diff --git a/.ci/generate_test_report_lib_test.py b/.ci/generate_test_report_lib_test.py
index a8659e1d6a3e3..431e10da6405a 100644
--- a/.ci/generate_test_report_lib_test.py
+++ b/.ci/generate_test_report_lib_test.py
@@ -407,7 +407,6 @@ def test_no_failures_multiple_build_failed_ninja_log(self):
                 ]
             ],
         )
-        print(test)
         self.assertEqual(
             generate_test_report_lib.generate_report(
                 "Foo",
diff --git a/.ci/premerge_advisor_explain.py b/.ci/premerge_advisor_explain.py
index 06c6cb9aaa46b..94f7949332e3a 100644
--- a/.ci/premerge_advisor_explain.py
+++ b/.ci/premerge_advisor_explain.py
@@ -40,7 +40,9 @@ def main(commit_sha: str, build_log_files: list[str]):
             explanation_request["failures"].append(
                 {"name": name, "message": failure_message}
             )
-    advisor_response = requests.get(PREMERGE_ADVISOR_URL, json=explanation_request)
+    advisor_response = requests.get(
+        PREMERGE_ADVISOR_URL, json=explanation_request, timeout=5
+    )
     if advisor_response.status_code == 200:
         print(advisor_response.json())
     else:
diff --git a/.ci/premerge_advisor_upload.py b/.ci/premerge_advisor_upload.py
index cb379b0e77cd6..9e14743c7cc07 100644
--- a/.ci/premerge_advisor_upload.py
+++ b/.ci/premerge_advisor_upload.py
@@ -45,7 +45,7 @@ def main(commit_sha, workflow_run_number, build_log_files):
         for name, failure_message in ninja_failures:
             failure_info["failures"].append({"name": name, "message": failure_message})
     for premerge_advisor_url in PREMERGE_ADVISOR_URLS:
-        requests.post(premerge_advisor_url, json=failure_info)
+        requests.post(premerge_advisor_url, json=failure_info, timeout=5)
 
 
 if __name__ == "__main__":
diff --git a/.github/instructions/lldb.instructions.md b/.github/instructions/lldb.instructions.md
new file mode 100644
index 0000000000000..35bcd27b1b42f
--- /dev/null
+++ b/.github/instructions/lldb.instructions.md
@@ -0,0 +1,79 @@
+---
+applyTo: lldb/**/*
+---
+
+When reviewing code, focus on:
+
+## Language, Libraries & Standards
+
+- Target C++17 and avoid vendor-specific extensions.
+- For Python scripts, follow PEP 8.
+- Prefer standard library or LLVM support libraries instead of reinventing data structures.
+
+## Comments & Documentation
+
+- Each source file should include the standard LLVM file header.
+- Header files must have proper header guards.
+- Non-trivial classes and public methods should have Doxygen documentation.
+- Use `//` or `///` comments normally; avoid block comments unless necessary.
+- Non-trivial code should have comments explaining what it does and why. Avoid comments that explain how it does it at a micro level.
+
+## Language & Compiler Issues
+
+- Write portable code; wrap non-portable code in interfaces.
+- Do not use RTTI or exceptions.
+- Prefer C++-style casts over C-style casts.
+- Do not use static constructors.
+- Use `class` or `struct` consistently; `struct` only for all-public data.
+- When then same class is declared or defined multiple times, make sure it's consistently done using either `class` or `struct`.
+
+## Headers & Library Layering
+
+- Include order: module header → local/private headers → project headers → system headers.
+- Headers must compile standalone (include all dependencies).
+- Maintain proper library layering; avoid circular dependencies.
+- Include minimally; use forward declarations where possible.
+- Keep internal headers private to modules.
+- Use full namespace qualifiers for out-of-line definitions.
+
+## Control Flow & Structure
+
+- Prefer early exits over deep nesting.
+- Do not use `else` after `return`, `continue`, `break`, or `goto`.
+- Encapsulate loops that compute predicates into helper functions.
+
+## Naming
+
+- LLDB's code style differs from LLVM's coding style.
+- Variables are `snake_case`.
+- Functions and methods are `UpperCamelCase`.
+- Static, global and member variables have `s_`, `g_` and `m_` prefixes respectively.
+
+## General Guidelines
+
+- Use `assert` liberally; prefer `llvm_unreachable` for unreachable states.
+- Do not use `using namespace std;` in headers.
+- Provide a virtual method anchor for classes defined in headers.
+- Do not use default labels in fully covered switches over enumerations.
+- Use range-based for loops wherever possible.
+- Capture `end()` outside loops if not using range-based iteration.
+- Including `<iostream>` is forbidded. Use LLVM’s `raw_ostream` instead.
+- Don’t use `inline` when defining a function in a class definition.
+
+## Microscopic Details
+
+- Preserve existing style in modified code.
+- Prefer pre-increment (`++i`) when value is unused.
+- Use `private`, `protected`, or `public` keyword as appropriate to restrict class member visibility.
+- Omit braces for single-statement `if`, `else`, `while`, `for` unless needed.
+
+## Review Style
+
+- Be specific and actionable in feedback.
+- Explain the "why" behind recommendations.
+- Link back to the LLVM Coding Standards: https://llvm.org/docs/CodingStandards.html.
+- Ask clarifying questions when code intent is unclear.
+
+Ignore formatting and assume that's handled by external tools like `clang-format` and `black`.
+Remember that these standards are **guidelines**.
+Always prioritize consistency with the style that is already being used by the surrounding code.
diff --git a/.github/copilot-instructions.md b/.github/instructions/llvm.instructions.md
similarity index 90%
rename from .github/copilot-instructions.md
rename to .github/instructions/llvm.instructions.md
index 03748938700e3..3f1308f51e676 100644
--- a/.github/copilot-instructions.md
+++ b/.github/instructions/llvm.instructions.md
@@ -1,3 +1,7 @@
+---
+applyTo: llvm/**/*
+---
+
 When performing a code review, pay close attention to code modifying a function's
 control flow. Could the change result in the corruption of performance profile
 data? Could the change result in invalid debug information, in particular for
diff --git a/.github/workflows/release-binaries-all.yml b/.github/workflows/release-binaries-all.yml
index 0b52a08202f1a..eef49b5e3625d 100644
--- a/.github/workflows/release-binaries-all.yml
+++ b/.github/workflows/release-binaries-all.yml
@@ -90,7 +90,6 @@ jobs:
         runs-on:
           - ubuntu-22.04
           - ubuntu-22.04-arm
-          - macos-13
           - macos-14
 
     uses: ./.github/workflows/release-binaries.yml
diff --git a/.github/workflows/release-binaries.yml b/.github/workflows/release-binaries.yml
index 8145926265256..fa73b9d9fe8d0 100644
--- a/.github/workflows/release-binaries.yml
+++ b/.github/workflows/release-binaries.yml
@@ -21,7 +21,6 @@ on:
         options:
           - ubuntu-22.04
           - ubuntu-22.04-arm
-          - macos-13
           - macos-14
 
   workflow_call:
@@ -130,8 +129,6 @@ jobs:
           target_cmake_flags="$target_cmake_flags -DBOOTSTRAP_BOOTSTRAP_COMPILER_RT_ENABLE_IOS=OFF"
           if [ "$RUNNER_ARCH" = "ARM64" ]; then
             arches=arm64
-          else
-            arches=x86_64
           fi
           target_cmake_flags="$target_cmake_flags -DBOOTSTRAP_BOOTSTRAP_DARWIN_osx_ARCHS=$arches -DBOOTSTRAP_BOOTSTRAP_DARWIN_osx_BUILTIN_ARCHS=$arches"
         fi
@@ -147,14 +144,6 @@ jobs:
             build_runs_on="depot-${{ inputs.runs-on }}-16"
             test_runs_on=$build_runs_on
             ;;
-          macos-13)
-            if [ "$GITHUB_EVENT_NAME" = "pull_request" ]; then
-              build_runs_on="${{ inputs.runs-on }}"
-            else
-              build_runs_on="macos-13-large"
-            fi
-            test_runs_on="${{ inputs.runs-on }}"
-            ;;
           macos-14)
             if [ "$GITHUB_EVENT_NAME" = "pull_request" ]; then
               build_runs_on="${{ inputs.runs-on }}"
diff --git a/bolt/include/bolt/Core/BinaryContext.h b/bolt/include/bolt/Core/BinaryContext.h
index 8960b1984745f..085c0265de3ed 100644
--- a/bolt/include/bolt/Core/BinaryContext.h
+++ b/bolt/include/bolt/Core/BinaryContext.h
@@ -781,11 +781,6 @@ class BinaryContext {
     uint64_t PseudoProbeLooseMatchedSampleCount{0};
     ///   the count of call matched samples
     uint64_t CallMatchedSampleCount{0};
-    ///   the number of stale functions that have matching number of blocks in
-    ///   the profile
-    uint64_t NumStaleFuncsWithEqualBlockCount{0};
-    ///   the number of blocks that have matching size but a differing hash
-    uint64_t NumStaleBlocksWithEqualIcount{0};
   } Stats;
 
   // Original binary execution count stats.
@@ -937,6 +932,16 @@ class BinaryContext {
   std::pair<const MCSymbol *, uint64_t>
   handleAddressRef(uint64_t Address, BinaryFunction &BF, bool IsPCRel);
 
+  /// When \p Address inside function \p BF is a target of a control transfer
+  /// instruction (branch) from another function, return a corresponding symbol
+  /// that should be used by the branch. For example, main or secondary entry
+  /// point.
+  ///
+  /// If \p Address is an invalid destination, such as a constant island, return
+  /// nullptr and mark \p BF as ignored, since we cannot properly handle a
+  /// branch to a constant island.
+  MCSymbol *handleExternalBranchTarget(uint64_t Address, BinaryFunction &BF);
+
   /// Analyze memory contents at the given \p Address and return the type of
   /// memory contents (such as a possible jump table).
   MemoryContentsType analyzeMemoryAt(uint64_t Address, BinaryFunction &BF);
diff --git a/bolt/include/bolt/Passes/MarkRAStates.h b/bolt/include/bolt/Passes/MarkRAStates.h
index 675ab9727142b..202f1dda2aad8 100644
--- a/bolt/include/bolt/Passes/MarkRAStates.h
+++ b/bolt/include/bolt/Passes/MarkRAStates.h
@@ -13,11 +13,16 @@
 #define BOLT_PASSES_MARK_RA_STATES
 
 #include "bolt/Passes/BinaryPasses.h"
+#include <mutex>
 
 namespace llvm {
 namespace bolt {
 
 class MarkRAStates : public BinaryFunctionPass {
+  // setIgnored() is not thread-safe, but the pass is running on functions in
+  // parallel.
+  std::mutex IgnoreMutex;
+
 public:
   explicit MarkRAStates() : BinaryFunctionPass(false) {}
 
diff --git a/bolt/lib/Core/BinaryContext.cpp b/bolt/lib/Core/BinaryContext.cpp
index c33540ada8a05..a383ced1712e3 100644
--- a/bolt/lib/Core/BinaryContext.cpp
+++ b/bolt/lib/Core/BinaryContext.cpp
@@ -518,6 +518,23 @@ BinaryContext::handleAddressRef(uint64_t Address, BinaryFunction &BF,
   return std::make_pair(TargetSymbol, 0);
 }
 
+MCSymbol *BinaryContext::handleExternalBranchTarget(uint64_t Address,
+                                                    BinaryFunction &BF) {
+  if (BF.isInConstantIsland(Address)) {
+    BF.setIgnored();
+    this->outs() << "BOLT-WARNING: ignoring entry point at address 0x"
+                 << Twine::utohexstr(Address)
+                 << " in constant island of function " << BF << '\n';
+    return nullptr;
+  }
+
+  const uint64_t Offset = Address - BF.getAddress();
+  assert(Offset < BF.getSize() &&
+         "Address should be inside the referenced function");
+
+  return Offset ? BF.addEntryPointAtOffset(Offset) : BF.getSymbol();
+}
+
 MemoryContentsType BinaryContext::analyzeMemoryAt(uint64_t Address,
                                                   BinaryFunction &BF) {
   if (!isX86())
@@ -1399,17 +1416,10 @@ void BinaryContext::processInterproceduralReferences() {
             << Function.getPrintName() << " and "
             << TargetFunction->getPrintName() << '\n';
       }
-      if (uint64_t Offset = Address - TargetFunction->getAddress()) {
-        if (!TargetFunction->isInConstantIsland(Address)) {
-          TargetFunction->addEntryPointAtOffset(Offset);
-        } else {
-          TargetFunction->setIgnored();
-          this->outs() << "BOLT-WARNING: Ignoring entry point at address 0x"
-                       << Twine::utohexstr(Address)
-                       << " in constant island of function " << *TargetFunction
-                       << '\n';
-        }
-      }
+
+      // Create an extra entry point if needed. Can also render the target
+      // function ignored if the reference is invalid.
+      handleExternalBranchTarget(Address, *TargetFunction);
 
       continue;
     }
diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp
index 84023efe1084e..ddaad6eef6140 100644
--- a/bolt/lib/Core/BinaryFunction.cpp
+++ b/bolt/lib/Core/BinaryFunction.cpp
@@ -1697,11 +1697,12 @@ bool BinaryFunction::scanExternalRefs() {
       if (!TargetFunction || ignoreFunctionRef(*TargetFunction))
         continue;
 
-      const uint64_t FunctionOffset =
-          TargetAddress - TargetFunction->getAddress();
+      // Get a reference symbol for the function when address is a valid code
+      // reference.
       BranchTargetSymbol =
-          FunctionOffset ? TargetFunction->addEntryPointAtOffset(FunctionOffset)
-                         : TargetFunction->getSymbol();
+          BC.handleExternalBranchTarget(TargetAddress, *TargetFunction);
+      if (!BranchTargetSymbol)
+        continue;
     }
 
     // Can't find more references. Not creating relocations since we are not
diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp
index e1a1856b506cf..1d187de11c35e 100644
--- a/bolt/lib/Passes/BinaryPasses.cpp
+++ b/bolt/lib/Passes/BinaryPasses.cpp
@@ -1508,12 +1508,6 @@ Error PrintProgramStats::runOnFunctions(BinaryContext &BC) {
   if (NumAllStaleFunctions) {
     const float PctStale =
         NumAllStaleFunctions / (float)NumAllProfiledFunctions * 100.0f;
-    const float PctStaleFuncsWithEqualBlockCount =
-        (float)BC.Stats.NumStaleFuncsWithEqualBlockCount /
-        NumAllStaleFunctions * 100.0f;
-    const float PctStaleBlocksWithEqualIcount =
-        (float)BC.Stats.NumStaleBlocksWithEqualIcount /
-        BC.Stats.NumStaleBlocks * 100.0f;
     auto printErrorOrWarning = [&]() {
       if (PctStale > opts::StaleThreshold)
         BC.errs() << "BOLT-ERROR: ";
@@ -1536,17 +1530,6 @@ Error PrintProgramStats::runOnFunctions(BinaryContext &BC) {
                 << "%) belong to functions with invalid"
                    " (possibly stale) profile.\n";
     }
-    BC.outs() << "BOLT-INFO: " << BC.Stats.NumStaleFuncsWithEqualBlockCount
-              << " stale function"
-              << (BC.Stats.NumStaleFuncsWithEqualBlockCount == 1 ? "" : "s")
-              << format(" (%.1f%% of all stale)",
-                        PctStaleFuncsWithEqualBlockCount)
-              << " have matching block count.\n";
-    BC.outs() << "BOLT-INFO: " << BC.Stats.NumStaleBlocksWithEqualIcount
-              << " stale block"
-              << (BC.Stats.NumStaleBlocksWithEqualIcount == 1 ? "" : "s")
-              << format(" (%.1f%% of all stale)", PctStaleBlocksWithEqualIcount)
-              << " have matching icount.\n";
     if (PctStale > opts::StaleThreshold) {
       return createFatalBOLTError(
           Twine("BOLT-ERROR: stale functions exceed specified threshold of ") +
diff --git a/bolt/lib/Passes/MarkRAStates.cpp b/bolt/lib/Passes/MarkRAStates.cpp
index af6a5ca7e31e5..b262d66732b7d 100644
--- a/bolt/lib/Passes/MarkRAStates.cpp
+++ b/bolt/lib/Passes/MarkRAStates.cpp
@@ -43,10 +43,11 @@ bool MarkRAStates::runOnFunction(BinaryFunction &BF) {
         // Not all functions have .cfi_negate_ra_state in them. But if one does,
         // we expect psign/pauth instructions to have the hasNegateRAState
         // annotation.
-        BF.setIgnored();
         BC.outs() << "BOLT-INFO: inconsistent RAStates in function "
                   << BF.getPrintName()
                   << ": ptr sign/auth inst without .cfi_negate_ra_state\n";
+        std::lock_guard<std::mutex> Lock(IgnoreMutex);
+        BF.setIgnored();
         return false;
       }
     }
@@ -67,6 +68,7 @@ bool MarkRAStates::runOnFunction(BinaryFunction &BF) {
           BC.outs() << "BOLT-INFO: inconsistent RAStates in function "
                     << BF.getPrintName()
                     << ": ptr signing inst encountered in Signed RA state\n";
+          std::lock_guard<std::mutex> Lock(IgnoreMutex);
           BF.setIgnored();
           return false;
         }
@@ -80,6 +82,7 @@ bool MarkRAStates::runOnFunction(BinaryFunction &BF) {
                     << BF.getPrintName()
                     << ": ptr authenticating inst encountered in Unsigned RA "
                        "state\n";
+          std::lock_guard<std::mutex> Lock(IgnoreMutex);
           BF.setIgnored();
           return false;
         }
diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp
index dc3d918d14bd6..4e062038a3e4c 100644
--- a/bolt/lib/Profile/DataAggregator.cpp
+++ b/bolt/lib/Profile/DataAggregator.cpp
@@ -1321,7 +1321,8 @@ std::error_code DataAggregator::parseAggregatedLBREntry() {
     }
 
     using SSI = StringSwitch<int>;
-    AddrNum = SSI(Str).Cases("T", "R", 3).Case("S", 1).Case("E", 0).Default(2);
+    AddrNum =
+        SSI(Str).Cases({"T", "R"}, 3).Case("S", 1).Case("E", 0).Default(2);
     CounterNum = SSI(Str).Case("B", 2).Case("E", 0).Default(1);
   }
 
@@ -2215,7 +2216,7 @@ DataAggregator::writeAggregatedFile(StringRef OutputFilename) const {
     OutFile << "boltedcollection\n";
   if (opts::BasicAggregation) {
     OutFile << "no_lbr";
-    for (const StringMapEntry<std::nullopt_t> &Entry : EventNames)
+    for (const StringMapEntry<EmptyStringSetTag> &Entry : EventNames)
       OutFile << " " << Entry.getKey();
     OutFile << "\n";
 
@@ -2291,7 +2292,7 @@ std::error_code DataAggregator::writeBATYAML(BinaryContext &BC,
 
   ListSeparator LS(",");
   raw_string_ostream EventNamesOS(BP.Header.EventNames);
-  for (const StringMapEntry<std::nullopt_t> &EventEntry : EventNames)
+  for (const StringMapEntry<EmptyStringSetTag> &EventEntry : EventNames)
     EventNamesOS << LS << EventEntry.first().str();
 
   BP.Header.Flags = opts::BasicAggregation ? BinaryFunction::PF_BASIC
diff --git a/bolt/lib/Profile/YAMLProfileReader.cpp b/bolt/lib/Profile/YAMLProfileReader.cpp
index 086e47b661e10..f0f87f9baec38 100644
--- a/bolt/lib/Profile/YAMLProfileReader.cpp
+++ b/bolt/lib/Profile/YAMLProfileReader.cpp
@@ -350,9 +350,6 @@ bool YAMLProfileReader::parseFunctionProfile(
              << MismatchedCalls << " calls, and " << MismatchedEdges
              << " edges in profile did not match function " << BF << '\n';
 
-    if (YamlBF.NumBasicBlocks != BF.size())
-      ++BC.Stats.NumStaleFuncsWithEqualBlockCount;
-
     if (!opts::InferStaleProfile)
       return false;
     ArrayRef<ProbeMatchSpec> ProbeMatchSpecs;
diff --git a/bolt/lib/Profile/YAMLProfileWriter.cpp b/bolt/lib/Profile/YAMLProfileWriter.cpp
index 1632aa1c6bfe2..5c631f93f01da 100644
--- a/bolt/lib/Profile/YAMLProfileWriter.cpp
+++ b/bolt/lib/Profile/YAMLProfileWriter.cpp
@@ -382,7 +382,7 @@ std::error_code YAMLProfileWriter::writeProfile(const RewriteInstance &RI) {
   StringSet<> EventNames = RI.getProfileReader()->getEventNames();
   if (!EventNames.empty()) {
     std::string Sep;
-    for (const StringMapEntry<std::nullopt_t> &EventEntry : EventNames) {
+    for (const StringMapEntry<EmptyStringSetTag> &EventEntry : EventNames) {
       BP.Header.EventNames += Sep + EventEntry.first().str();
       Sep = ",";
     }
diff --git a/bolt/test/AArch64/constant-island-entry.s b/bolt/test/AArch64/constant-island-entry.s
index 6567114eb980a..a82b876fde46d 100644
--- a/bolt/test/AArch64/constant-island-entry.s
+++ b/bolt/test/AArch64/constant-island-entry.s
@@ -1,11 +1,16 @@
-// This test checks that we ignore functions which add an entry point that
-// is in a constant island.
+## This test checks that we ignore functions which add an entry point that
+## is in a constant island.
 
 # RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o
 # RUN: %clang %cflags %t.o -pie -Wl,-q -o %t.exe
+
+## Check when the caller is successfully disassembled.
 # RUN: llvm-bolt %t.exe -o %t.bolt 2>&1 | FileCheck %s
 
-# CHECK: BOLT-WARNING: Ignoring entry point at address 0x{{[0-9a-f]+}} in constant island of function func
+## Skip caller to check the identical warning is triggered from ScanExternalRefs().
+# RUN: llvm-bolt %t.exe -o %t.bolt -skip-funcs=caller 2>&1 | FileCheck %s
+
+# CHECK: BOLT-WARNING: ignoring entry point at address 0x{{[0-9a-f]+}} in constant island of function func
 
 .globl func
 .type func, %function
diff --git a/clang-tools-extra/clang-tidy/ClangTidyOptions.cpp b/clang-tools-extra/clang-tidy/ClangTidyOptions.cpp
index 21455db7c7e7b..c4b47a440e44b 100644
--- a/clang-tools-extra/clang-tidy/ClangTidyOptions.cpp
+++ b/clang-tools-extra/clang-tidy/ClangTidyOptions.cpp
@@ -247,7 +247,7 @@ ClangTidyOptions ClangTidyOptions::getDefaults() {
   Options.WarningsAsErrors = "";
   Options.HeaderFileExtensions = {"", "h", "hh", "hpp", "hxx"};
   Options.ImplementationFileExtensions = {"c", "cc", "cpp", "cxx"};
-  Options.HeaderFilterRegex = "";
+  Options.HeaderFilterRegex = ".*";
   Options.ExcludeHeaderFilterRegex = "";
   Options.SystemHeaders = false;
   Options.FormatStyle = "none";
diff --git a/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp b/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp
index e6115f67656bc..7adff8a641fb8 100644
--- a/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp
@@ -61,6 +61,7 @@
 #include "ParentVirtualCallCheck.h"
 #include "PointerArithmeticOnPolymorphicObjectCheck.h"
 #include "PosixReturnCheck.h"
+#include "RawMemoryCallOnNonTrivialTypeCheck.h"
 #include "RedundantBranchConditionCheck.h"
 #include "ReservedIdentifierCheck.h"
 #include "ReturnConstRefFromParameterCheck.h"
@@ -216,6 +217,8 @@ class BugproneModule : public ClangTidyModule {
     CheckFactories.registerCheck<ParentVirtualCallCheck>(
         "bugprone-parent-virtual-call");
     CheckFactories.registerCheck<PosixReturnCheck>("bugprone-posix-return");
+    CheckFactories.registerCheck<RawMemoryCallOnNonTrivialTypeCheck>(
+        "bugprone-raw-memory-call-on-non-trivial-type");
     CheckFactories.registerCheck<ReservedIdentifierCheck>(
         "bugprone-reserved-identifier");
     CheckFactories.registerCheck<SharedPtrArrayMismatchCheck>(
diff --git a/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt b/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt
index c8943e5b22ef8..c0fdb4daaa305 100644
--- a/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt
+++ b/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt
@@ -62,6 +62,7 @@ add_clang_library(clangTidyBugproneModule STATIC
   ParentVirtualCallCheck.cpp
   PointerArithmeticOnPolymorphicObjectCheck.cpp
   PosixReturnCheck.cpp
+  RawMemoryCallOnNonTrivialTypeCheck.cpp
   RedundantBranchConditionCheck.cpp
   ReservedIdentifierCheck.cpp
   ReturnConstRefFromParameterCheck.cpp
diff --git a/clang-tools-extra/clang-tidy/bugprone/InvalidEnumDefaultInitializationCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/InvalidEnumDefaultInitializationCheck.cpp
index 76df992f29fc1..18eb40f4eb6d2 100644
--- a/clang-tools-extra/clang-tidy/bugprone/InvalidEnumDefaultInitializationCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/InvalidEnumDefaultInitializationCheck.cpp
@@ -20,6 +20,8 @@ namespace clang::tidy::bugprone {
 
 namespace {
 
+// Preserve same name as AST_MATCHER(isCompleteAndHasNoZeroValue)
+// NOLINTNEXTLINE(llvm-prefer-static-over-anonymous-namespace)
 bool isCompleteAndHasNoZeroValue(const EnumDecl *D) {
   const EnumDecl *Definition = D->getDefinition();
   return Definition && Definition->isComplete() &&
diff --git a/clang-tools-extra/clang-tidy/cert/NonTrivialTypesLibcMemoryCallsCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/RawMemoryCallOnNonTrivialTypeCheck.cpp
similarity index 92%
rename from clang-tools-extra/clang-tidy/cert/NonTrivialTypesLibcMemoryCallsCheck.cpp
rename to clang-tools-extra/clang-tidy/bugprone/RawMemoryCallOnNonTrivialTypeCheck.cpp
index e266cf995e8a7..f7f7ccb8877d3 100644
--- a/clang-tools-extra/clang-tidy/cert/NonTrivialTypesLibcMemoryCallsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/RawMemoryCallOnNonTrivialTypeCheck.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "NonTrivialTypesLibcMemoryCallsCheck.h"
+#include "RawMemoryCallOnNonTrivialTypeCheck.h"
 #include "../utils/OptionsUtils.h"
 #include "clang/AST/Decl.h"
 #include "clang/ASTMatchers/ASTMatchFinder.h"
@@ -17,7 +17,7 @@
 
 using namespace clang::ast_matchers;
 
-namespace clang::tidy::cert {
+namespace clang::tidy::bugprone {
 
 namespace {
 AST_MATCHER(CXXRecordDecl, isTriviallyDefaultConstructible) {
@@ -48,22 +48,21 @@ static constexpr llvm::StringRef ComparisonOperators[] = {
     "operator==", "operator!=", "operator<",
     "operator>",  "operator<=", "operator>="};
 
-NonTrivialTypesLibcMemoryCallsCheck::NonTrivialTypesLibcMemoryCallsCheck(
+RawMemoryCallOnNonTrivialTypeCheck::RawMemoryCallOnNonTrivialTypeCheck(
     StringRef Name, ClangTidyContext *Context)
     : ClangTidyCheck(Name, Context),
       MemSetNames(Options.get("MemSetNames", "")),
       MemCpyNames(Options.get("MemCpyNames", "")),
       MemCmpNames(Options.get("MemCmpNames", "")) {}
 
-void NonTrivialTypesLibcMemoryCallsCheck::storeOptions(
+void RawMemoryCallOnNonTrivialTypeCheck::storeOptions(
     ClangTidyOptions::OptionMap &Opts) {
   Options.store(Opts, "MemSetNames", MemSetNames);
   Options.store(Opts, "MemCpyNames", MemCpyNames);
   Options.store(Opts, "MemCmpNames", MemCmpNames);
 }
 
-void NonTrivialTypesLibcMemoryCallsCheck::registerMatchers(
-    MatchFinder *Finder) {
+void RawMemoryCallOnNonTrivialTypeCheck::registerMatchers(MatchFinder *Finder) {
   using namespace ast_matchers::internal;
   auto IsStructPointer = [](Matcher<CXXRecordDecl> Constraint = anything(),
                             bool Bind = false) {
@@ -103,7 +102,7 @@ void NonTrivialTypesLibcMemoryCallsCheck::registerMatchers(
       this);
 }
 
-void NonTrivialTypesLibcMemoryCallsCheck::check(
+void RawMemoryCallOnNonTrivialTypeCheck::check(
     const MatchFinder::MatchResult &Result) {
   if (const auto *Caller = Result.Nodes.getNodeAs<CallExpr>("lazyConstruct")) {
     diag(Caller->getBeginLoc(), "calling %0 on a non-trivially default "
@@ -122,4 +121,4 @@ void NonTrivialTypesLibcMemoryCallsCheck::check(
   }
 }
 
-} // namespace clang::tidy::cert
+} // namespace clang::tidy::bugprone
diff --git a/clang-tools-extra/clang-tidy/cert/NonTrivialTypesLibcMemoryCallsCheck.h b/clang-tools-extra/clang-tidy/bugprone/RawMemoryCallOnNonTrivialTypeCheck.h
similarity index 59%
rename from clang-tools-extra/clang-tidy/cert/NonTrivialTypesLibcMemoryCallsCheck.h
rename to clang-tools-extra/clang-tidy/bugprone/RawMemoryCallOnNonTrivialTypeCheck.h
index 4589ce444c878..002aac6d37bfb 100644
--- a/clang-tools-extra/clang-tidy/cert/NonTrivialTypesLibcMemoryCallsCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/RawMemoryCallOnNonTrivialTypeCheck.h
@@ -6,22 +6,21 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_NONTRIVIALTYPESLIBCMEMORYCALLSCHECK_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_NONTRIVIALTYPESLIBCMEMORYCALLSCHECK_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_RAWMEMORYCALLONNONTRIVIALTYPECHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_RAWMEMORYCALLONNONTRIVIALTYPECHECK_H
 
 #include "../ClangTidyCheck.h"
 
-namespace clang::tidy::cert {
+namespace clang::tidy::bugprone {
 
-/// Flags use of the `C` standard library functions 'memset', 'memcpy' and
+/// Flags use of the C standard library functions 'memset', 'memcpy' and
 /// 'memcmp' and similar derivatives on non-trivial types.
 ///
 /// For the user-facing documentation see:
-/// https://clang.llvm.org/extra/clang-tidy/checks/cert/oop57-cpp.html
-class NonTrivialTypesLibcMemoryCallsCheck : public ClangTidyCheck {
+/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/raw-memory-call-on-non-trivial-type.html
+class RawMemoryCallOnNonTrivialTypeCheck : public ClangTidyCheck {
 public:
-  NonTrivialTypesLibcMemoryCallsCheck(StringRef Name,
-                                      ClangTidyContext *Context);
+  RawMemoryCallOnNonTrivialTypeCheck(StringRef Name, ClangTidyContext *Context);
   bool isLanguageVersionSupported(const LangOptions &LangOpts) const override {
     return LangOpts.CPlusPlus && !LangOpts.ObjC;
   }
@@ -35,6 +34,6 @@ class NonTrivialTypesLibcMemoryCallsCheck : public ClangTidyCheck {
   const StringRef MemCmpNames;
 };
 
-} // namespace clang::tidy::cert
+} // namespace clang::tidy::bugprone
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_NONTRIVIALTYPESLIBCMEMORYCALLSCHECK_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_RAWMEMORYCALLONNONTRIVIALTYPECHECK_H
diff --git a/clang-tools-extra/clang-tidy/bugprone/SmartPtrArrayMismatchCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/SmartPtrArrayMismatchCheck.cpp
index ee797ecb694bd..f76e4a722a508 100644
--- a/clang-tools-extra/clang-tidy/bugprone/SmartPtrArrayMismatchCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/SmartPtrArrayMismatchCheck.cpp
@@ -15,13 +15,11 @@ using namespace clang::ast_matchers;
 
 namespace clang::tidy::bugprone {
 
-namespace {
+static constexpr char ConstructExprN[] = "found_construct_expr";
+static constexpr char NewExprN[] = "found_new_expr";
+static constexpr char ConstructorN[] = "found_constructor";
 
-constexpr char ConstructExprN[] = "found_construct_expr";
-constexpr char NewExprN[] = "found_new_expr";
-constexpr char ConstructorN[] = "found_constructor";
-
-bool isInSingleDeclStmt(const DeclaratorDecl *D) {
+static bool isInSingleDeclStmt(const DeclaratorDecl *D) {
   const DynTypedNodeList Parents =
       D->getASTContext().getParentMapContext().getParents(*D);
   for (const DynTypedNode &PNode : Parents)
@@ -30,8 +28,8 @@ bool isInSingleDeclStmt(const DeclaratorDecl *D) {
   return false;
 }
 
-const DeclaratorDecl *getConstructedVarOrField(const Expr *FoundConstructExpr,
-                                               ASTContext &Ctx) {
+static const DeclaratorDecl *
+getConstructedVarOrField(const Expr *FoundConstructExpr, ASTContext &Ctx) {
   const DynTypedNodeList ConstructParents =
       Ctx.getParentMapContext().getParents(*FoundConstructExpr);
   if (ConstructParents.size() != 1)
@@ -43,8 +41,6 @@ const DeclaratorDecl *getConstructedVarOrField(const Expr *FoundConstructExpr,
   return nullptr;
 }
 
-} // namespace
-
 const char SmartPtrArrayMismatchCheck::PointerTypeN[] = "pointer_type";
 
 SmartPtrArrayMismatchCheck::SmartPtrArrayMismatchCheck(
diff --git a/clang-tools-extra/clang-tidy/bugprone/UncheckedStringToNumberConversionCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/UncheckedStringToNumberConversionCheck.cpp
index d1e7b895f9a35..d0bf72b35ba8f 100644
--- a/clang-tools-extra/clang-tidy/bugprone/UncheckedStringToNumberConversionCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/UncheckedStringToNumberConversionCheck.cpp
@@ -51,7 +51,7 @@ enum class ConversionKind {
 
 static ConversionKind classifyConversionFunc(const FunctionDecl *FD) {
   return llvm::StringSwitch<ConversionKind>(FD->getName())
-      .Cases("atoi", "atol", ConversionKind::ToInt)
+      .Cases({"atoi", "atol"}, ConversionKind::ToInt)
       .Case("atoll", ConversionKind::ToLongInt)
       .Case("atof", ConversionKind::ToDouble)
       .Default(ConversionKind::None);
diff --git a/clang-tools-extra/clang-tidy/cert/CERTTidyModule.cpp b/clang-tools-extra/clang-tidy/cert/CERTTidyModule.cpp
index c1ca2cec7a1eb..fa1eb4abc1dd8 100644
--- a/clang-tools-extra/clang-tidy/cert/CERTTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/cert/CERTTidyModule.cpp
@@ -12,6 +12,7 @@
 #include "../bugprone/BadSignalToKillThreadCheck.h"
 #include "../bugprone/CommandProcessorCheck.h"
 #include "../bugprone/PointerArithmeticOnPolymorphicObjectCheck.h"
+#include "../bugprone/RawMemoryCallOnNonTrivialTypeCheck.h"
 #include "../bugprone/ReservedIdentifierCheck.h"
 #include "../bugprone/SignalHandlerCheck.h"
 #include "../bugprone/SignedCharMisuseCheck.h"
@@ -39,7 +40,6 @@
 #include "FloatLoopCounter.h"
 #include "LimitedRandomnessCheck.h"
 #include "MutatingCopyCheck.h"
-#include "NonTrivialTypesLibcMemoryCallsCheck.h"
 #include "ProperlySeededRandomGeneratorCheck.h"
 #include "ThrownExceptionTypeCheck.h"
 
@@ -278,7 +278,7 @@ class CERTModule : public ClangTidyModule {
         "cert-oop11-cpp");
     CheckFactories.registerCheck<bugprone::UnhandledSelfAssignmentCheck>(
         "cert-oop54-cpp");
-    CheckFactories.registerCheck<NonTrivialTypesLibcMemoryCallsCheck>(
+    CheckFactories.registerCheck<bugprone::RawMemoryCallOnNonTrivialTypeCheck>(
         "cert-oop57-cpp");
     CheckFactories.registerCheck<MutatingCopyCheck>("cert-oop58-cpp");
 
diff --git a/clang-tools-extra/clang-tidy/cert/CMakeLists.txt b/clang-tools-extra/clang-tidy/cert/CMakeLists.txt
index 453d1d30921e9..ce57faadcf749 100644
--- a/clang-tools-extra/clang-tidy/cert/CMakeLists.txt
+++ b/clang-tools-extra/clang-tidy/cert/CMakeLists.txt
@@ -10,7 +10,6 @@ add_clang_library(clangTidyCERTModule STATIC
   FloatLoopCounter.cpp
   LimitedRandomnessCheck.cpp
   MutatingCopyCheck.cpp
-  NonTrivialTypesLibcMemoryCallsCheck.cpp
   ProperlySeededRandomGeneratorCheck.cpp
   ThrownExceptionTypeCheck.cpp
 
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidNonConstGlobalVariablesCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidNonConstGlobalVariablesCheck.cpp
index f0e66e44690b2..2c0baa5716954 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidNonConstGlobalVariablesCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidNonConstGlobalVariablesCheck.cpp
@@ -17,7 +17,8 @@ namespace clang::tidy::cppcoreguidelines {
 AvoidNonConstGlobalVariablesCheck::AvoidNonConstGlobalVariablesCheck(
     StringRef Name, ClangTidyContext *Context)
     : ClangTidyCheck(Name, Context),
-      AllowInternalLinkage(Options.get("AllowInternalLinkage", false)) {}
+      AllowInternalLinkage(Options.get("AllowInternalLinkage", false)),
+      AllowThreadLocal(Options.get("AllowThreadLocal", false)) {}
 
 void AvoidNonConstGlobalVariablesCheck::registerMatchers(MatchFinder *Finder) {
   auto NamespaceMatcher = AllowInternalLinkage
@@ -31,6 +32,8 @@ void AvoidNonConstGlobalVariablesCheck::registerMatchers(MatchFinder *Finder) {
       GlobalContext,
       AllowInternalLinkage ? varDecl(unless(isStaticStorageClass()))
                            : varDecl(),
+      AllowThreadLocal ? varDecl(unless(hasThreadStorageDuration()))
+                       : varDecl(),
       unless(anyOf(
           isConstexpr(), hasType(isConstQualified()),
           hasType(referenceType())))); // References can't be changed, only the
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidNonConstGlobalVariablesCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidNonConstGlobalVariablesCheck.h
index 5e7c968b12f97..d8f2a733e3b01 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidNonConstGlobalVariablesCheck.h
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidNonConstGlobalVariablesCheck.h
@@ -27,6 +27,7 @@ class AvoidNonConstGlobalVariablesCheck : public ClangTidyCheck {
 
 private:
   const bool AllowInternalLinkage;
+  const bool AllowThreadLocal;
 };
 
 } // namespace clang::tidy::cppcoreguidelines
diff --git a/clang-tools-extra/clang-tidy/google/FunctionNamingCheck.cpp b/clang-tools-extra/clang-tidy/google/FunctionNamingCheck.cpp
index 3d75f4dd25bd1..ce0e4e6896f37 100644
--- a/clang-tools-extra/clang-tidy/google/FunctionNamingCheck.cpp
+++ b/clang-tools-extra/clang-tidy/google/FunctionNamingCheck.cpp
@@ -14,9 +14,7 @@ using namespace clang::ast_matchers;
 
 namespace clang::tidy::google::objc {
 
-namespace {
-
-std::string validFunctionNameRegex(bool RequirePrefix) {
+static std::string validFunctionNameRegex(bool RequirePrefix) {
   // Allow the following name patterns for all functions:
   // • ABFoo (prefix + UpperCamelCase)
   // • ABURL (prefix + capitalized acronym/initialism)
@@ -43,7 +41,7 @@ std::string validFunctionNameRegex(bool RequirePrefix) {
 /// For now we will only fix functions of static storage class with names like
 /// 'functionName' or 'function_name' and convert them to 'FunctionName'. For
 /// other cases the user must determine an appropriate name on their own.
-FixItHint generateFixItHint(const FunctionDecl *Decl) {
+static FixItHint generateFixItHint(const FunctionDecl *Decl) {
   // A fixit can be generated for functions of static storage class but
   // otherwise the check cannot determine the appropriate function name prefix
   // to use.
@@ -82,8 +80,6 @@ FixItHint generateFixItHint(const FunctionDecl *Decl) {
   return {};
 }
 
-} // namespace
-
 void FunctionNamingCheck::registerMatchers(MatchFinder *Finder) {
   // Enforce Objective-C function naming conventions on all functions except:
   // • Functions defined in system headers.
diff --git a/clang-tools-extra/clang-tidy/llvm/UseNewMLIROpBuilderCheck.cpp b/clang-tools-extra/clang-tidy/llvm/UseNewMLIROpBuilderCheck.cpp
index bd51cc5037dca..0014153cceaa3 100644
--- a/clang-tools-extra/clang-tidy/llvm/UseNewMLIROpBuilderCheck.cpp
+++ b/clang-tools-extra/clang-tidy/llvm/UseNewMLIROpBuilderCheck.cpp
@@ -18,18 +18,15 @@
 #include "llvm/Support/FormatVariadic.h"
 
 namespace clang::tidy::llvm_check {
-namespace {
 
 using namespace ::clang::ast_matchers;
 using namespace ::clang::transformer;
 
-EditGenerator rewrite(RangeSelector Call, RangeSelector Builder,
-                      RangeSelector CallArgs) {
+static EditGenerator rewrite(RangeSelector Call, RangeSelector Builder) {
   // This is using an EditGenerator rather than ASTEdit as we want to warn even
   // if in macro.
-  return [Call = std::move(Call), Builder = std::move(Builder),
-          CallArgs =
-              std::move(CallArgs)](const MatchFinder::MatchResult &Result)
+  return [Call = std::move(Call),
+          Builder = std::move(Builder)](const MatchFinder::MatchResult &Result)
              -> Expected<SmallVector<transformer::Edit, 1>> {
     Expected<CharSourceRange> CallRange = Call(Result);
     if (!CallRange)
@@ -54,7 +51,7 @@ EditGenerator rewrite(RangeSelector Call, RangeSelector Builder,
     auto NextToken = [&](std::optional<Token> CurrentToken) {
       if (!CurrentToken)
         return CurrentToken;
-      if (CurrentToken->getEndLoc() >= CallRange->getEnd())
+      if (CurrentToken->is(clang::tok::eof))
         return std::optional<Token>();
       return clang::Lexer::findNextToken(CurrentToken->getLocation(), SM,
                                          LangOpts);
@@ -68,9 +65,10 @@ EditGenerator rewrite(RangeSelector Call, RangeSelector Builder,
       return llvm::make_error<llvm::StringError>(llvm::errc::invalid_argument,
                                                  "missing '<' token");
     }
+
     std::optional<Token> EndToken = NextToken(LessToken);
-    for (std::optional<Token> GreaterToken = NextToken(EndToken);
-         GreaterToken && GreaterToken->getKind() != clang::tok::greater;
+    std::optional<Token> GreaterToken = NextToken(EndToken);
+    for (; GreaterToken && GreaterToken->getKind() != clang::tok::greater;
          GreaterToken = NextToken(GreaterToken)) {
       EndToken = GreaterToken;
     }
@@ -79,12 +77,21 @@ EditGenerator rewrite(RangeSelector Call, RangeSelector Builder,
                                                  "missing '>' token");
     }
 
+    std::optional<Token> ArgStart = NextToken(GreaterToken);
+    if (!ArgStart || ArgStart->getKind() != clang::tok::l_paren) {
+      return llvm::make_error<llvm::StringError>(llvm::errc::invalid_argument,
+                                                 "missing '(' token");
+    }
+    std::optional<Token> Arg = NextToken(ArgStart);
+    if (!Arg) {
+      return llvm::make_error<llvm::StringError>(llvm::errc::invalid_argument,
+                                                 "unexpected end of file");
+    }
+    const bool HasArgs = Arg->getKind() != clang::tok::r_paren;
+
     Expected<CharSourceRange> BuilderRange = Builder(Result);
     if (!BuilderRange)
       return BuilderRange.takeError();
-    Expected<CharSourceRange> CallArgsRange = CallArgs(Result);
-    if (!CallArgsRange)
-      return CallArgsRange.takeError();
 
     // Helper for concatting below.
     auto GetText = [&](const CharSourceRange &Range) {
@@ -93,43 +100,42 @@ EditGenerator rewrite(RangeSelector Call, RangeSelector Builder,
 
     Edit Replace;
     Replace.Kind = EditKind::Range;
-    Replace.Range = *CallRange;
-    std::string CallArgsStr;
-    // Only emit args if there are any.
-    if (auto CallArgsText = GetText(*CallArgsRange).ltrim();
-        !CallArgsText.rtrim().empty()) {
-      CallArgsStr = llvm::formatv(", {}", CallArgsText);
+    Replace.Range.setBegin(CallRange->getBegin());
+    Replace.Range.setEnd(ArgStart->getEndLoc());
+    const Expr *BuilderExpr = Result.Nodes.getNodeAs<Expr>("builder");
+    std::string BuilderText = GetText(*BuilderRange).str();
+    if (BuilderExpr->getType()->isPointerType()) {
+      BuilderText = BuilderExpr->isImplicitCXXThis()
+                        ? "*this"
+                        : llvm::formatv("*{}", BuilderText).str();
     }
-    Replace.Replacement =
-        llvm::formatv("{}::create({}{})",
-                      GetText(CharSourceRange::getTokenRange(
-                          LessToken->getEndLoc(), EndToken->getLastLoc())),
-                      GetText(*BuilderRange), CallArgsStr);
+    const StringRef OpType = GetText(CharSourceRange::getTokenRange(
+        LessToken->getEndLoc(), EndToken->getLastLoc()));
+    Replace.Replacement = llvm::formatv("{}::create({}{}", OpType, BuilderText,
+                                        HasArgs ? ", " : "");
 
     return SmallVector<Edit, 1>({Replace});
   };
 }
 
-RewriteRuleWith<std::string> useNewMlirOpBuilderCheckRule() {
+static RewriteRuleWith<std::string> useNewMlirOpBuilderCheckRule() {
   Stencil Message = cat("use 'OpType::create(builder, ...)' instead of "
                         "'builder.create<OpType>(...)'");
   // Match a create call on an OpBuilder.
+  auto BuilderType = cxxRecordDecl(isSameOrDerivedFrom("::mlir::OpBuilder"));
   ast_matchers::internal::Matcher<Stmt> Base =
       cxxMemberCallExpr(
-          on(expr(hasType(
-                      cxxRecordDecl(isSameOrDerivedFrom("::mlir::OpBuilder"))))
+          on(expr(anyOf(hasType(BuilderType), hasType(pointsTo(BuilderType))))
                  .bind("builder")),
-          callee(cxxMethodDecl(hasTemplateArgument(0, templateArgument()))),
-          callee(cxxMethodDecl(hasName("create"))))
+          callee(cxxMethodDecl(hasTemplateArgument(0, templateArgument()),
+                               hasName("create"))))
           .bind("call");
   return applyFirst(
       //  Attempt rewrite given an lvalue builder, else just warn.
       {makeRule(cxxMemberCallExpr(unless(on(cxxTemporaryObjectExpr())), Base),
-                rewrite(node("call"), node("builder"), callArgs("call")),
-                Message),
+                rewrite(node("call"), node("builder")), Message),
        makeRule(Base, noopEdit(node("call")), Message)});
 }
-} // namespace
 
 UseNewMlirOpBuilderCheck::UseNewMlirOpBuilderCheck(StringRef Name,
                                                    ClangTidyContext *Context)
diff --git a/clang-tools-extra/clang-tidy/misc/UseInternalLinkageCheck.cpp b/clang-tools-extra/clang-tidy/misc/UseInternalLinkageCheck.cpp
index 415852d6f14e9..1d2706499dab5 100644
--- a/clang-tools-extra/clang-tidy/misc/UseInternalLinkageCheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/UseInternalLinkageCheck.cpp
@@ -43,12 +43,6 @@ struct OptionEnumMapping<misc::UseInternalLinkageCheck::FixModeKind> {
 
 namespace clang::tidy::misc {
 
-namespace {
-
-AST_MATCHER(Decl, isFirstDecl) { return Node.isFirstDecl(); }
-
-AST_MATCHER(FunctionDecl, hasBody) { return Node.hasBody(); }
-
 static bool isInMainFile(SourceLocation L, SourceManager &SM,
                          const FileExtensionsSet &HeaderFileExtensions) {
   for (;;) {
@@ -65,6 +59,12 @@ static bool isInMainFile(SourceLocation L, SourceManager &SM,
   }
 }
 
+namespace {
+
+AST_MATCHER(Decl, isFirstDecl) { return Node.isFirstDecl(); }
+
+AST_MATCHER(FunctionDecl, hasBody) { return Node.hasBody(); }
+
 AST_MATCHER_P(Decl, isAllRedeclsInMainFile, FileExtensionsSet,
               HeaderFileExtensions) {
   return llvm::all_of(Node.redecls(), [&](const Decl *D) {
diff --git a/clang-tools-extra/clang-tidy/modernize/AvoidCArraysCheck.cpp b/clang-tools-extra/clang-tidy/modernize/AvoidCArraysCheck.cpp
index 92900192957e5..71d89d3ab6098 100644
--- a/clang-tools-extra/clang-tidy/modernize/AvoidCArraysCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/AvoidCArraysCheck.cpp
@@ -15,6 +15,14 @@ using namespace clang::ast_matchers;
 
 namespace clang::tidy::modernize {
 
+template <typename TargetType, typename NodeType>
+static const TargetType *getAs(const NodeType *Node) {
+  if constexpr (std::is_same_v<NodeType, clang::DynTypedNode>)
+    return Node->template get<TargetType>();
+  else
+    return llvm::dyn_cast<TargetType>(Node);
+}
+
 namespace {
 
 AST_MATCHER(clang::TypeLoc, hasValidBeginLoc) {
@@ -39,14 +47,6 @@ AST_MATCHER(clang::ParmVarDecl, isArgvOfMain) {
   return FD ? FD->isMain() : false;
 }
 
-template <typename TargetType, typename NodeType>
-const TargetType *getAs(const NodeType *Node) {
-  if constexpr (std::is_same_v<NodeType, clang::DynTypedNode>)
-    return Node->template get<TargetType>();
-  else
-    return llvm::dyn_cast<TargetType>(Node);
-}
-
 AST_MATCHER(clang::TypeLoc, isWithinImplicitTemplateInstantiation) {
   const auto IsImplicitTemplateInstantiation = [](const auto *Node) {
     const auto IsImplicitInstantiation = [](const auto *Node) {
diff --git a/clang-tools-extra/clang-tidy/modernize/UseAutoCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseAutoCheck.cpp
index 01796a6f4af2d..084349be7b609 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseAutoCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseAutoCheck.cpp
@@ -20,14 +20,13 @@ using namespace clang::ast_matchers;
 using namespace clang::ast_matchers::internal;
 
 namespace clang::tidy::modernize {
-namespace {
 
-const char IteratorDeclStmtId[] = "iterator_decl";
-const char DeclWithNewId[] = "decl_new";
-const char DeclWithCastId[] = "decl_cast";
-const char DeclWithTemplateCastId[] = "decl_template";
+static const char IteratorDeclStmtId[] = "iterator_decl";
+static const char DeclWithNewId[] = "decl_new";
+static const char DeclWithCastId[] = "decl_cast";
+static const char DeclWithTemplateCastId[] = "decl_template";
 
-size_t getTypeNameLength(bool RemoveStars, StringRef Text) {
+static size_t getTypeNameLength(bool RemoveStars, StringRef Text) {
   enum CharType { Space, Alpha, Punctuation };
   CharType LastChar = Space, BeforeSpace = Punctuation;
   size_t NumChars = 0;
@@ -54,6 +53,7 @@ size_t getTypeNameLength(bool RemoveStars, StringRef Text) {
   return NumChars;
 }
 
+namespace {
 /// Matches variable declarations that have explicit initializers that
 /// are not initializer lists.
 ///
@@ -65,7 +65,7 @@ size_t getTypeNameLength(bool RemoveStars, StringRef Text) {
 ///   MyType C;
 /// \endcode
 ///
-/// varDecl(hasWrittenNonListInitializer()) maches \c I and \c A but not \c B
+/// varDecl(hasWrittenNonListInitializer()) matches \c I and \c A but not \c B
 /// or \c C.
 AST_MATCHER(VarDecl, hasWrittenNonListInitializer) {
   const Expr *Init = Node.getAnyInitializer();
@@ -108,6 +108,15 @@ AST_MATCHER_P(QualType, isSugarFor, Matcher<QualType>, SugarMatcher) {
   }
 }
 
+/// Matches declaration reference or member expressions with explicit template
+/// arguments.
+AST_POLYMORPHIC_MATCHER(hasExplicitTemplateArgs,
+                        AST_POLYMORPHIC_SUPPORTED_TYPES(DeclRefExpr,
+                                                        MemberExpr)) {
+  return Node.hasExplicitTemplateArgs();
+}
+} // namespace
+
 /// Matches named declarations that have one of the standard iterator
 /// names: iterator, reverse_iterator, const_iterator, const_reverse_iterator.
 ///
@@ -118,7 +127,7 @@ AST_MATCHER_P(QualType, isSugarFor, Matcher<QualType>, SugarMatcher) {
 /// \endcode
 ///
 /// namedDecl(hasStdIteratorName()) matches \c I and \c CI.
-Matcher<NamedDecl> hasStdIteratorName() {
+static Matcher<NamedDecl> hasStdIteratorName() {
   static const StringRef IteratorNames[] = {"iterator", "reverse_iterator",
                                             "const_iterator",
                                             "const_reverse_iterator"};
@@ -137,7 +146,7 @@ Matcher<NamedDecl> hasStdIteratorName() {
 ///
 /// recordDecl(hasStdContainerName()) matches \c vector and \c forward_list
 /// but not \c my_vec.
-Matcher<NamedDecl> hasStdContainerName() {
+static Matcher<NamedDecl> hasStdContainerName() {
   static StringRef ContainerNames[] = {"array",         "deque",
                                        "forward_list",  "list",
                                        "vector",
@@ -154,17 +163,9 @@ Matcher<NamedDecl> hasStdContainerName() {
   return hasAnyName(ContainerNames);
 }
 
-/// Matches declaration reference or member expressions with explicit template
-/// arguments.
-AST_POLYMORPHIC_MATCHER(hasExplicitTemplateArgs,
-                        AST_POLYMORPHIC_SUPPORTED_TYPES(DeclRefExpr,
-                                                        MemberExpr)) {
-  return Node.hasExplicitTemplateArgs();
-}
-
 /// Returns a DeclarationMatcher that matches standard iterators nested
 /// inside records with a standard container name.
-DeclarationMatcher standardIterator() {
+static DeclarationMatcher standardIterator() {
   return decl(
       namedDecl(hasStdIteratorName()),
       hasDeclContext(recordDecl(hasStdContainerName(), isInStdNamespace())));
@@ -172,19 +173,19 @@ DeclarationMatcher standardIterator() {
 
 /// Returns a TypeMatcher that matches typedefs for standard iterators
 /// inside records with a standard container name.
-TypeMatcher typedefIterator() {
+static TypeMatcher typedefIterator() {
   return typedefType(hasDeclaration(standardIterator()));
 }
 
 /// Returns a TypeMatcher that matches records named for standard
 /// iterators nested inside records named for standard containers.
-TypeMatcher nestedIterator() {
+static TypeMatcher nestedIterator() {
   return recordType(hasDeclaration(standardIterator()));
 }
 
 /// Returns a TypeMatcher that matches types declared with using
 /// declarations and which name standard iterators for standard containers.
-TypeMatcher iteratorFromUsingDeclaration() {
+static TypeMatcher iteratorFromUsingDeclaration() {
   auto HasIteratorDecl = hasDeclaration(namedDecl(hasStdIteratorName()));
   // Unwrap the nested name specifier to test for one of the standard
   // containers.
@@ -198,7 +199,7 @@ TypeMatcher iteratorFromUsingDeclaration() {
 
 /// This matcher returns declaration statements that contain variable
 /// declarations with written non-list initializer for standard iterators.
-StatementMatcher makeIteratorDeclMatcher() {
+static StatementMatcher makeIteratorDeclMatcher() {
   return declStmt(unless(has(
                       varDecl(anyOf(unless(hasWrittenNonListInitializer()),
                                     unless(hasType(isSugarFor(anyOf(
@@ -207,7 +208,7 @@ StatementMatcher makeIteratorDeclMatcher() {
       .bind(IteratorDeclStmtId);
 }
 
-StatementMatcher makeDeclWithNewMatcher() {
+static StatementMatcher makeDeclWithNewMatcher() {
   return declStmt(
              unless(has(varDecl(anyOf(
                  unless(hasInitializer(ignoringParenImpCasts(cxxNewExpr()))),
@@ -225,13 +226,13 @@ StatementMatcher makeDeclWithNewMatcher() {
       .bind(DeclWithNewId);
 }
 
-StatementMatcher makeDeclWithCastMatcher() {
+static StatementMatcher makeDeclWithCastMatcher() {
   return declStmt(
              unless(has(varDecl(unless(hasInitializer(explicitCastExpr()))))))
       .bind(DeclWithCastId);
 }
 
-StatementMatcher makeDeclWithTemplateCastMatcher() {
+static StatementMatcher makeDeclWithTemplateCastMatcher() {
   auto ST =
       substTemplateTypeParmType(hasReplacementType(equalsBoundNode("arg")));
 
@@ -252,7 +253,7 @@ StatementMatcher makeDeclWithTemplateCastMatcher() {
       .bind(DeclWithTemplateCastId);
 }
 
-StatementMatcher makeCombinedMatcher() {
+static StatementMatcher makeCombinedMatcher() {
   return declStmt(
       // At least one varDecl should be a child of the declStmt to ensure
       // it's a declaration list and avoid matching other declarations,
@@ -265,8 +266,6 @@ StatementMatcher makeCombinedMatcher() {
             makeDeclWithCastMatcher(), makeDeclWithTemplateCastMatcher()));
 }
 
-} // namespace
-
 UseAutoCheck::UseAutoCheck(StringRef Name, ClangTidyContext *Context)
     : ClangTidyCheck(Name, Context),
       MinTypeNameLength(Options.get("MinTypeNameLength", 5)),
diff --git a/clang-tools-extra/clang-tidy/modernize/UseDesignatedInitializersCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseDesignatedInitializersCheck.cpp
index cc7c2d1e1dff5..c1094b1fc194a 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseDesignatedInitializersCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseDesignatedInitializersCheck.cpp
@@ -40,6 +40,12 @@ static constexpr char StrictCppStandardComplianceName[] =
     "StrictCppStandardCompliance";
 static constexpr bool StrictCppStandardComplianceDefault = true;
 
+static unsigned getNumberOfDesignated(const InitListExpr *SyntacticInitList) {
+  return llvm::count_if(*SyntacticInitList, [](auto *InitExpr) {
+    return isa<DesignatedInitExpr>(InitExpr);
+  });
+}
+
 namespace {
 
 struct Designators {
@@ -74,12 +80,6 @@ struct Designators {
   }
 };
 
-unsigned getNumberOfDesignated(const InitListExpr *SyntacticInitList) {
-  return llvm::count_if(*SyntacticInitList, [](auto *InitExpr) {
-    return isa<DesignatedInitExpr>(InitExpr);
-  });
-}
-
 AST_MATCHER(CXXRecordDecl, isAggregate) {
   return Node.hasDefinition() && Node.isAggregate();
 }
diff --git a/clang-tools-extra/clang-tidy/modernize/UseEmplaceCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseEmplaceCheck.cpp
index ade0085267db3..e585dd1d40002 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseEmplaceCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseEmplaceCheck.cpp
@@ -81,41 +81,44 @@ AST_MATCHER(CXXMemberCallExpr, hasSameNumArgsAsDeclNumParams) {
 AST_MATCHER(DeclRefExpr, hasExplicitTemplateArgs) {
   return Node.hasExplicitTemplateArgs();
 }
+} // namespace
 
 // Helper Matcher which applies the given QualType Matcher either directly or by
 // resolving a pointer type to its pointee. Used to match v.push_back() as well
 // as p->push_back().
-auto hasTypeOrPointeeType(
+static auto hasTypeOrPointeeType(
     const ast_matchers::internal::Matcher<QualType> &TypeMatcher) {
   return anyOf(hasType(TypeMatcher),
                hasType(pointerType(pointee(TypeMatcher))));
 }
 
 // Matches if the node has canonical type matching any of the given names.
-auto hasWantedType(llvm::ArrayRef<StringRef> TypeNames) {
+static auto hasWantedType(llvm::ArrayRef<StringRef> TypeNames) {
   return hasCanonicalType(hasDeclaration(cxxRecordDecl(hasAnyName(TypeNames))));
 }
 
 // Matches member call expressions of the named method on the listed container
 // types.
-auto cxxMemberCallExprOnContainer(StringRef MethodName,
-                                  llvm::ArrayRef<StringRef> ContainerNames) {
+static auto
+cxxMemberCallExprOnContainer(StringRef MethodName,
+                             llvm::ArrayRef<StringRef> ContainerNames) {
   return cxxMemberCallExpr(
       hasDeclaration(functionDecl(hasName(MethodName))),
       on(hasTypeOrPointeeType(hasWantedType(ContainerNames))));
 }
 
-const auto DefaultContainersWithPushBack =
+static const auto DefaultContainersWithPushBack =
     "::std::vector; ::std::list; ::std::deque";
-const auto DefaultContainersWithPush =
+static const auto DefaultContainersWithPush =
     "::std::stack; ::std::queue; ::std::priority_queue";
-const auto DefaultContainersWithPushFront =
+static const auto DefaultContainersWithPushFront =
     "::std::forward_list; ::std::list; ::std::deque";
-const auto DefaultSmartPointers =
+static const auto DefaultSmartPointers =
     "::std::shared_ptr; ::std::unique_ptr; ::std::auto_ptr; ::std::weak_ptr";
-const auto DefaultTupleTypes = "::std::pair; ::std::tuple";
-const auto DefaultTupleMakeFunctions = "::std::make_pair; ::std::make_tuple";
-const auto DefaultEmplacyFunctions =
+static const auto DefaultTupleTypes = "::std::pair; ::std::tuple";
+static const auto DefaultTupleMakeFunctions =
+    "::std::make_pair; ::std::make_tuple";
+static const auto DefaultEmplacyFunctions =
     "vector::emplace_back; vector::emplace;"
     "deque::emplace; deque::emplace_front; deque::emplace_back;"
     "forward_list::emplace_after; forward_list::emplace_front;"
@@ -129,7 +132,6 @@ const auto DefaultEmplacyFunctions =
     "unordered_multiset::emplace; unordered_multiset::emplace_hint;"
     "unordered_multimap::emplace; unordered_multimap::emplace_hint;"
     "stack::emplace; queue::emplace; priority_queue::emplace";
-} // namespace
 
 UseEmplaceCheck::UseEmplaceCheck(StringRef Name, ClangTidyContext *Context)
     : ClangTidyCheck(Name, Context), IgnoreImplicitConstructors(Options.get(
diff --git a/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp b/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp
index 6f6da57d7822b..4cf8574e56c5e 100644
--- a/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp
@@ -29,6 +29,8 @@ AST_MATCHER(Stmt, isMacroExpansion) {
 
 AST_MATCHER(Stmt, isC23) { return Finder->getASTContext().getLangOpts().C23; }
 
+// Preserve same name as AST_MATCHER(isNULLMacroExpansion)
+// NOLINTNEXTLINE(llvm-prefer-static-over-anonymous-namespace)
 bool isNULLMacroExpansion(const Stmt *Statement, ASTContext &Context) {
   SourceManager &SM = Context.getSourceManager();
   const LangOptions &LO = Context.getLangOpts();
diff --git a/clang-tools-extra/clang-tidy/readability/RedundantParenthesesCheck.cpp b/clang-tools-extra/clang-tidy/readability/RedundantParenthesesCheck.cpp
index 0ab59fff39d88..874b9618bd882 100644
--- a/clang-tools-extra/clang-tidy/readability/RedundantParenthesesCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/RedundantParenthesesCheck.cpp
@@ -7,6 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "RedundantParenthesesCheck.h"
+#include "../utils/Matchers.h"
+#include "../utils/OptionsUtils.h"
 #include "clang/AST/Expr.h"
 #include "clang/ASTMatchers/ASTMatchFinder.h"
 #include "clang/ASTMatchers/ASTMatchers.h"
@@ -32,15 +34,30 @@ AST_MATCHER(ParenExpr, isInMacro) {
 
 } // namespace
 
+RedundantParenthesesCheck::RedundantParenthesesCheck(StringRef Name,
+                                                     ClangTidyContext *Context)
+    : ClangTidyCheck(Name, Context),
+      AllowedDecls(utils::options::parseStringList(
+          Options.get("AllowedDecls", "std::max;std::min"))) {}
+
+void RedundantParenthesesCheck::storeOptions(
+    ClangTidyOptions::OptionMap &Opts) {
+  Options.store(Opts, "AllowedDecls",
+                utils::options::serializeStringList(AllowedDecls));
+}
+
 void RedundantParenthesesCheck::registerMatchers(MatchFinder *Finder) {
   const auto ConstantExpr =
       expr(anyOf(integerLiteral(), floatLiteral(), characterLiteral(),
                  cxxBoolLiteral(), stringLiteral(), cxxNullPtrLiteralExpr()));
   Finder->addMatcher(
-      parenExpr(subExpr(anyOf(parenExpr(), ConstantExpr, declRefExpr())),
-                unless(anyOf(isInMacro(),
-                             // sizeof(...) is common used.
-                             hasParent(unaryExprOrTypeTraitExpr()))))
+      parenExpr(
+          subExpr(anyOf(parenExpr(), ConstantExpr,
+                        declRefExpr(to(namedDecl(unless(
+                            matchers::matchesAnyListedName(AllowedDecls))))))),
+          unless(anyOf(isInMacro(),
+                       // sizeof(...) is common used.
+                       hasParent(unaryExprOrTypeTraitExpr()))))
           .bind("dup"),
       this);
 }
diff --git a/clang-tools-extra/clang-tidy/readability/RedundantParenthesesCheck.h b/clang-tools-extra/clang-tidy/readability/RedundantParenthesesCheck.h
index 9a0409b83fff3..2638a09730f7e 100644
--- a/clang-tools-extra/clang-tidy/readability/RedundantParenthesesCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/RedundantParenthesesCheck.h
@@ -20,13 +20,16 @@ namespace clang::tidy::readability {
 /// https://clang.llvm.org/extra/clang-tidy/checks/readability/redundant-parentheses.html
 class RedundantParenthesesCheck : public ClangTidyCheck {
 public:
-  RedundantParenthesesCheck(StringRef Name, ClangTidyContext *Context)
-      : ClangTidyCheck(Name, Context) {}
+  RedundantParenthesesCheck(StringRef Name, ClangTidyContext *Context);
+  void storeOptions(ClangTidyOptions::OptionMap &Opts) override;
   void registerMatchers(ast_matchers::MatchFinder *Finder) override;
   void check(const ast_matchers::MatchFinder::MatchResult &Result) override;
   bool isLanguageVersionSupported(const LangOptions &LangOpts) const override {
     return LangOpts.CPlusPlus | LangOpts.C99;
   }
+
+private:
+  const std::vector<StringRef> AllowedDecls;
 };
 
 } // namespace clang::tidy::readability
diff --git a/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp b/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp
index 64157f530b8c0..1ae8756c339e7 100644
--- a/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp
+++ b/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp
@@ -93,7 +93,7 @@ Configuration files:
     WarningsAsErrors:             ''
     HeaderFileExtensions:         ['', 'h','hh','hpp','hxx']
     ImplementationFileExtensions: ['c','cc','cpp','cxx']
-    HeaderFilterRegex:            ''
+    HeaderFilterRegex:            '.*'
     FormatStyle:                  none
     InheritParentConfig:          true
     User:                         user
@@ -132,14 +132,16 @@ file, if any.
 
 static cl::opt<std::string> HeaderFilter("header-filter", desc(R"(
 Regular expression matching the names of the
-headers to output diagnostics from. Diagnostics
+headers to output diagnostics from. The default
+value is '.*', i.e. diagnostics from all non-system
+headers are displayed by default. Diagnostics
 from the main file of each translation unit are
 always displayed.
 Can be used together with -line-filter.
 This option overrides the 'HeaderFilterRegex'
 option in .clang-tidy file, if any.
 )"),
-                                         cl::init(""),
+                                         cl::init(".*"),
                                          cl::cat(ClangTidyCategory));
 
 static cl::opt<std::string> ExcludeHeaderFilter("exclude-header-filter",
@@ -379,9 +381,9 @@ static void printStats(const ClangTidyStats &Stats) {
                    << " with check filters";
     llvm::errs() << ").\n";
     if (Stats.ErrorsIgnoredNonUserCode)
-      llvm::errs() << "Use -header-filter=.* to display errors from all "
-                      "non-system headers. Use -system-headers to display "
-                      "errors from system headers as well.\n";
+      llvm::errs() << "Use -header-filter=.* or leave it as default to display "
+                      "errors from all non-system headers. Use -system-headers "
+                      "to display errors from system headers as well.\n";
   }
 }
 
diff --git a/clang-tools-extra/clang-tidy/utils/ExceptionAnalyzer.cpp b/clang-tools-extra/clang-tidy/utils/ExceptionAnalyzer.cpp
index fd4320eb8144b..706dd67c16776 100644
--- a/clang-tools-extra/clang-tidy/utils/ExceptionAnalyzer.cpp
+++ b/clang-tools-extra/clang-tidy/utils/ExceptionAnalyzer.cpp
@@ -44,10 +44,9 @@ ExceptionAnalyzer::ExceptionInfo &ExceptionAnalyzer::ExceptionInfo::merge(
 }
 
 // FIXME: This could be ported to clang later.
-namespace {
 
-bool isUnambiguousPublicBaseClass(const Type *DerivedType,
-                                  const Type *BaseType) {
+static bool isUnambiguousPublicBaseClass(const Type *DerivedType,
+                                         const Type *BaseType) {
   const auto *DerivedClass =
       DerivedType->getCanonicalTypeUnqualified()->getAsCXXRecordDecl();
   const auto *BaseClass =
@@ -78,11 +77,11 @@ bool isUnambiguousPublicBaseClass(const Type *DerivedType,
          IsPublicBaseClass;
 }
 
-inline bool isPointerOrPointerToMember(const Type *T) {
+static bool isPointerOrPointerToMember(const Type *T) {
   return T->isPointerType() || T->isMemberPointerType();
 }
 
-std::optional<QualType> getPointeeOrArrayElementQualType(QualType T) {
+static std::optional<QualType> getPointeeOrArrayElementQualType(QualType T) {
   if (T->isAnyPointerType() || T->isMemberPointerType())
     return T->getPointeeType();
 
@@ -92,7 +91,7 @@ std::optional<QualType> getPointeeOrArrayElementQualType(QualType T) {
   return std::nullopt;
 }
 
-bool isBaseOf(const Type *DerivedType, const Type *BaseType) {
+static bool isBaseOf(const Type *DerivedType, const Type *BaseType) {
   const auto *DerivedClass = DerivedType->getAsCXXRecordDecl();
   const auto *BaseClass = BaseType->getAsCXXRecordDecl();
   if (!DerivedClass || !BaseClass)
@@ -103,12 +102,12 @@ bool isBaseOf(const Type *DerivedType, const Type *BaseType) {
 }
 
 // Check if T1 is more or Equally qualified than T2.
-bool moreOrEquallyQualified(QualType T1, QualType T2) {
+static bool moreOrEquallyQualified(QualType T1, QualType T2) {
   return T1.getQualifiers().isStrictSupersetOf(T2.getQualifiers()) ||
          T1.getQualifiers() == T2.getQualifiers();
 }
 
-bool isStandardPointerConvertible(QualType From, QualType To) {
+static bool isStandardPointerConvertible(QualType From, QualType To) {
   assert((From->isPointerType() || From->isMemberPointerType()) &&
          (To->isPointerType() || To->isMemberPointerType()) &&
          "Pointer conversion should be performed on pointer types only.");
@@ -150,7 +149,7 @@ bool isStandardPointerConvertible(QualType From, QualType To) {
   return false;
 }
 
-bool isFunctionPointerConvertible(QualType From, QualType To) {
+static bool isFunctionPointerConvertible(QualType From, QualType To) {
   if (!From->isFunctionPointerType() && !From->isFunctionType() &&
       !From->isMemberFunctionPointerType())
     return false;
@@ -192,8 +191,8 @@ bool isFunctionPointerConvertible(QualType From, QualType To) {
 // from the C rules.
 //
 // The function should only be called in C++ mode.
-bool isQualificationConvertiblePointer(QualType From, QualType To,
-                                       LangOptions LangOpts) {
+static bool isQualificationConvertiblePointer(QualType From, QualType To,
+                                              LangOptions LangOpts) {
 
   // [N4659 7.5 (1)]
   // A cv-decomposition of a type T is a sequence of cv_i and P_i such that T is
@@ -320,7 +319,6 @@ bool isQualificationConvertiblePointer(QualType From, QualType To,
 
   return From.getTypePtr() == To.getTypePtr();
 }
-} // namespace
 
 static bool canThrow(const FunctionDecl *Func) {
   // consteval specifies that every call to the function must produce a
diff --git a/clang-tools-extra/clang-tidy/utils/FixItHintUtils.cpp b/clang-tools-extra/clang-tidy/utils/FixItHintUtils.cpp
index 086c7f3a15d45..b30c83e3aeb35 100644
--- a/clang-tools-extra/clang-tidy/utils/FixItHintUtils.cpp
+++ b/clang-tools-extra/clang-tidy/utils/FixItHintUtils.cpp
@@ -21,6 +21,11 @@ FixItHint changeVarDeclToReference(const VarDecl &Var, ASTContext &Context) {
   SourceLocation AmpLocation = Var.getLocation();
   auto Token = utils::lexer::getPreviousToken(
       AmpLocation, Context.getSourceManager(), Context.getLangOpts());
+
+  // For parameter packs the '&' must go before the '...' token
+  if (Token.is(tok::ellipsis))
+    return FixItHint::CreateInsertion(Token.getLocation(), "&");
+
   if (!Token.is(tok::unknown))
     AmpLocation = Lexer::getLocForEndOfToken(Token.getLocation(), 0,
                                              Context.getSourceManager(),
diff --git a/clang-tools-extra/clangd/Selection.cpp b/clang-tools-extra/clangd/Selection.cpp
index 06165dfbbcdd2..faa00d20497fa 100644
--- a/clang-tools-extra/clangd/Selection.cpp
+++ b/clang-tools-extra/clangd/Selection.cpp
@@ -958,6 +958,18 @@ class SelectionVisitor : public RecursiveASTVisitor<SelectionVisitor> {
         claimRange(SourceRange(FTL.getLParenLoc(), FTL.getEndLoc()), Result);
         return;
       }
+      if (auto ATL = TL->getAs<AttributedTypeLoc>()) {
+        // For attributed function types like `int foo() [[attr]]`, the
+        // AttributedTypeLoc's range includes the function name. We want to
+        // allow the function name to be associated with the FunctionDecl
+        // rather than the AttributedTypeLoc, so we only claim the attribute
+        // range itself.
+        if (ATL.getModifiedLoc().getAs<FunctionTypeLoc>()) {
+          // Only claim the attribute's source range, not the whole type.
+          claimRange(ATL.getLocalSourceRange(), Result);
+          return;
+        }
+      }
     }
     claimRange(getSourceRange(N), Result);
   }
diff --git a/clang-tools-extra/clangd/support/DirectiveTree.cpp b/clang-tools-extra/clangd/support/DirectiveTree.cpp
index 97b0598e82c58..16d12f332a0be 100644
--- a/clang-tools-extra/clangd/support/DirectiveTree.cpp
+++ b/clang-tools-extra/clangd/support/DirectiveTree.cpp
@@ -305,8 +305,8 @@ class BranchChooser {
     if (&Value >= Tokens.end() || &Value.nextNC() < Tokens.end())
       return std::nullopt;
     return llvm::StringSwitch<std::optional<bool>>(Value.text())
-        .Cases("true", "1", true)
-        .Cases("false", "0", false)
+        .Cases({"true", "1"}, true)
+        .Cases({"false", "0"}, false)
         .Default(std::nullopt);
   }
 
diff --git a/clang-tools-extra/clangd/support/Markup.cpp b/clang-tools-extra/clangd/support/Markup.cpp
index 304917de252bf..9ba993a04709c 100644
--- a/clang-tools-extra/clangd/support/Markup.cpp
+++ b/clang-tools-extra/clangd/support/Markup.cpp
@@ -475,31 +475,61 @@ std::string Block::asPlainText() const {
   return llvm::StringRef(OS.str()).trim().str();
 }
 
+void Paragraph::renderNewlinesMarkdown(llvm::raw_ostream &OS,
+                                       llvm::StringRef ParagraphText) const {
+  llvm::StringRef Line, Rest;
+
+  for (std::tie(Line, Rest) = ParagraphText.ltrim("\n").rtrim().split('\n');
+       !(Line.empty() && Rest.empty());
+       std::tie(Line, Rest) = Rest.split('\n')) {
+
+    if (Line.empty()) {
+      // Blank lines are preserved in markdown.
+      OS << '\n';
+      continue;
+    }
+
+    OS << Line;
+
+    if (!Rest.empty() && isHardLineBreakAfter(Line, Rest, /*IsMarkdown=*/true))
+      // In markdown, 2 spaces before a line break forces a line break.
+      OS << "  ";
+    OS << '\n';
+  }
+}
+
 void Paragraph::renderEscapedMarkdown(llvm::raw_ostream &OS) const {
   bool NeedsSpace = false;
   bool HasChunks = false;
+  std::string ParagraphText;
+  ParagraphText.reserve(EstimatedStringSize);
+  llvm::raw_string_ostream ParagraphTextOS(ParagraphText);
   for (auto &C : Chunks) {
     if (C.SpaceBefore || NeedsSpace)
-      OS << " ";
+      ParagraphTextOS << " ";
     switch (C.Kind) {
     case ChunkKind::PlainText:
-      OS << renderText(C.Contents, !HasChunks, /*EscapeMarkdown=*/true);
+      ParagraphTextOS << renderText(C.Contents, !HasChunks,
+                                    /*EscapeMarkdown=*/true);
       break;
     case ChunkKind::InlineCode:
-      OS << renderInlineBlock(C.Contents);
+      ParagraphTextOS << renderInlineBlock(C.Contents);
       break;
     case ChunkKind::Bold:
-      OS << renderText("**" + C.Contents + "**", !HasChunks,
-                       /*EscapeMarkdown=*/true);
+      ParagraphTextOS << renderText("**" + C.Contents + "**", !HasChunks,
+                                    /*EscapeMarkdown=*/true);
       break;
     case ChunkKind::Emphasized:
-      OS << renderText("*" + C.Contents + "*", !HasChunks,
-                       /*EscapeMarkdown=*/true);
+      ParagraphTextOS << renderText("*" + C.Contents + "*", !HasChunks,
+                                    /*EscapeMarkdown=*/true);
       break;
     }
     HasChunks = true;
     NeedsSpace = C.SpaceAfter;
   }
+
+  renderNewlinesMarkdown(OS, ParagraphText);
+
   // A paragraph in markdown is separated by a blank line.
   OS << "\n\n";
 }
@@ -507,28 +537,39 @@ void Paragraph::renderEscapedMarkdown(llvm::raw_ostream &OS) const {
 void Paragraph::renderMarkdown(llvm::raw_ostream &OS) const {
   bool NeedsSpace = false;
   bool HasChunks = false;
+  std::string ParagraphText;
+  ParagraphText.reserve(EstimatedStringSize);
+  llvm::raw_string_ostream ParagraphTextOS(ParagraphText);
   for (auto &C : Chunks) {
     if (C.SpaceBefore || NeedsSpace)
-      OS << " ";
+      ParagraphTextOS << " ";
     switch (C.Kind) {
     case ChunkKind::PlainText:
-      OS << renderText(C.Contents, !HasChunks, /*EscapeMarkdown=*/false);
+      ParagraphTextOS << renderText(C.Contents, !HasChunks,
+                                    /*EscapeMarkdown=*/false);
       break;
     case ChunkKind::InlineCode:
-      OS << renderInlineBlock(C.Contents);
+      ParagraphTextOS << renderInlineBlock(C.Contents);
       break;
     case ChunkKind::Bold:
-      OS << "**" << renderText(C.Contents, !HasChunks, /*EscapeMarkdown=*/false)
-         << "**";
+      ParagraphTextOS << "**"
+                      << renderText(C.Contents, !HasChunks,
+                                    /*EscapeMarkdown=*/false)
+                      << "**";
       break;
     case ChunkKind::Emphasized:
-      OS << "*" << renderText(C.Contents, !HasChunks, /*EscapeMarkdown=*/false)
-         << "*";
+      ParagraphTextOS << "*"
+                      << renderText(C.Contents, !HasChunks,
+                                    /*EscapeMarkdown=*/false)
+                      << "*";
       break;
     }
     HasChunks = true;
     NeedsSpace = C.SpaceAfter;
   }
+
+  renderNewlinesMarkdown(OS, ParagraphText);
+
   // A paragraph in markdown is separated by a blank line.
   OS << "\n\n";
 }
@@ -537,8 +578,6 @@ std::unique_ptr<Block> Paragraph::clone() const {
   return std::make_unique<Paragraph>(*this);
 }
 
-/// Choose a marker to delimit `Text` from a prioritized list of options.
-/// This is more readable than escaping for plain-text.
 llvm::StringRef Paragraph::chooseMarker(llvm::ArrayRef<llvm::StringRef> Options,
                                         llvm::StringRef Text) const {
   // Prefer a delimiter whose characters don't appear in the text.
@@ -548,23 +587,36 @@ llvm::StringRef Paragraph::chooseMarker(llvm::ArrayRef<llvm::StringRef> Options,
   return Options.front();
 }
 
-bool Paragraph::punctuationIndicatesLineBreak(llvm::StringRef Line) const {
+bool Paragraph::punctuationIndicatesLineBreak(llvm::StringRef Line,
+                                              bool IsMarkdown) const {
   constexpr llvm::StringLiteral Punctuation = R"txt(.:,;!?)txt";
 
+  if (!IsMarkdown && Line.ends_with("  "))
+    return true;
+
   Line = Line.rtrim();
   return !Line.empty() && Punctuation.contains(Line.back());
 }
 
-bool Paragraph::isHardLineBreakIndicator(llvm::StringRef Rest) const {
+bool Paragraph::isHardLineBreakIndicator(llvm::StringRef Rest,
+                                         bool IsMarkdown) const {
+  // Plaintext indicators:
   // '-'/'*' md list, '@'/'\' documentation command, '>' md blockquote,
-  // '#' headings, '`' code blocks, two spaces (markdown force newline)
-  constexpr llvm::StringLiteral LinebreakIndicators = R"txt(-*@\>#`)txt";
+  // '#' headings, '`' code blocks
+  constexpr llvm::StringLiteral LinebreakIndicatorsPlainText =
+      R"txt(-*@\>#`)txt";
+  // Markdown indicators:
+  // Only '@' and '\' documentation commands/escaped markdown syntax.
+  constexpr llvm::StringLiteral LinebreakIndicatorsMarkdown = R"txt(@\)txt";
 
   Rest = Rest.ltrim(" \t");
   if (Rest.empty())
     return false;
 
-  if (LinebreakIndicators.contains(Rest.front()))
+  if (IsMarkdown)
+    return LinebreakIndicatorsMarkdown.contains(Rest.front());
+
+  if (LinebreakIndicatorsPlainText.contains(Rest.front()))
     return true;
 
   if (llvm::isDigit(Rest.front())) {
@@ -575,64 +627,18 @@ bool Paragraph::isHardLineBreakIndicator(llvm::StringRef Rest) const {
   return false;
 }
 
-bool Paragraph::isHardLineBreakAfter(llvm::StringRef Line,
-                                     llvm::StringRef Rest) const {
-  // In Markdown, 2 spaces before a line break forces a line break.
-  // Add a line break for plaintext in this case too.
+bool Paragraph::isHardLineBreakAfter(llvm::StringRef Line, llvm::StringRef Rest,
+                                     bool IsMarkdown) const {
   // Should we also consider whether Line is short?
-  return Line.ends_with("  ") || punctuationIndicatesLineBreak(Line) ||
-         isHardLineBreakIndicator(Rest);
+  return punctuationIndicatesLineBreak(Line, IsMarkdown) ||
+         isHardLineBreakIndicator(Rest, IsMarkdown);
 }
 
-void Paragraph::renderPlainText(llvm::raw_ostream &OS) const {
-  bool NeedsSpace = false;
-  std::string ConcatenatedText;
-  ConcatenatedText.reserve(EstimatedStringSize);
-
-  llvm::raw_string_ostream ConcatenatedOS(ConcatenatedText);
-
-  for (auto &C : Chunks) {
-
-    if (C.Kind == ChunkKind::PlainText) {
-      if (C.SpaceBefore || NeedsSpace)
-        ConcatenatedOS << ' ';
-
-      ConcatenatedOS << C.Contents;
-      NeedsSpace = llvm::isSpace(C.Contents.back()) || C.SpaceAfter;
-      continue;
-    }
-
-    if (C.SpaceBefore || NeedsSpace)
-      ConcatenatedOS << ' ';
-    llvm::StringRef Marker = "";
-    if (C.Preserve && C.Kind == ChunkKind::InlineCode)
-      Marker = chooseMarker({"`", "'", "\""}, C.Contents);
-    else if (C.Kind == ChunkKind::Bold)
-      Marker = "**";
-    else if (C.Kind == ChunkKind::Emphasized)
-      Marker = "*";
-    ConcatenatedOS << Marker << C.Contents << Marker;
-    NeedsSpace = C.SpaceAfter;
-  }
-
-  // We go through the contents line by line to handle the newlines
-  // and required spacing correctly.
-  //
-  // Newlines are added if:
-  // - the line ends with 2 spaces and a newline follows
-  // - the line ends with punctuation that indicates a line break (.:,;!?)
-  // - the next line starts with a hard line break indicator (-@>#`, or a digit
-  //   followed by '.' or ')'), ignoring leading whitespace.
-  //
-  // Otherwise, newlines in the input are replaced with a single space.
-  //
-  // Multiple spaces are collapsed into a single space.
-  //
-  // Lines containing only whitespace are ignored.
+void Paragraph::renderNewlinesPlaintext(llvm::raw_ostream &OS,
+                                        llvm::StringRef ParagraphText) const {
   llvm::StringRef Line, Rest;
 
-  for (std::tie(Line, Rest) =
-           llvm::StringRef(ConcatenatedText).trim().split('\n');
+  for (std::tie(Line, Rest) = ParagraphText.trim().split('\n');
        !(Line.empty() && Rest.empty());
        std::tie(Line, Rest) = Rest.split('\n')) {
 
@@ -653,7 +659,7 @@ void Paragraph::renderPlainText(llvm::raw_ostream &OS) const {
 
     OS << canonicalizeSpaces(Line);
 
-    if (isHardLineBreakAfter(Line, Rest))
+    if (isHardLineBreakAfter(Line, Rest, /*IsMarkdown=*/false))
       OS << '\n';
     else if (!Rest.empty())
       // Since we removed any trailing whitespace from the input using trim(),
@@ -661,6 +667,40 @@ void Paragraph::renderPlainText(llvm::raw_ostream &OS) const {
       // Therefore, we can add a space without worrying about trailing spaces.
       OS << ' ';
   }
+}
+
+void Paragraph::renderPlainText(llvm::raw_ostream &OS) const {
+  bool NeedsSpace = false;
+  std::string ParagraphText;
+  ParagraphText.reserve(EstimatedStringSize);
+
+  llvm::raw_string_ostream ParagraphTextOS(ParagraphText);
+
+  for (auto &C : Chunks) {
+
+    if (C.Kind == ChunkKind::PlainText) {
+      if (C.SpaceBefore || NeedsSpace)
+        ParagraphTextOS << ' ';
+
+      ParagraphTextOS << C.Contents;
+      NeedsSpace = llvm::isSpace(C.Contents.back()) || C.SpaceAfter;
+      continue;
+    }
+
+    if (C.SpaceBefore || NeedsSpace)
+      ParagraphTextOS << ' ';
+    llvm::StringRef Marker = "";
+    if (C.Preserve && C.Kind == ChunkKind::InlineCode)
+      Marker = chooseMarker({"`", "'", "\""}, C.Contents);
+    else if (C.Kind == ChunkKind::Bold)
+      Marker = "**";
+    else if (C.Kind == ChunkKind::Emphasized)
+      Marker = "*";
+    ParagraphTextOS << Marker << C.Contents << Marker;
+    NeedsSpace = C.SpaceAfter;
+  }
+
+  renderNewlinesPlaintext(OS, ParagraphText);
 
   // Paragraphs are separated by a blank line.
   OS << "\n\n";
diff --git a/clang-tools-extra/clangd/support/Markup.h b/clang-tools-extra/clangd/support/Markup.h
index eea6328f69a12..219a7dad1e175 100644
--- a/clang-tools-extra/clangd/support/Markup.h
+++ b/clang-tools-extra/clangd/support/Markup.h
@@ -92,9 +92,84 @@ class Paragraph : public Block {
 
   llvm::StringRef chooseMarker(llvm::ArrayRef<llvm::StringRef> Options,
                                llvm::StringRef Text) const;
-  bool punctuationIndicatesLineBreak(llvm::StringRef Line) const;
-  bool isHardLineBreakIndicator(llvm::StringRef Rest) const;
-  bool isHardLineBreakAfter(llvm::StringRef Line, llvm::StringRef Rest) const;
+
+  /// Checks if the given line ends with punctuation that indicates a line break
+  /// (.:,;!?).
+  ///
+  /// If \p IsMarkdown is false, lines ending with 2 spaces are also considered
+  /// as indicating a line break. This is not needed for markdown because the
+  /// client renderer will handle this case.
+  bool punctuationIndicatesLineBreak(llvm::StringRef Line,
+                                     bool IsMarkdown) const;
+
+  /// Checks if the given line starts with a hard line break indicator.
+  ///
+  /// If \p IsMarkdown is true, only '@' and '\' are considered as indicators.
+  /// Otherwise, '-', '*', '@', '\', '>', '#', '`' and a digit followed by '.'
+  /// or ')' are also considered as indicators.
+  bool isHardLineBreakIndicator(llvm::StringRef Rest, bool IsMarkdown) const;
+
+  /// Checks if a hard line break should be added after the given line.
+  bool isHardLineBreakAfter(llvm::StringRef Line, llvm::StringRef Rest,
+                            bool IsMarkdown) const;
+
+  /// \brief Go through the contents line by line to handle the newlines
+  /// and required spacing correctly for markdown rendering.
+  ///
+  /// Newlines are added if:
+  /// - the line ends with punctuation that indicates a line break (.:,;!?)
+  /// - the next line starts with a hard line break indicator \\ (escaped
+  /// markdown/doxygen command) or @ (doxygen command)
+  ///
+  /// This newline handling is only used when the client requests markdown
+  /// for hover/signature help content.
+  /// Markdown does not add any newlines inside paragraphs unless the user
+  /// explicitly adds them. For hover/signature help content, we still want to
+  /// add newlines in some cases to improve readability, especially when doxygen
+  /// parsing is disabled or not implemented (like for signature help).
+  /// Therefore we add newlines in the above mentioned cases.
+  ///
+  /// In addition to that, we need to consider that the user can configure
+  /// clangd to treat documentation comments as plain text, while the client
+  /// requests markdown.
+  /// In this case, all markdown syntax is escaped and will
+  /// not be rendered as expected by markdown.
+  /// Examples are lists starting with '-' or headings starting with '#'.
+  /// With the above next line heuristics, these cases are also covered by the
+  /// '\\' new line indicator.
+  ///
+  /// FIXME: The heuristic fails e.g. for lists starting with '*' because it is
+  /// also used for emphasis in markdown and should not be treated as a newline.
+  ///
+  /// \param OS The stream to render to.
+  /// \param ParagraphText The text of the paragraph to render.
+  void renderNewlinesMarkdown(llvm::raw_ostream &OS,
+                              llvm::StringRef ParagraphText) const;
+
+  /// \brief Go through the contents line by line to handle the newlines
+  /// and required spacing correctly for plain text rendering.
+  ///
+  /// Newlines are added if:
+  /// - the line ends with 2 spaces and a newline follows
+  /// - the line ends with punctuation that indicates a line break (.:,;!?)
+  /// - the next line starts with a hard line break indicator (-@>#`\\ or a
+  ///   digit followed by '.' or ')'), ignoring leading whitespace.
+  ///
+  /// Otherwise, newlines in the input are replaced with a single space.
+  ///
+  /// Multiple spaces are collapsed into a single space.
+  ///
+  /// Lines containing only whitespace are ignored.
+  ///
+  /// This newline handling is only used when the client requests plain
+  /// text for hover/signature help content.
+  /// Therefore with this approach we mimic the behavior of markdown rendering
+  /// for these clients.
+  ///
+  /// \param OS The stream to render to.
+  /// \param ParagraphText The text of the paragraph to render.
+  void renderNewlinesPlaintext(llvm::raw_ostream &OS,
+                               llvm::StringRef ParagraphText) const;
 };
 
 /// Represents a sequence of one or more documents. Knows how to print them in a
diff --git a/clang-tools-extra/clangd/unittests/HoverTests.cpp b/clang-tools-extra/clangd/unittests/HoverTests.cpp
index 718c1bc5f355a..eb858ff616e90 100644
--- a/clang-tools-extra/clangd/unittests/HoverTests.cpp
+++ b/clang-tools-extra/clangd/unittests/HoverTests.cpp
@@ -4087,16 +4087,16 @@ As well as warnings
 
 @brief brief doc
 
-longer doc
+longer doc  
 @note this is a note
 
-As you see, notes are "inlined".
+As you see, notes are "inlined".  
 @warning this is a warning
 
-As well as warnings
-@param a this is a param
-@return it returns something
-@retval 0 if successful
+As well as warnings  
+@param a this is a param  
+@return it returns something  
+@retval 0 if successful  
 @retval 1 if failed
 
 ---
@@ -4166,9 +4166,9 @@ As well as warnings)"},
 
 @brief brief doc
 
-longer doc
-@param a this is a param
-@param b does not exist
+longer doc  
+@param a this is a param  
+@param b does not exist  
 @return it returns something
 
 ---
@@ -4315,19 +4315,19 @@ TEST(Hover, ParseDocumentation) {
                },
                {
                    "foo.\nbar",
-                   "foo.\nbar",
-                   "foo.\nbar",
+                   "foo.  \nbar",
+                   "foo.  \nbar",
                    "foo.\nbar",
                },
                {
                    "foo. \nbar",
-                   "foo. \nbar",
-                   "foo. \nbar",
+                   "foo.   \nbar",
+                   "foo.   \nbar",
                    "foo.\nbar",
                },
                {
                    "foo\n*bar",
-                   "foo\n\\*bar",
+                   "foo  \n\\*bar",
                    "foo\n*bar",
                    "foo\n*bar",
                },
@@ -4354,6 +4354,24 @@ TEST(Hover, ParseDocumentation) {
                    "\\`not\nparsed\\`",
                    "`not\nparsed`",
                    "`not parsed`",
+               },
+               {
+                   R"(@brief this is a typical use case
+@param x this is x
+\param y this is y
+@return something)",
+                   R"(@brief this is a typical use case  
+@param x this is x  
+\\param y this is y  
+@return something)",
+                   R"(@brief this is a typical use case  
+@param x this is x  
+\param y this is y  
+@return something)",
+                   R"(@brief this is a typical use case
+@param x this is x
+\param y this is y
+@return something)",
                }};
 
   for (const auto &C : Cases) {
diff --git a/clang-tools-extra/clangd/unittests/SelectionTests.cpp b/clang-tools-extra/clangd/unittests/SelectionTests.cpp
index 3df19d8fc174d..63c0403ab2e70 100644
--- a/clang-tools-extra/clangd/unittests/SelectionTests.cpp
+++ b/clang-tools-extra/clangd/unittests/SelectionTests.cpp
@@ -311,6 +311,19 @@ TEST(SelectionTest, CommonAncestor) {
       {"[[void foo^()]];", "FunctionProtoTypeLoc"},
       {"[[^void foo^()]];", "FunctionDecl"},
       {"[[void ^foo()]];", "FunctionDecl"},
+      // Tricky case: with function attributes, the AttributedTypeLoc's range
+      // includes the function name, but we want the name to be associated with
+      // the CXXMethodDecl.
+      {"struct X { [[const int* ^Get() const <:[clang::lifetimebound]:> "
+       "{return nullptr;}]]; };",
+       "CXXMethodDecl"},
+      // When the cursor is on the attribute itself, we should select the
+      // AttributedTypeLoc. Note: Due to a bug or deliberate quirk in the AST
+      // modeling of AttributedTypeLoc, its range ends at the attribute name
+      // token, not including the closing brackets ":>:>".
+      {"struct X { const [[int* Foo() const <:<:clang::life^timebound]]:>:> "
+       "{return nullptr;}; };",
+       "AttributedTypeLoc"},
       // Tricky case: two VarDecls share a specifier.
       {"[[int ^a]], b;", "VarDecl"},
       {"[[int a, ^b]];", "VarDecl"},
diff --git a/clang-tools-extra/clangd/unittests/SymbolDocumentationTests.cpp b/clang-tools-extra/clangd/unittests/SymbolDocumentationTests.cpp
index b3185cc10dd5a..676f7dfc74483 100644
--- a/clang-tools-extra/clangd/unittests/SymbolDocumentationTests.cpp
+++ b/clang-tools-extra/clangd/unittests/SymbolDocumentationTests.cpp
@@ -195,10 +195,10 @@ More description documentation)",
 normal text<i>this is an italic text</i>
 <code>this is a code block</code>)",
           R"(\<b>this is a bold text\</b>
-normal text\<i>this is an italic text\</i>
+normal text\<i>this is an italic text\</i>  
 \<code>this is a code block\</code>)",
           R"(\<b>this is a bold text\</b>
-normal text\<i>this is an italic text\</i>
+normal text\<i>this is an italic text\</i>  
 \<code>this is a code block\</code>)",
           "<b>this is a bold text</b> normal text<i>this is an italic text</i> "
           "<code>this is a code block</code>",
@@ -712,10 +712,10 @@ TEST(SymbolDocumentation, MarkdownCodeSpans) {
 line
 \c span`)",
        R"(\`multi
-line
+line  
 \\c span\`)",
        R"(`multi
-line
+line  
 \c span`)",
        R"(`multi line
 \c span`)"},
diff --git a/clang-tools-extra/clangd/unittests/support/MarkupTests.cpp b/clang-tools-extra/clangd/unittests/support/MarkupTests.cpp
index 5f91f31557176..af4782c07ae52 100644
--- a/clang-tools-extra/clangd/unittests/support/MarkupTests.cpp
+++ b/clang-tools-extra/clangd/unittests/support/MarkupTests.cpp
@@ -304,9 +304,9 @@ TEST(Paragraph, SeparationOfChunks) {
 
   P.appendSpace().appendCode("code").appendText(".\n  newline");
   EXPECT_EQ(P.asEscapedMarkdown(),
-            "after `foobar` bat`no` `space` text `code`.\n  newline");
+            "after `foobar` bat`no` `space` text `code`.  \n  newline");
   EXPECT_EQ(P.asMarkdown(),
-            "after `foobar` bat`no` `space` text `code`.\n  newline");
+            "after `foobar` bat`no` `space` text `code`.  \n  newline");
   EXPECT_EQ(P.asPlainText(), "after foobar batno space text code.\nnewline");
 }
 
@@ -371,21 +371,117 @@ TEST(Paragraph, SeparationOfChunks3) {
   EXPECT_EQ(P.asPlainText(), "after\nfoobar");
 
   P.appendText("- bat\n");
-  EXPECT_EQ(P.asEscapedMarkdown(), "after  \n  foobar\n\\- bat");
+  EXPECT_EQ(P.asEscapedMarkdown(), "after  \n  foobar  \n\\- bat");
   EXPECT_EQ(P.asMarkdown(), "after  \n  foobar\n- bat");
   EXPECT_EQ(P.asPlainText(), "after\nfoobar\n- bat");
 
   P.appendText("- baz");
-  EXPECT_EQ(P.asEscapedMarkdown(), "after  \n  foobar\n\\- bat\n\\- baz");
+  EXPECT_EQ(P.asEscapedMarkdown(), "after  \n  foobar  \n\\- bat  \n\\- baz");
   EXPECT_EQ(P.asMarkdown(), "after  \n  foobar\n- bat\n- baz");
   EXPECT_EQ(P.asPlainText(), "after\nfoobar\n- bat\n- baz");
 
   P.appendText(" faz ");
-  EXPECT_EQ(P.asEscapedMarkdown(), "after  \n  foobar\n\\- bat\n\\- baz faz");
+  EXPECT_EQ(P.asEscapedMarkdown(),
+            "after  \n  foobar  \n\\- bat  \n\\- baz faz");
   EXPECT_EQ(P.asMarkdown(), "after  \n  foobar\n- bat\n- baz faz");
   EXPECT_EQ(P.asPlainText(), "after\nfoobar\n- bat\n- baz faz");
 }
 
+TEST(Paragraph, PunctuationLineBreaks) {
+
+  struct {
+    std::string Text;
+    std::string EscapedMarkdown;
+    std::string Markdown;
+    std::string PlainText;
+  } Cases[] = {
+      {"Line ending with dot.\nForces a visual linebreak.",
+       "Line ending with dot.  \nForces a visual linebreak.",
+       "Line ending with dot.  \nForces a visual linebreak.",
+       "Line ending with dot.\nForces a visual linebreak."},
+      {"Line ending with colon:\nForces a visual linebreak.",
+       "Line ending with colon:  \nForces a visual linebreak.",
+       "Line ending with colon:  \nForces a visual linebreak.",
+       "Line ending with colon:\nForces a visual linebreak."},
+      {"Line ending with semicolon:\nForces a visual linebreak.",
+       "Line ending with semicolon:  \nForces a visual linebreak.",
+       "Line ending with semicolon:  \nForces a visual linebreak.",
+       "Line ending with semicolon:\nForces a visual linebreak."},
+      {"Line ending with comma,\nForces a visual linebreak.",
+       "Line ending with comma,  \nForces a visual linebreak.",
+       "Line ending with comma,  \nForces a visual linebreak.",
+       "Line ending with comma,\nForces a visual linebreak."},
+      {"Line ending with exclamation mark!\nForces a visual linebreak.",
+       "Line ending with exclamation mark!  \nForces a visual linebreak.",
+       "Line ending with exclamation mark!  \nForces a visual linebreak.",
+       "Line ending with exclamation mark!\nForces a visual linebreak."},
+      {"Line ending with question mark?\nForces a visual linebreak.",
+       "Line ending with question mark?  \nForces a visual linebreak.",
+       "Line ending with question mark?  \nForces a visual linebreak.",
+       "Line ending with question mark?\nForces a visual linebreak."},
+  };
+
+  for (const auto &C : Cases) {
+    Paragraph P;
+    P.appendText(C.Text);
+    EXPECT_EQ(P.asEscapedMarkdown(), C.EscapedMarkdown);
+    EXPECT_EQ(P.asMarkdown(), C.Markdown);
+    EXPECT_EQ(P.asPlainText(), C.PlainText);
+  }
+}
+
+TEST(Paragraph, LineBreakIndicators) {
+
+  struct {
+    std::string Text;
+    std::string EscapedMarkdown;
+    std::string Markdown;
+    std::string PlainText;
+  } Cases[] = {
+      {"Visual linebreak for\n- list items\n- and so on",
+       "Visual linebreak for  \n\\- list items  \n\\- and so on",
+       "Visual linebreak for\n- list items\n- and so on",
+       "Visual linebreak for\n- list items\n- and so on"},
+      {"Visual linebreak for\n* list items\n* and so on",
+       "Visual linebreak for  \n\\* list items  \n\\* and so on",
+       "Visual linebreak for\n* list items\n* and so on",
+       "Visual linebreak for\n* list items\n* and so on"},
+      {"Visual linebreak for\n@command any doxygen command\n\\other other "
+       "doxygen command",
+       "Visual linebreak for  \n@command any doxygen command  \n\\\\other "
+       "other doxygen command",
+       "Visual linebreak for  \n@command any doxygen command  \n\\other other "
+       "doxygen command",
+       "Visual linebreak for\n@command any doxygen command\n\\other other "
+       "doxygen command"},
+      {"Visual linebreak for\n>blockquoute line 1\n> blockquoute line 2",
+       "Visual linebreak for  \n\\>blockquoute line 1  \n\\> blockquoute line "
+       "2",
+       "Visual linebreak for\n>blockquoute line 1\n> blockquoute line 2",
+       "Visual linebreak for\n>blockquoute line 1\n> blockquoute line 2"},
+      {"Visual linebreak for\n# Heading 1\ntext under heading\n## Heading "
+       "2\ntext under heading 2",
+       "Visual linebreak for  \n\\# Heading 1\ntext under heading  \n\\## "
+       "Heading 2\ntext under heading 2",
+       "Visual linebreak for\n# Heading 1\ntext under heading\n## Heading "
+       "2\ntext under heading 2",
+       "Visual linebreak for\n# Heading 1 text under heading\n## Heading 2 "
+       "text under heading 2"},
+      {"Visual linebreak for\n`inline code`",
+       "Visual linebreak for  \n\\`inline code\\`",
+       "Visual linebreak for\n`inline code`",
+       "Visual linebreak for\n`inline code`"},
+  };
+
+  for (const auto &C : Cases) {
+    Paragraph P;
+    P.appendText(C.Text);
+    EXPECT_EQ(P.asEscapedMarkdown(), C.EscapedMarkdown);
+    EXPECT_EQ(P.asMarkdown(), C.Markdown);
+    EXPECT_EQ(P.asPlainText(), C.PlainText);
+  }
+}
+
 TEST(Paragraph, ExtraSpaces) {
   // Make sure spaces inside chunks are preserved for markdown
   // and dropped for plain text.
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 915b79329dac4..f4eeb3e7942a8 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -70,6 +70,11 @@ Potentially Breaking Changes
     :doc:`bugprone-signed-char-misuse
     <clang-tidy/checks/bugprone/signed-char-misuse>`
 
+- :program:`clang-tidy` now displays warnings from all non-system headers by
+  default. Previously, users had to explicitly opt-in to header warnings using
+  `-header-filter='.*'`. To disable warnings from non-system, set `-header-filter`
+  to an empty string.
+
 Improvements to clangd
 ----------------------
 
@@ -132,6 +137,11 @@ Improvements to clang-tidy
   when run over C files. If ``-std`` is not specified, it defaults to
   ``c99-or-later``.
 
+- :program:`clang-tidy` now displays warnings from all non-system headers by
+  default. Previously, users had to explicitly opt-in to header warnings using
+  `-header-filter='.*'`. To disable warnings from non-system, set `-header-filter`
+  to an empty string.
+
 - :program:`clang-tidy` no longer attempts to analyze code from system headers
   by default, greatly improving performance. This behavior is disabled if the
   `SystemHeaders` option is enabled.
@@ -174,17 +184,17 @@ Improvements to clang-tidy
 New checks
 ^^^^^^^^^^
 
+- New :doc:`bugprone-derived-method-shadowing-base-method
+  <clang-tidy/checks/bugprone/derived-method-shadowing-base-method>` check.
+
+  Finds derived class methods that shadow a (non-virtual) base class method.
+
 - New :doc:`bugprone-invalid-enum-default-initialization
   <clang-tidy/checks/bugprone/invalid-enum-default-initialization>` check.
 
   Detects default initialization (to 0) of variables with ``enum`` type where
   the enum has no enumerator with value of 0.
 
-- New :doc:`bugprone-derived-method-shadowing-base-method
-  <clang-tidy/checks/bugprone/derived-method-shadowing-base-method>` check.
-
-  Finds derived class methods that shadow a (non-virtual) base class method.
-
 - New :doc:`cppcoreguidelines-pro-bounds-avoid-unchecked-container-access
   <clang-tidy/checks/cppcoreguidelines/pro-bounds-avoid-unchecked-container-access>`
   check.
@@ -254,6 +264,11 @@ New check aliases
   <clang-tidy/checks/bugprone/throwing-static-initialization>`
   keeping initial check as an alias to the new one.
 
+- Renamed :doc:`cert-oop57-cpp <clang-tidy/checks/cert/oop57-cpp>` to
+  :doc:`bugprone-raw-memory-call-on-non-trivial-type
+  <clang-tidy/checks/bugprone/raw-memory-call-on-non-trivial-type>`
+  keeping initial check as an alias to the new one.
+
 Changes in existing checks
 ^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -321,6 +336,11 @@ Changes in existing checks
   an additional matcher that generalizes the copy-and-swap idiom pattern
   detection.
 
+- Improved :doc:`cppcoreguidelines-avoid-non-const-global-variables
+  <clang-tidy/checks/cppcoreguidelines/avoid-non-const-global-variables>` check
+  by adding a new option `AllowThreadLocal` that suppresses warnings on
+  non-const global variables with thread-local storage duration.
+
 - Improved :doc:`cppcoreguidelines-init-variables
   <clang-tidy/checks/cppcoreguidelines/init-variables>` check by fixing the
   insertion location for function pointers with multiple parameters.
@@ -352,7 +372,7 @@ Changes in existing checks
 
 - Improved :doc:`misc-const-correctness
   <clang-tidy/checks/misc/const-correctness>` check to avoid false
-  positives when pointers is transferred to non-const references 
+  positives when pointers is transferred to non-const references
   and avoid false positives of function pointer and fix false
   positives on return of non-const pointer.
 
@@ -407,7 +427,8 @@ Changes in existing checks
 
 - Improved :doc:`performance-unnecessary-value-param
   <clang-tidy/checks/performance/unnecessary-value-param>` by printing
-  the type of the diagnosed variable.
+  the type of the diagnosed variable and correctly generating fix-it hints for
+  parameter-pack arguments.
 
 - Improved :doc:`portability-template-virtual-member-function
   <clang-tidy/checks/portability/template-virtual-member-function>` check to
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/raw-memory-call-on-non-trivial-type.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/raw-memory-call-on-non-trivial-type.rst
new file mode 100644
index 0000000000000..0ce7f80e8381d
--- /dev/null
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/raw-memory-call-on-non-trivial-type.rst
@@ -0,0 +1,35 @@
+.. title:: clang-tidy - bugprone-raw-memory-call-on-non-trivial-type
+
+bugprone-raw-memory-call-on-non-trivial-type
+============================================
+
+Flags use of the C standard library functions ``memset``, ``memcpy`` and
+``memcmp`` and similar derivatives on non-trivial types.
+
+The check will detect the following functions: ``memset``, ``std::memset``,
+``std::memcpy``, ``memcpy``, ``std::memmove``, ``memmove``, ``std::strcpy``,
+``strcpy``, ``memccpy``, ``stpncpy``, ``strncpy``, ``std::memcmp``, ``memcmp``,
+``std::strcmp``, ``strcmp``, ``strncmp``.
+
+Options
+-------
+
+.. option:: MemSetNames
+
+   Specify extra functions to flag that act similarly to ``memset``. Specify 
+   names in a semicolon-delimited list. Default is an empty string.
+
+.. option:: MemCpyNames
+
+   Specify extra functions to flag that act similarly to ``memcpy``. Specify 
+   names in a semicolon-delimited list. Default is an empty string.
+
+.. option:: MemCmpNames
+
+   Specify extra functions to flag that act similarly to ``memcmp``. Specify 
+   names in a semicolon-delimited list. Default is an empty string.
+
+This check corresponds to the CERT C++ Coding Standard rule
+`OOP57-CPP. Prefer special member functions and overloaded operators to C
+Standard Library functions
+<https://wiki.sei.cmu.edu/confluence/display/cplusplus/OOP57-CPP.+Prefer+special+member+functions+and+overloaded+operators+to+C+Standard+Library+functions>`_.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/cert/oop57-cpp.rst b/clang-tools-extra/docs/clang-tidy/checks/cert/oop57-cpp.rst
index 4787abf1554ab..414f788bf2500 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/cert/oop57-cpp.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/cert/oop57-cpp.rst
@@ -1,38 +1,13 @@
 .. title:: clang-tidy - cert-oop57-cpp
+.. meta::
+   :http-equiv=refresh: 5;URL=../bugprone/raw-memory-call-on-non-trivial-type.html
 
 cert-oop57-cpp
 ==============
 
-  Flags use of the `C` standard library functions ``memset``, ``memcpy`` and
-  ``memcmp`` and similar derivatives on non-trivial types.
-
-Options
--------
-
-.. option:: MemSetNames
-
-   Specify extra functions to flag that act similarly to ``memset``.
-   Specify names in a semicolon delimited list.
-   Default is an empty string.
-   The check will detect the following functions:
-   `memset`, `std::memset`.
-
-.. option:: MemCpyNames
-
-   Specify extra functions to flag that act similarly to ``memcpy``.
-   Specify names in a semicolon delimited list.
-   Default is an empty string.
-   The check will detect the following functions:
-   `std::memcpy`, `memcpy`, `std::memmove`, `memmove`, `std::strcpy`,
-   `strcpy`, `memccpy`, `stpncpy`, `strncpy`.
-
-.. option:: MemCmpNames
-
-   Specify extra functions to flag that act similarly to ``memcmp``.
-   Specify names in a semicolon delimited list.
-   Default is an empty string.
-   The check will detect the following functions:
-   `std::memcmp`, `memcmp`, `std::strcmp`, `strcmp`, `strncmp`.
+The `cert-oop57-cpp` check is an alias, please see
+`bugprone-raw-memory-call-on-non-trivial-type <../bugprone/raw-memory-call-on-non-trivial-type.html>`_
+for more information.
 
 This check corresponds to the CERT C++ Coding Standard rule
 `OOP57-CPP. Prefer special member functions and overloaded operators to C
diff --git a/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/avoid-non-const-global-variables.rst b/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/avoid-non-const-global-variables.rst
index 8da284ca13e3d..3d5fef3a07dca 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/avoid-non-const-global-variables.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/avoid-non-const-global-variables.rst
@@ -49,3 +49,8 @@ Options
 
    When set to `true`, static non-const variables and variables in anonymous
    namespaces will not generate a warning. The default value is `false`.
+
+.. option:: AllowThreadLocal
+
+   When set to `true`, non-const global variables with thread-local storage
+   duration will not generate a warning. The default value is `false`.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/pro-bounds-avoid-unchecked-container-access.rst b/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/pro-bounds-avoid-unchecked-container-access.rst
index 556d90213b216..1ecdcdb1ed4c7 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/pro-bounds-avoid-unchecked-container-access.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/pro-bounds-avoid-unchecked-container-access.rst
@@ -39,8 +39,8 @@ Options
 
 .. option:: ExcludeClasses
 
-    Semicolon-delimited list of class names for overwriting the default
-    exclusion list. The default is:
+    Semicolon-separated list of regular expressions matching class names that
+    overwrites the default exclusion list. The default is:
     `::std::map;::std::unordered_map;::std::flat_map`.
     
 .. option:: FixMode
diff --git a/clang-tools-extra/docs/clang-tidy/checks/list.rst b/clang-tools-extra/docs/clang-tidy/checks/list.rst
index d3c89e469188d..c7a922a91c6e0 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/list.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/list.rst
@@ -129,6 +129,7 @@ Clang-Tidy Checks
    :doc:`bugprone-parent-virtual-call <bugprone/parent-virtual-call>`, "Yes"
    :doc:`bugprone-pointer-arithmetic-on-polymorphic-object <bugprone/pointer-arithmetic-on-polymorphic-object>`,
    :doc:`bugprone-posix-return <bugprone/posix-return>`, "Yes"
+   :doc:`bugprone-raw-memory-call-on-non-trivial-type <bugprone/raw-memory-call-on-non-trivial-type>`,
    :doc:`bugprone-redundant-branch-condition <bugprone/redundant-branch-condition>`, "Yes"
    :doc:`bugprone-reserved-identifier <bugprone/reserved-identifier>`, "Yes"
    :doc:`bugprone-return-const-ref-from-parameter <bugprone/return-const-ref-from-parameter>`,
@@ -180,7 +181,6 @@ Clang-Tidy Checks
    :doc:`cert-mem57-cpp <cert/mem57-cpp>`,
    :doc:`cert-msc50-cpp <cert/msc50-cpp>`,
    :doc:`cert-msc51-cpp <cert/msc51-cpp>`,
-   :doc:`cert-oop57-cpp <cert/oop57-cpp>`,
    :doc:`cert-oop58-cpp <cert/oop58-cpp>`,
    :doc:`concurrency-mt-unsafe <concurrency/mt-unsafe>`,
    :doc:`concurrency-thread-canceltype-asynchronous <concurrency/thread-canceltype-asynchronous>`,
@@ -442,8 +442,8 @@ Check aliases
    :doc:`cert-dcl51-cpp <cert/dcl51-cpp>`, :doc:`bugprone-reserved-identifier <bugprone/reserved-identifier>`, "Yes"
    :doc:`cert-dcl54-cpp <cert/dcl54-cpp>`, :doc:`misc-new-delete-overloads <misc/new-delete-overloads>`,
    :doc:`cert-dcl59-cpp <cert/dcl59-cpp>`, :doc:`google-build-namespaces <google/build-namespaces>`,
-   :doc:`cert-err09-cpp <cert/err09-cpp>`, :doc:`misc-throw-by-value-catch-by-reference <misc/throw-by-value-catch-by-reference>`,
    :doc:`cert-env33-c <cert/env33-c>`, :doc:`bugprone-command-processor <bugprone/command-processor>`,
+   :doc:`cert-err09-cpp <cert/err09-cpp>`, :doc:`misc-throw-by-value-catch-by-reference <misc/throw-by-value-catch-by-reference>`,
    :doc:`cert-err34-c <cert/err34-c>`, :doc:`bugprone-unchecked-string-to-number-conversion <bugprone/unchecked-string-to-number-conversion>`,
    :doc:`cert-err52-cpp <cert/err52-cpp>`, :doc:`modernize-avoid-setjmp-longjmp <modernize/avoid-setjmp-longjmp>`,
    :doc:`cert-err58-cpp <cert/err58-cpp>`, :doc:`bugprone-throwing-static-initialization <bugprone/throwing-static-initialization>`,
@@ -459,6 +459,7 @@ Check aliases
    :doc:`cert-msc54-cpp <cert/msc54-cpp>`, :doc:`bugprone-signal-handler <bugprone/signal-handler>`,
    :doc:`cert-oop11-cpp <cert/oop11-cpp>`, :doc:`performance-move-constructor-init <performance/move-constructor-init>`,
    :doc:`cert-oop54-cpp <cert/oop54-cpp>`, :doc:`bugprone-unhandled-self-assignment <bugprone/unhandled-self-assignment>`,
+   :doc:`cert-oop57-cpp <cert/oop57-cpp>`, :doc:`bugprone-raw-memory-call-on-non-trivial-type <bugprone/raw-memory-call-on-non-trivial-type>`,
    :doc:`cert-pos44-c <cert/pos44-c>`, :doc:`bugprone-bad-signal-to-kill-thread <bugprone/bad-signal-to-kill-thread>`,
    :doc:`cert-pos47-c <cert/pos47-c>`, :doc:`concurrency-thread-canceltype-asynchronous <concurrency/thread-canceltype-asynchronous>`,
    :doc:`cert-sig30-c <cert/sig30-c>`, :doc:`bugprone-signal-handler <bugprone/signal-handler>`,
diff --git a/clang-tools-extra/docs/clang-tidy/checks/modernize/use-std-format.rst b/clang-tools-extra/docs/clang-tidy/checks/modernize/use-std-format.rst
index cfa11d3cac8bf..7038e7bfc5d26 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/modernize/use-std-format.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/modernize/use-std-format.rst
@@ -62,12 +62,12 @@ Options
 
 .. option:: StrFormatLikeFunctions
 
-   A semicolon-separated list of (fully qualified) function names to
-   replace, with the requirement that the first parameter contains the
-   printf-style format string and the arguments to be formatted follow
-   immediately afterwards. Qualified member function names are supported,
-   but the replacement function name must be unqualified. The default value
-   for this option is `absl::StrFormat`.
+   A semicolon-separated list of regular expressions matching the 
+   (fully qualified) names of functions to replace, with the requirement that
+   the first parameter contains the printf-style format string and the
+   arguments to be formatted follow immediately afterwards. Qualified member
+   function names are supported, but the replacement function name must be
+   unqualified. The default value is `absl::StrFormat`.
 
 .. option:: ReplacementFormatFunction
 
diff --git a/clang-tools-extra/docs/clang-tidy/checks/modernize/use-std-print.rst b/clang-tools-extra/docs/clang-tidy/checks/modernize/use-std-print.rst
index 0cf51e3961a05..eb2159bc848d1 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/modernize/use-std-print.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/modernize/use-std-print.rst
@@ -122,25 +122,27 @@ Options
 
 .. option:: PrintfLikeFunctions
 
-   A semicolon-separated list of (fully qualified) function names to
-   replace, with the requirement that the first parameter contains the
-   printf-style format string and the arguments to be formatted follow
-   immediately afterwards. Qualified member function names are supported,
-   but the replacement function name must be unqualified. If neither this
-   option nor `FprintfLikeFunctions` are set then the default value for
-   this option is `printf; absl::PrintF`, otherwise it is empty.
+   A semicolon-separated list of regular expressions matching the 
+   (fully qualified) names of functions to replace, with the requirement
+   that the first parameter contains the printf-style format string and the
+   arguments to be formatted follow immediately afterwards. Qualified member
+   function names are supported, but the replacement function name must be
+   unqualified. If neither this option nor `FprintfLikeFunctions` are set then
+   the default value is `printf; absl::PrintF`, otherwise it is the empty
+   string.
 
 
 .. option:: FprintfLikeFunctions
 
-   A semicolon-separated list of (fully qualified) function names to
-   replace, with the requirement that the first parameter is retained, the
-   second parameter contains the printf-style format string and the
-   arguments to be formatted follow immediately afterwards. Qualified
-   member function names are supported, but the replacement function name
-   must be unqualified. If neither this option nor `PrintfLikeFunctions`
-   are set then the default value for this option is `fprintf;
-   absl::FPrintF`, otherwise it is empty.
+   A semicolon-separated list of regular expressions matching the 
+   (fully qualified) names of functions to replace, with the requirement
+   that the first parameter is retained, the second parameter contains the
+   printf-style format string and the arguments to be formatted follow
+   immediately afterwards. Qualified member function names are supported,
+   but the replacement function name must be unqualified. If neither this
+   option nor `PrintfLikeFunctions` are set then the default value is 
+   `fprintf;absl::FPrintF`, otherwise it is the empty string.
+
 
 .. option:: ReplacementPrintFunction
 
diff --git a/clang-tools-extra/docs/clang-tidy/checks/readability/container-size-empty.rst b/clang-tools-extra/docs/clang-tidy/checks/readability/container-size-empty.rst
index da6f770b3d74b..cc012fdcd7649 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/readability/container-size-empty.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/readability/container-size-empty.rst
@@ -30,8 +30,8 @@ Options
 
 .. option:: ExcludedComparisonTypes
 
-    A semicolon-separated list of class names for which the check will ignore
-    comparisons of objects with default-constructed objects of the same type.
-    If a class is listed here, the check will not suggest using ``empty()``
-    instead of such comparisons for objects of that class.
-    Default value is: `::std::array`.
+    A semicolon-separated list of regular expressions matching class names for
+    which the check will ignore comparisons of objects with default-constructed
+    objects of the same type. If a class is listed here, the check will not
+    suggest using ``empty()`` instead of such comparisons for objects of that
+    class. Default value is: `::std::array`.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/readability/redundant-parentheses.rst b/clang-tools-extra/docs/clang-tidy/checks/readability/redundant-parentheses.rst
index 23d975e646490..20e3891c72d7f 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/readability/redundant-parentheses.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/readability/redundant-parentheses.rst
@@ -27,3 +27,16 @@ affect the semantics.
 .. code-block:: c++
 
   int a = (1 * 2) + 3; // no warning
+
+Options
+-------
+
+.. option:: AllowedDecls
+
+  Semicolon-separated list of regular expressions matching names of declarations
+  to ignore when the parentheses are around. Declarations can include variables
+  or functions. The default is an `std::max;std::min`.
+  
+  Some STL library functions may have the same name as widely used function-like
+  macro. For example, ``std::max`` and ``max`` macro. A workaround to distinguish
+  them is adding parentheses around functions to prevent function-like macro.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/readability/redundant-string-cstr.rst b/clang-tools-extra/docs/clang-tidy/checks/readability/redundant-string-cstr.rst
index 2789f9c096ccf..7b507771d6799 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/readability/redundant-string-cstr.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/readability/redundant-string-cstr.rst
@@ -11,10 +11,10 @@ Options
 
 .. option:: StringParameterFunctions
 
-   A semicolon-separated list of (fully qualified) function/method/operator
-   names, with the requirement that any parameter currently accepting a
-   ``const char*`` input should also be able to accept ``std::string``
-   inputs, or proper overload candidates that can do so should exist. This
-   can be used to configure functions such as ``fmt::format``,
-   ``spdlog::logger::info``, or wrappers around these and similar
-   functions. The default value is the empty string.
+   A semicolon-separated list of regular expressions matching the
+   (fully qualified) names of function/method/operator, with the requirement
+   that any parameter currently accepting a ``const char*`` input should also
+   be able to accept ``std::string`` inputs, or proper overload candidates that
+   can do so should exist. This can be used to configure functions such as
+   ``fmt::format``, ``spdlog::logger::info``, or wrappers around these and
+   similar functions. The default value is the empty string.
diff --git a/clang-tools-extra/docs/clang-tidy/index.rst b/clang-tools-extra/docs/clang-tidy/index.rst
index bd2c40e948f34..6ff82bf230f4b 100644
--- a/clang-tools-extra/docs/clang-tidy/index.rst
+++ b/clang-tools-extra/docs/clang-tidy/index.rst
@@ -215,7 +215,9 @@ An overview of all the command-line options:
                                        This option overrides the 'FormatStyle` option in
                                        .clang-tidy file, if any.
     --header-filter=<string>         - Regular expression matching the names of the
-                                       headers to output diagnostics from. Diagnostics
+                                       headers to output diagnostics from. The default
+                                       value is '.*', i.e. diagnostics from all non-system
+                                       headers are displayed by default. Diagnostics
                                        from the main file of each translation unit are
                                        always displayed.
                                        Can be used together with -line-filter.
@@ -338,7 +340,7 @@ An overview of all the command-line options:
       WarningsAsErrors:    ''
       HeaderFileExtensions:         ['', 'h','hh','hpp','hxx']
       ImplementationFileExtensions: ['c','cc','cpp','cxx']
-      HeaderFilterRegex:   ''
+      HeaderFilterRegex:   '.*'
       FormatStyle:         none
       InheritParentConfig: true
       User:                user
diff --git a/clang-tools-extra/test/CMakeLists.txt b/clang-tools-extra/test/CMakeLists.txt
index a70d2ef2d92f2..78447e7a00db8 100644
--- a/clang-tools-extra/test/CMakeLists.txt
+++ b/clang-tools-extra/test/CMakeLists.txt
@@ -87,4 +87,7 @@ add_lit_testsuite(check-clang-extra "Running clang-tools-extra/test"
 
 add_lit_testsuites(CLANG-EXTRA ${CMAKE_CURRENT_SOURCE_DIR}
   DEPENDS ${CLANG_TOOLS_TEST_DEPS}
+   SKIP "^clang-doc"
   )
+
+add_subdirectory(clang-doc)
diff --git a/clang-tools-extra/test/clang-doc/CMakeLists.txt b/clang-tools-extra/test/clang-doc/CMakeLists.txt
new file mode 100644
index 0000000000000..4446b2a3c897f
--- /dev/null
+++ b/clang-tools-extra/test/clang-doc/CMakeLists.txt
@@ -0,0 +1,7 @@
+# Specialize the clang-doc target to avoid building other projects
+add_lit_testsuite(check-clang-extra-clang-doc "Running clang-doc tests"
+  ${CMAKE_CURRENT_BINARY_DIR}
+  EXCLUDE_FROM_CHECK_ALL
+  DEPENDS clang-doc
+  DEPENDS ${LLVM_UTILS_DEPS}
+)
diff --git a/clang-tools-extra/test/clang-tidy/checkers/abseil/no-internal-dependencies.cpp b/clang-tools-extra/test/clang-tidy/checkers/abseil/no-internal-dependencies.cpp
index 2949d7fdd0274..f6eb7c5e25949 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/abseil/no-internal-dependencies.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/abseil/no-internal-dependencies.cpp
@@ -1,4 +1,4 @@
-// RUN: %check_clang_tidy %s abseil-no-internal-dependencies %t,  -- -- -I %S/Inputs
+// RUN: %check_clang_tidy %s abseil-no-internal-dependencies %t,  -- -header-filter='' -- -I %S/Inputs
 // RUN: clang-tidy -checks='-*, abseil-no-internal-dependencies' -header-filter='.*' %s -- -I %S/Inputs 2>&1 | FileCheck %s
 
 #include "absl/strings/internal-file.h"
diff --git a/clang-tools-extra/test/clang-tidy/checkers/abseil/no-namespace.cpp b/clang-tools-extra/test/clang-tidy/checkers/abseil/no-namespace.cpp
index 78821c373f5c4..c8a5752ed86a6 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/abseil/no-namespace.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/abseil/no-namespace.cpp
@@ -1,4 +1,4 @@
-// RUN: %check_clang_tidy %s abseil-no-namespace %t -- -- -I %S/Inputs
+// RUN: %check_clang_tidy %s abseil-no-namespace %t -- -header-filter='' -- -I %S/Inputs
 // RUN: clang-tidy -checks='-*, abseil-no-namespace' -header-filter='.*' %s -- -I %S/Inputs 2>&1 | FileCheck %s
 
 /// Warning will not be triggered on internal Abseil code that is included.
diff --git a/clang-tools-extra/test/clang-tidy/checkers/cert/oop57-cpp.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/raw-memory-call-on-non-trivial-type.cpp
similarity index 93%
rename from clang-tools-extra/test/clang-tidy/checkers/cert/oop57-cpp.cpp
rename to clang-tools-extra/test/clang-tidy/checkers/bugprone/raw-memory-call-on-non-trivial-type.cpp
index e34315fc98d25..41a86ff385dbf 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/cert/oop57-cpp.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/raw-memory-call-on-non-trivial-type.cpp
@@ -1,8 +1,8 @@
-// RUN: %check_clang_tidy %s cert-oop57-cpp %t -- \
+// RUN: %check_clang_tidy %s bugprone-raw-memory-call-on-non-trivial-type %t -- \
 // RUN: -config='{CheckOptions: \
-// RUN:  {cert-oop57-cpp.MemSetNames: mymemset, \
-// RUN:  cert-oop57-cpp.MemCpyNames: mymemcpy, \
-// RUN:  cert-oop57-cpp.MemCmpNames: mymemcmp}}' \
+// RUN:  {bugprone-raw-memory-call-on-non-trivial-type.MemSetNames: mymemset, \
+// RUN:  bugprone-raw-memory-call-on-non-trivial-type.MemCpyNames: mymemcpy, \
+// RUN:  bugprone-raw-memory-call-on-non-trivial-type.MemCmpNames: mymemcmp}}' \
 // RUN: --
 
 void mymemset(void *, unsigned char, decltype(sizeof(int)));
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/reserved-identifier.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/reserved-identifier.cpp
index 0f36efe656bf9..b17e8903c41c2 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/reserved-identifier.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/reserved-identifier.cpp
@@ -1,8 +1,9 @@
-// RUN: %check_clang_tidy %s bugprone-reserved-identifier %t -- -- \
+// RUN: %check_clang_tidy %s bugprone-reserved-identifier %t -- \
+// RUN:   -header-filter='' -- \
 // RUN:   -I%S/Inputs/reserved-identifier \
 // RUN:   -isystem %S/Inputs/reserved-identifier/system
 
-// no warnings expected without -header-filter=
+// no warnings expected with -header-filter=''
 #include "user-header.h"
 #include <system-header.h>
 
diff --git a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/avoid-non-const-global-variables.cpp b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/avoid-non-const-global-variables.cpp
index 334332def216f..30bdd68a21b84 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/avoid-non-const-global-variables.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/avoid-non-const-global-variables.cpp
@@ -1,6 +1,8 @@
 // RUN: %check_clang_tidy %s -check-suffixes=,DEFAULT cppcoreguidelines-avoid-non-const-global-variables %t
 // RUN: %check_clang_tidy %s -check-suffixes=,INTERNAL-LINKAGE cppcoreguidelines-avoid-non-const-global-variables %t -- \
 // RUN: -config="{CheckOptions: {cppcoreguidelines-avoid-non-const-global-variables.AllowInternalLinkage : 'true'}}"
+// RUN: %check_clang_tidy %s -check-suffixes=,THREAD-LOCAL cppcoreguidelines-avoid-non-const-global-variables %t -- \
+// RUN: -config="{CheckOptions: {cppcoreguidelines-avoid-non-const-global-variables.AllowThreadLocal : 'true'}}"
 
 int nonConstInt = 0;
 // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: variable 'nonConstInt' is non-const and globally accessible, consider making it const [cppcoreguidelines-avoid-non-const-global-variables]
@@ -42,14 +44,23 @@ namespace {
 int nonConstAnonymousNamespaceInt = 0;
 // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:5: warning: variable 'nonConstAnonymousNamespaceInt' is non-const and globally accessible, consider making it const [cppcoreguidelines-avoid-non-const-global-variables]
 // CHECK-MESSAGES-INTERNAL-LINKAGE-NOT: :[[@LINE-2]]:5: warning: variable 'nonConstAnonymousNamespaceInt' is non-const and globally accessible, consider making it const [cppcoreguidelines-avoid-non-const-global-variables]
+// CHECK-MESSAGES-THREAD-LOCAL: :[[@LINE-3]]:5: warning: variable 'nonConstAnonymousNamespaceInt' is non-const and globally accessible, consider making it const [cppcoreguidelines-avoid-non-const-global-variables]
 } // namespace
 
 static int nonConstStaticInt = 0;
 // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:12: warning: variable 'nonConstStaticInt' is non-const and globally accessible, consider making it const [cppcoreguidelines-avoid-non-const-global-variables]
 // CHECK-MESSAGES-INTERNAL-LINKAGE-NOT: :[[@LINE-2]]:12: warning: variable 'nonConstStaticInt' is non-const and globally accessible, consider making it const [cppcoreguidelines-avoid-non-const-global-variables]
+// CHECK-MESSAGES-THREAD-LOCAL: :[[@LINE-3]]:12: warning: variable 'nonConstStaticInt' is non-const and globally accessible, consider making it const [cppcoreguidelines-avoid-non-const-global-variables]
 
 static const int constStaticInt = 0;
 
+thread_local int threadLocalInt = 0;
+// CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:18: warning: variable 'threadLocalInt' is non-const and globally accessible, consider making it const [cppcoreguidelines-avoid-non-const-global-variables]
+// CHECK-MESSAGES-INTERNAL-LINKAGE: :[[@LINE-2]]:18: warning: variable 'threadLocalInt' is non-const and globally accessible, consider making it const [cppcoreguidelines-avoid-non-const-global-variables]
+// CHECK-MESSAGES-THREAD-LOCAL-NOT: :[[@LINE-3]]:18: warning: variable 'threadLocalInt' is non-const and globally accessible, consider making it const [cppcoreguidelines-avoid-non-const-global-variables]
+
+thread_local const int threadLocalConstInt = 0;
+
 class DummyClass {
 public:
   int nonConstPublicMemberVariable = 0;
@@ -137,6 +148,7 @@ DummyEnum nonConstAnonymousNamespaceEnumInstance = DummyEnum::first;
 }
 // CHECK-MESSAGES-DEFAULT: :[[@LINE-2]]:11: warning: variable 'nonConstAnonymousNamespaceEnumInstance' is non-const and globally accessible, consider making it const [cppcoreguidelines-avoid-non-const-global-variables]
 // CHECK-MESSAGES-INTERNAL-LINKAGE-NOT: :[[@LINE-2]]:11: warning: variable 'nonConstAnonymousNamespaceEnumInstance' is non-const and globally accessible, consider making it const [cppcoreguidelines-avoid-non-const-global-variables]
+// CHECK-MESSAGES-THREAD-LOCAL: :[[@LINE-4]]:11: warning: variable 'nonConstAnonymousNamespaceEnumInstance' is non-const and globally accessible, consider making it const [cppcoreguidelines-avoid-non-const-global-variables]
 
 // CHECKING FOR NON-CONST GLOBAL STRUCT ///////////////////////////////////////
 struct DummyStruct {
@@ -181,6 +193,7 @@ DummyStruct nonConstAnonymousNamespaceStructInstance;
 }
 // CHECK-MESSAGES-DEFAULT: :[[@LINE-2]]:13: warning: variable 'nonConstAnonymousNamespaceStructInstance' is non-const and globally accessible, consider making it const [cppcoreguidelines-avoid-non-const-global-variables]
 // CHECK-MESSAGES-INTERNAL-LINKAGE-NOT: :[[@LINE-2]]:11: warning: variable 'nonConstAnonymousNamespaceEnumInstance' is non-const and globally accessible, consider making it const [cppcoreguidelines-avoid-non-const-global-variables]
+// CHECK-MESSAGES-THREAD-LOCAL: :[[@LINE-4]]:13: warning: variable 'nonConstAnonymousNamespaceStructInstance' is non-const and globally accessible, consider making it const [cppcoreguidelines-avoid-non-const-global-variables]
 
 // CHECKING FOR NON-CONST GLOBAL UNION ////////////////////////////////////////
 union DummyUnion {
@@ -222,6 +235,7 @@ DummyUnion nonConstAnonymousNamespaceUnionInstance = {0x0};
 }
 // CHECK-MESSAGES-DEFAULT: :[[@LINE-2]]:12: warning: variable 'nonConstAnonymousNamespaceUnionInstance' is non-const and globally accessible, consider making it const [cppcoreguidelines-avoid-non-const-global-variables]
 // CHECK-MESSAGES-INTERNAL-LINKAGE-NOT: :[[@LINE-3]]:12: warning: variable 'nonConstAnonymousNamespaceUnionInstance' is non-const and globally accessible, consider making it const [cppcoreguidelines-avoid-non-const-global-variables]
+// CHECK-MESSAGES-THREAD-LOCAL: :[[@LINE-4]]:12: warning: variable 'nonConstAnonymousNamespaceUnionInstance' is non-const and globally accessible, consider making it const [cppcoreguidelines-avoid-non-const-global-variables]
 
 // CHECKING FOR NON-CONST GLOBAL FUNCTION POINTER /////////////////////////////
 int dummyFunction() {
diff --git a/clang-tools-extra/test/clang-tidy/checkers/google/upgrade-googletest-case.cpp b/clang-tools-extra/test/clang-tidy/checkers/google/upgrade-googletest-case.cpp
index edb11b9863532..5b30541a96a42 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/google/upgrade-googletest-case.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/google/upgrade-googletest-case.cpp
@@ -1,5 +1,5 @@
-// RUN: %check_clang_tidy %s google-upgrade-googletest-case %t -- -- -I%S/Inputs
-// RUN: %check_clang_tidy -check-suffix=NOSUITE %s google-upgrade-googletest-case %t -- -- -DNOSUITE -I%S/Inputs/gtest/nosuite
+// RUN: %check_clang_tidy %s google-upgrade-googletest-case %t -- -- -isystem%S/Inputs
+// RUN: %check_clang_tidy -check-suffix=NOSUITE %s google-upgrade-googletest-case %t -- -- -DNOSUITE -isystem%S/Inputs/gtest/nosuite
 
 #include "gtest/gtest.h"
 
diff --git a/clang-tools-extra/test/clang-tidy/checkers/llvm/use-new-mlir-op-builder.cpp b/clang-tools-extra/test/clang-tidy/checkers/llvm/use-new-mlir-op-builder.cpp
index b57eab089c748..c4a1d8d66cdeb 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/llvm/use-new-mlir-op-builder.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/llvm/use-new-mlir-op-builder.cpp
@@ -2,6 +2,7 @@
 
 namespace mlir {
 class Location {};
+class Value {};
 class OpBuilder {
 public:
   template <typename OpTy, typename... Args>
@@ -28,6 +29,13 @@ struct NamedOp {
   static NamedOp create(OpBuilder &builder, Location location, const char* name) {
     return NamedOp(name);
   }
+  Value getResult() { return Value(); }
+};
+struct OperandOp {
+  OperandOp(Value val) {}
+  static OperandOp create(OpBuilder &builder, Location location, Value val) {
+    return OperandOp(val);
+  }
 };
 } // namespace mlir
 
@@ -40,6 +48,22 @@ void g(mlir::OpBuilder &b) {
   b.create<T>(b.getUnknownLoc(), "gaz");
 }
 
+class CustomBuilder : public mlir::ImplicitLocOpBuilder {
+public:
+  mlir::NamedOp f(const char *name) {
+    // CHECK-MESSAGES: :[[@LINE+2]]:12: warning: use 'OpType::create(builder, ...)'
+    // CHECK-FIXES: return mlir::NamedOp::create(*this, name);
+    return create<mlir::NamedOp>(name);
+  }
+
+  mlir::NamedOp g(const char *name) {
+    using mlir::NamedOp;
+    // CHECK-MESSAGES: :[[@LINE+2]]:12: warning: use 'OpType::create(builder, ...)'
+    // CHECK-FIXES: return NamedOp::create(*this, name);
+    return create<NamedOp>(name);
+  }
+};
+
 void f() {
   mlir::OpBuilder builder;
   // CHECK-MESSAGES: :[[@LINE+2]]:3: warning: use 'OpType::create(builder, ...)' instead of 'builder.create<OpType>(...)' [llvm-use-new-mlir-op-builder]
@@ -47,15 +71,18 @@ void f() {
   builder.create<mlir::  ModuleOp>(builder.getUnknownLoc());
 
   using mlir::NamedOp;
+  using mlir::OperandOp;
+
   // CHECK-MESSAGES: :[[@LINE+2]]:3: warning: use 'OpType::create(builder, ...)' instead of 'builder.create<OpType>(...)' [llvm-use-new-mlir-op-builder]
   // CHECK-FIXES: NamedOp::create(builder, builder.getUnknownLoc(), "baz");
   builder.create<NamedOp>(builder.getUnknownLoc(), "baz");
 
-  // CHECK-MESSAGES: :[[@LINE+3]]:3: warning: use 'OpType::create(builder, ...)' instead of 'builder.create<OpType>(...)' [llvm-use-new-mlir-op-builder]
-  // CHECK-FIXES: NamedOp::create(builder, builder.getUnknownLoc(),
-  // CHECK-FIXES:   "caz");
+  // CHECK-MESSAGES: :[[@LINE+4]]:3: warning: use 'OpType::create(builder, ...)' instead of 'builder.create<OpType>(...)' [llvm-use-new-mlir-op-builder]
+  // CHECK-FIXES: NamedOp::create(builder,
+  // CHECK-FIXES:      builder.getUnknownLoc(),
+  // CHECK-FIXES:      "caz");
   builder.
-   create<NamedOp>(
+   create<NamedOp>  (
      builder.getUnknownLoc(),
      "caz");
 
@@ -66,10 +93,26 @@ void f() {
 
   mlir::ImplicitLocOpBuilder ib;
   // CHECK-MESSAGES: :[[@LINE+2]]:3: warning: use 'OpType::create(builder, ...)' instead of 'builder.create<OpType>(...)' [llvm-use-new-mlir-op-builder]
-  // CHECK-FIXES: mlir::ModuleOp::create(ib);
+  // CHECK-FIXES: mlir::ModuleOp::create(ib );
   ib.create<mlir::ModuleOp>(   );
 
   // CHECK-MESSAGES: :[[@LINE+2]]:3: warning: use 'OpType::create(builder, ...)' instead of 'builder.create<OpType>(...)' [llvm-use-new-mlir-op-builder]
   // CHECK-FIXES: mlir::OpBuilder().create<mlir::ModuleOp>(builder.getUnknownLoc());
   mlir::OpBuilder().create<mlir::ModuleOp>(builder.getUnknownLoc());
+
+  auto *p = &builder;
+  // CHECK-MESSAGES: :[[@LINE+2]]:3: warning: use 'OpType::create(builder, ...)'
+  // CHECK-FIXES: NamedOp::create(*p, builder.getUnknownLoc(), "eaz");
+  p->create<NamedOp>(builder.getUnknownLoc(), "eaz");
+
+  CustomBuilder cb;
+  cb.f("faz");
+  cb.g("gaz");
+
+  // CHECK-FIXES:      OperandOp::create(builder, builder.getUnknownLoc(),
+  // CHECK-FIXES-NEXT:   NamedOp::create(builder, builder.getUnknownLoc(), "haz").getResult());
+  // CHECK-MESSAGES: :[[@LINE+2]]:3: warning: use 'OpType::create(builder, ...)' instead of 'builder.create<OpType>(...)' [llvm-use-new-mlir-op-builder]
+  // CHECK-MESSAGES: :[[@LINE+2]]:5: warning: use 'OpType::create(builder, ...)' instead of 'builder.create<OpType>(...)' [llvm-use-new-mlir-op-builder]
+  builder.create<OperandOp>(builder.getUnknownLoc(),
+    builder.create<NamedOp>(builder.getUnknownLoc(), "haz").getResult());
 }
diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/replace-auto-ptr.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/replace-auto-ptr.cpp
index 2281c1acad94f..371f3ddf6d650 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/modernize/replace-auto-ptr.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/replace-auto-ptr.cpp
@@ -1,4 +1,4 @@
-// RUN: %check_clang_tidy %s modernize-replace-auto-ptr %t -- -- -I %S/Inputs/replace-auto-ptr
+// RUN: %check_clang_tidy %s modernize-replace-auto-ptr %t -- -- -isystem %S/Inputs/replace-auto-ptr
 
 // CHECK-FIXES: #include <utility>
 
diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-using.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-using.cpp
index 8288f39126a11..5b8eca2825645 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-using.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-using.cpp
@@ -1,4 +1,4 @@
-// RUN: %check_clang_tidy %s modernize-use-using %t -- -- -fno-delayed-template-parsing -I %S/Inputs/use-using/
+// RUN: %check_clang_tidy %s modernize-use-using %t -- -- -fno-delayed-template-parsing -isystem %S/Inputs/use-using/
 
 typedef int Type;
 // CHECK-MESSAGES: :[[@LINE-1]]:1: warning: use 'using' instead of 'typedef' [modernize-use-using]
diff --git a/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-value-param-templates.cpp b/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-value-param-templates.cpp
index 688c79bbaa9ac..61758c5dac071 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-value-param-templates.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-value-param-templates.cpp
@@ -96,3 +96,34 @@ void lambdaNonConstAutoValue() {
   };
   fn(ExpensiveToCopyType());
 }
+
+template <typename... Args>
+void ParameterPack(Args... args) {
+  // CHECK-MESSAGES: [[@LINE-1]]:28: warning: the parameter 'args' of type 'ExpensiveToCopyType'
+  // CHECK-FIXES: void ParameterPack(const Args&... args) {
+}
+
+template <typename... Args>
+void ParameterPackConst(Args const... args) {
+  // CHECK-MESSAGES: [[@LINE-1]]:39: warning: the const qualified parameter 'args' of type 'const ExpensiveToCopyType'
+  // CHECK-FIXES: void ParameterPackConst(Args const&... args) {
+}
+
+template <typename... Args>
+void ParameterPackWithParams(const ExpensiveToCopyType E1, ExpensiveToCopyType E2, Args... args) {
+  // CHECK-MESSAGES: [[@LINE-1]]:56: warning: the const qualified parameter 'E1'
+  // CHECK-MESSAGES: [[@LINE-2]]:80: warning: the parameter 'E2'
+  // CHECK-MESSAGES: [[@LINE-3]]:92: warning: the parameter 'args'
+  // CHECK-FIXES: void ParameterPackWithParams(const ExpensiveToCopyType& E1, const ExpensiveToCopyType& E2, const Args&... args) {
+}
+
+template <typename... Args>
+void PackWithNonExpensive(int x, Args... args) {}
+
+void instantiatedParameterPack() {
+  ExpensiveToCopyType E;
+  ParameterPack(E);
+  ParameterPackConst(E);
+  ParameterPackWithParams(E, E, E);
+  PackWithNonExpensive(5, 5);
+}
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/duplicate-include.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/duplicate-include.cpp
index 223f07724c5d0..c452f69fad07d 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/readability/duplicate-include.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability/duplicate-include.cpp
@@ -1,4 +1,6 @@
-// RUN: %check_clang_tidy %s readability-duplicate-include %t -- -- -isystem %S/Inputs/duplicate-include/system -I %S/Inputs/duplicate-include
+// RUN: %check_clang_tidy %s readability-duplicate-include %t -- \
+// RUN:   -header-filter='' \
+// RUN:   -- -isystem %S/Inputs/duplicate-include/system -I %S/Inputs/duplicate-include
 
 int a;
 #include <string.h>
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/identifier-naming.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/identifier-naming.cpp
index 91807337176d9..1d06df3bbfaf2 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/readability/identifier-naming.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability/identifier-naming.cpp
@@ -86,7 +86,9 @@
 // RUN:     readability-identifier-naming.LocalPointerPrefix: 'l_', \
 // RUN:     readability-identifier-naming.LocalConstantPointerCase: CamelCase, \
 // RUN:     readability-identifier-naming.LocalConstantPointerPrefix: 'lc_', \
-// RUN:   }}' -- -fno-delayed-template-parsing -Dbad_macro \
+// RUN:   }}' \
+// RUN:   -header-filter='' \
+// RUN:   -- -fno-delayed-template-parsing -Dbad_macro \
 // RUN:   -I%S/Inputs/identifier-naming \
 // RUN:   -isystem %S/Inputs/identifier-naming/system
 
@@ -95,8 +97,7 @@
 #include <system-header.h>
 #include <coroutines.h>
 #include "user-header.h"
-// NO warnings or fixes expected from declarations within header files without
-// the -header-filter= option
+// NO warnings or fixes expected from declarations with the -header-filter='' option
 
 namespace FOO_NS {
 // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: invalid case style for namespace 'FOO_NS' [readability-identifier-naming]
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/redundant-parentheses.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/redundant-parentheses.cpp
index 926cb118c77cf..c77608c66469c 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/readability/redundant-parentheses.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability/redundant-parentheses.cpp
@@ -62,3 +62,12 @@ void exceptions() {
   // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: redundant parentheses around expression [readability-redundant-parentheses]
   // CHECK-FIXES:    alignof(3);
 }
+
+namespace std {
+  template<class T> T max(T, T);
+  template<class T> T min(T, T);
+} // namespace std
+void ignoreStdMaxMin() {
+  (std::max)(1,2);
+  (std::min)(1,2);
+}
diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/default-header-filter.cpp b/clang-tools-extra/test/clang-tidy/infrastructure/default-header-filter.cpp
new file mode 100644
index 0000000000000..489b302ac0512
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/infrastructure/default-header-filter.cpp
@@ -0,0 +1,27 @@
+
+// RUN: clang-tidy -checks='-*,google-explicit-constructor' --config='{}' %s -- -I %S/Inputs/file-filter -isystem %S/Inputs/file-filter/system 2>&1 | FileCheck --check-prefix=CHECK-DEFAULT %s
+// RUN: clang-tidy -checks='-*,google-explicit-constructor' --config='{}' -header-filter='' %s -- -I %S/Inputs/file-filter -isystem %S/Inputs/file-filter/system 2>&1 | FileCheck --check-prefix=CHECK-EMPTY %s
+// RUN: clang-tidy -checks='-*,google-explicit-constructor' --config='{}' -header-filter='.*' %s -- -I %S/Inputs/file-filter -isystem %S/Inputs/file-filter/system 2>&1 | FileCheck --check-prefix=CHECK-EXPLICIT %s
+// RUN: clang-tidy -checks='-*,google-explicit-constructor' --config='{}' %s -- -I %S/Inputs/file-filter -isystem %S/Inputs/file-filter/system 2>&1 | FileCheck --check-prefix=CHECK-NO-SYSTEM %s
+// RUN: clang-tidy -checks='-*,google-explicit-constructor' --config='{}' -system-headers %s -- -I %S/Inputs/file-filter -isystem %S/Inputs/file-filter/system 2>&1 | FileCheck --check-prefix=CHECK-WITH-SYSTEM %s
+
+#include "header1.h"
+// CHECK-DEFAULT: header1.h:1:12: warning: single-argument constructors must be marked explicit
+// CHECK-EMPTY-NOT: header1.h:1:12: warning:
+// CHECK-EXPLICIT: header1.h:1:12: warning: single-argument constructors must be marked explicit
+// CHECK-NO-SYSTEM: header1.h:1:12: warning: single-argument constructors must be marked explicit
+// CHECK-WITH-SYSTEM-DAG: header1.h:1:12: warning: single-argument constructors must be marked explicit
+
+#include <system-header.h>
+// CHECK-DEFAULT-NOT: system-header.h:1:12: warning:
+// CHECK-EMPTY-NOT: system-header.h:1:12: warning:
+// CHECK-EXPLICIT-NOT: system-header.h:1:12: warning:
+// CHECK-NO-SYSTEM-NOT: system-header.h:1:12: warning:
+// CHECK-WITH-SYSTEM-DAG: system-header.h:1:12: warning: single-argument constructors must be marked explicit
+
+class A { A(int); };
+// CHECK-DEFAULT: :[[@LINE-1]]:11: warning: single-argument constructors must be marked explicit
+// CHECK-EMPTY: :[[@LINE-2]]:11: warning: single-argument constructors must be marked explicit
+// CHECK-EXPLICIT: :[[@LINE-3]]:11: warning: single-argument constructors must be marked explicit
+// CHECK-NO-SYSTEM: :[[@LINE-4]]:11: warning: single-argument constructors must be marked explicit
+// CHECK-WITH-SYSTEM: :[[@LINE-5]]:11: warning: single-argument constructors must be marked explicit
diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/file-filter.cpp b/clang-tools-extra/test/clang-tidy/infrastructure/file-filter.cpp
index d9ec1049963b0..485e9fb1f0cb7 100644
--- a/clang-tools-extra/test/clang-tidy/infrastructure/file-filter.cpp
+++ b/clang-tools-extra/test/clang-tidy/infrastructure/file-filter.cpp
@@ -66,7 +66,7 @@ class A { A(int); };
 // CHECK4-NOT: warning:
 // CHECK4-QUIET-NOT: warning:
 
-// CHECK: Use -header-filter=.* to display errors from all non-system headers.
+// CHECK: Use -header-filter=.* or leave it as default to display errors from all non-system headers.
 // CHECK-QUIET-NOT: Suppressed
 // CHECK2-QUIET-NOT: Suppressed
 // CHECK3: Use -header-filter=.* {{.*}}
diff --git a/clang/AreaTeamMembers.txt b/clang/AreaTeamMembers.txt
index 964d11e79f694..2928943f47533 100644
--- a/clang/AreaTeamMembers.txt
+++ b/clang/AreaTeamMembers.txt
@@ -13,5 +13,5 @@ rnk@google.com (email), rnk (Discourse), rnk (GitHub), rnk (Discord)
 Other Members
 -------------
 Eli Friedman
-efriedma@quicinc.com> (email), efriedma-quic (Discourse), efriedma-quic (GitHub)
+efriedma@qti.qualcomm.com> (email), efriedma-quic (Discourse), efriedma-quic (GitHub)
 
diff --git a/clang/Maintainers.rst b/clang/Maintainers.rst
index 8fb2201aae16c..1d16ea9fe5638 100644
--- a/clang/Maintainers.rst
+++ b/clang/Maintainers.rst
@@ -46,7 +46,7 @@ Clang LLVM IR generation
 | rjmccall\@apple.com (email), rjmccall (Phabricator), rjmccall (GitHub)
 
 | Eli Friedman
-| efriedma\@quicinc.com (email), efriedma (Phabricator), efriedma-quic (GitHub)
+| efriedma\@qti.qualcomm.com (email), efriedma (Phabricator), efriedma-quic (GitHub)
 
 | Anton Korobeynikov
 | anton\@korobeynikov.info (email), asl (Phabricator), asl (GitHub)
diff --git a/clang/bindings/python/clang/cindex.py b/clang/bindings/python/clang/cindex.py
index 2786add27f5e8..c48bc9c2eb7de 100644
--- a/clang/bindings/python/clang/cindex.py
+++ b/clang/bindings/python/clang/cindex.py
@@ -2362,6 +2362,13 @@ def get_bitfield_width(self) -> int:
         """
         return conf.lib.clang_getFieldDeclBitWidth(self)  # type: ignore [no-any-return]
 
+    @cursor_null_guard
+    def is_function_inlined(self) -> bool:
+        """
+        Check if the function is inlined.
+        """
+        return bool(conf.lib.clang_Cursor_isFunctionInlined(self))
+
     @cursor_null_guard
     def has_attrs(self) -> bool:
         """
@@ -4310,6 +4317,7 @@ def set_property(self, property, value):
     ("clang_Cursor_isAnonymous", [Cursor], bool),
     ("clang_Cursor_isAnonymousRecordDecl", [Cursor], bool),
     ("clang_Cursor_isBitField", [Cursor], bool),
+    ("clang_Cursor_isFunctionInlined", [Cursor], c_uint),
     ("clang_Location_isInSystemHeader", [SourceLocation], bool),
     ("clang_PrintingPolicy_dispose", [PrintingPolicy]),
     ("clang_PrintingPolicy_getProperty", [PrintingPolicy, c_int], c_uint),
diff --git a/clang/bindings/python/tests/cindex/test_cursor.py b/clang/bindings/python/tests/cindex/test_cursor.py
index eb0d1d50601a6..7cb616a7ef148 100644
--- a/clang/bindings/python/tests/cindex/test_cursor.py
+++ b/clang/bindings/python/tests/cindex/test_cursor.py
@@ -784,6 +784,21 @@ def test_storage_class(self):
         cursor = get_cursor(tu, "reg")
         self.assertEqual(cursor.storage_class, StorageClass.REGISTER)
 
+    def test_function_inlined(self):
+        tu = get_tu(
+            """
+inline void f_inline(void);
+void f_noninline(void);
+int d_noninline;
+"""
+        )
+        cursor = get_cursor(tu, "f_inline")
+        self.assertEqual(cursor.is_function_inlined(), True)
+        cursor = get_cursor(tu, "f_noninline")
+        self.assertEqual(cursor.is_function_inlined(), False)
+        cursor = get_cursor(tu, "d_noninline")
+        self.assertEqual(cursor.is_function_inlined(), False)
+
     def test_availability(self):
         tu = get_tu("class A { A(A const&) = delete; };", lang="cpp")
 
diff --git a/clang/docs/AMDGPUSupport.rst b/clang/docs/AMDGPUSupport.rst
index 3eada5f900613..18e3de8abe92a 100644
--- a/clang/docs/AMDGPUSupport.rst
+++ b/clang/docs/AMDGPUSupport.rst
@@ -49,10 +49,6 @@ Predefined Macros
      - Defined as 1 if the CU mode is enabled and 0 if the WGP mode is enabled.
    * - ``__AMDGCN_UNSAFE_FP_ATOMICS__``
      - Defined if unsafe floating-point atomics are allowed.
-   * - ``__AMDGCN_WAVEFRONT_SIZE__``
-     - Defines the wavefront size. Allowed values are 32 and 64 (deprecated).
-   * - ``__AMDGCN_WAVEFRONT_SIZE``
-     - Alias to ``__AMDGCN_WAVEFRONT_SIZE__`` (deprecated).
    * - ``__HAS_FMAF__``
      - Defined if FMAF instruction is available (deprecated).
    * - ``__HAS_LDEXPF__``
diff --git a/clang/docs/AllocToken.rst b/clang/docs/AllocToken.rst
index b65e18ccfa967..1a740e5e22c29 100644
--- a/clang/docs/AllocToken.rst
+++ b/clang/docs/AllocToken.rst
@@ -49,6 +49,39 @@ change or removal. These may (experimentally) be selected with ``-Xclang
 * ``increment``: This mode assigns a simple, incrementally increasing token ID
   to each allocation site.
 
+The following command-line options affect generated token IDs:
+
+* ``-falloc-token-max=<N>``
+    Configures the maximum number of tokens. No max by default (tokens bounded
+    by ``SIZE_MAX``).
+
+Querying Token IDs with ``__builtin_infer_alloc_token``
+=======================================================
+
+For use cases where the token ID must be known at compile time, Clang provides
+a builtin function:
+
+.. code-block:: c
+
+    size_t __builtin_infer_alloc_token(<args>, ...);
+
+This builtin returns the token ID inferred from its argument expressions, which
+mirror arguments normally passed to any allocation function. The argument
+expressions are **unevaluated**, so it can be used with expressions that would
+have side effects without any runtime impact.
+
+For example, it can be used as follows:
+
+.. code-block:: c
+
+    struct MyType { ... };
+    void *__partition_alloc(size_t size, size_t partition);
+    #define partition_alloc(...) __partition_alloc(__VA_ARGS__, __builtin_infer_alloc_token(__VA_ARGS__))
+
+    void foo(void) {
+        MyType *x = partition_alloc(sizeof(*x));
+    }
+
 Allocation Token Instrumentation
 ================================
 
@@ -70,16 +103,6 @@ example:
     // Instrumented:
     ptr = __alloc_token_malloc(size, <token id>);
 
-The following command-line options affect generated token IDs:
-
-* ``-falloc-token-max=<N>``
-    Configures the maximum number of tokens. No max by default (tokens bounded
-    by ``SIZE_MAX``).
-
-    .. code-block:: console
-
-        % clang++ -fsanitize=alloc-token -falloc-token-max=512 example.cc
-
 Runtime Interface
 -----------------
 
diff --git a/clang/docs/ClangFormatStyleOptions.rst b/clang/docs/ClangFormatStyleOptions.rst
index 570cab262c115..0b4a4849f6ccc 100644
--- a/clang/docs/ClangFormatStyleOptions.rst
+++ b/clang/docs/ClangFormatStyleOptions.rst
@@ -197,57 +197,29 @@ the configuration (without a prefix: ``Auto``).
 
 .. _AlignAfterOpenBracket:
 
-**AlignAfterOpenBracket** (``BracketAlignmentStyle``) :versionbadge:`clang-format 3.8` :ref:`¶ <AlignAfterOpenBracket>`
+**AlignAfterOpenBracket** (``Boolean``) :versionbadge:`clang-format 3.8` :ref:`¶ <AlignAfterOpenBracket>`
   If ``true``, horizontally aligns arguments after an open bracket.
 
-  This applies to round brackets (parentheses), angle brackets and square
-  brackets.
-
-  Possible values:
-
-  * ``BAS_Align`` (in configuration: ``Align``)
-    Align parameters on the open bracket, e.g.:
-
-    .. code-block:: c++
-
-      someLongFunction(argument1,
-                       argument2);
-
-  * ``BAS_DontAlign`` (in configuration: ``DontAlign``)
-    Don't align, instead use ``ContinuationIndentWidth``, e.g.:
-
-    .. code-block:: c++
-
-      someLongFunction(argument1,
-          argument2);
-
-  * ``BAS_AlwaysBreak`` (in configuration: ``AlwaysBreak``)
-    Always break after an open bracket, if the parameters don't fit
-    on a single line, e.g.:
-
-    .. code-block:: c++
 
-      someLongFunction(
-          argument1, argument2);
-
-  * ``BAS_BlockIndent`` (in configuration: ``BlockIndent``)
-    Always break after an open bracket, if the parameters don't fit
-    on a single line. Closing brackets will be placed on a new line.
-    E.g.:
-
-    .. code-block:: c++
+  .. code-block:: c++
 
-      someLongFunction(
-          argument1, argument2
-      )
+    true:                         vs.   false
+    someLongFunction(argument1,         someLongFunction(argument1,
+                     argument2);            argument2);
 
 
-    .. note::
-
-     This currently only applies to braced initializer lists (when
-     ``Cpp11BracedListStyle`` is not ``Block``) and parentheses.
+  .. note::
 
+    As of clang-format 22 this option is a bool with the previous
+    option of ``Align`` replaced with ``true``, ``DontAlign`` replaced
+    with ``false``, and the options of ``AlwaysBreak`` and ``BlockIndent``
+    replaced with ``true`` and with setting of new style options using
+    ``BreakAfterOpenBracketBracedList``, ``BreakAfterOpenBracketFunction``,
+    ``BreakAfterOpenBracketIf``, ``BreakBeforeCloseBracketBracedList``,
+    ``BreakBeforeCloseBracketFunction``, and ``BreakBeforeCloseBracketIf``.
 
+  This applies to round brackets (parentheses), angle brackets and square
+  brackets.
 
 .. _AlignArrayOfStructures:
 
@@ -2746,6 +2718,67 @@ the configuration (without a prefix: ``Auto``).
      @Mock
      DataLoad loader;
 
+.. _BreakAfterOpenBracketBracedList:
+
+**BreakAfterOpenBracketBracedList** (``Boolean``) :versionbadge:`clang-format 22` :ref:`¶ <BreakAfterOpenBracketBracedList>`
+  Force break after the left bracket of a braced initializer list (when
+  ``Cpp11BracedListStyle`` is ``true``) when the list exceeds the column
+  limit.
+
+  .. code-block:: c++
+
+    true:                             false:
+    vector<int> x {         vs.       vector<int> x {1,
+       1, 2, 3}                            2, 3}
+
+.. _BreakAfterOpenBracketFunction:
+
+**BreakAfterOpenBracketFunction** (``Boolean``) :versionbadge:`clang-format 22` :ref:`¶ <BreakAfterOpenBracketFunction>`
+  Force break after the left parenthesis of a function (declaration,
+  definition, call) when the parameters exceed the column limit.
+
+  .. code-block:: c++
+
+    true:                             false:
+    foo (                   vs.       foo (a,
+       a , b)                              b)
+
+.. _BreakAfterOpenBracketIf:
+
+**BreakAfterOpenBracketIf** (``Boolean``) :versionbadge:`clang-format 22` :ref:`¶ <BreakAfterOpenBracketIf>`
+  Force break after the left parenthesis of an if control statement
+  when the expression exceeds the column limit.
+
+  .. code-block:: c++
+
+    true:                             false:
+    if constexpr (          vs.       if constexpr (a ||
+       a || b)                                      b)
+
+.. _BreakAfterOpenBracketLoop:
+
+**BreakAfterOpenBracketLoop** (``Boolean``) :versionbadge:`clang-format 22` :ref:`¶ <BreakAfterOpenBracketLoop>`
+  Force break after the left parenthesis of a loop control statement
+  when the expression exceeds the column limit.
+
+  .. code-block:: c++
+
+    true:                             false:
+    while (                  vs.      while (a &&
+       a && b) {                             b) {
+
+.. _BreakAfterOpenBracketSwitch:
+
+**BreakAfterOpenBracketSwitch** (``Boolean``) :versionbadge:`clang-format 22` :ref:`¶ <BreakAfterOpenBracketSwitch>`
+  Force break after the left parenthesis of a switch control statement
+  when the expression exceeds the column limit.
+
+  .. code-block:: c++
+
+    true:                             false:
+    switch (                 vs.      switch (a +
+       a + b) {                               b) {
+
 .. _BreakAfterReturnType:
 
 **BreakAfterReturnType** (``ReturnTypeBreakingStyle``) :versionbadge:`clang-format 19` :ref:`¶ <BreakAfterReturnType>`
@@ -3383,6 +3416,79 @@ the configuration (without a prefix: ``Auto``).
 
 
 
+.. _BreakBeforeCloseBracketBracedList:
+
+**BreakBeforeCloseBracketBracedList** (``Boolean``) :versionbadge:`clang-format 22` :ref:`¶ <BreakBeforeCloseBracketBracedList>`
+  Force break before the right bracket of a braced initializer list (when
+  ``Cpp11BracedListStyle`` is ``true``) when the list exceeds the column
+  limit. The break before the right bracket is only made if there is a
+  break after the opening bracket.
+
+  .. code-block:: c++
+
+    true:                             false:
+    vector<int> x {         vs.       vector<int> x {
+       1, 2, 3                           1, 2, 3}
+    }
+
+.. _BreakBeforeCloseBracketFunction:
+
+**BreakBeforeCloseBracketFunction** (``Boolean``) :versionbadge:`clang-format 22` :ref:`¶ <BreakBeforeCloseBracketFunction>`
+  Force break before the right parenthesis of a function (declaration,
+  definition, call) when the parameters exceed the column limit.
+
+  .. code-block:: c++
+
+    true:                             false:
+    foo (                   vs.       foo (
+       a , b                             a , b)
+    )
+
+.. _BreakBeforeCloseBracketIf:
+
+**BreakBeforeCloseBracketIf** (``Boolean``) :versionbadge:`clang-format 22` :ref:`¶ <BreakBeforeCloseBracketIf>`
+  Force break before the right parenthesis of an if control statement
+  when the expression exceeds the column limit. The break before the
+  closing parenthesis is only made if there is a break after the opening
+  parenthesis.
+
+  .. code-block:: c++
+
+    true:                             false:
+    if constexpr (          vs.       if constexpr (
+       a || b                            a || b )
+    )
+
+.. _BreakBeforeCloseBracketLoop:
+
+**BreakBeforeCloseBracketLoop** (``Boolean``) :versionbadge:`clang-format 22` :ref:`¶ <BreakBeforeCloseBracketLoop>`
+  Force break before the right parenthesis of a loop control statement
+  when the expression exceeds the column limit. The break before the
+  closing parenthesis is only made if there is a break after the opening
+  parenthesis.
+
+  .. code-block:: c++
+
+    true:                             false:
+    while (                  vs.      while (
+       a && b                            a && b) {
+    ) {
+
+.. _BreakBeforeCloseBracketSwitch:
+
+**BreakBeforeCloseBracketSwitch** (``Boolean``) :versionbadge:`clang-format 22` :ref:`¶ <BreakBeforeCloseBracketSwitch>`
+  Force break before the right parenthesis of a switch control statement
+  when the expression exceeds the column limit. The break before the
+  closing parenthesis is only made if there is a break after the opening
+  parenthesis.
+
+  .. code-block:: c++
+
+    true:                             false:
+    switch (                 vs.      switch (
+       a + b                             a + b) {
+    ) {
+
 .. _BreakBeforeConceptDeclarations:
 
 **BreakBeforeConceptDeclarations** (``BreakBeforeConceptDeclarationsStyle``) :versionbadge:`clang-format 12` :ref:`¶ <BreakBeforeConceptDeclarations>`
diff --git a/clang/docs/HIPSupport.rst b/clang/docs/HIPSupport.rst
index ec2af2a6f569d..ab9ea110e6d54 100644
--- a/clang/docs/HIPSupport.rst
+++ b/clang/docs/HIPSupport.rst
@@ -180,8 +180,7 @@ Predefined Macros
      - Alias to ``__HIP_API_PER_THREAD_DEFAULT_STREAM__``. Deprecated.
 
 Note that some architecture specific AMDGPU macros will have default values when
-used from the HIP host compilation. Other :doc:`AMDGPU macros <AMDGPUSupport>`
-like ``__AMDGCN_WAVEFRONT_SIZE__`` (deprecated) will default to 64 for example.
+used from the HIP host compilation.
 
 Compilation Modes
 =================
diff --git a/clang/docs/LibASTMatchersReference.html b/clang/docs/LibASTMatchersReference.html
index 9b30057b5257f..5b2a96d00d592 100644
--- a/clang/docs/LibASTMatchersReference.html
+++ b/clang/docs/LibASTMatchersReference.html
@@ -1028,6 +1028,15 @@ <h2 id="decl-matchers">Node Matchers</h2>
 </pre></td></tr>
 
 
+<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('requiresExprBodyDecl0')"><a name="requiresExprBodyDecl0Anchor">requiresExprBodyDecl</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1RequiresExprBodyDecl.html">RequiresExprBodyDecl</a>&gt;...</td></tr>
+<tr><td colspan="4" class="doc" id="requiresExprBodyDecl0"><pre>Matches concept requirement body declaration.
+
+Example matches '{ *p; }'
+  template&lt;typename T&gt;
+  concept dereferencable = requires(T p) { *p; }
+</pre></td></tr>
+
+
 <tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('staticAssertDecl0')"><a name="staticAssertDecl0Anchor">staticAssertDecl</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1StaticAssertDecl.html">StaticAssertDecl</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="staticAssertDecl0"><pre>Matches a C++ static_assert declaration.
 
@@ -1190,6 +1199,17 @@ <h2 id="decl-matchers">Node Matchers</h2>
   matches using enum X::x </pre></td></tr>
 
 
+<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('usingShadowDecl0')"><a name="usingShadowDecl0Anchor">usingShadowDecl</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1UsingShadowDecl.html">UsingShadowDecl</a>&gt;...</td></tr>
+<tr><td colspan="4" class="doc" id="usingShadowDecl0"><pre>Matches shadow declarations introduced into a scope by a
+       (resolved) using declaration.
+
+Given
+  namespace n { int f; }
+  namespace declToImport { using n::f; }
+usingShadowDecl()
+  matches f </pre></td></tr>
+
+
 <tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('valueDecl0')"><a name="valueDecl0Anchor">valueDecl</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1ValueDecl.html">ValueDecl</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="valueDecl0"><pre>Matches any value declaration.
 
@@ -1210,6 +1230,15 @@ <h2 id="decl-matchers">Node Matchers</h2>
 </pre></td></tr>
 
 
+<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt;</td><td class="name" onclick="toggle('requiresExpr0')"><a name="requiresExpr0Anchor">requiresExpr</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1RequiresExpr.html">RequiresExpr</a>&gt;...</td></tr>
+<tr><td colspan="4" class="doc" id="requiresExpr0"><pre>Matches concept requirement.
+
+Example matches 'requires(T p) { *p; }'
+  template&lt;typename T&gt;
+  concept dereferencable = requires(T p) { *p; }
+</pre></td></tr>
+
+
 <tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1LambdaCapture.html">LambdaCapture</a>&gt;</td><td class="name" onclick="toggle('lambdaCapture0')"><a name="lambdaCapture0Anchor">lambdaCapture</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1LambdaCapture.html">LambdaCapture</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="lambdaCapture0"><pre>Matches lambda captures.
 
@@ -1679,6 +1708,19 @@ <h2 id="decl-matchers">Node Matchers</h2>
 </pre></td></tr>
 
 
+<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('cxxNamedCastExpr0')"><a name="cxxNamedCastExpr0Anchor">cxxNamedCastExpr</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXNamedCastExpr.html">CXXNamedCastExpr</a>&gt;...</td></tr>
+<tr><td colspan="4" class="doc" id="cxxNamedCastExpr0"><pre>Matches any named cast expression.
+
+Example: Matches all four of the casts in
+  struct S { virtual void f(); };
+  S* p = nullptr;
+  S* ptr1 = static_cast&lt;S*&gt;(p);
+  S* ptr2 = reinterpret_cast&lt;S*&gt;(p);
+  S* ptr3 = dynamic_cast&lt;S*&gt;(p);
+  S* ptr4 = const_cast&lt;S*&gt;(p);
+</pre></td></tr>
+
+
 <tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('cxxNewExpr0')"><a name="cxxNewExpr0Anchor">cxxNewExpr</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXNewExpr.html">CXXNewExpr</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="cxxNewExpr0"><pre>Matches new expressions.
 
@@ -2168,7 +2210,7 @@ <h2 id="decl-matchers">Node Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('ompExecutableDirective0')"><a name="ompExecutableDirective0Anchor">ompExecutableDirective</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1OMPExecutableDirective.html">OMPExecutableDirective</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('ompExecutableDirective0')"><a name="ompExecutableDirective0Anchor">ompExecutableDirective</a></td><td>Matcher&lt;OMPExecutableDirective&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="ompExecutableDirective0"><pre>Matches any ``#pragma omp`` executable directive.
 
 Given
@@ -2393,17 +2435,6 @@ <h2 id="decl-matchers">Node Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TypeLoc.html">TypeLoc</a>&gt;</td><td class="name" onclick="toggle('elaboratedTypeLoc0')"><a name="elaboratedTypeLoc0Anchor">elaboratedTypeLoc</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1ElaboratedTypeLoc.html">ElaboratedTypeLoc</a>&gt;...</td></tr>
-<tr><td colspan="4" class="doc" id="elaboratedTypeLoc0"><pre>Matches C or C++ elaborated `TypeLoc`s.
-
-Given
-  struct s {};
-  struct s ss;
-elaboratedTypeLoc()
-  matches the `TypeLoc` of the variable declaration of `ss`.
-</pre></td></tr>
-
-
 <tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TypeLoc.html">TypeLoc</a>&gt;</td><td class="name" onclick="toggle('pointerTypeLoc0')"><a name="pointerTypeLoc0Anchor">pointerTypeLoc</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1PointerTypeLoc.html">PointerTypeLoc</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="pointerTypeLoc0"><pre>Matches pointer `TypeLoc`s.
 
@@ -2474,7 +2505,7 @@ <h2 id="decl-matchers">Node Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('autoType0')"><a name="autoType0Anchor">autoType</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1AutoType.html">AutoType</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('autoType0')"><a name="autoType0Anchor">autoType</a></td><td>Matcher&lt;AutoType&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="autoType0"><pre>Matches types nodes representing C++11 auto types.
 
 Given:
@@ -2544,7 +2575,7 @@ <h2 id="decl-matchers">Node Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('decltypeType0')"><a name="decltypeType0Anchor">decltypeType</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1DecltypeType.html">DecltypeType</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('decltypeType0')"><a name="decltypeType0Anchor">decltypeType</a></td><td>Matcher&lt;DecltypeType&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="decltypeType0"><pre>Matches types nodes representing C++11 decltype(&lt;expr&gt;) types.
 
 Given:
@@ -2556,7 +2587,7 @@ <h2 id="decl-matchers">Node Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('deducedTemplateSpecializationType0')"><a name="deducedTemplateSpecializationType0Anchor">deducedTemplateSpecializationType</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1DeducedTemplateSpecializationType.html">DeducedTemplateSpecializationType</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('deducedTemplateSpecializationType0')"><a name="deducedTemplateSpecializationType0Anchor">deducedTemplateSpecializationType</a></td><td>Matcher&lt;DeducedTemplateSpecializationType&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="deducedTemplateSpecializationType0"><pre>Matches C++17 deduced template specialization types, e.g. deduced class
 template types.
 
@@ -2570,7 +2601,7 @@ <h2 id="decl-matchers">Node Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('dependentNameType0')"><a name="dependentNameType0Anchor">dependentNameType</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1DependentNameType.html">DependentNameType</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('dependentNameType0')"><a name="dependentNameType0Anchor">dependentNameType</a></td><td>Matcher&lt;DependentNameType&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="dependentNameType0"><pre>Matches a dependent name type
 
 Example matches T::type
@@ -2607,38 +2638,7 @@ <h2 id="decl-matchers">Node Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('dependentTemplateSpecializationType0')"><a name="dependentTemplateSpecializationType0Anchor">dependentTemplateSpecializationType</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1DependentTemplateSpecializationType.html">DependentTemplateSpecializationType</a>&gt;...</td></tr>
-<tr><td colspan="4" class="doc" id="dependentTemplateSpecializationType0"><pre>Matches a dependent template specialization type
-
-Example matches A&lt;T&gt;::template B&lt;T&gt;
-  template&lt;typename T&gt; struct A;
-  template&lt;typename T&gt; struct declToImport {
-    typename A&lt;T&gt;::template B&lt;T&gt; a;
-  };
-</pre></td></tr>
-
-
-<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('elaboratedType0')"><a name="elaboratedType0Anchor">elaboratedType</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1ElaboratedType.html">ElaboratedType</a>&gt;...</td></tr>
-<tr><td colspan="4" class="doc" id="elaboratedType0"><pre>Matches types specified with an elaborated type keyword or with a
-qualified name.
-
-Given
-  namespace N {
-    namespace M {
-      class D {};
-    }
-  }
-  class C {};
-
-  class C c;
-  N::M::D d;
-
-elaboratedType() matches the type of the variable declarations of both
-c and d.
-</pre></td></tr>
-
-
-<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('enumType0')"><a name="enumType0Anchor">enumType</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1EnumType.html">EnumType</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('enumType0')"><a name="enumType0Anchor">enumType</a></td><td>Matcher&lt;EnumType&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="enumType0"><pre>Matches enum types.
 
 Given
@@ -2688,7 +2688,7 @@ <h2 id="decl-matchers">Node Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('injectedClassNameType0')"><a name="injectedClassNameType0Anchor">injectedClassNameType</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1InjectedClassNameType.html">InjectedClassNameType</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('injectedClassNameType0')"><a name="injectedClassNameType0Anchor">injectedClassNameType</a></td><td>Matcher&lt;InjectedClassNameType&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="injectedClassNameType0"><pre>Matches injected class name types.
 
 Example matches S s, but not S&lt;T&gt; s.
@@ -2800,7 +2800,7 @@ <h2 id="decl-matchers">Node Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('recordType0')"><a name="recordType0Anchor">recordType</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1RecordType.html">RecordType</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('recordType0')"><a name="recordType0Anchor">recordType</a></td><td>Matcher&lt;RecordType&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="recordType0"><pre>Matches record types (e.g. structs, classes).
 
 Given
@@ -2831,7 +2831,7 @@ <h2 id="decl-matchers">Node Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('substTemplateTypeParmType0')"><a name="substTemplateTypeParmType0Anchor">substTemplateTypeParmType</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1SubstTemplateTypeParmType.html">SubstTemplateTypeParmType</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('substTemplateTypeParmType0')"><a name="substTemplateTypeParmType0Anchor">substTemplateTypeParmType</a></td><td>Matcher&lt;SubstTemplateTypeParmType&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="substTemplateTypeParmType0"><pre>Matches types that represent the result of substituting a type for a
 template type parameter.
 
@@ -2845,7 +2845,7 @@ <h2 id="decl-matchers">Node Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('tagType0')"><a name="tagType0Anchor">tagType</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TagType.html">TagType</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('tagType0')"><a name="tagType0Anchor">tagType</a></td><td>Matcher&lt;TagType&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="tagType0"><pre>Matches tag types (record and enum types).
 
 Given
@@ -2860,7 +2860,7 @@ <h2 id="decl-matchers">Node Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('templateSpecializationType0')"><a name="templateSpecializationType0Anchor">templateSpecializationType</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TemplateSpecializationType.html">TemplateSpecializationType</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('templateSpecializationType0')"><a name="templateSpecializationType0Anchor">templateSpecializationType</a></td><td>Matcher&lt;TemplateSpecializationType&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="templateSpecializationType0"><pre>Matches template specialization types.
 
 Given
@@ -2875,7 +2875,7 @@ <h2 id="decl-matchers">Node Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('templateTypeParmType0')"><a name="templateTypeParmType0Anchor">templateTypeParmType</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TemplateTypeParmType.html">TemplateTypeParmType</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('templateTypeParmType0')"><a name="templateTypeParmType0Anchor">templateTypeParmType</a></td><td>Matcher&lt;TemplateTypeParmType&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="templateTypeParmType0"><pre>Matches template type parameter types.
 
 Example matches T, but not int.
@@ -2899,7 +2899,7 @@ <h2 id="decl-matchers">Node Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('unaryTransformType0')"><a name="unaryTransformType0Anchor">unaryTransformType</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1UnaryTransformType.html">UnaryTransformType</a>&gt;...</td></tr>
+<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('unaryTransformType0')"><a name="unaryTransformType0Anchor">unaryTransformType</a></td><td>Matcher&lt;UnaryTransformType&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="unaryTransformType0"><pre>Matches types nodes representing unary type transformations.
 
 Given:
@@ -3077,8 +3077,8 @@ <h2 id="narrowing-matchers">Narrowing Matchers</h2>
 
 
 <tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXBaseSpecifier.html">CXXBaseSpecifier</a>&gt;</td><td class="name" onclick="toggle('isPrivate1')"><a name="isPrivate1Anchor">isPrivate</a></td><td></td></tr>
-<tr><td colspan="4" class="doc" id="isPrivate1"><pre>Matches private C++ declarations and C++ base specifers that specify private
-inheritance.
+<tr><td colspan="4" class="doc" id="isPrivate1"><pre>Matches private C++ declarations and C++ base specifiers that specify
+private inheritance.
 
 Examples:
   class C {
@@ -3094,7 +3094,7 @@ <h2 id="narrowing-matchers">Narrowing Matchers</h2>
 
 
 <tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXBaseSpecifier.html">CXXBaseSpecifier</a>&gt;</td><td class="name" onclick="toggle('isProtected1')"><a name="isProtected1Anchor">isProtected</a></td><td></td></tr>
-<tr><td colspan="4" class="doc" id="isProtected1"><pre>Matches protected C++ declarations and C++ base specifers that specify
+<tr><td colspan="4" class="doc" id="isProtected1"><pre>Matches protected C++ declarations and C++ base specifiers that specify
 protected inheritance.
 
 Examples:
@@ -3110,7 +3110,7 @@ <h2 id="narrowing-matchers">Narrowing Matchers</h2>
 
 
 <tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXBaseSpecifier.html">CXXBaseSpecifier</a>&gt;</td><td class="name" onclick="toggle('isPublic1')"><a name="isPublic1Anchor">isPublic</a></td><td></td></tr>
-<tr><td colspan="4" class="doc" id="isPublic1"><pre>Matches public C++ declarations and C++ base specifers that specify public
+<tr><td colspan="4" class="doc" id="isPublic1"><pre>Matches public C++ declarations and C++ base specifiers that specify public
 inheritance.
 
 Examples:
@@ -3127,7 +3127,7 @@ <h2 id="narrowing-matchers">Narrowing Matchers</h2>
 
 
 <tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXBaseSpecifier.html">CXXBaseSpecifier</a>&gt;</td><td class="name" onclick="toggle('isVirtual1')"><a name="isVirtual1Anchor">isVirtual</a></td><td></td></tr>
-<tr><td colspan="4" class="doc" id="isVirtual1"><pre>Matches declarations of virtual methods and C++ base specifers that specify
+<tr><td colspan="4" class="doc" id="isVirtual1"><pre>Matches declarations of virtual methods and C++ base specifiers that specify
 virtual inheritance.
 
 Example:
@@ -3709,7 +3709,7 @@ <h2 id="narrowing-matchers">Narrowing Matchers</h2>
 
 
 <tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXMethodDecl.html">CXXMethodDecl</a>&gt;</td><td class="name" onclick="toggle('isVirtual0')"><a name="isVirtual0Anchor">isVirtual</a></td><td></td></tr>
-<tr><td colspan="4" class="doc" id="isVirtual0"><pre>Matches declarations of virtual methods and C++ base specifers that specify
+<tr><td colspan="4" class="doc" id="isVirtual0"><pre>Matches declarations of virtual methods and C++ base specifiers that specify
 virtual inheritance.
 
 Example:
@@ -4161,6 +4161,12 @@ <h2 id="narrowing-matchers">Narrowing Matchers</h2>
 </pre></td></tr>
 
 
+<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('declaresSameEntityAsBoundNode0')"><a name="declaresSameEntityAsBoundNode0Anchor">declaresSameEntityAsBoundNode</a></td><td>std::string ID</td></tr>
+<tr><td colspan="4" class="doc" id="declaresSameEntityAsBoundNode0"><pre>Matches a declaration if it declares the same entity as the node previously
+bound to ID.
+</pre></td></tr>
+
+
 <tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('equalsBoundNode1')"><a name="equalsBoundNode1Anchor">equalsBoundNode</a></td><td>std::string ID</td></tr>
 <tr><td colspan="4" class="doc" id="equalsBoundNode1"><pre>Matches if a node equals a previously bound node.
 
@@ -4322,8 +4328,8 @@ <h2 id="narrowing-matchers">Narrowing Matchers</h2>
 
 
 <tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('isPrivate0')"><a name="isPrivate0Anchor">isPrivate</a></td><td></td></tr>
-<tr><td colspan="4" class="doc" id="isPrivate0"><pre>Matches private C++ declarations and C++ base specifers that specify private
-inheritance.
+<tr><td colspan="4" class="doc" id="isPrivate0"><pre>Matches private C++ declarations and C++ base specifiers that specify
+private inheritance.
 
 Examples:
   class C {
@@ -4339,7 +4345,7 @@ <h2 id="narrowing-matchers">Narrowing Matchers</h2>
 
 
 <tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('isProtected0')"><a name="isProtected0Anchor">isProtected</a></td><td></td></tr>
-<tr><td colspan="4" class="doc" id="isProtected0"><pre>Matches protected C++ declarations and C++ base specifers that specify
+<tr><td colspan="4" class="doc" id="isProtected0"><pre>Matches protected C++ declarations and C++ base specifiers that specify
 protected inheritance.
 
 Examples:
@@ -4355,7 +4361,7 @@ <h2 id="narrowing-matchers">Narrowing Matchers</h2>
 
 
 <tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('isPublic0')"><a name="isPublic0Anchor">isPublic</a></td><td></td></tr>
-<tr><td colspan="4" class="doc" id="isPublic0"><pre>Matches public C++ declarations and C++ base specifers that specify public
+<tr><td colspan="4" class="doc" id="isPublic0"><pre>Matches public C++ declarations and C++ base specifiers that specify public
 inheritance.
 
 Examples:
@@ -4371,7 +4377,7 @@ <h2 id="narrowing-matchers">Narrowing Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1DependentNameType.html">DependentNameType</a>&gt;</td><td class="name" onclick="toggle('hasDependentName1')"><a name="hasDependentName1Anchor">hasDependentName</a></td><td>std::string N</td></tr>
+<tr><td>Matcher&lt;DependentNameType&gt;</td><td class="name" onclick="toggle('hasDependentName1')"><a name="hasDependentName1Anchor">hasDependentName</a></td><td>std::string N</td></tr>
 <tr><td colspan="4" class="doc" id="hasDependentName1"><pre>Matches the dependent name of a DependentScopeDeclRefExpr or
 DependentNameType
 
@@ -5046,7 +5052,7 @@ <h2 id="narrowing-matchers">Narrowing Matchers</h2>
 int z;
 
 Example matches f() because it has external formal linkage despite being
-unique to the translation unit as though it has internal likage
+unique to the translation unit as though it has internal linkage
 (matcher = functionDecl(hasExternalFormalLinkage()))
 
 namespace {
@@ -5182,7 +5188,7 @@ <h2 id="narrowing-matchers">Narrowing Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1OMPExecutableDirective.html">OMPExecutableDirective</a>&gt;</td><td class="name" onclick="toggle('isAllowedToContainClauseKind0')"><a name="isAllowedToContainClauseKind0Anchor">isAllowedToContainClauseKind</a></td><td>OpenMPClauseKind CKind</td></tr>
+<tr><td>Matcher&lt;OMPExecutableDirective&gt;</td><td class="name" onclick="toggle('isAllowedToContainClauseKind0')"><a name="isAllowedToContainClauseKind0Anchor">isAllowedToContainClauseKind</a></td><td>OpenMPClauseKind CKind</td></tr>
 <tr><td colspan="4" class="doc" id="isAllowedToContainClauseKind0"><pre>Matches if the OpenMP directive is allowed to contain the specified OpenMP
 clause kind.
 
@@ -5192,7 +5198,7 @@ <h2 id="narrowing-matchers">Narrowing Matchers</h2>
   #pragma omp parallel for
   #pragma omp          for
 
-`ompExecutableDirective(isAllowedToContainClause(OMPC_default))`` matches
+``ompExecutableDirective(isAllowedToContainClause(OMPC_default))`` matches
 ``omp parallel`` and ``omp parallel for``.
 
 If the matcher is use from clang-query, ``OpenMPClauseKind`` parameter
@@ -5201,7 +5207,7 @@ <h2 id="narrowing-matchers">Narrowing Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1OMPExecutableDirective.html">OMPExecutableDirective</a>&gt;</td><td class="name" onclick="toggle('isStandaloneDirective0')"><a name="isStandaloneDirective0Anchor">isStandaloneDirective</a></td><td></td></tr>
+<tr><td>Matcher&lt;OMPExecutableDirective&gt;</td><td class="name" onclick="toggle('isStandaloneDirective0')"><a name="isStandaloneDirective0Anchor">isStandaloneDirective</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="isStandaloneDirective0"><pre>Matches standalone OpenMP directives,
 i.e., directives that can't have a structured block.
 
@@ -5545,10 +5551,10 @@ <h2 id="narrowing-matchers">Narrowing Matchers</h2>
 
 Given
   void a(int);
-  void b(long);
+  void b(unsigned long);
   void c(double);
 functionDecl(hasAnyParameter(hasType(isInteger())))
-matches "a(int)", "b(long)", but not "c(double)".
+matches "a(int)", "b(unsigned long)", but not "c(double)".
 </pre></td></tr>
 
 
@@ -5781,7 +5787,7 @@ <h2 id="narrowing-matchers">Narrowing Matchers</h2>
 <tr><td colspan="4" class="doc" id="equalsIntegralValue0"><pre>Matches a TemplateArgument of integral type with a given value.
 
 Note that 'Value' is a string as the template argument's value is
-an arbitrary precision integer. 'Value' must be euqal to the canonical
+an arbitrary precision integer. 'Value' must be equal to the canonical
 representation of that integral value in base 10.
 
 Given
@@ -5806,7 +5812,7 @@ <h2 id="narrowing-matchers">Narrowing Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TemplateSpecializationType.html">TemplateSpecializationType</a>&gt;</td><td class="name" onclick="toggle('templateArgumentCountIs3')"><a name="templateArgumentCountIs3Anchor">templateArgumentCountIs</a></td><td>unsigned N</td></tr>
+<tr><td>Matcher&lt;TemplateSpecializationType&gt;</td><td class="name" onclick="toggle('templateArgumentCountIs3')"><a name="templateArgumentCountIs3Anchor">templateArgumentCountIs</a></td><td>unsigned N</td></tr>
 <tr><td colspan="4" class="doc" id="templateArgumentCountIs3"><pre>Matches if the number of template arguments equals N.
 
 Given
@@ -6571,8 +6577,8 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 
 
 <tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1AbstractConditionalOperator.html">AbstractConditionalOperator</a>&gt;</td><td class="name" onclick="toggle('hasCondition5')"><a name="hasCondition5Anchor">hasCondition</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
-<tr><td colspan="4" class="doc" id="hasCondition5"><pre>Matches the condition expression of an if statement, for loop,
-switch statement or conditional operator.
+<tr><td colspan="4" class="doc" id="hasCondition5"><pre>Matches the condition expression of an if statement, for loop, while loop,
+do-while loop, switch statement or conditional operator.
 
 Example matches true (matcher = hasCondition(cxxBoolLiteral(equals(true))))
   if (true) {}
@@ -6600,8 +6606,8 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1AddrLabelExpr.html">AddrLabelExpr</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration15')"><a name="hasDeclaration15Anchor">hasDeclaration</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
-<tr><td colspan="4" class="doc" id="hasDeclaration15"><pre>Matches a node if the declaration associated with that node
+<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1AddrLabelExpr.html">AddrLabelExpr</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration16')"><a name="hasDeclaration16Anchor">hasDeclaration</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
+<tr><td colspan="4" class="doc" id="hasDeclaration16"><pre>Matches a node if the declaration associated with that node
 matches the given matcher.
 
 The associated declaration is:
@@ -6626,11 +6632,11 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 
 Usable as: Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1AddrLabelExpr.html">AddrLabelExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;,
   Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXNewExpr.html">CXXNewExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1EnumType.html">EnumType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1InjectedClassNameType.html">InjectedClassNameType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1LabelStmt.html">LabelStmt</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1RecordType.html">RecordType</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TagType.html">TagType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TemplateSpecializationType.html">TemplateSpecializationType</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TemplateTypeParmType.html">TemplateTypeParmType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TypedefType.html">TypedefType</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1UnresolvedUsingType.html">UnresolvedUsingType</a>&gt;
+  Matcher&lt;EnumType&gt;, Matcher&lt;InjectedClassNameType&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1LabelStmt.html">LabelStmt</a>&gt;,
+  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;, Matcher&lt;RecordType&gt;,
+  Matcher&lt;TagType&gt;, Matcher&lt;TemplateSpecializationType&gt;,
+  Matcher&lt;TemplateTypeParmType&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TypedefType.html">TypedefType</a>&gt;,
+  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1UnresolvedUsingType.html">UnresolvedUsingType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1UsingType.html">UsingType</a>&gt;
 </pre></td></tr>
 
 
@@ -6701,7 +6707,7 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1AutoType.html">AutoType</a>&gt;</td><td class="name" onclick="toggle('hasDeducedType0')"><a name="hasDeducedType0Anchor">hasDeducedType</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td></tr>
+<tr><td>Matcher&lt;AutoType&gt;</td><td class="name" onclick="toggle('hasDeducedType0')"><a name="hasDeducedType0Anchor">hasDeducedType</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td></tr>
 <tr><td colspan="4" class="doc" id="hasDeducedType0"><pre>Matches AutoType nodes where the deduced type is a specific type.
 
 Note: There is no TypeLoc for the deduced type and thus no
@@ -6713,7 +6719,7 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 autoType(hasDeducedType(isInteger()))
   matches "auto a"
 
-Usable as: Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1AutoType.html">AutoType</a>&gt;
+Usable as: Matcher&lt;AutoType&gt;
 </pre></td></tr>
 
 
@@ -7026,8 +7032,8 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration13')"><a name="hasDeclaration13Anchor">hasDeclaration</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
-<tr><td colspan="4" class="doc" id="hasDeclaration13"><pre>Matches a node if the declaration associated with that node
+<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration14')"><a name="hasDeclaration14Anchor">hasDeclaration</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
+<tr><td colspan="4" class="doc" id="hasDeclaration14"><pre>Matches a node if the declaration associated with that node
 matches the given matcher.
 
 The associated declaration is:
@@ -7052,11 +7058,11 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 
 Usable as: Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1AddrLabelExpr.html">AddrLabelExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;,
   Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXNewExpr.html">CXXNewExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1EnumType.html">EnumType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1InjectedClassNameType.html">InjectedClassNameType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1LabelStmt.html">LabelStmt</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1RecordType.html">RecordType</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TagType.html">TagType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TemplateSpecializationType.html">TemplateSpecializationType</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TemplateTypeParmType.html">TemplateTypeParmType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TypedefType.html">TypedefType</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1UnresolvedUsingType.html">UnresolvedUsingType</a>&gt;
+  Matcher&lt;EnumType&gt;, Matcher&lt;InjectedClassNameType&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1LabelStmt.html">LabelStmt</a>&gt;,
+  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;, Matcher&lt;RecordType&gt;,
+  Matcher&lt;TagType&gt;, Matcher&lt;TemplateSpecializationType&gt;,
+  Matcher&lt;TemplateTypeParmType&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TypedefType.html">TypedefType</a>&gt;,
+  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1UnresolvedUsingType.html">UnresolvedUsingType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1UsingType.html">UsingType</a>&gt;
 </pre></td></tr>
 
 
@@ -7489,8 +7495,8 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXNewExpr.html">CXXNewExpr</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration12')"><a name="hasDeclaration12Anchor">hasDeclaration</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
-<tr><td colspan="4" class="doc" id="hasDeclaration12"><pre>Matches a node if the declaration associated with that node
+<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXNewExpr.html">CXXNewExpr</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration13')"><a name="hasDeclaration13Anchor">hasDeclaration</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
+<tr><td colspan="4" class="doc" id="hasDeclaration13"><pre>Matches a node if the declaration associated with that node
 matches the given matcher.
 
 The associated declaration is:
@@ -7515,11 +7521,11 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 
 Usable as: Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1AddrLabelExpr.html">AddrLabelExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;,
   Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXNewExpr.html">CXXNewExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1EnumType.html">EnumType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1InjectedClassNameType.html">InjectedClassNameType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1LabelStmt.html">LabelStmt</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1RecordType.html">RecordType</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TagType.html">TagType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TemplateSpecializationType.html">TemplateSpecializationType</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TemplateTypeParmType.html">TemplateTypeParmType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TypedefType.html">TypedefType</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1UnresolvedUsingType.html">UnresolvedUsingType</a>&gt;
+  Matcher&lt;EnumType&gt;, Matcher&lt;InjectedClassNameType&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1LabelStmt.html">LabelStmt</a>&gt;,
+  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;, Matcher&lt;RecordType&gt;,
+  Matcher&lt;TagType&gt;, Matcher&lt;TemplateSpecializationType&gt;,
+  Matcher&lt;TemplateTypeParmType&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TypedefType.html">TypedefType</a>&gt;,
+  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1UnresolvedUsingType.html">UnresolvedUsingType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1UsingType.html">UsingType</a>&gt;
 </pre></td></tr>
 
 
@@ -7952,8 +7958,8 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration14')"><a name="hasDeclaration14Anchor">hasDeclaration</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
-<tr><td colspan="4" class="doc" id="hasDeclaration14"><pre>Matches a node if the declaration associated with that node
+<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration15')"><a name="hasDeclaration15Anchor">hasDeclaration</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
+<tr><td colspan="4" class="doc" id="hasDeclaration15"><pre>Matches a node if the declaration associated with that node
 matches the given matcher.
 
 The associated declaration is:
@@ -7978,11 +7984,11 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 
 Usable as: Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1AddrLabelExpr.html">AddrLabelExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;,
   Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXNewExpr.html">CXXNewExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1EnumType.html">EnumType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1InjectedClassNameType.html">InjectedClassNameType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1LabelStmt.html">LabelStmt</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1RecordType.html">RecordType</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TagType.html">TagType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TemplateSpecializationType.html">TemplateSpecializationType</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TemplateTypeParmType.html">TemplateTypeParmType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TypedefType.html">TypedefType</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1UnresolvedUsingType.html">UnresolvedUsingType</a>&gt;
+  Matcher&lt;EnumType&gt;, Matcher&lt;InjectedClassNameType&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1LabelStmt.html">LabelStmt</a>&gt;,
+  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;, Matcher&lt;RecordType&gt;,
+  Matcher&lt;TagType&gt;, Matcher&lt;TemplateSpecializationType&gt;,
+  Matcher&lt;TemplateTypeParmType&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TypedefType.html">TypedefType</a>&gt;,
+  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1UnresolvedUsingType.html">UnresolvedUsingType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1UsingType.html">UsingType</a>&gt;
 </pre></td></tr>
 
 
@@ -8204,7 +8210,7 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 
 
 <tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1DecayedType.html">DecayedType</a>&gt;</td><td class="name" onclick="toggle('hasDecayedType0')"><a name="hasDecayedType0Anchor">hasDecayedType</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt; InnerType</td></tr>
-<tr><td colspan="4" class="doc" id="hasDecayedType0"><pre>Matches the decayed type, whoes decayed type matches InnerMatcher
+<tr><td colspan="4" class="doc" id="hasDecayedType0"><pre>Matches the decayed type, whose decayed type matches InnerMatcher
 </pre></td></tr>
 
 
@@ -8223,8 +8229,8 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration11')"><a name="hasDeclaration11Anchor">hasDeclaration</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
-<tr><td colspan="4" class="doc" id="hasDeclaration11"><pre>Matches a node if the declaration associated with that node
+<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration12')"><a name="hasDeclaration12Anchor">hasDeclaration</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
+<tr><td colspan="4" class="doc" id="hasDeclaration12"><pre>Matches a node if the declaration associated with that node
 matches the given matcher.
 
 The associated declaration is:
@@ -8249,11 +8255,11 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 
 Usable as: Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1AddrLabelExpr.html">AddrLabelExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;,
   Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXNewExpr.html">CXXNewExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1EnumType.html">EnumType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1InjectedClassNameType.html">InjectedClassNameType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1LabelStmt.html">LabelStmt</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1RecordType.html">RecordType</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TagType.html">TagType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TemplateSpecializationType.html">TemplateSpecializationType</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TemplateTypeParmType.html">TemplateTypeParmType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TypedefType.html">TypedefType</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1UnresolvedUsingType.html">UnresolvedUsingType</a>&gt;
+  Matcher&lt;EnumType&gt;, Matcher&lt;InjectedClassNameType&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1LabelStmt.html">LabelStmt</a>&gt;,
+  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;, Matcher&lt;RecordType&gt;,
+  Matcher&lt;TagType&gt;, Matcher&lt;TemplateSpecializationType&gt;,
+  Matcher&lt;TemplateTypeParmType&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TypedefType.html">TypedefType</a>&gt;,
+  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1UnresolvedUsingType.html">UnresolvedUsingType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1UsingType.html">UsingType</a>&gt;
 </pre></td></tr>
 
 
@@ -8373,24 +8379,11 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
     }
   }
 
-cxxRcordDecl(hasDeclContext(namedDecl(hasName("M")))) matches the
+cxxRecordDecl(hasDeclContext(namedDecl(hasName("M")))) matches the
 declaration of class D.
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1DecltypeType.html">DecltypeType</a>&gt;</td><td class="name" onclick="toggle('hasUnderlyingType0')"><a name="hasUnderlyingType0Anchor">hasUnderlyingType</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td></tr>
-<tr><td colspan="4" class="doc" id="hasUnderlyingType0"><pre>Matches DecltypeType or UsingType nodes to find the underlying type.
-
-Given
-  decltype(1) a = 1;
-  decltype(2.0) b = 2.0;
-decltypeType(hasUnderlyingType(isInteger()))
-  matches the type of "a"
-
-Usable as: Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1DecltypeType.html">DecltypeType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1UsingType.html">UsingType</a>&gt;
-</pre></td></tr>
-
-
 <tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1DecompositionDecl.html">DecompositionDecl</a>&gt;</td><td class="name" onclick="toggle('hasAnyBinding0')"><a name="hasAnyBinding0Anchor">hasAnyBinding</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1BindingDecl.html">BindingDecl</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasAnyBinding0"><pre>Matches any binding of a DecompositionDecl.
 
@@ -8451,66 +8444,16 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 
 
 <tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1DoStmt.html">DoStmt</a>&gt;</td><td class="name" onclick="toggle('hasCondition3')"><a name="hasCondition3Anchor">hasCondition</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
-<tr><td colspan="4" class="doc" id="hasCondition3"><pre>Matches the condition expression of an if statement, for loop,
-switch statement or conditional operator.
+<tr><td colspan="4" class="doc" id="hasCondition3"><pre>Matches the condition expression of an if statement, for loop, while loop,
+do-while loop, switch statement or conditional operator.
 
 Example matches true (matcher = hasCondition(cxxBoolLiteral(equals(true))))
   if (true) {}
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1ElaboratedTypeLoc.html">ElaboratedTypeLoc</a>&gt;</td><td class="name" onclick="toggle('hasNamedTypeLoc0')"><a name="hasNamedTypeLoc0Anchor">hasNamedTypeLoc</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TypeLoc.html">TypeLoc</a>&gt; InnerMatcher</td></tr>
-<tr><td colspan="4" class="doc" id="hasNamedTypeLoc0"><pre>Matches elaborated `TypeLoc`s that have a named `TypeLoc` matching
-`InnerMatcher`.
-
-Given
-  template &lt;typename T&gt;
-  class C {};
-  class C&lt;int&gt; c;
-
-  class D {};
-  class D d;
-elaboratedTypeLoc(hasNamedTypeLoc(templateSpecializationTypeLoc()));
-  matches the `TypeLoc` of the variable declaration of `c`, but not `d`.
-</pre></td></tr>
-
-
-<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1ElaboratedType.html">ElaboratedType</a>&gt;</td><td class="name" onclick="toggle('hasQualifier0')"><a name="hasQualifier0Anchor">hasQualifier</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1NestedNameSpecifier.html">NestedNameSpecifier</a>&gt; InnerMatcher</td></tr>
-<tr><td colspan="4" class="doc" id="hasQualifier0"><pre>Matches ElaboratedTypes whose qualifier, a NestedNameSpecifier,
-matches InnerMatcher if the qualifier exists.
-
-Given
-  namespace N {
-    namespace M {
-      class D {};
-    }
-  }
-  N::M::D d;
-
-elaboratedType(hasQualifier(hasPrefix(specifiesNamespace(hasName("N"))))
-matches the type of the variable declaration of d.
-</pre></td></tr>
-
-
-<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1ElaboratedType.html">ElaboratedType</a>&gt;</td><td class="name" onclick="toggle('namesType0')"><a name="namesType0Anchor">namesType</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt; InnerMatcher</td></tr>
-<tr><td colspan="4" class="doc" id="namesType0"><pre>Matches ElaboratedTypes whose named type matches InnerMatcher.
-
-Given
-  namespace N {
-    namespace M {
-      class D {};
-    }
-  }
-  N::M::D d;
-
-elaboratedType(namesType(recordType(
-hasDeclaration(namedDecl(hasName("D")))))) matches the type of the variable
-declaration of d.
-</pre></td></tr>
-
-
-<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1EnumType.html">EnumType</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration10')"><a name="hasDeclaration10Anchor">hasDeclaration</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
-<tr><td colspan="4" class="doc" id="hasDeclaration10"><pre>Matches a node if the declaration associated with that node
+<tr><td>Matcher&lt;EnumType&gt;</td><td class="name" onclick="toggle('hasDeclaration11')"><a name="hasDeclaration11Anchor">hasDeclaration</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
+<tr><td colspan="4" class="doc" id="hasDeclaration11"><pre>Matches a node if the declaration associated with that node
 matches the given matcher.
 
 The associated declaration is:
@@ -8535,11 +8478,11 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 
 Usable as: Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1AddrLabelExpr.html">AddrLabelExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;,
   Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXNewExpr.html">CXXNewExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1EnumType.html">EnumType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1InjectedClassNameType.html">InjectedClassNameType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1LabelStmt.html">LabelStmt</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1RecordType.html">RecordType</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TagType.html">TagType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TemplateSpecializationType.html">TemplateSpecializationType</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TemplateTypeParmType.html">TemplateTypeParmType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TypedefType.html">TypedefType</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1UnresolvedUsingType.html">UnresolvedUsingType</a>&gt;
+  Matcher&lt;EnumType&gt;, Matcher&lt;InjectedClassNameType&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1LabelStmt.html">LabelStmt</a>&gt;,
+  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;, Matcher&lt;RecordType&gt;,
+  Matcher&lt;TagType&gt;, Matcher&lt;TemplateSpecializationType&gt;,
+  Matcher&lt;TemplateTypeParmType&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TypedefType.html">TypedefType</a>&gt;,
+  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1UnresolvedUsingType.html">UnresolvedUsingType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1UsingType.html">UsingType</a>&gt;
 </pre></td></tr>
 
 
@@ -8788,14 +8731,26 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 
 
 <tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1ForStmt.html">ForStmt</a>&gt;</td><td class="name" onclick="toggle('hasCondition1')"><a name="hasCondition1Anchor">hasCondition</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
-<tr><td colspan="4" class="doc" id="hasCondition1"><pre>Matches the condition expression of an if statement, for loop,
-switch statement or conditional operator.
+<tr><td colspan="4" class="doc" id="hasCondition1"><pre>Matches the condition expression of an if statement, for loop, while loop,
+do-while loop, switch statement or conditional operator.
 
 Example matches true (matcher = hasCondition(cxxBoolLiteral(equals(true))))
   if (true) {}
 </pre></td></tr>
 
 
+<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1ForStmt.html">ForStmt</a>&gt;</td><td class="name" onclick="toggle('hasConditionVariableStatement1')"><a name="hasConditionVariableStatement1Anchor">hasConditionVariableStatement</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1DeclStmt.html">DeclStmt</a>&gt; InnerMatcher</td></tr>
+<tr><td colspan="4" class="doc" id="hasConditionVariableStatement1"><pre>Matches the condition variable statement in an if statement, for loop,
+while loop or switch statement.
+
+Given
+  if (A* a = GetAPointer()) {}
+  for (; A* a = GetAPointer(); ) {}
+hasConditionVariableStatement(...)
+  matches both 'A* a = GetAPointer()'.
+</pre></td></tr>
+
+
 <tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1ForStmt.html">ForStmt</a>&gt;</td><td class="name" onclick="toggle('hasIncrement0')"><a name="hasIncrement0Anchor">hasIncrement</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasIncrement0"><pre>Matches the increment statement of a for loop.
 
@@ -9099,8 +9054,8 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 
 
 <tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1IfStmt.html">IfStmt</a>&gt;</td><td class="name" onclick="toggle('hasCondition0')"><a name="hasCondition0Anchor">hasCondition</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
-<tr><td colspan="4" class="doc" id="hasCondition0"><pre>Matches the condition expression of an if statement, for loop,
-switch statement or conditional operator.
+<tr><td colspan="4" class="doc" id="hasCondition0"><pre>Matches the condition expression of an if statement, for loop, while loop,
+do-while loop, switch statement or conditional operator.
 
 Example matches true (matcher = hasCondition(cxxBoolLiteral(equals(true))))
   if (true) {}
@@ -9108,12 +9063,14 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 
 
 <tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1IfStmt.html">IfStmt</a>&gt;</td><td class="name" onclick="toggle('hasConditionVariableStatement0')"><a name="hasConditionVariableStatement0Anchor">hasConditionVariableStatement</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1DeclStmt.html">DeclStmt</a>&gt; InnerMatcher</td></tr>
-<tr><td colspan="4" class="doc" id="hasConditionVariableStatement0"><pre>Matches the condition variable statement in an if statement.
+<tr><td colspan="4" class="doc" id="hasConditionVariableStatement0"><pre>Matches the condition variable statement in an if statement, for loop,
+while loop or switch statement.
 
 Given
   if (A* a = GetAPointer()) {}
+  for (; A* a = GetAPointer(); ) {}
 hasConditionVariableStatement(...)
-  matches 'A* a = GetAPointer()'.
+  matches both 'A* a = GetAPointer()'.
 </pre></td></tr>
 
 
@@ -9179,8 +9136,8 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1InjectedClassNameType.html">InjectedClassNameType</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration9')"><a name="hasDeclaration9Anchor">hasDeclaration</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
-<tr><td colspan="4" class="doc" id="hasDeclaration9"><pre>Matches a node if the declaration associated with that node
+<tr><td>Matcher&lt;InjectedClassNameType&gt;</td><td class="name" onclick="toggle('hasDeclaration10')"><a name="hasDeclaration10Anchor">hasDeclaration</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
+<tr><td colspan="4" class="doc" id="hasDeclaration10"><pre>Matches a node if the declaration associated with that node
 matches the given matcher.
 
 The associated declaration is:
@@ -9205,16 +9162,16 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 
 Usable as: Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1AddrLabelExpr.html">AddrLabelExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;,
   Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXNewExpr.html">CXXNewExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1EnumType.html">EnumType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1InjectedClassNameType.html">InjectedClassNameType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1LabelStmt.html">LabelStmt</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1RecordType.html">RecordType</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TagType.html">TagType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TemplateSpecializationType.html">TemplateSpecializationType</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TemplateTypeParmType.html">TemplateTypeParmType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TypedefType.html">TypedefType</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1UnresolvedUsingType.html">UnresolvedUsingType</a>&gt;
+  Matcher&lt;EnumType&gt;, Matcher&lt;InjectedClassNameType&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1LabelStmt.html">LabelStmt</a>&gt;,
+  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;, Matcher&lt;RecordType&gt;,
+  Matcher&lt;TagType&gt;, Matcher&lt;TemplateSpecializationType&gt;,
+  Matcher&lt;TemplateTypeParmType&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TypedefType.html">TypedefType</a>&gt;,
+  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1UnresolvedUsingType.html">UnresolvedUsingType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1UsingType.html">UsingType</a>&gt;
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1LabelStmt.html">LabelStmt</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration8')"><a name="hasDeclaration8Anchor">hasDeclaration</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
-<tr><td colspan="4" class="doc" id="hasDeclaration8"><pre>Matches a node if the declaration associated with that node
+<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1LabelStmt.html">LabelStmt</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration9')"><a name="hasDeclaration9Anchor">hasDeclaration</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
+<tr><td colspan="4" class="doc" id="hasDeclaration9"><pre>Matches a node if the declaration associated with that node
 matches the given matcher.
 
 The associated declaration is:
@@ -9239,11 +9196,11 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 
 Usable as: Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1AddrLabelExpr.html">AddrLabelExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;,
   Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXNewExpr.html">CXXNewExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1EnumType.html">EnumType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1InjectedClassNameType.html">InjectedClassNameType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1LabelStmt.html">LabelStmt</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1RecordType.html">RecordType</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TagType.html">TagType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TemplateSpecializationType.html">TemplateSpecializationType</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TemplateTypeParmType.html">TemplateTypeParmType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TypedefType.html">TypedefType</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1UnresolvedUsingType.html">UnresolvedUsingType</a>&gt;
+  Matcher&lt;EnumType&gt;, Matcher&lt;InjectedClassNameType&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1LabelStmt.html">LabelStmt</a>&gt;,
+  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;, Matcher&lt;RecordType&gt;,
+  Matcher&lt;TagType&gt;, Matcher&lt;TemplateSpecializationType&gt;,
+  Matcher&lt;TemplateTypeParmType&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TypedefType.html">TypedefType</a>&gt;,
+  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1UnresolvedUsingType.html">UnresolvedUsingType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1UsingType.html">UsingType</a>&gt;
 </pre></td></tr>
 
 
@@ -9293,8 +9250,8 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration7')"><a name="hasDeclaration7Anchor">hasDeclaration</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
-<tr><td colspan="4" class="doc" id="hasDeclaration7"><pre>Matches a node if the declaration associated with that node
+<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration8')"><a name="hasDeclaration8Anchor">hasDeclaration</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
+<tr><td colspan="4" class="doc" id="hasDeclaration8"><pre>Matches a node if the declaration associated with that node
 matches the given matcher.
 
 The associated declaration is:
@@ -9319,11 +9276,11 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 
 Usable as: Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1AddrLabelExpr.html">AddrLabelExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;,
   Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXNewExpr.html">CXXNewExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1EnumType.html">EnumType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1InjectedClassNameType.html">InjectedClassNameType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1LabelStmt.html">LabelStmt</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1RecordType.html">RecordType</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TagType.html">TagType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TemplateSpecializationType.html">TemplateSpecializationType</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TemplateTypeParmType.html">TemplateTypeParmType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TypedefType.html">TypedefType</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1UnresolvedUsingType.html">UnresolvedUsingType</a>&gt;
+  Matcher&lt;EnumType&gt;, Matcher&lt;InjectedClassNameType&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1LabelStmt.html">LabelStmt</a>&gt;,
+  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;, Matcher&lt;RecordType&gt;,
+  Matcher&lt;TagType&gt;, Matcher&lt;TemplateSpecializationType&gt;,
+  Matcher&lt;TemplateTypeParmType&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TypedefType.html">TypedefType</a>&gt;,
+  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1UnresolvedUsingType.html">UnresolvedUsingType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1UsingType.html">UsingType</a>&gt;
 </pre></td></tr>
 
 
@@ -9456,7 +9413,7 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1OMPExecutableDirective.html">OMPExecutableDirective</a>&gt;</td><td class="name" onclick="toggle('hasAnyClause0')"><a name="hasAnyClause0Anchor">hasAnyClause</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1OMPClause.html">OMPClause</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;OMPExecutableDirective&gt;</td><td class="name" onclick="toggle('hasAnyClause0')"><a name="hasAnyClause0Anchor">hasAnyClause</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1OMPClause.html">OMPClause</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasAnyClause0"><pre>Matches any clause in an OpenMP directive.
 
 Given
@@ -9469,7 +9426,7 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1OMPExecutableDirective.html">OMPExecutableDirective</a>&gt;</td><td class="name" onclick="toggle('hasStructuredBlock0')"><a name="hasStructuredBlock0Anchor">hasStructuredBlock</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;OMPExecutableDirective&gt;</td><td class="name" onclick="toggle('hasStructuredBlock0')"><a name="hasStructuredBlock0Anchor">hasStructuredBlock</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasStructuredBlock0"><pre>Matches the structured-block of the OpenMP executable directive
 
 Prerequisite: the executable directive must not be standalone directive.
@@ -9826,8 +9783,8 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration6')"><a name="hasDeclaration6Anchor">hasDeclaration</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
-<tr><td colspan="4" class="doc" id="hasDeclaration6"><pre>Matches a node if the declaration associated with that node
+<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration7')"><a name="hasDeclaration7Anchor">hasDeclaration</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
+<tr><td colspan="4" class="doc" id="hasDeclaration7"><pre>Matches a node if the declaration associated with that node
 matches the given matcher.
 
 The associated declaration is:
@@ -9852,11 +9809,11 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 
 Usable as: Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1AddrLabelExpr.html">AddrLabelExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;,
   Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXNewExpr.html">CXXNewExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1EnumType.html">EnumType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1InjectedClassNameType.html">InjectedClassNameType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1LabelStmt.html">LabelStmt</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1RecordType.html">RecordType</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TagType.html">TagType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TemplateSpecializationType.html">TemplateSpecializationType</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TemplateTypeParmType.html">TemplateTypeParmType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TypedefType.html">TypedefType</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1UnresolvedUsingType.html">UnresolvedUsingType</a>&gt;
+  Matcher&lt;EnumType&gt;, Matcher&lt;InjectedClassNameType&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1LabelStmt.html">LabelStmt</a>&gt;,
+  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;, Matcher&lt;RecordType&gt;,
+  Matcher&lt;TagType&gt;, Matcher&lt;TemplateSpecializationType&gt;,
+  Matcher&lt;TemplateTypeParmType&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TypedefType.html">TypedefType</a>&gt;,
+  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1UnresolvedUsingType.html">UnresolvedUsingType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1UsingType.html">UsingType</a>&gt;
 </pre></td></tr>
 
 
@@ -9920,8 +9877,8 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1RecordType.html">RecordType</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration5')"><a name="hasDeclaration5Anchor">hasDeclaration</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
-<tr><td colspan="4" class="doc" id="hasDeclaration5"><pre>Matches a node if the declaration associated with that node
+<tr><td>Matcher&lt;RecordType&gt;</td><td class="name" onclick="toggle('hasDeclaration6')"><a name="hasDeclaration6Anchor">hasDeclaration</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
+<tr><td colspan="4" class="doc" id="hasDeclaration6"><pre>Matches a node if the declaration associated with that node
 matches the given matcher.
 
 The associated declaration is:
@@ -9946,11 +9903,11 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 
 Usable as: Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1AddrLabelExpr.html">AddrLabelExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;,
   Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXNewExpr.html">CXXNewExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1EnumType.html">EnumType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1InjectedClassNameType.html">InjectedClassNameType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1LabelStmt.html">LabelStmt</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1RecordType.html">RecordType</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TagType.html">TagType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TemplateSpecializationType.html">TemplateSpecializationType</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TemplateTypeParmType.html">TemplateTypeParmType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TypedefType.html">TypedefType</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1UnresolvedUsingType.html">UnresolvedUsingType</a>&gt;
+  Matcher&lt;EnumType&gt;, Matcher&lt;InjectedClassNameType&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1LabelStmt.html">LabelStmt</a>&gt;,
+  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;, Matcher&lt;RecordType&gt;,
+  Matcher&lt;TagType&gt;, Matcher&lt;TemplateSpecializationType&gt;,
+  Matcher&lt;TemplateTypeParmType&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TypedefType.html">TypedefType</a>&gt;,
+  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1UnresolvedUsingType.html">UnresolvedUsingType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1UsingType.html">UsingType</a>&gt;
 </pre></td></tr>
 
 
@@ -10066,7 +10023,7 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1SubstTemplateTypeParmType.html">SubstTemplateTypeParmType</a>&gt;</td><td class="name" onclick="toggle('hasReplacementType0')"><a name="hasReplacementType0Anchor">hasReplacementType</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td></tr>
+<tr><td>Matcher&lt;SubstTemplateTypeParmType&gt;</td><td class="name" onclick="toggle('hasReplacementType0')"><a name="hasReplacementType0Anchor">hasReplacementType</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td></tr>
 <tr><td colspan="4" class="doc" id="hasReplacementType0"><pre>Matches template type parameter substitutions that have a replacement
 type that matches the provided matcher.
 
@@ -10094,14 +10051,26 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 
 
 <tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1SwitchStmt.html">SwitchStmt</a>&gt;</td><td class="name" onclick="toggle('hasCondition4')"><a name="hasCondition4Anchor">hasCondition</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
-<tr><td colspan="4" class="doc" id="hasCondition4"><pre>Matches the condition expression of an if statement, for loop,
-switch statement or conditional operator.
+<tr><td colspan="4" class="doc" id="hasCondition4"><pre>Matches the condition expression of an if statement, for loop, while loop,
+do-while loop, switch statement or conditional operator.
 
 Example matches true (matcher = hasCondition(cxxBoolLiteral(equals(true))))
   if (true) {}
 </pre></td></tr>
 
 
+<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1SwitchStmt.html">SwitchStmt</a>&gt;</td><td class="name" onclick="toggle('hasConditionVariableStatement3')"><a name="hasConditionVariableStatement3Anchor">hasConditionVariableStatement</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1DeclStmt.html">DeclStmt</a>&gt; InnerMatcher</td></tr>
+<tr><td colspan="4" class="doc" id="hasConditionVariableStatement3"><pre>Matches the condition variable statement in an if statement, for loop,
+while loop or switch statement.
+
+Given
+  if (A* a = GetAPointer()) {}
+  for (; A* a = GetAPointer(); ) {}
+hasConditionVariableStatement(...)
+  matches both 'A* a = GetAPointer()'.
+</pre></td></tr>
+
+
 <tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1SwitchStmt.html">SwitchStmt</a>&gt;</td><td class="name" onclick="toggle('hasInitStatement1')"><a name="hasInitStatement1Anchor">hasInitStatement</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasInitStatement1"><pre>Matches selection statements with initializer.
 
@@ -10125,8 +10094,8 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TagType.html">TagType</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration4')"><a name="hasDeclaration4Anchor">hasDeclaration</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
-<tr><td colspan="4" class="doc" id="hasDeclaration4"><pre>Matches a node if the declaration associated with that node
+<tr><td>Matcher&lt;TagType&gt;</td><td class="name" onclick="toggle('hasDeclaration5')"><a name="hasDeclaration5Anchor">hasDeclaration</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
+<tr><td colspan="4" class="doc" id="hasDeclaration5"><pre>Matches a node if the declaration associated with that node
 matches the given matcher.
 
 The associated declaration is:
@@ -10151,11 +10120,11 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 
 Usable as: Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1AddrLabelExpr.html">AddrLabelExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;,
   Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXNewExpr.html">CXXNewExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1EnumType.html">EnumType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1InjectedClassNameType.html">InjectedClassNameType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1LabelStmt.html">LabelStmt</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1RecordType.html">RecordType</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TagType.html">TagType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TemplateSpecializationType.html">TemplateSpecializationType</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TemplateTypeParmType.html">TemplateTypeParmType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TypedefType.html">TypedefType</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1UnresolvedUsingType.html">UnresolvedUsingType</a>&gt;
+  Matcher&lt;EnumType&gt;, Matcher&lt;InjectedClassNameType&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1LabelStmt.html">LabelStmt</a>&gt;,
+  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;, Matcher&lt;RecordType&gt;,
+  Matcher&lt;TagType&gt;, Matcher&lt;TemplateSpecializationType&gt;,
+  Matcher&lt;TemplateTypeParmType&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TypedefType.html">TypedefType</a>&gt;,
+  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1UnresolvedUsingType.html">UnresolvedUsingType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1UsingType.html">UsingType</a>&gt;
 </pre></td></tr>
 
 
@@ -10284,7 +10253,7 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TemplateSpecializationType.html">TemplateSpecializationType</a>&gt;</td><td class="name" onclick="toggle('forEachTemplateArgument3')"><a name="forEachTemplateArgument3Anchor">forEachTemplateArgument</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TemplateArgument.html">TemplateArgument</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;TemplateSpecializationType&gt;</td><td class="name" onclick="toggle('forEachTemplateArgument3')"><a name="forEachTemplateArgument3Anchor">forEachTemplateArgument</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TemplateArgument.html">TemplateArgument</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="forEachTemplateArgument3"><pre>Matches templateSpecializationType, class template specialization,
 variable template specialization, and function template specialization
 nodes where the template argument matches the inner matcher. This matcher
@@ -10310,7 +10279,7 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TemplateSpecializationType.html">TemplateSpecializationType</a>&gt;</td><td class="name" onclick="toggle('hasAnyTemplateArgument3')"><a name="hasAnyTemplateArgument3Anchor">hasAnyTemplateArgument</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TemplateArgument.html">TemplateArgument</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;TemplateSpecializationType&gt;</td><td class="name" onclick="toggle('hasAnyTemplateArgument3')"><a name="hasAnyTemplateArgument3Anchor">hasAnyTemplateArgument</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TemplateArgument.html">TemplateArgument</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasAnyTemplateArgument3"><pre>Matches templateSpecializationTypes, class template specializations,
 variable template specializations, and function template specializations
 that have at least one TemplateArgument matching the given InnerMatcher.
@@ -10332,8 +10301,8 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TemplateSpecializationType.html">TemplateSpecializationType</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration3')"><a name="hasDeclaration3Anchor">hasDeclaration</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
-<tr><td colspan="4" class="doc" id="hasDeclaration3"><pre>Matches a node if the declaration associated with that node
+<tr><td>Matcher&lt;TemplateSpecializationType&gt;</td><td class="name" onclick="toggle('hasDeclaration4')"><a name="hasDeclaration4Anchor">hasDeclaration</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
+<tr><td colspan="4" class="doc" id="hasDeclaration4"><pre>Matches a node if the declaration associated with that node
 matches the given matcher.
 
 The associated declaration is:
@@ -10358,15 +10327,15 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 
 Usable as: Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1AddrLabelExpr.html">AddrLabelExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;,
   Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXNewExpr.html">CXXNewExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1EnumType.html">EnumType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1InjectedClassNameType.html">InjectedClassNameType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1LabelStmt.html">LabelStmt</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1RecordType.html">RecordType</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TagType.html">TagType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TemplateSpecializationType.html">TemplateSpecializationType</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TemplateTypeParmType.html">TemplateTypeParmType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TypedefType.html">TypedefType</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1UnresolvedUsingType.html">UnresolvedUsingType</a>&gt;
+  Matcher&lt;EnumType&gt;, Matcher&lt;InjectedClassNameType&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1LabelStmt.html">LabelStmt</a>&gt;,
+  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;, Matcher&lt;RecordType&gt;,
+  Matcher&lt;TagType&gt;, Matcher&lt;TemplateSpecializationType&gt;,
+  Matcher&lt;TemplateTypeParmType&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TypedefType.html">TypedefType</a>&gt;,
+  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1UnresolvedUsingType.html">UnresolvedUsingType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1UsingType.html">UsingType</a>&gt;
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TemplateSpecializationType.html">TemplateSpecializationType</a>&gt;</td><td class="name" onclick="toggle('hasTemplateArgument3')"><a name="hasTemplateArgument3Anchor">hasTemplateArgument</a></td><td>unsigned N, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TemplateArgument.html">TemplateArgument</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;TemplateSpecializationType&gt;</td><td class="name" onclick="toggle('hasTemplateArgument3')"><a name="hasTemplateArgument3Anchor">hasTemplateArgument</a></td><td>unsigned N, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TemplateArgument.html">TemplateArgument</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasTemplateArgument3"><pre>Matches templateSpecializationType, class template specializations,
 variable template specializations, and function template specializations
 where the n'th TemplateArgument matches the given InnerMatcher.
@@ -10387,8 +10356,8 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TemplateTypeParmType.html">TemplateTypeParmType</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration2')"><a name="hasDeclaration2Anchor">hasDeclaration</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
-<tr><td colspan="4" class="doc" id="hasDeclaration2"><pre>Matches a node if the declaration associated with that node
+<tr><td>Matcher&lt;TemplateTypeParmType&gt;</td><td class="name" onclick="toggle('hasDeclaration3')"><a name="hasDeclaration3Anchor">hasDeclaration</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
+<tr><td colspan="4" class="doc" id="hasDeclaration3"><pre>Matches a node if the declaration associated with that node
 matches the given matcher.
 
 The associated declaration is:
@@ -10413,11 +10382,11 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 
 Usable as: Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1AddrLabelExpr.html">AddrLabelExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;,
   Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXNewExpr.html">CXXNewExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1EnumType.html">EnumType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1InjectedClassNameType.html">InjectedClassNameType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1LabelStmt.html">LabelStmt</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1RecordType.html">RecordType</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TagType.html">TagType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TemplateSpecializationType.html">TemplateSpecializationType</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TemplateTypeParmType.html">TemplateTypeParmType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TypedefType.html">TypedefType</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1UnresolvedUsingType.html">UnresolvedUsingType</a>&gt;
+  Matcher&lt;EnumType&gt;, Matcher&lt;InjectedClassNameType&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1LabelStmt.html">LabelStmt</a>&gt;,
+  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;, Matcher&lt;RecordType&gt;,
+  Matcher&lt;TagType&gt;, Matcher&lt;TemplateSpecializationType&gt;,
+  Matcher&lt;TemplateTypeParmType&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TypedefType.html">TypedefType</a>&gt;,
+  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1UnresolvedUsingType.html">UnresolvedUsingType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1UsingType.html">UsingType</a>&gt;
 </pre></td></tr>
 
 
@@ -10473,8 +10442,8 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TypedefType.html">TypedefType</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration1')"><a name="hasDeclaration1Anchor">hasDeclaration</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
-<tr><td colspan="4" class="doc" id="hasDeclaration1"><pre>Matches a node if the declaration associated with that node
+<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TypedefType.html">TypedefType</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration2')"><a name="hasDeclaration2Anchor">hasDeclaration</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
+<tr><td colspan="4" class="doc" id="hasDeclaration2"><pre>Matches a node if the declaration associated with that node
 matches the given matcher.
 
 The associated declaration is:
@@ -10499,11 +10468,41 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 
 Usable as: Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1AddrLabelExpr.html">AddrLabelExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;,
   Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXNewExpr.html">CXXNewExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1EnumType.html">EnumType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1InjectedClassNameType.html">InjectedClassNameType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1LabelStmt.html">LabelStmt</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1RecordType.html">RecordType</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TagType.html">TagType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TemplateSpecializationType.html">TemplateSpecializationType</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TemplateTypeParmType.html">TemplateTypeParmType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TypedefType.html">TypedefType</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1UnresolvedUsingType.html">UnresolvedUsingType</a>&gt;
+  Matcher&lt;EnumType&gt;, Matcher&lt;InjectedClassNameType&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1LabelStmt.html">LabelStmt</a>&gt;,
+  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;, Matcher&lt;RecordType&gt;,
+  Matcher&lt;TagType&gt;, Matcher&lt;TemplateSpecializationType&gt;,
+  Matcher&lt;TemplateTypeParmType&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TypedefType.html">TypedefType</a>&gt;,
+  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1UnresolvedUsingType.html">UnresolvedUsingType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1UsingType.html">UsingType</a>&gt;
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('hasQualifier0')"><a name="hasQualifier0Anchor">hasQualifier</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1NestedNameSpecifier.html">NestedNameSpecifier</a>&gt; InnerMatcher</td></tr>
+<tr><td colspan="4" class="doc" id="hasQualifier0"><pre>Matches Types whose qualifier, a NestedNameSpecifier,
+matches InnerMatcher if the qualifier exists.
+
+Given
+  namespace N {
+    namespace M {
+      class D {};
+    }
+  }
+  N::M::D d;
+
+elaboratedType(hasQualifier(hasPrefix(specifiesNamespace(hasName("N"))))
+matches the type of the variable declaration of d.
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('hasUnderlyingType0')"><a name="hasUnderlyingType0Anchor">hasUnderlyingType</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt; Inner</td></tr>
+<tr><td colspan="4" class="doc" id="hasUnderlyingType0"><pre>Matches QualType nodes to find the underlying type.
+
+Given
+  decltype(1) a = 1;
+  decltype(2.0) b = 2.0;
+decltypeType(hasUnderlyingType(isInteger()))
+  matches the type of "a"
+
+Usable as: Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;
 </pre></td></tr>
 
 
@@ -10556,8 +10555,8 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1UnresolvedUsingType.html">UnresolvedUsingType</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration0')"><a name="hasDeclaration0Anchor">hasDeclaration</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
-<tr><td colspan="4" class="doc" id="hasDeclaration0"><pre>Matches a node if the declaration associated with that node
+<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1UnresolvedUsingType.html">UnresolvedUsingType</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration1')"><a name="hasDeclaration1Anchor">hasDeclaration</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
+<tr><td colspan="4" class="doc" id="hasDeclaration1"><pre>Matches a node if the declaration associated with that node
 matches the given matcher.
 
 The associated declaration is:
@@ -10582,11 +10581,11 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 
 Usable as: Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1AddrLabelExpr.html">AddrLabelExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;,
   Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXNewExpr.html">CXXNewExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1EnumType.html">EnumType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1InjectedClassNameType.html">InjectedClassNameType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1LabelStmt.html">LabelStmt</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1RecordType.html">RecordType</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TagType.html">TagType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TemplateSpecializationType.html">TemplateSpecializationType</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TemplateTypeParmType.html">TemplateTypeParmType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TypedefType.html">TypedefType</a>&gt;,
-  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1UnresolvedUsingType.html">UnresolvedUsingType</a>&gt;
+  Matcher&lt;EnumType&gt;, Matcher&lt;InjectedClassNameType&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1LabelStmt.html">LabelStmt</a>&gt;,
+  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;, Matcher&lt;RecordType&gt;,
+  Matcher&lt;TagType&gt;, Matcher&lt;TemplateSpecializationType&gt;,
+  Matcher&lt;TemplateTypeParmType&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TypedefType.html">TypedefType</a>&gt;,
+  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1UnresolvedUsingType.html">UnresolvedUsingType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1UsingType.html">UsingType</a>&gt;
 </pre></td></tr>
 
 
@@ -10602,16 +10601,37 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
   matches using X::b but not using X::a </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1UsingType.html">UsingType</a>&gt;</td><td class="name" onclick="toggle('hasUnderlyingType1')"><a name="hasUnderlyingType1Anchor">hasUnderlyingType</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td></tr>
-<tr><td colspan="4" class="doc" id="hasUnderlyingType1"><pre>Matches DecltypeType or UsingType nodes to find the underlying type.
+<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1UsingType.html">UsingType</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration0')"><a name="hasDeclaration0Anchor">hasDeclaration</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
+<tr><td colspan="4" class="doc" id="hasDeclaration0"><pre>Matches a node if the declaration associated with that node
+matches the given matcher.
 
-Given
-  decltype(1) a = 1;
-  decltype(2.0) b = 2.0;
-decltypeType(hasUnderlyingType(isInteger()))
-  matches the type of "a"
+The associated declaration is:
+- for type nodes, the declaration of the underlying type
+- for CallExpr, the declaration of the callee
+- for MemberExpr, the declaration of the referenced member
+- for CXXConstructExpr, the declaration of the constructor
+- for CXXNewExpr, the declaration of the operator new
+- for ObjCIvarExpr, the declaration of the ivar
+
+For type nodes, hasDeclaration will generally match the declaration of the
+sugared type. Given
+  class X {};
+  typedef X Y;
+  Y y;
+in varDecl(hasType(hasDeclaration(decl()))) the decl will match the
+typedefDecl. A common use case is to match the underlying, desugared type.
+This can be achieved by using the hasUnqualifiedDesugaredType matcher:
+  varDecl(hasType(hasUnqualifiedDesugaredType(
+      recordType(hasDeclaration(decl())))))
+In this matcher, the decl will match the CXXRecordDecl of class X.
 
-Usable as: Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1DecltypeType.html">DecltypeType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1UsingType.html">UsingType</a>&gt;
+Usable as: Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1AddrLabelExpr.html">AddrLabelExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;,
+  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXNewExpr.html">CXXNewExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;,
+  Matcher&lt;EnumType&gt;, Matcher&lt;InjectedClassNameType&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1LabelStmt.html">LabelStmt</a>&gt;,
+  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;, Matcher&lt;RecordType&gt;,
+  Matcher&lt;TagType&gt;, Matcher&lt;TemplateSpecializationType&gt;,
+  Matcher&lt;TemplateTypeParmType&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TypedefType.html">TypedefType</a>&gt;,
+  Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1UnresolvedUsingType.html">UnresolvedUsingType</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1UsingType.html">UsingType</a>&gt;
 </pre></td></tr>
 
 
@@ -10832,13 +10852,25 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 
 
 <tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1WhileStmt.html">WhileStmt</a>&gt;</td><td class="name" onclick="toggle('hasCondition2')"><a name="hasCondition2Anchor">hasCondition</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
-<tr><td colspan="4" class="doc" id="hasCondition2"><pre>Matches the condition expression of an if statement, for loop,
-switch statement or conditional operator.
+<tr><td colspan="4" class="doc" id="hasCondition2"><pre>Matches the condition expression of an if statement, for loop, while loop,
+do-while loop, switch statement or conditional operator.
 
 Example matches true (matcher = hasCondition(cxxBoolLiteral(equals(true))))
   if (true) {}
 </pre></td></tr>
 
+
+<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1WhileStmt.html">WhileStmt</a>&gt;</td><td class="name" onclick="toggle('hasConditionVariableStatement2')"><a name="hasConditionVariableStatement2Anchor">hasConditionVariableStatement</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1DeclStmt.html">DeclStmt</a>&gt; InnerMatcher</td></tr>
+<tr><td colspan="4" class="doc" id="hasConditionVariableStatement2"><pre>Matches the condition variable statement in an if statement, for loop,
+while loop or switch statement.
+
+Given
+  if (A* a = GetAPointer()) {}
+  for (; A* a = GetAPointer(); ) {}
+hasConditionVariableStatement(...)
+  matches both 'A* a = GetAPointer()'.
+</pre></td></tr>
+
 <!--END_TRAVERSAL_MATCHERS -->
 </table>
 
diff --git a/clang/docs/Modules.rst b/clang/docs/Modules.rst
index acbe45e0be970..e45ee9ff9eac2 100644
--- a/clang/docs/Modules.rst
+++ b/clang/docs/Modules.rst
@@ -421,13 +421,7 @@ As an example, the module map file for the C standard library might look a bit l
 
 .. parsed-literal::
 
-  module std [system] [extern_c] {
-    module assert {
-      textual header "assert.h"
-      header "bits/assert-decls.h"
-      export *
-    }
-
+  module std [system] {
     module complex {
       header "complex.h"
       export *
@@ -440,7 +434,6 @@ As an example, the module map file for the C standard library might look a bit l
 
     module errno {
       header "errno.h"
-      header "sys/errno.h"
       export *
     }
 
@@ -673,14 +666,14 @@ of checking *use-declaration*\s, and must still be a lexically-valid header
 file. In the future, we intend to pre-tokenize such headers and include the
 token sequence within the prebuilt module representation.
 
-A header with the ``exclude`` specifier is excluded from the module. It will not be included when the module is built, nor will it be considered to be part of the module, even if an ``umbrella`` header or directory would otherwise make it part of the module.
+A header with the ``exclude`` specifier is excluded from the module. It will not be included when the module is built, nor will it be considered to be part of the module, even if an ``umbrella`` directory would otherwise make it part of the module.
 
-**Example:** The C header ``assert.h`` is an excellent candidate for a textual header, because it is meant to be included multiple times (possibly with different ``NDEBUG`` settings). However, declarations within it should typically be split into a separate modular header.
+**Example:** A "X macro" header is an excellent candidate for a textual header, because it is can't be compiled standalone, and by itself does not contain any declarations.
 
 .. parsed-literal::
 
-  module std [system] {
-    textual header "assert.h"
+  module MyLib [system] {
+    textual header "xmacros.h"
   }
 
 A given header shall not be referenced by more than one *header-declaration*.
diff --git a/clang/docs/OpenMPSupport.rst b/clang/docs/OpenMPSupport.rst
index 61b5babbd18a8..10a8d095fede3 100644
--- a/clang/docs/OpenMPSupport.rst
+++ b/clang/docs/OpenMPSupport.rst
@@ -1,662 +1,662 @@
-.. raw:: html
-
-  <style type="text/css">
-    .none { background-color: #FFCCCC }
-    .part { background-color: #FFFF99 }
-    .good { background-color: #CCFF99 }
-  </style>
-
-.. role:: none
-.. role:: part
-.. role:: good
-
-.. contents::
-   :local:
-
-==============
-OpenMP Support
-==============
-
-Clang fully supports OpenMP 4.5, almost all of 5.0 and most of 5.1/2.
-Clang supports offloading to X86_64, AArch64, PPC64[LE], NVIDIA GPUs (all models) and AMD GPUs (all models).
-
-In addition, the LLVM OpenMP runtime `libomp` supports the OpenMP Tools
-Interface (OMPT) on x86, x86_64, AArch64, and PPC64 on Linux, Windows, and macOS.
-OMPT is also supported for NVIDIA and AMD GPUs.
-
-For the list of supported features from OpenMP 5.0 and 5.1
-see `OpenMP implementation details`_ and `OpenMP 51 implementation details`_.
-
-General improvements
-====================
-- New collapse clause scheme to avoid expensive remainder operations.
-  Compute loop index variables after collapsing a loop nest via the
-  collapse clause by replacing the expensive remainder operation with
-  multiplications and additions.
-
-- When using the collapse clause on a loop nest the default behavior
-  is to automatically extend the representation of the loop counter to
-  64 bits for the cases where the sizes of the collapsed loops are not
-  known at compile time. To prevent this conservative choice and use
-  at most 32 bits, compile your program with the
-  `-fopenmp-optimistic-collapse`.
-
-
-GPU devices support
-===================
-
-Data-sharing modes
-------------------
-
-Clang supports two data-sharing models for Cuda devices: `Generic` and `Cuda`
-modes. The default mode is `Generic`. `Cuda` mode can give an additional
-performance and can be activated using the `-fopenmp-cuda-mode` flag. In
-`Generic` mode all local variables that can be shared in the parallel regions
-are stored in the global memory. In `Cuda` mode local variables are not shared
-between the threads and it is user responsibility to share the required data
-between the threads in the parallel regions. Often, the optimizer is able to
-reduce the cost of `Generic` mode to the level of `Cuda` mode, but the flag,
-as well as other assumption flags, can be used for tuning.
-
-Features not supported or with limited support for Cuda devices
----------------------------------------------------------------
-
-- Cancellation constructs are not supported.
-
-- Doacross loop nest is not supported.
-
-- User-defined reductions are supported only for trivial types.
-
-- Nested parallelism: inner parallel regions are executed sequentially.
-
-- Debug information for OpenMP target regions is supported, but sometimes it may
-  be required to manually specify the address class of the inspected variables.
-  In some cases the local variables are actually allocated in the global memory,
-  but the debug info may be not aware of it.
-
-
-.. _OpenMP implementation details:
-
-OpenMP 5.0 Implementation Details
-=================================
-
-The following table provides a quick overview over various OpenMP 5.0 features
-and their implementation status. Please post on the
-`Discourse forums (Runtimes - OpenMP category)`_ for more
-information or if you want to help with the
-implementation.
-
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-|Category                      | Feature                                                      | Status                   | Reviews                                                               |
-+==============================+==============================================================+==========================+=======================================================================+
-| loop                         | support != in the canonical loop form                        | :good:`done`             | D54441                                                                |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| loop                         | #pragma omp loop (directive)                                 | :part:`partial`          | D145823 (combined forms)                                              |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| loop                         | #pragma omp loop bind                                        | :part:`worked on`        | D144634 (needs review)                                                |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| loop                         | collapse imperfectly nested loop                             | :good:`done`             |                                                                       |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| loop                         | collapse non-rectangular nested loop                         | :good:`done`             |                                                                       |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| loop                         | C++ range-base for loop                                      | :good:`done`             |                                                                       |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| loop                         | clause: if for SIMD directives                               | :good:`done`             |                                                                       |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| loop                         | inclusive scan (matching C++17 PSTL)                         | :good:`done`             |                                                                       |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| memory management            | memory allocators                                            | :good:`done`             | r341687,r357929                                                       |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| memory management            | allocate directive and allocate clause                       | :good:`done`             | r355614,r335952                                                       |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| OMPD                         | OMPD interfaces                                              | :good:`done`             | https://reviews.llvm.org/D99914   (Supports only HOST(CPU) and Linux  |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| OMPT                         | OMPT interfaces (callback support)                           | :good:`done`             |                                                                       |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| thread affinity              | thread affinity                                              | :good:`done`             |                                                                       |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| task                         | taskloop reduction                                           | :good:`done`             |                                                                       |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| task                         | task affinity                                                | :part:`not upstream`     | https://github.com/jklinkenberg/openmp/tree/task-affinity             |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| task                         | clause: depend on the taskwait construct                     | :good:`done`             | D113540 (regular codegen only)                                        |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| task                         | depend objects and detachable tasks                          | :good:`done`             |                                                                       |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| task                         | mutexinoutset dependence-type for tasks                      | :good:`done`             | D53380,D57576                                                         |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| task                         | combined taskloop constructs                                 | :good:`done`             |                                                                       |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| task                         | master taskloop                                              | :good:`done`             |                                                                       |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| task                         | parallel master taskloop                                     | :good:`done`             |                                                                       |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| task                         | master taskloop simd                                         | :good:`done`             |                                                                       |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| task                         | parallel master taskloop simd                                | :good:`done`             |                                                                       |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| SIMD                         | atomic and simd constructs inside SIMD code                  | :good:`done`             |                                                                       |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| SIMD                         | SIMD nontemporal                                             | :good:`done`             |                                                                       |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| device                       | infer target functions from initializers                     | :part:`worked on`        |                                                                       |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| device                       | infer target variables from initializers                     | :good:`done`             | D146418                                                               |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| device                       | OMP_TARGET_OFFLOAD environment variable                      | :good:`done`             | D50522                                                                |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| device                       | support full 'defaultmap' functionality                      | :good:`done`             | D69204                                                                |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| device                       | device specific functions                                    | :good:`done`             |                                                                       |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| device                       | clause: device_type                                          | :good:`done`             |                                                                       |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| device                       | clause: extended device                                      | :good:`done`             |                                                                       |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| device                       | clause: uses_allocators clause                               | :good:`done`             | https://github.com/llvm/llvm-project/pull/157025                      |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| device                       | clause: in_reduction                                         | :part:`worked on`        | r308768                                                               |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| device                       | omp_get_device_num()                                         | :good:`done`             | D54342,D128347                                                        |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| device                       | structure mapping of references                              | :none:`unclaimed`        |                                                                       |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| device                       | nested target declare                                        | :good:`done`             | D51378                                                                |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| device                       | implicitly map 'this' (this[:1])                             | :good:`done`             | D55982                                                                |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| device                       | allow access to the reference count (omp_target_is_present)  | :good:`done`             |                                                                       |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| device                       | requires directive                                           | :good:`done`             |                                                                       |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| device                       | clause: unified_shared_memory                                | :good:`done`             | D52625,D52359                                                         |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| device                       | clause: unified_address                                      | :part:`partial`          |                                                                       |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| device                       | clause: reverse_offload                                      | :part:`partial`          | D52780,D155003                                                        |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| device                       | clause: atomic_default_mem_order                             | :good:`done`             | D53513                                                                |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| device                       | clause: dynamic_allocators                                   | :part:`unclaimed parts`  | D53079                                                                |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| device                       | user-defined mappers                                         | :good:`done`             | D56326,D58638,D58523,D58074,D60972,D59474                             |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| device                       | map array-section with implicit mapper                       | :good:`done`             |  https://github.com/llvm/llvm-project/pull/101101                     |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| device                       | mapping lambda expression                                    | :good:`done`             | D51107                                                                |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| device                       | clause: use_device_addr for target data                      | :good:`done`             |                                                                       |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| device                       | support close modifier on map clause                         | :good:`done`             | D55719,D55892                                                         |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| device                       | teams construct on the host device                           | :good:`done`             | r371553                                                               |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| device                       | support non-contiguous array sections for target update      | :good:`done`             | https://github.com/llvm/llvm-project/pull/144635                      |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| device                       | pointer attachment                                           | :part:`being repaired`   | @abhinavgaba (https://github.com/llvm/llvm-project/pull/153683)       |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| atomic                       | hints for the atomic construct                               | :good:`done`             | D51233                                                                |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| base language                | C11 support                                                  | :good:`done`             |                                                                       |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| base language                | C++11/14/17 support                                          | :good:`done`             |                                                                       |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| base language                | lambda support                                               | :good:`done`             |                                                                       |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| misc                         | array shaping                                                | :good:`done`             | D74144                                                                |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| misc                         | library shutdown (omp_pause_resource[_all])                  | :good:`done`             | D55078                                                                |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| misc                         | metadirectives                                               | :part:`mostly done`      | D91944, https://github.com/llvm/llvm-project/pull/128640              |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| misc                         | conditional modifier for lastprivate clause                  | :good:`done`             |                                                                       |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| misc                         | iterator and multidependences                                | :good:`done`             |                                                                       |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| misc                         | depobj directive and depobj dependency kind                  | :good:`done`             |                                                                       |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| misc                         | user-defined function variants                               | :good:`done`.            | D67294, D64095, D71847, D71830, D109635                               |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| misc                         | pointer/reference to pointer based array reductions          | :good:`done`             |                                                                       |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| misc                         | prevent new type definitions in clauses                      | :good:`done`             |                                                                       |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| memory model                 | memory model update (seq_cst, acq_rel, release, acquire,...) | :good:`done`             |                                                                       |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-
-
-.. _OpenMP 51 implementation details:
-
-OpenMP 5.1 Implementation Details
-=================================
-
-The following table provides a quick overview over various OpenMP 5.1 features
-and their implementation status.
-Please post on the
-`Discourse forums (Runtimes - OpenMP category)`_ for more
-information or if you want to help with the
-implementation.
-
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-|Category                      | Feature                                                      | Status                   | Reviews                                                               |
-+==============================+==============================================================+==========================+=======================================================================+
-| atomic                       | 'compare' clause on atomic construct                         | :good:`done`             | D120290, D120007, D118632, D120200, D116261, D118547, D116637         |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| atomic                       | 'fail' clause on atomic construct                            | :part:`worked on`        | D123235 (in progress)                                                 |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| base language                | C++ attribute specifier syntax                               | :good:`done`             | D105648                                                               |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| device                       | 'present' map type modifier                                  | :good:`done`             | D83061, D83062, D84422                                                |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| device                       | 'present' motion modifier                                    | :good:`done`             | D84711, D84712                                                        |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| device                       | 'present' in defaultmap clause                               | :good:`done`             | D92427                                                                |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| device                       | map clause reordering based on 'present' modifier            | :none:`unclaimed`        |                                                                       |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| device                       | device-specific environment variables                        | :none:`unclaimed`        |                                                                       |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| device                       | omp_target_is_accessible routine                             | :good:`done`             | https://github.com/llvm/llvm-project/pull/138294                      |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| device                       | omp_get_mapped_ptr routine                                   | :good:`done`             | D141545                                                               |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| device                       | new async target memory copy routines                        | :good:`done`             | D136103                                                               |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| device                       | thread_limit clause on target construct                      | :part:`partial`          | D141540 (offload), D152054 (host, in progress)                        |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| device                       | has_device_addr clause on target construct                   | :none:`unclaimed`        |                                                                       |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| device                       | iterators in map clause or motion clauses                    | :none:`unclaimed`        |                                                                       |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| device                       | indirect clause on declare target directive                  | :part:`In Progress`      |                                                                       |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| device                       | allow virtual functions calls for mapped object on device    | :part:`partial`          |                                                                       |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| device                       | interop construct                                            | :part:`partial`          | parsing/sema done: D98558, D98834, D98815                             |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| device                       | assorted routines for querying interoperable properties      | :part:`partial`          | D106674                                                               |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| loop                         | Loop tiling transformation                                   | :good:`done`             | D76342                                                                |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| loop                         | Loop unrolling transformation                                | :good:`done`             | D99459                                                                |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| loop                         | 'reproducible'/'unconstrained' modifiers in 'order' clause   | :part:`partial`          | D127855                                                               |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| memory management            | alignment for allocate directive and clause                  | :good:`done`             | D115683                                                               |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| memory management            | 'allocator' modifier for allocate clause                     | :good:`done`             | https://github.com/llvm/llvm-project/pull/114883                      |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| memory management            | 'align' modifier for allocate clause                         | :good:`done`             | https://github.com/llvm/llvm-project/pull/121814                      |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| memory management            | new memory management routines                               | :none:`unclaimed`        |                                                                       |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| memory management            | changes to omp_alloctrait_key enum                           | :none:`unclaimed`        |                                                                       |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| memory model                 | seq_cst clause on flush construct                            | :good:`done`             | https://github.com/llvm/llvm-project/pull/114072                      |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| misc                         | 'omp_all_memory' keyword and use in 'depend' clause          | :good:`done`             | D125828, D126321                                                      |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| misc                         | error directive                                              | :good:`done`             | D139166                                                               |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| misc                         | scope construct                                              | :good:`done`             | D157933, https://github.com/llvm/llvm-project/pull/109197             |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| misc                         | routines for controlling and querying team regions           | :part:`partial`          | D95003 (libomp only)                                                  |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| misc                         | changes to ompt_scope_endpoint_t enum                        | :none:`unclaimed`        |                                                                       |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| misc                         | omp_display_env routine                                      | :good:`done`             | D74956                                                                |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| misc                         | extended OMP_PLACES syntax                                   | :none:`unclaimed`        |                                                                       |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| misc                         | OMP_NUM_TEAMS and OMP_TEAMS_THREAD_LIMIT env vars            | :good:`done`             | D138769                                                               |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| misc                         | 'target_device' selector in context specifier                | :none:`worked on`        |                                                                       |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| misc                         | begin/end declare variant                                    | :good:`done`             | D71179                                                                |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| misc                         | dispatch construct and function variant argument adjustment  | :part:`worked on`        | D99537, D99679                                                        |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| misc                         | assumes directives                                           | :part:`worked on`        |                                                                       |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| misc                         | assume directive                                             | :good:`done`             |                                                                       |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| misc                         | nothing directive                                            | :good:`done`             | D123286                                                               |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| misc                         | masked construct and related combined constructs             | :good:`done`             | D99995, D100514, PR-121741(parallel_masked_taskloop)                  |
-|                              |                                                              |                          | PR-121746(parallel_masked_task_loop_simd),PR-121914(masked_taskloop)  |
-|                              |                                                              |                          | PR-121916(masked_taskloop_simd)                                       |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| misc                         | default(firstprivate) & default(private)                     | :good:`done`             | D75591 (firstprivate), D125912 (private)                              |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| other                        | deprecating master construct                                 | :none:`unclaimed`        |                                                                       |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| OMPT                         | new barrier types added to ompt_sync_region_t enum           | :none:`unclaimed`        |                                                                       |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| OMPT                         | async data transfers added to ompt_target_data_op_t enum     | :none:`unclaimed`        |                                                                       |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| OMPT                         | new barrier state values added to ompt_state_t enum          | :none:`unclaimed`        |                                                                       |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| OMPT                         | new 'emi' callbacks for external monitoring interfaces       | :good:`done`             |                                                                       |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| OMPT                         | device tracing interface                                     | :none:`in progress`      | jplehr                                                                |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| task                         | 'strict' modifier for taskloop construct                     | :none:`unclaimed`        |                                                                       |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| task                         | inoutset in depend clause                                    | :good:`done`             | D97085, D118383                                                       |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| task                         | nowait clause on taskwait                                    | :part:`partial`          | parsing/sema done: D131830, D141531                                   |
-+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-
-
-.. _OpenMP 5.2 implementation details:
-
-OpenMP 5.2 Implementation Details
-=================================
-
-The following table provides a quick overview of various OpenMP 5.2 features
-and their implementation status. Please post on the
-`Discourse forums (Runtimes - OpenMP category)`_ for more
-information or if you want to help with the
-implementation.
-
-
-
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-|Feature                                                      | C/C++ Status              |  Fortran Status           | Reviews                                                                  |
-+=============================================================+===========================+===========================+==========================================================================+
-| omp_in_explicit_task()                                      | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| semantics of explicit_task_var and implicit_task_var        | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| ompx sentinel for C/C++ directive extensions                | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| ompx prefix for clause extensions                           | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| if clause on teams construct                                | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| step modifier added                                         | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| declare mapper: Add iterator modifier on map clause         | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| declare mapper: Add iterator modifier on map clause         | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| memspace and traits modifiers to uses allocator         i   | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| Add otherwise clause to metadirectives                      | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| doacross clause with support for omp_cur_iteration          | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| position of interop_type in init clause on iterop           | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| implicit map type for target enter/exit data                | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| work OMPT type for work-sharing loop constructs             | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| allocate and firstprivate on scope directive                | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| Change loop consistency for order clause                    | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| Add memspace and traits modifiers to uses_allocators        | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| Keep original base pointer on map w/o matched candidate     | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| Pure procedure support for certain directives               | :none:`N/A`               | :none:`unclaimed`         |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| ALLOCATE statement support for allocators                   | :none:`N/A`               | :none:`unclaimed`         |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| dispatch construct extension to support end directive       | :none:`N/A`               | :none:`unclaimed`         |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-
-
-
-.. _OpenMP 5.2 Deprecations:
-
-OpenMP 5.2 Deprecations
-=======================
-
-
-
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-|                                                             | C/C++ Status              |  Fortran Status           | Reviews                                                                  |
-+=============================================================+===========================+===========================+==========================================================================+
-| Linear clause syntax                                        | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| The minus operator                                          | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| Map clause modifiers without commas                         | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| The use of allocate directives with ALLOCATE statement      | :good:`N/A`               | :none:`unclaimed`         |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| uses_allocators list syntax                                 | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| The default clause on metadirectives                        | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| The delimited form of the declare target directive          | :none:`unclaimed`         | :good:`N/A`               |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| The use of the to clause on the declare target directive    | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| The syntax of the destroy clause on the depobj construct    | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| keyword source and sink as task-dependence modifiers        | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| interop types in any position on init clause of interop     | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| ompd prefix usage for some ICVs                             | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-
-.. _OpenMP 6.0 implementation details:
-
-OpenMP 6.0 Implementation Details
-=================================
-
-The following table provides a quick overview of various OpenMP 6.0 features
-and their implementation status. Please post on the
-`Discourse forums (Runtimes - OpenMP category)`_ for more
-information or if you want to help with the
-implementation.
-
-
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-|Feature                                                      | C/C++ Status              |  Fortran Status           | Reviews                                                                  |
-+=============================================================+===========================+===========================+==========================================================================+
-| free-agent threads                                          | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| threadset clause                                            | :part:`in progress`       | :none:`unclaimed`         |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| Recording of task graphs                                    | :part:`in progress`       | :part:`in progress`       | clang: jtb20, flang: kparzysz                                            |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| Parallel inductions                                         | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| init_complete for scan directive                            | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| loop interchange transformation                             | :good:`done`              | :none:`unclaimed`         | Clang (interchange): https://github.com/llvm/llvm-project/pull/93022     |
-|                                                             |                           |                           | Clang (permutation): https://github.com/llvm/llvm-project/pull/92030     |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| loop reverse transformation                                 | :good:`done`              | :none:`unclaimed`         | https://github.com/llvm/llvm-project/pull/92916                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| loop stripe transformation                                  | :good:`done`              | :none:`unclaimed`         | https://github.com/llvm/llvm-project/pull/119891                         |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| loop fusion transformation                                  | :part:`in progress`       | :none:`unclaimed`         | https://github.com/llvm/llvm-project/pull/139293                         |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| loop index set splitting transformation                     | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| loop transformation apply clause                            | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| loop fuse transformation                                    | :good:`done`              | :none:`unclaimed`         |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| workdistribute construct                                    |                           | :none:`in progress`       | @skc7, @mjklemm                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| task_iteration                                              | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| memscope clause for atomic and flush                        | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| transparent clause (hull tasks)                             | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| rule-based compound directives                              | :part:`In Progress`       | :part:`In Progress`       | kparzysz                                                                 |
-|                                                             |                           |                           | Testing for Fortran missing                                              |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| C23, C++23                                                  | :none:`unclaimed`         |                           |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| Fortran 2023                                                |                           | :none:`unclaimed`         |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| decl attribute for declarative directives                   | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| C attribute syntax                                          | :none:`unclaimed`         |                           |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| pure directives in DO CONCURRENT                            |                           | :none:`unclaimed`         |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| Optional argument for all clauses                           | :none:`partial`           | :none:`In Progress`       | Parse/Sema (nowait): https://github.com/llvm/llvm-project/pull/159628    |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| Function references for locator list items                  | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| All clauses accept directive name modifier                  | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| Extensions to depobj construct                              | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| Extensions to atomic construct                              | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| Private reductions                                          | :good:`mostly`            | :none:`unclaimed`         | Parse/Sema:https://github.com/llvm/llvm-project/pull/129938              |
-|                                                             |                           |                           | Codegen: https://github.com/llvm/llvm-project/pull/134709                |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| Self maps                                                   | :part:`partial`           | :none:`unclaimed`         | parsing/sema done: https://github.com/llvm/llvm-project/pull/129888      |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| Release map type for declare mapper                         | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| Extensions to interop construct                             | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| no_openmp_constructs                                        | :good:`done`              | :none:`unclaimed`         | https://github.com/llvm/llvm-project/pull/125933                         |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| safe_sync and progress with identifier and API              | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| OpenMP directives in concurrent loop regions                | :good:`done`              | :none:`unclaimed`         | https://github.com/llvm/llvm-project/pull/125621                         |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| atomics constructs on concurrent loop regions               | :good:`done`              | :none:`unclaimed`         | https://github.com/llvm/llvm-project/pull/125621                         |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| Loop construct with DO CONCURRENT                           |                           | :part:`In Progress`       |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| device_type clause for target construct                     | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| nowait for ancestor target directives                       | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| New API for devices' num_teams/thread_limit                 | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| Host and device environment variables                       | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| num_threads ICV and clause accepts list                     | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| Numeric names for environment variables                     | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| Increment between places for OMP_PLACES                     | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| OMP_AVAILABLE_DEVICES envirable                             | :none:`unclaimed`         | :none:`unclaimed`         | (should wait for "Traits for default device envirable" being done)       |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| Traits for default device envirable                         | :part:`in progress`       | :none:`unclaimed`         | ro-i                                                                     |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| Optionally omit array length expression                     | :good:`done`              | :none:`unclaimed`         | (Parse) https://github.com/llvm/llvm-project/pull/148048,                |
-|                                                             |                           |                           | (Sema) https://github.com/llvm/llvm-project/pull/152786                  |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| Canonical loop sequences                                    | :part:`in progress`       | :part:`in progress`       | Clang: https://github.com/llvm/llvm-project/pull/139293                  |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| Clarifications to Fortran map semantics                     | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| default clause at target construct                          | :part:`In Progress`       | :none:`unclaimed`         |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| ref count update use_device_{ptr, addr}                     | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| Clarifications to implicit reductions                       | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| ref modifier for map clauses                                | :part:`In Progress`       | :none:`unclaimed`         |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| map-type modifiers in arbitrary position                    | :good:`done`              | :none:`unclaimed`         | https://github.com/llvm/llvm-project/pull/90499                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| Lift nesting restriction on concurrent loop                 | :good:`done`              | :none:`unclaimed`         | https://github.com/llvm/llvm-project/pull/125621                         |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| priority clause for target constructs                       | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| changes to target_data construct                            | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| Non-const do_not_sync for nowait/nogroup                    | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| need_device_addr modifier for adjust_args clause            | :part:`partial`           | :none:`unclaimed`         | Parsing/Sema: https://github.com/llvm/llvm-project/pull/143442           |
-|                                                             |                           |                           |               https://github.com/llvm/llvm-project/pull/149586           |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| Prescriptive num_threads                                    | :good:`done`              | :none:`unclaimed`         |  https://github.com/llvm/llvm-project/pull/160659                        |
-|                                                             |                           |                           |  https://github.com/llvm/llvm-project/pull/146403                        |
-|                                                             |                           |                           |  https://github.com/llvm/llvm-project/pull/146404                        |
-|                                                             |                           |                           |  https://github.com/llvm/llvm-project/pull/146405                        |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| Message and severity clauses                                | :good:`done`              | :none:`unclaimed`         |  https://github.com/llvm/llvm-project/pull/146093                        |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| Local clause on declare target                              | :part:`In Progress`       | :none:`unclaimed`         |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| groupprivate directive                                      | :part:`In Progress`       | :part:`partial`           | Flang: kparzysz, mjklemm                                                 |
-|                                                             |                           |                           |                                                                          |
-|                                                             |                           |                           | Flang parser: https://github.com/llvm/llvm-project/pull/153807           |
-|                                                             |                           |                           | Flang sema: https://github.com/llvm/llvm-project/pull/154779             |
-|                                                             |                           |                           | Clang parse/sema: https://github.com/llvm/llvm-project/pull/158134       |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| variable-category on default clause                         | :good:`done`              | :none:`unclaimed`         |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| Changes to omp_target_is_accessible                         | :part:`In Progress`       | :part:`In Progress`       |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| defaultmap implicit-behavior 'storage'                      | :good:`done`              | :none:`unclaimed`         | https://github.com/llvm/llvm-project/pull/158336                         |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| defaultmap implicit-behavior 'private'                      | :good:`done`              | :none:`unclaimed`         | https://github.com/llvm/llvm-project/pull/158712                         |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-
-.. _OpenMP 6.1 implementation details:
-
-OpenMP 6.1 Implementation Details (Experimental)
-================================================
-
-The following table provides a quick overview over various OpenMP 6.1 features
-and their implementation status. Since OpenMP 6.1 has not yet been released, the
-following features are experimental and are subject to change at any time.
-Please post on the `Discourse forums (Runtimes - OpenMP category)`_ for more
-information or if you want to help with the
-implementation.
-
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-|Feature                                                      | C/C++ Status              | Fortran Status            | Reviews                                                                  |
-+=============================================================+===========================+===========================+==========================================================================+
-| dyn_groupprivate clause                                     | :part:`In Progress`       | :part:`In Progress`       | C/C++: kevinsala (https://github.com/llvm/llvm-project/pull/152651       |
-|                                                             |                           |                           | https://github.com/llvm/llvm-project/pull/152830                         |
-|                                                             |                           |                           | https://github.com/llvm/llvm-project/pull/152831)                        |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| loop flatten transformation                                 | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| loop grid/tile modifiers for sizes clause                   | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| attach map-type modifier                                    | :part:`In Progress`       | :none:`unclaimed`         | C/C++: @abhinavgaba;                                                     |
-|                                                             |                           |                           | RT: @abhinavgaba (https://github.com/llvm/llvm-project/pull/149036,      |
-|                                                             |                           |                           | https://github.com/llvm/llvm-project/pull/158370)                        |
-+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-
-
-OpenMP Extensions
-=================
-
-The following table provides a quick overview over various OpenMP
-extensions and their implementation status.  These extensions are not
-currently defined by any standard, so links to associated LLVM
-documentation are provided.  As these extensions mature, they will be
-considered for standardization. Please post on the
-`Discourse forums (Runtimes - OpenMP category)`_ to provide feedback.
-
-+------------------------------+-----------------------------------------------------------------------------------+--------------------------+--------------------------------------------------------+
-|Category                      | Feature                                                                           | Status                   | Reviews                                                |
-+==============================+===================================================================================+==========================+========================================================+
-| atomic extension             | `'atomic' strictly nested within 'teams'                                          | :good:`prototyped`       | D126323                                                |
-|                              | <https://openmp.llvm.org/docs/openacc/OpenMPExtensions.html#atomicWithinTeams>`_  |                          |                                                        |
-+------------------------------+-----------------------------------------------------------------------------------+--------------------------+--------------------------------------------------------+
-| device extension             | `'ompx_hold' map type modifier                                                    | :good:`prototyped`       | D106509, D106510                                       |
-|                              | <https://openmp.llvm.org/docs/openacc/OpenMPExtensions.html#ompx-hold>`_          |                          |                                                        |
-+------------------------------+-----------------------------------------------------------------------------------+--------------------------+--------------------------------------------------------+
-| device extension             | `'ompx_bare' clause on 'target teams' construct                                   | :good:`prototyped`       | #66844, #70612                                         |
-|                              | <https://www.osti.gov/servlets/purl/2205717>`_                                    |                          |                                                        |
-+------------------------------+-----------------------------------------------------------------------------------+--------------------------+--------------------------------------------------------+
-| device extension             | Multi-dim 'num_teams' and 'thread_limit' clause on 'target teams ompx_bare'       | :good:`partial`          | #99732, #101407, #102715                               |
-|                              | construct                                                                         |                          |                                                        |
-+------------------------------+-----------------------------------------------------------------------------------+--------------------------+--------------------------------------------------------+
-
-.. _Discourse forums (Runtimes - OpenMP category): https://discourse.llvm.org/c/runtimes/openmp/35
+.. raw:: html
+
+  <style type="text/css">
+    .none { background-color: #FFCCCC }
+    .part { background-color: #FFFF99 }
+    .good { background-color: #CCFF99 }
+  </style>
+
+.. role:: none
+.. role:: part
+.. role:: good
+
+.. contents::
+   :local:
+
+==============
+OpenMP Support
+==============
+
+Clang fully supports OpenMP 4.5, almost all of 5.0 and most of 5.1/2.
+Clang supports offloading to X86_64, AArch64, PPC64[LE], NVIDIA GPUs (all models) and AMD GPUs (all models).
+
+In addition, the LLVM OpenMP runtime `libomp` supports the OpenMP Tools
+Interface (OMPT) on x86, x86_64, AArch64, and PPC64 on Linux, Windows, and macOS.
+OMPT is also supported for NVIDIA and AMD GPUs.
+
+For the list of supported features from OpenMP 5.0 and 5.1
+see `OpenMP implementation details`_ and `OpenMP 51 implementation details`_.
+
+General improvements
+====================
+- New collapse clause scheme to avoid expensive remainder operations.
+  Compute loop index variables after collapsing a loop nest via the
+  collapse clause by replacing the expensive remainder operation with
+  multiplications and additions.
+
+- When using the collapse clause on a loop nest the default behavior
+  is to automatically extend the representation of the loop counter to
+  64 bits for the cases where the sizes of the collapsed loops are not
+  known at compile time. To prevent this conservative choice and use
+  at most 32 bits, compile your program with the
+  `-fopenmp-optimistic-collapse`.
+
+
+GPU devices support
+===================
+
+Data-sharing modes
+------------------
+
+Clang supports two data-sharing models for Cuda devices: `Generic` and `Cuda`
+modes. The default mode is `Generic`. `Cuda` mode can give an additional
+performance and can be activated using the `-fopenmp-cuda-mode` flag. In
+`Generic` mode all local variables that can be shared in the parallel regions
+are stored in the global memory. In `Cuda` mode local variables are not shared
+between the threads and it is user responsibility to share the required data
+between the threads in the parallel regions. Often, the optimizer is able to
+reduce the cost of `Generic` mode to the level of `Cuda` mode, but the flag,
+as well as other assumption flags, can be used for tuning.
+
+Features not supported or with limited support for Cuda devices
+---------------------------------------------------------------
+
+- Cancellation constructs are not supported.
+
+- Doacross loop nest is not supported.
+
+- User-defined reductions are supported only for trivial types.
+
+- Nested parallelism: inner parallel regions are executed sequentially.
+
+- Debug information for OpenMP target regions is supported, but sometimes it may
+  be required to manually specify the address class of the inspected variables.
+  In some cases the local variables are actually allocated in the global memory,
+  but the debug info may be not aware of it.
+
+
+.. _OpenMP implementation details:
+
+OpenMP 5.0 Implementation Details
+=================================
+
+The following table provides a quick overview over various OpenMP 5.0 features
+and their implementation status. Please post on the
+`Discourse forums (Runtimes - OpenMP category)`_ for more
+information or if you want to help with the
+implementation.
+
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+|Category                      | Feature                                                      | Status                   | Reviews                                                               |
++==============================+==============================================================+==========================+=======================================================================+
+| loop                         | support != in the canonical loop form                        | :good:`done`             | D54441                                                                |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| loop                         | #pragma omp loop (directive)                                 | :part:`partial`          | D145823 (combined forms)                                              |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| loop                         | #pragma omp loop bind                                        | :part:`worked on`        | D144634 (needs review)                                                |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| loop                         | collapse imperfectly nested loop                             | :good:`done`             |                                                                       |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| loop                         | collapse non-rectangular nested loop                         | :good:`done`             |                                                                       |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| loop                         | C++ range-base for loop                                      | :good:`done`             |                                                                       |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| loop                         | clause: if for SIMD directives                               | :good:`done`             |                                                                       |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| loop                         | inclusive scan (matching C++17 PSTL)                         | :good:`done`             |                                                                       |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| memory management            | memory allocators                                            | :good:`done`             | r341687,r357929                                                       |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| memory management            | allocate directive and allocate clause                       | :good:`done`             | r355614,r335952                                                       |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| OMPD                         | OMPD interfaces                                              | :good:`done`             | https://reviews.llvm.org/D99914   (Supports only HOST(CPU) and Linux  |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| OMPT                         | OMPT interfaces (callback support)                           | :good:`done`             |                                                                       |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| thread affinity              | thread affinity                                              | :good:`done`             |                                                                       |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| task                         | taskloop reduction                                           | :good:`done`             |                                                                       |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| task                         | task affinity                                                | :part:`not upstream`     | https://github.com/jklinkenberg/openmp/tree/task-affinity             |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| task                         | clause: depend on the taskwait construct                     | :good:`done`             | D113540 (regular codegen only)                                        |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| task                         | depend objects and detachable tasks                          | :good:`done`             |                                                                       |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| task                         | mutexinoutset dependence-type for tasks                      | :good:`done`             | D53380,D57576                                                         |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| task                         | combined taskloop constructs                                 | :good:`done`             |                                                                       |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| task                         | master taskloop                                              | :good:`done`             |                                                                       |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| task                         | parallel master taskloop                                     | :good:`done`             |                                                                       |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| task                         | master taskloop simd                                         | :good:`done`             |                                                                       |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| task                         | parallel master taskloop simd                                | :good:`done`             |                                                                       |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| SIMD                         | atomic and simd constructs inside SIMD code                  | :good:`done`             |                                                                       |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| SIMD                         | SIMD nontemporal                                             | :good:`done`             |                                                                       |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| device                       | infer target functions from initializers                     | :part:`worked on`        |                                                                       |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| device                       | infer target variables from initializers                     | :good:`done`             | D146418                                                               |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| device                       | OMP_TARGET_OFFLOAD environment variable                      | :good:`done`             | D50522                                                                |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| device                       | support full 'defaultmap' functionality                      | :good:`done`             | D69204                                                                |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| device                       | device specific functions                                    | :good:`done`             |                                                                       |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| device                       | clause: device_type                                          | :good:`done`             |                                                                       |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| device                       | clause: extended device                                      | :good:`done`             |                                                                       |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| device                       | clause: uses_allocators clause                               | :good:`done`             | https://github.com/llvm/llvm-project/pull/157025                      |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| device                       | clause: in_reduction                                         | :part:`worked on`        | r308768                                                               |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| device                       | omp_get_device_num()                                         | :good:`done`             | D54342,D128347                                                        |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| device                       | structure mapping of references                              | :none:`unclaimed`        |                                                                       |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| device                       | nested target declare                                        | :good:`done`             | D51378                                                                |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| device                       | implicitly map 'this' (this[:1])                             | :good:`done`             | D55982                                                                |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| device                       | allow access to the reference count (omp_target_is_present)  | :good:`done`             |                                                                       |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| device                       | requires directive                                           | :good:`done`             |                                                                       |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| device                       | clause: unified_shared_memory                                | :good:`done`             | D52625,D52359                                                         |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| device                       | clause: unified_address                                      | :part:`partial`          |                                                                       |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| device                       | clause: reverse_offload                                      | :part:`partial`          | D52780,D155003                                                        |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| device                       | clause: atomic_default_mem_order                             | :good:`done`             | D53513                                                                |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| device                       | clause: dynamic_allocators                                   | :part:`unclaimed parts`  | D53079                                                                |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| device                       | user-defined mappers                                         | :good:`done`             | D56326,D58638,D58523,D58074,D60972,D59474                             |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| device                       | map array-section with implicit mapper                       | :good:`done`             |  https://github.com/llvm/llvm-project/pull/101101                     |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| device                       | mapping lambda expression                                    | :good:`done`             | D51107                                                                |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| device                       | clause: use_device_addr for target data                      | :good:`done`             |                                                                       |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| device                       | support close modifier on map clause                         | :good:`done`             | D55719,D55892                                                         |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| device                       | teams construct on the host device                           | :good:`done`             | r371553                                                               |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| device                       | support non-contiguous array sections for target update      | :good:`done`             | https://github.com/llvm/llvm-project/pull/144635                      |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| device                       | pointer attachment                                           | :part:`being repaired`   | @abhinavgaba (https://github.com/llvm/llvm-project/pull/153683)       |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| atomic                       | hints for the atomic construct                               | :good:`done`             | D51233                                                                |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| base language                | C11 support                                                  | :good:`done`             |                                                                       |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| base language                | C++11/14/17 support                                          | :good:`done`             |                                                                       |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| base language                | lambda support                                               | :good:`done`             |                                                                       |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| misc                         | array shaping                                                | :good:`done`             | D74144                                                                |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| misc                         | library shutdown (omp_pause_resource[_all])                  | :good:`done`             | D55078                                                                |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| misc                         | metadirectives                                               | :part:`mostly done`      | D91944, https://github.com/llvm/llvm-project/pull/128640              |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| misc                         | conditional modifier for lastprivate clause                  | :good:`done`             |                                                                       |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| misc                         | iterator and multidependences                                | :good:`done`             |                                                                       |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| misc                         | depobj directive and depobj dependency kind                  | :good:`done`             |                                                                       |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| misc                         | user-defined function variants                               | :good:`done`.            | D67294, D64095, D71847, D71830, D109635                               |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| misc                         | pointer/reference to pointer based array reductions          | :good:`done`             |                                                                       |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| misc                         | prevent new type definitions in clauses                      | :good:`done`             |                                                                       |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| memory model                 | memory model update (seq_cst, acq_rel, release, acquire,...) | :good:`done`             |                                                                       |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+
+
+.. _OpenMP 51 implementation details:
+
+OpenMP 5.1 Implementation Details
+=================================
+
+The following table provides a quick overview over various OpenMP 5.1 features
+and their implementation status.
+Please post on the
+`Discourse forums (Runtimes - OpenMP category)`_ for more
+information or if you want to help with the
+implementation.
+
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+|Category                      | Feature                                                      | Status                   | Reviews                                                               |
++==============================+==============================================================+==========================+=======================================================================+
+| atomic                       | 'compare' clause on atomic construct                         | :good:`done`             | D120290, D120007, D118632, D120200, D116261, D118547, D116637         |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| atomic                       | 'fail' clause on atomic construct                            | :part:`worked on`        | D123235 (in progress)                                                 |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| base language                | C++ attribute specifier syntax                               | :good:`done`             | D105648                                                               |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| device                       | 'present' map type modifier                                  | :good:`done`             | D83061, D83062, D84422                                                |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| device                       | 'present' motion modifier                                    | :good:`done`             | D84711, D84712                                                        |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| device                       | 'present' in defaultmap clause                               | :good:`done`             | D92427                                                                |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| device                       | map clause reordering based on 'present' modifier            | :none:`unclaimed`        |                                                                       |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| device                       | device-specific environment variables                        | :none:`unclaimed`        |                                                                       |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| device                       | omp_target_is_accessible routine                             | :good:`done`             | https://github.com/llvm/llvm-project/pull/138294                      |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| device                       | omp_get_mapped_ptr routine                                   | :good:`done`             | D141545                                                               |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| device                       | new async target memory copy routines                        | :good:`done`             | D136103                                                               |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| device                       | thread_limit clause on target construct                      | :part:`partial`          | D141540 (offload), D152054 (host, in progress)                        |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| device                       | has_device_addr clause on target construct                   | :none:`unclaimed`        |                                                                       |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| device                       | iterators in map clause or motion clauses                    | :none:`unclaimed`        |                                                                       |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| device                       | indirect clause on declare target directive                  | :part:`In Progress`      |                                                                       |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| device                       | allow virtual functions calls for mapped object on device    | :part:`partial`          |                                                                       |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| device                       | interop construct                                            | :part:`partial`          | parsing/sema done: D98558, D98834, D98815                             |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| device                       | assorted routines for querying interoperable properties      | :part:`partial`          | D106674                                                               |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| loop                         | Loop tiling transformation                                   | :good:`done`             | D76342                                                                |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| loop                         | Loop unrolling transformation                                | :good:`done`             | D99459                                                                |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| loop                         | 'reproducible'/'unconstrained' modifiers in 'order' clause   | :part:`partial`          | D127855                                                               |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| memory management            | alignment for allocate directive and clause                  | :good:`done`             | D115683                                                               |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| memory management            | 'allocator' modifier for allocate clause                     | :good:`done`             | https://github.com/llvm/llvm-project/pull/114883                      |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| memory management            | 'align' modifier for allocate clause                         | :good:`done`             | https://github.com/llvm/llvm-project/pull/121814                      |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| memory management            | new memory management routines                               | :none:`unclaimed`        |                                                                       |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| memory management            | changes to omp_alloctrait_key enum                           | :none:`unclaimed`        |                                                                       |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| memory model                 | seq_cst clause on flush construct                            | :good:`done`             | https://github.com/llvm/llvm-project/pull/114072                      |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| misc                         | 'omp_all_memory' keyword and use in 'depend' clause          | :good:`done`             | D125828, D126321                                                      |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| misc                         | error directive                                              | :good:`done`             | D139166                                                               |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| misc                         | scope construct                                              | :good:`done`             | D157933, https://github.com/llvm/llvm-project/pull/109197             |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| misc                         | routines for controlling and querying team regions           | :part:`partial`          | D95003 (libomp only)                                                  |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| misc                         | changes to ompt_scope_endpoint_t enum                        | :none:`unclaimed`        |                                                                       |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| misc                         | omp_display_env routine                                      | :good:`done`             | D74956                                                                |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| misc                         | extended OMP_PLACES syntax                                   | :none:`unclaimed`        |                                                                       |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| misc                         | OMP_NUM_TEAMS and OMP_TEAMS_THREAD_LIMIT env vars            | :good:`done`             | D138769                                                               |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| misc                         | 'target_device' selector in context specifier                | :none:`worked on`        |                                                                       |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| misc                         | begin/end declare variant                                    | :good:`done`             | D71179                                                                |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| misc                         | dispatch construct and function variant argument adjustment  | :part:`worked on`        | D99537, D99679                                                        |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| misc                         | assumes directives                                           | :part:`worked on`        |                                                                       |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| misc                         | assume directive                                             | :good:`done`             |                                                                       |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| misc                         | nothing directive                                            | :good:`done`             | D123286                                                               |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| misc                         | masked construct and related combined constructs             | :good:`done`             | D99995, D100514, PR-121741(parallel_masked_taskloop)                  |
+|                              |                                                              |                          | PR-121746(parallel_masked_task_loop_simd),PR-121914(masked_taskloop)  |
+|                              |                                                              |                          | PR-121916(masked_taskloop_simd)                                       |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| misc                         | default(firstprivate) & default(private)                     | :good:`done`             | D75591 (firstprivate), D125912 (private)                              |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| other                        | deprecating master construct                                 | :none:`unclaimed`        |                                                                       |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| OMPT                         | new barrier types added to ompt_sync_region_t enum           | :none:`unclaimed`        |                                                                       |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| OMPT                         | async data transfers added to ompt_target_data_op_t enum     | :none:`unclaimed`        |                                                                       |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| OMPT                         | new barrier state values added to ompt_state_t enum          | :none:`unclaimed`        |                                                                       |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| OMPT                         | new 'emi' callbacks for external monitoring interfaces       | :good:`done`             |                                                                       |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| OMPT                         | device tracing interface                                     | :none:`in progress`      | jplehr                                                                |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| task                         | 'strict' modifier for taskloop construct                     | :none:`unclaimed`        |                                                                       |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| task                         | inoutset in depend clause                                    | :good:`done`             | D97085, D118383                                                       |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| task                         | nowait clause on taskwait                                    | :part:`partial`          | parsing/sema done: D131830, D141531                                   |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+
+
+.. _OpenMP 5.2 implementation details:
+
+OpenMP 5.2 Implementation Details
+=================================
+
+The following table provides a quick overview of various OpenMP 5.2 features
+and their implementation status. Please post on the
+`Discourse forums (Runtimes - OpenMP category)`_ for more
+information or if you want to help with the
+implementation.
+
+
+
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+|Feature                                                      | C/C++ Status              |  Fortran Status           | Reviews                                                                  |
++=============================================================+===========================+===========================+==========================================================================+
+| omp_in_explicit_task()                                      | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| semantics of explicit_task_var and implicit_task_var        | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| ompx sentinel for C/C++ directive extensions                | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| ompx prefix for clause extensions                           | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| if clause on teams construct                                | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| step modifier added                                         | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| declare mapper: Add iterator modifier on map clause         | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| declare mapper: Add iterator modifier on map clause         | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| memspace and traits modifiers to uses allocator         i   | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| Add otherwise clause to metadirectives                      | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| doacross clause with support for omp_cur_iteration          | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| position of interop_type in init clause on iterop           | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| implicit map type for target enter/exit data                | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| work OMPT type for work-sharing loop constructs             | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| allocate and firstprivate on scope directive                | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| Change loop consistency for order clause                    | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| Add memspace and traits modifiers to uses_allocators        | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| Keep original base pointer on map w/o matched candidate     | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| Pure procedure support for certain directives               | :none:`N/A`               | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| ALLOCATE statement support for allocators                   | :none:`N/A`               | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| dispatch construct extension to support end directive       | :none:`N/A`               | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+
+
+
+.. _OpenMP 5.2 Deprecations:
+
+OpenMP 5.2 Deprecations
+=======================
+
+
+
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+|                                                             | C/C++ Status              |  Fortran Status           | Reviews                                                                  |
++=============================================================+===========================+===========================+==========================================================================+
+| Linear clause syntax                                        | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| The minus operator                                          | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| Map clause modifiers without commas                         | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| The use of allocate directives with ALLOCATE statement      | :good:`N/A`               | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| uses_allocators list syntax                                 | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| The default clause on metadirectives                        | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| The delimited form of the declare target directive          | :none:`unclaimed`         | :good:`N/A`               |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| The use of the to clause on the declare target directive    | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| The syntax of the destroy clause on the depobj construct    | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| keyword source and sink as task-dependence modifiers        | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| interop types in any position on init clause of interop     | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| ompd prefix usage for some ICVs                             | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+
+.. _OpenMP 6.0 implementation details:
+
+OpenMP 6.0 Implementation Details
+=================================
+
+The following table provides a quick overview of various OpenMP 6.0 features
+and their implementation status. Please post on the
+`Discourse forums (Runtimes - OpenMP category)`_ for more
+information or if you want to help with the
+implementation.
+
+
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+|Feature                                                      | C/C++ Status              |  Fortran Status           | Reviews                                                                  |
++=============================================================+===========================+===========================+==========================================================================+
+| free-agent threads                                          | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| threadset clause                                            | :part:`partial`           | :none:`unclaimed`         | Parse/Sema/Codegen : https://github.com/llvm/llvm-project/pull/13580     |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| Recording of task graphs                                    | :part:`in progress`       | :part:`in progress`       | clang: jtb20, flang: kparzysz                                            |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| Parallel inductions                                         | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| init_complete for scan directive                            | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| loop interchange transformation                             | :good:`done`              | :none:`unclaimed`         | Clang (interchange): https://github.com/llvm/llvm-project/pull/93022     |
+|                                                             |                           |                           | Clang (permutation): https://github.com/llvm/llvm-project/pull/92030     |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| loop reverse transformation                                 | :good:`done`              | :none:`unclaimed`         | https://github.com/llvm/llvm-project/pull/92916                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| loop stripe transformation                                  | :good:`done`              | :none:`unclaimed`         | https://github.com/llvm/llvm-project/pull/119891                         |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| loop fusion transformation                                  | :part:`in progress`       | :none:`unclaimed`         | https://github.com/llvm/llvm-project/pull/139293                         |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| loop index set splitting transformation                     | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| loop transformation apply clause                            | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| loop fuse transformation                                    | :good:`done`              | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| workdistribute construct                                    |                           | :none:`in progress`       | @skc7, @mjklemm                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| task_iteration                                              | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| memscope clause for atomic and flush                        | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| transparent clause (hull tasks)                             | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| rule-based compound directives                              | :part:`In Progress`       | :part:`In Progress`       | kparzysz                                                                 |
+|                                                             |                           |                           | Testing for Fortran missing                                              |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| C23, C++23                                                  | :none:`unclaimed`         |                           |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| Fortran 2023                                                |                           | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| decl attribute for declarative directives                   | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| C attribute syntax                                          | :none:`unclaimed`         |                           |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| pure directives in DO CONCURRENT                            |                           | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| Optional argument for all clauses                           | :none:`partial`           | :none:`In Progress`       | Parse/Sema (nowait): https://github.com/llvm/llvm-project/pull/159628    |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| Function references for locator list items                  | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| All clauses accept directive name modifier                  | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| Extensions to depobj construct                              | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| Extensions to atomic construct                              | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| Private reductions                                          | :good:`mostly`            | :none:`unclaimed`         | Parse/Sema:https://github.com/llvm/llvm-project/pull/129938              |
+|                                                             |                           |                           | Codegen: https://github.com/llvm/llvm-project/pull/134709                |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| Self maps                                                   | :part:`partial`           | :none:`unclaimed`         | parsing/sema done: https://github.com/llvm/llvm-project/pull/129888      |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| Release map type for declare mapper                         | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| Extensions to interop construct                             | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| no_openmp_constructs                                        | :good:`done`              | :none:`unclaimed`         | https://github.com/llvm/llvm-project/pull/125933                         |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| safe_sync and progress with identifier and API              | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| OpenMP directives in concurrent loop regions                | :good:`done`              | :none:`unclaimed`         | https://github.com/llvm/llvm-project/pull/125621                         |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| atomics constructs on concurrent loop regions               | :good:`done`              | :none:`unclaimed`         | https://github.com/llvm/llvm-project/pull/125621                         |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| Loop construct with DO CONCURRENT                           |                           | :part:`In Progress`       |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| device_type clause for target construct                     | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| nowait for ancestor target directives                       | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| New API for devices' num_teams/thread_limit                 | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| Host and device environment variables                       | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| num_threads ICV and clause accepts list                     | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| Numeric names for environment variables                     | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| Increment between places for OMP_PLACES                     | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| OMP_AVAILABLE_DEVICES envirable                             | :none:`unclaimed`         | :none:`unclaimed`         | (should wait for "Traits for default device envirable" being done)       |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| Traits for default device envirable                         | :part:`in progress`       | :none:`unclaimed`         | ro-i                                                                     |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| Optionally omit array length expression                     | :good:`done`              | :none:`unclaimed`         | (Parse) https://github.com/llvm/llvm-project/pull/148048,                |
+|                                                             |                           |                           | (Sema) https://github.com/llvm/llvm-project/pull/152786                  |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| Canonical loop sequences                                    | :part:`in progress`       | :part:`in progress`       | Clang: https://github.com/llvm/llvm-project/pull/139293                  |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| Clarifications to Fortran map semantics                     | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| default clause at target construct                          | :part:`In Progress`       | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| ref count update use_device_{ptr, addr}                     | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| Clarifications to implicit reductions                       | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| ref modifier for map clauses                                | :part:`In Progress`       | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| map-type modifiers in arbitrary position                    | :good:`done`              | :none:`unclaimed`         | https://github.com/llvm/llvm-project/pull/90499                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| Lift nesting restriction on concurrent loop                 | :good:`done`              | :none:`unclaimed`         | https://github.com/llvm/llvm-project/pull/125621                         |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| priority clause for target constructs                       | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| changes to target_data construct                            | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| Non-const do_not_sync for nowait/nogroup                    | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| need_device_addr modifier for adjust_args clause            | :part:`partial`           | :none:`unclaimed`         | Parsing/Sema: https://github.com/llvm/llvm-project/pull/143442           |
+|                                                             |                           |                           |               https://github.com/llvm/llvm-project/pull/149586           |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| Prescriptive num_threads                                    | :good:`done`              | :none:`unclaimed`         |  https://github.com/llvm/llvm-project/pull/160659                        |
+|                                                             |                           |                           |  https://github.com/llvm/llvm-project/pull/146403                        |
+|                                                             |                           |                           |  https://github.com/llvm/llvm-project/pull/146404                        |
+|                                                             |                           |                           |  https://github.com/llvm/llvm-project/pull/146405                        |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| Message and severity clauses                                | :good:`done`              | :none:`unclaimed`         |  https://github.com/llvm/llvm-project/pull/146093                        |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| Local clause on declare target                              | :part:`In Progress`       | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| groupprivate directive                                      | :part:`In Progress`       | :part:`partial`           | Flang: kparzysz, mjklemm                                                 |
+|                                                             |                           |                           |                                                                          |
+|                                                             |                           |                           | Flang parser: https://github.com/llvm/llvm-project/pull/153807           |
+|                                                             |                           |                           | Flang sema: https://github.com/llvm/llvm-project/pull/154779             |
+|                                                             |                           |                           | Clang parse/sema: https://github.com/llvm/llvm-project/pull/158134       |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| variable-category on default clause                         | :good:`done`              | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| Changes to omp_target_is_accessible                         | :part:`In Progress`       | :part:`In Progress`       |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| defaultmap implicit-behavior 'storage'                      | :good:`done`              | :none:`unclaimed`         | https://github.com/llvm/llvm-project/pull/158336                         |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| defaultmap implicit-behavior 'private'                      | :good:`done`              | :none:`unclaimed`         | https://github.com/llvm/llvm-project/pull/158712                         |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+
+.. _OpenMP 6.1 implementation details:
+
+OpenMP 6.1 Implementation Details (Experimental)
+================================================
+
+The following table provides a quick overview over various OpenMP 6.1 features
+and their implementation status. Since OpenMP 6.1 has not yet been released, the
+following features are experimental and are subject to change at any time.
+Please post on the `Discourse forums (Runtimes - OpenMP category)`_ for more
+information or if you want to help with the
+implementation.
+
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+|Feature                                                      | C/C++ Status              | Fortran Status            | Reviews                                                                  |
++=============================================================+===========================+===========================+==========================================================================+
+| dyn_groupprivate clause                                     | :part:`In Progress`       | :part:`In Progress`       | C/C++: kevinsala (https://github.com/llvm/llvm-project/pull/152651       |
+|                                                             |                           |                           | https://github.com/llvm/llvm-project/pull/152830                         |
+|                                                             |                           |                           | https://github.com/llvm/llvm-project/pull/152831)                        |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| loop flatten transformation                                 | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| loop grid/tile modifiers for sizes clause                   | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| attach map-type modifier                                    | :part:`In Progress`       | :none:`unclaimed`         | C/C++: @abhinavgaba;                                                     |
+|                                                             |                           |                           | RT: @abhinavgaba (https://github.com/llvm/llvm-project/pull/149036,      |
+|                                                             |                           |                           | https://github.com/llvm/llvm-project/pull/158370)                        |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+
+
+OpenMP Extensions
+=================
+
+The following table provides a quick overview over various OpenMP
+extensions and their implementation status.  These extensions are not
+currently defined by any standard, so links to associated LLVM
+documentation are provided.  As these extensions mature, they will be
+considered for standardization. Please post on the
+`Discourse forums (Runtimes - OpenMP category)`_ to provide feedback.
+
++------------------------------+-----------------------------------------------------------------------------------+--------------------------+--------------------------------------------------------+
+|Category                      | Feature                                                                           | Status                   | Reviews                                                |
++==============================+===================================================================================+==========================+========================================================+
+| atomic extension             | `'atomic' strictly nested within 'teams'                                          | :good:`prototyped`       | D126323                                                |
+|                              | <https://openmp.llvm.org/docs/openacc/OpenMPExtensions.html#atomicWithinTeams>`_  |                          |                                                        |
++------------------------------+-----------------------------------------------------------------------------------+--------------------------+--------------------------------------------------------+
+| device extension             | `'ompx_hold' map type modifier                                                    | :good:`prototyped`       | D106509, D106510                                       |
+|                              | <https://openmp.llvm.org/docs/openacc/OpenMPExtensions.html#ompx-hold>`_          |                          |                                                        |
++------------------------------+-----------------------------------------------------------------------------------+--------------------------+--------------------------------------------------------+
+| device extension             | `'ompx_bare' clause on 'target teams' construct                                   | :good:`prototyped`       | #66844, #70612                                         |
+|                              | <https://www.osti.gov/servlets/purl/2205717>`_                                    |                          |                                                        |
++------------------------------+-----------------------------------------------------------------------------------+--------------------------+--------------------------------------------------------+
+| device extension             | Multi-dim 'num_teams' and 'thread_limit' clause on 'target teams ompx_bare'       | :good:`partial`          | #99732, #101407, #102715                               |
+|                              | construct                                                                         |                          |                                                        |
++------------------------------+-----------------------------------------------------------------------------------+--------------------------+--------------------------------------------------------+
+
+.. _Discourse forums (Runtimes - OpenMP category): https://discourse.llvm.org/c/runtimes/openmp/35
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index e6e33e7a9a280..92fc9381a5868 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -281,6 +281,9 @@ Non-comprehensive list of changes in this release
   allocator-level heap organization strategies. A feature to instrument all
   allocation functions with a token ID can be enabled via the
   ``-fsanitize=alloc-token`` flag.
+- A builtin ``__builtin_infer_alloc_token(<args>, ...)`` is provided to allow
+  compile-time querying of allocation token IDs, where the builtin arguments
+  mirror those normally passed to an allocation function.
 
 - Clang now rejects the invalid use of ``constexpr`` with ``auto`` and an explicit type in C. (#GH163090)
 
@@ -448,6 +451,7 @@ Bug Fixes to Attribute Support
   ``[[gnu::error("some error")]]`` now correctly triggers an error. (#GH146520)
 - Fix a crash when the function name is empty in the `swift_name` attribute. (#GH157075)
 - Fixes crashes or missing diagnostics with the `device_kernel` attribute. (#GH161905)
+- Fix handling of parameter indexes when an attribute is applied to a C++23 explicit object member function.
 
 Bug Fixes to C++ Support
 ^^^^^^^^^^^^^^^^^^^^^^^^
@@ -610,6 +614,14 @@ clang-format
   literals.
 - Add ``Leave`` suboption to ``IndentPPDirectives``.
 - Add ``AllowBreakBeforeQtProperty`` option.
+- Add ``BreakAfterOpenBracketBracedList'', ``BreakAfterOpenBracketFunction'',
+  ``BreakAfterOpenBracketIf``, ``BreakAfterOpenBracketLoop``,
+  ``BreakAfterOpenBracketSwitch``, ``BreakBeforeCloseBracketBracedList'',
+  ``BreakBeforeCloseBracketFunction``, ``BreakBeforeCloseBracketIf``,
+  ``BreakBeforeCloseBracketLoop``, ``BreakBeforeCloseBracketSwitch`` options.
+- Deprecate ``AlwaysBreak`` and ``BlockIndent`` suboptions from the
+  ``AlignAfterOpenBracket`` option, and make ``AlignAfterOpenBracket`` a
+  ``bool`` type.
 
 libclang
 --------
@@ -648,6 +660,7 @@ Sanitizers
 
 Python Binding Changes
 ----------------------
+- Exposed ``clang_Cursor_isFunctionInlined``.
 - Exposed ``clang_getCursorLanguage`` via ``Cursor.language``.
 - Add all missing ``CursorKind``s, ``TypeKind``s and
   ``ExceptionSpecificationKind``s from ``Index.h``
@@ -658,6 +671,7 @@ OpenMP Support
   modifier in the ``adjust_args`` clause.
 - Allow array length to be omitted in array section subscript expression.
 - Fixed non-contiguous strided update in the ``omp target update`` directive with the ``from`` clause.
+- Added support for threadset clause in task and taskloop directives.
 - Properly handle array section/assumed-size array privatization in C/C++.
 - Added support to handle new syntax of the ``uses_allocators`` clause.
 - Added support for ``variable-category`` modifier in ``default clause``.
diff --git a/clang/docs/ThreadSafetyAnalysis.rst b/clang/docs/ThreadSafetyAnalysis.rst
index 853a8fae4a907..d0f96f58dac17 100644
--- a/clang/docs/ThreadSafetyAnalysis.rst
+++ b/clang/docs/ThreadSafetyAnalysis.rst
@@ -118,7 +118,7 @@ require exclusive access, while read operations require only shared access.
 At any given moment during program execution, a thread holds a specific set of
 capabilities (e.g. the set of mutexes that it has locked.)  These act like keys
 or tokens that allow the thread to access a given resource.  Just like physical
-security keys, a thread cannot make copy of a capability, nor can it destroy
+security keys, a thread cannot make a copy of a capability, nor can it destroy
 one.  A thread can only release a capability to another thread, or acquire one
 from another thread.  The annotations are deliberately agnostic about the
 exact mechanism used to acquire and release capabilities; it assumes that the
@@ -131,7 +131,7 @@ by calculating an approximation of that set, called the *capability
 environment*.  The capability environment is calculated for every program point,
 and describes the set of capabilities that are statically known to be held, or
 not held, at that particular point.  This environment is a conservative
-approximation of the full set of capabilities that will actually held by a
+approximation of the full set of capabilities that will actually be held by a
 thread at run-time.
 
 
@@ -369,7 +369,7 @@ thread-safe, but too complicated for the analysis to understand.  Reasons for
     void unsafeIncrement() NO_THREAD_SAFETY_ANALYSIS { a++; }
   };
 
-Unlike the other attributes, NO_THREAD_SAFETY_ANALYSIS is not part of the
+Unlike the other attributes, ``NO_THREAD_SAFETY_ANALYSIS`` is not part of the
 interface of a function, and should thus be placed on the function definition
 (in the ``.cc`` or ``.cpp`` file) rather than on the function declaration
 (in the header).
@@ -509,7 +509,7 @@ ASSERT_CAPABILITY(...) and ASSERT_SHARED_CAPABILITY(...)
 *Previously:*  ``ASSERT_EXCLUSIVE_LOCK``, ``ASSERT_SHARED_LOCK``
 
 These are attributes on a function or method which asserts the calling thread
-already holds the given capability, for example by performing a run-time test
+already holds the given capability, for example, by performing a run-time test
 and terminating if the capability is not held.  Presence of this annotation
 causes the analysis to assume the capability is held after calls to the
 annotated function.  See :ref:`mutexheader`, below, for example uses.
@@ -554,19 +554,19 @@ Negative Capabilities
 =====================
 
 Thread Safety Analysis is designed to prevent both race conditions and
-deadlock.  The GUARDED_BY and REQUIRES attributes prevent race conditions, by
+deadlock.  The ``GUARDED_BY`` and ``REQUIRES`` attributes prevent race conditions, by
 ensuring that a capability is held before reading or writing to guarded data,
-and the EXCLUDES attribute prevents deadlock, by making sure that a mutex is
+and the ``EXCLUDES`` attribute prevents deadlock, by making sure that a mutex is
 *not* held.
 
-However, EXCLUDES is an optional attribute, and does not provide the same
-safety guarantee as REQUIRES.  In particular:
+However, ``EXCLUDES`` is an optional attribute, and does not provide the same
+safety guarantee as ``REQUIRES``.  In particular:
 
   * A function which acquires a capability does not have to exclude it.
   * A function which calls a function that excludes a capability does not
-    have transitively exclude that capability.
+    have to transitively exclude that capability.
 
-As a result, EXCLUDES can easily produce false negatives:
+As a result, ``EXCLUDES`` can easily produce false negatives:
 
 .. code-block:: c++
 
@@ -594,8 +594,8 @@ As a result, EXCLUDES can easily produce false negatives:
   };
 
 
-Negative requirements are an alternative EXCLUDES that provide
-a stronger safety guarantee.  A negative requirement uses the  REQUIRES
+Negative requirements are an alternative to ``EXCLUDES`` that provide
+a stronger safety guarantee.  A negative requirement uses the  ``REQUIRES``
 attribute, in conjunction with the ``!`` operator, to indicate that a capability
 should *not* be held.
 
@@ -642,7 +642,7 @@ Frequently Asked Questions
 
 (A) Attributes are part of the formal interface of a function, and should
 always go in the header, where they are visible to anything that includes
-the header.  Attributes in the .cpp file are not visible outside of the
+the header.  Attributes in the ``.cpp`` file are not visible outside of the
 immediate translation unit, which leads to false negatives and false positives.
 
 
@@ -684,7 +684,7 @@ Private Mutexes
 ---------------
 
 Good software engineering practice dictates that mutexes should be private
-members, because the locking mechanism used by a thread-safe class is part of
+members because the locking mechanism used by a thread-safe class is part of
 its internal implementation.  However, private mutexes can sometimes leak into
 the public interface of a class.
 Thread safety attributes follow normal C++ access restrictions, so if ``mu``
diff --git a/clang/docs/tools/dump_ast_matchers.py b/clang/docs/tools/dump_ast_matchers.py
index 46b7bb718ba08..5db6826070934 100755
--- a/clang/docs/tools/dump_ast_matchers.py
+++ b/clang/docs/tools/dump_ast_matchers.py
@@ -6,11 +6,8 @@
 import collections
 import re
 import os
+from urllib.request import urlopen
 
-try:
-    from urllib.request import urlopen
-except ImportError:
-    from urllib2 import urlopen
 
 CLASS_INDEX_PAGE_URL = "https://clang.llvm.org/doxygen/classes.html"
 try:
diff --git a/clang/include/clang/AST/Attr.h b/clang/include/clang/AST/Attr.h
index ce273c167aa22..14d7caa0e16d7 100644
--- a/clang/include/clang/AST/Attr.h
+++ b/clang/include/clang/AST/Attr.h
@@ -16,6 +16,7 @@
 #include "clang/AST/ASTFwd.h"
 #include "clang/AST/AttrIterator.h"
 #include "clang/AST/Decl.h"
+#include "clang/AST/DeclCXX.h"
 #include "clang/AST/Type.h"
 #include "clang/Basic/AttrKinds.h"
 #include "clang/Basic/AttributeCommonInfo.h"
@@ -327,8 +328,8 @@ class ParamIdx {
   ParamIdx(unsigned Idx, const Decl *D)
       : Idx(Idx), HasThis(false), IsValid(true) {
     assert(Idx >= 1 && "Idx must be one-origin");
-    if (const auto *FD = dyn_cast<FunctionDecl>(D))
-      HasThis = FD->isCXXInstanceMember();
+    if (const auto *MethodDecl = dyn_cast<CXXMethodDecl>(D))
+      HasThis = MethodDecl->isImplicitObjectMemberFunction();
   }
 
   /// A type into which \c ParamIdx can be serialized.
diff --git a/clang/include/clang/AST/OpenMPClause.h b/clang/include/clang/AST/OpenMPClause.h
index bc791e46e7c92..4f507485968cd 100644
--- a/clang/include/clang/AST/OpenMPClause.h
+++ b/clang/include/clang/AST/OpenMPClause.h
@@ -1424,6 +1424,86 @@ class OMPDefaultClause : public OMPClause {
   }
 };
 
+/// This represents 'threadset' clause in the '#pragma omp task ...' directive.
+///
+/// \code
+/// #pragma omp task threadset(omp_pool)
+/// \endcode
+/// In this example directive '#pragma omp task' has simple 'threadset'
+/// clause with kind 'omp_pool'.
+class OMPThreadsetClause final : public OMPClause {
+  friend class OMPClauseReader;
+
+  /// Location of '('.
+  SourceLocation LParenLoc;
+
+  /// A kind of the 'threadset' clause.
+  OpenMPThreadsetKind Kind = OMPC_THREADSET_unknown;
+
+  /// Start location of the kind in source code.
+  SourceLocation KindLoc;
+
+  /// Set kind of the clauses.
+  ///
+  /// \param K Argument of clause.
+  void setThreadsetKind(OpenMPThreadsetKind K) { Kind = K; }
+
+  /// Set argument location.
+  ///
+  /// \param KLoc Argument location.
+  void setThreadsetKindLoc(SourceLocation KLoc) { KindLoc = KLoc; }
+
+public:
+  /// Build 'threadset' clause with argument \a A ('omp_team' or 'omp_pool').
+  ///
+  /// \param A Argument of the clause ('omp_team' or 'omp_pool').
+  /// \param ALoc Starting location of the argument.
+  /// \param StartLoc Starting location of the clause.
+  /// \param LParenLoc Location of '('.
+  /// \param EndLoc Ending location of the clause.
+  OMPThreadsetClause(OpenMPThreadsetKind A, SourceLocation ALoc,
+                     SourceLocation StartLoc, SourceLocation LParenLoc,
+                     SourceLocation EndLoc)
+      : OMPClause(llvm::omp::OMPC_threadset, StartLoc, EndLoc),
+        LParenLoc(LParenLoc), Kind(A), KindLoc(ALoc) {}
+
+  /// Build an empty clause.
+  OMPThreadsetClause()
+      : OMPClause(llvm::omp::OMPC_threadset, SourceLocation(),
+                  SourceLocation()) {}
+
+  /// Sets the location of '('.
+  void setLParenLoc(SourceLocation Loc) { LParenLoc = Loc; }
+
+  /// Returns the location of '('.
+  SourceLocation getLParenLoc() const { return LParenLoc; }
+
+  /// Returns kind of the clause.
+  OpenMPThreadsetKind getThreadsetKind() const { return Kind; }
+
+  /// Returns location of clause kind.
+  SourceLocation getThreadsetKindLoc() const { return KindLoc; }
+
+  child_range children() {
+    return child_range(child_iterator(), child_iterator());
+  }
+
+  const_child_range children() const {
+    return const_child_range(const_child_iterator(), const_child_iterator());
+  }
+
+  child_range used_children() {
+    return child_range(child_iterator(), child_iterator());
+  }
+  const_child_range used_children() const {
+    return const_child_range(const_child_iterator(), const_child_iterator());
+  }
+
+  static bool classof(const OMPClause *T) {
+    return T->getClauseKind() == llvm::omp::OMPC_threadset;
+  }
+};
+
 /// This represents 'proc_bind' clause in the '#pragma omp ...'
 /// directive.
 ///
diff --git a/clang/include/clang/AST/RecursiveASTVisitor.h b/clang/include/clang/AST/RecursiveASTVisitor.h
index 32b2b6bdb989c..8cb0a657023b4 100644
--- a/clang/include/clang/AST/RecursiveASTVisitor.h
+++ b/clang/include/clang/AST/RecursiveASTVisitor.h
@@ -3523,6 +3523,12 @@ bool RecursiveASTVisitor<Derived>::VisitOMPDefaultClause(OMPDefaultClause *) {
   return true;
 }
 
+template <typename Derived>
+bool RecursiveASTVisitor<Derived>::VisitOMPThreadsetClause(
+    OMPThreadsetClause *) {
+  return true;
+}
+
 template <typename Derived>
 bool RecursiveASTVisitor<Derived>::VisitOMPProcBindClause(OMPProcBindClause *) {
   return true;
diff --git a/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h b/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h
index 91ffbb169f947..e5ac4ca0d01c0 100644
--- a/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h
+++ b/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h
@@ -23,7 +23,10 @@
 #include "clang/Analysis/Analyses/LifetimeSafety/Facts.h"
 #include "clang/Analysis/Analyses/LifetimeSafety/LiveOrigins.h"
 #include "clang/Analysis/Analyses/LifetimeSafety/LoanPropagation.h"
+#include "clang/Analysis/Analyses/LifetimeSafety/Origins.h"
 #include "clang/Analysis/AnalysisDeclContext.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/Support/raw_ostream.h"
 
 namespace clang::lifetimes {
 
@@ -73,7 +76,12 @@ class LifetimeSafetyAnalysis {
   LiveOriginsAnalysis &getLiveOrigins() const { return *LiveOrigins; }
   FactManager &getFactManager() { return FactMgr; }
 
+  static void PrintStats(llvm::raw_ostream& OS);
+
+  static void UpdateMissingOriginCount(const OriginManager& OM);
+
 private:
+  static llvm::StringMap<int> MissingOriginMap;
   AnalysisDeclContext &AC;
   LifetimeSafetyReporter *Reporter;
   LifetimeFactory Factory;
diff --git a/clang/include/clang/Analysis/Analyses/LifetimeSafety/Origins.h b/clang/include/clang/Analysis/Analyses/LifetimeSafety/Origins.h
index ba138b078b379..231cc60b7e097 100644
--- a/clang/include/clang/Analysis/Analyses/LifetimeSafety/Origins.h
+++ b/clang/include/clang/Analysis/Analyses/LifetimeSafety/Origins.h
@@ -16,7 +16,10 @@
 
 #include "clang/AST/Decl.h"
 #include "clang/AST/Expr.h"
+#include "clang/AST/TypeBase.h"
 #include "clang/Analysis/Analyses/LifetimeSafety/Utils.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/Support/raw_ostream.h"
 
 namespace clang::lifetimes::internal {
 
@@ -76,6 +79,8 @@ class OriginManager {
 
   void dump(OriginID OID, llvm::raw_ostream &OS) const;
 
+  const llvm::StringMap<int> getMissingOrigins() const;
+
 private:
   OriginID getNextOriginID() { return NextOriginID++; }
 
@@ -85,6 +90,7 @@ class OriginManager {
   llvm::SmallVector<Origin> AllOrigins;
   llvm::DenseMap<const clang::ValueDecl *, OriginID> DeclToOriginID;
   llvm::DenseMap<const clang::Expr *, OriginID> ExprToOriginID;
+  llvm::StringMap<int> ExprTypeToMissingOriginCount;
 };
 } // namespace clang::lifetimes::internal
 
diff --git a/clang/include/clang/Basic/Builtins.def b/clang/include/clang/Basic/Builtins.def
index b856ad145824d..3a5b72e20afab 100644
--- a/clang/include/clang/Basic/Builtins.def
+++ b/clang/include/clang/Basic/Builtins.def
@@ -43,6 +43,7 @@
 //  SJ -> sigjmp_buf
 //  K -> ucontext_t
 //  p -> pid_t
+//  e -> _Float16 for HIP/C++ and __fp16 for OpenCL
 //  . -> "...".  This may only occur at the end of the function list.
 //
 // Types may be prefixed with the following modifiers:
diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index a2c202158522f..2b400b012d6ed 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -5030,6 +5030,12 @@ def HLSLWaveActiveMax : LangBuiltin<"HLSL_LANG"> {
   let Prototype = "void (...)";
 }
 
+def HLSLWaveActiveMin : LangBuiltin<"HLSL_LANG"> {
+  let Spellings = ["__builtin_hlsl_wave_active_min"];
+  let Attributes = [NoThrow, Const];
+  let Prototype = "void (...)";
+}
+
 def HLSLWaveActiveSum : LangBuiltin<"HLSL_LANG"> {
   let Spellings = ["__builtin_hlsl_wave_active_sum"];
   let Attributes = [NoThrow, Const];
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index f265d82efee75..36cb527a9c806 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -967,6 +967,47 @@ TARGET_BUILTIN(__builtin_amdgcn_image_sample_3d_v4f32_f32, "V4fifffQtV4ibii", "n
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_3d_v4f16_f32, "V4hifffQtV4ibii", "nc", "image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_cube_v4f32_f32, "V4fifffQtV4ibii", "nc", "image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_cube_v4f16_f32, "V4hifffQtV4ibii", "nc", "image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_1d_v4f32_f32, "V4fifQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_1d_v4f16_f32, "V4eifQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_1darray_v4f32_f32, "V4fiffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_1darray_v4f16_f32, "V4eiffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_2d_f32_f32, "fiffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_2d_v4f32_f32, "V4fiffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_2d_v4f16_f32, "V4eiffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_2darray_f32_f32, "fifffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_2darray_v4f32_f32, "V4fifffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_2darray_v4f16_f32, "V4eifffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_3d_v4f32_f32, "V4fifffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_3d_v4f16_f32, "V4eifffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_cube_v4f32_f32, "V4fifffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_cube_v4f16_f32, "V4eifffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_1d_v4f32_f32, "V4fiffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_1d_v4f16_f32, "V4eiffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_1darray_v4f32_f32, "V4fifffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_1darray_v4f16_f32, "V4eifffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_2d_f32_f32, "fifffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_2d_v4f32_f32, "V4fifffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_2d_v4f16_f32, "V4eifffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_2darray_f32_f32, "fiffffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_2darray_v4f32_f32, "V4fiffffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_2darray_v4f16_f32, "V4eiffffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_3d_v4f32_f32, "V4fiffffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_3d_v4f16_f32, "V4eiffffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_cube_v4f32_f32, "V4fiffffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_cube_v4f16_f32, "V4eiffffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_1d_v4f32_f32, "V4fifffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_1d_v4f16_f32, "V4eifffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_1darray_v4f32_f32, "V4fiffffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_1darray_v4f16_f32, "V4eiffffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_2d_f32_f32, "fiffffffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_2d_v4f32_f32, "V4fiffffffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_2d_v4f16_f32, "V4eiffffffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_2darray_f32_f32, "fifffffffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_2darray_v4f32_f32, "V4fifffffffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_2darray_v4f16_f32, "V4eifffffffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_3d_v4f32_f32, "V4fifffffffffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_3d_v4f16_f32, "V4eifffffffffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_gather4_lz_2d_v4f32_f32, "V4fiffQtV4ibii", "nc", "extended-image-insts")
 
 #undef BUILTIN
 #undef TARGET_BUILTIN
diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index 0c85e280e748b..9e877b92eac68 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -328,7 +328,6 @@ let Features = "ssse3", Attributes = [NoThrow, Const, Constexpr, RequiredVectorW
 }
 
 let Features = "sse4.1", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
-  def insertps128 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant char)">;
   def roundps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Constant int)">;
   def roundss : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int)">;
   def roundsd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant int)">;
@@ -342,6 +341,8 @@ let Features = "sse4.1", Attributes = [NoThrow, Const, RequiredVectorWidth<128>]
 
 let Features = "sse4.1",
     Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
+  def insertps128 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, "
+                               "_Vector<4, float>, _Constant char)">;
   def ptestz128
       : X86Builtin<"int(_Vector<2, long long int>, _Vector<2, long long int>)">;
   def ptestc128
@@ -1282,81 +1283,99 @@ let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr] in {
   def knotdi : X86Builtin<"unsigned long long int(unsigned long long int)">;
 }
 
-let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+let Features = "avx512vl,avx512bw",
+    Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
   def cmpb128_mask : X86Builtin<"unsigned short(_Vector<16, char>, _Vector<16, char>, _Constant int, unsigned short)">;
 }
 
-let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+let Features = "avx512vl",
+    Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
   def cmpd128_mask : X86Builtin<"unsigned char(_Vector<4, int>, _Vector<4, int>, _Constant int, unsigned char)">;
   def cmpq128_mask : X86Builtin<"unsigned char(_Vector<2, long long int>, _Vector<2, long long int>, _Constant int, unsigned char)">;
 }
 
-let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+let Features = "avx512vl,avx512bw",
+    Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
   def cmpw128_mask : X86Builtin<"unsigned char(_Vector<8, short>, _Vector<8, short>, _Constant int, unsigned char)">;
 }
 
-let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+let Features = "avx512vl,avx512bw",
+    Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
   def cmpb256_mask : X86Builtin<"unsigned int(_Vector<32, char>, _Vector<32, char>, _Constant int, unsigned int)">;
 }
 
-let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+let Features = "avx512vl",
+    Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
   def cmpd256_mask : X86Builtin<"unsigned char(_Vector<8, int>, _Vector<8, int>, _Constant int, unsigned char)">;
   def cmpq256_mask : X86Builtin<"unsigned char(_Vector<4, long long int>, _Vector<4, long long int>, _Constant int, unsigned char)">;
 }
 
-let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+let Features = "avx512vl,avx512bw",
+    Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
   def cmpw256_mask : X86Builtin<"unsigned short(_Vector<16, short>, _Vector<16, short>, _Constant int, unsigned short)">;
 }
 
-let Features = "avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+let Features = "avx512bw",
+    Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
   def cmpb512_mask : X86Builtin<"unsigned long long int(_Vector<64, char>, _Vector<64, char>, _Constant int, unsigned long long int)">;
 }
 
-let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+let Features = "avx512f",
+    Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
   def cmpd512_mask : X86Builtin<"unsigned short(_Vector<16, int>, _Vector<16, int>, _Constant int, unsigned short)">;
   def cmpq512_mask : X86Builtin<"unsigned char(_Vector<8, long long int>, _Vector<8, long long int>, _Constant int, unsigned char)">;
 }
 
-let Features = "avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+let Features = "avx512bw",
+    Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
   def cmpw512_mask : X86Builtin<"unsigned int(_Vector<32, short>, _Vector<32, short>, _Constant int, unsigned int)">;
 }
 
-let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+let Features = "avx512vl,avx512bw",
+    Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
   def ucmpb128_mask : X86Builtin<"unsigned short(_Vector<16, char>, _Vector<16, char>, _Constant int, unsigned short)">;
 }
 
-let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+let Features = "avx512vl",
+    Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
   def ucmpd128_mask : X86Builtin<"unsigned char(_Vector<4, int>, _Vector<4, int>, _Constant int, unsigned char)">;
   def ucmpq128_mask : X86Builtin<"unsigned char(_Vector<2, long long int>, _Vector<2, long long int>, _Constant int, unsigned char)">;
 }
 
-let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+let Features = "avx512vl,avx512bw",
+    Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
   def ucmpw128_mask : X86Builtin<"unsigned char(_Vector<8, short>, _Vector<8, short>, _Constant int, unsigned char)">;
 }
 
-let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+let Features = "avx512vl,avx512bw",
+    Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
   def ucmpb256_mask : X86Builtin<"unsigned int(_Vector<32, char>, _Vector<32, char>, _Constant int, unsigned int)">;
 }
 
-let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+let Features = "avx512vl",
+    Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
   def ucmpd256_mask : X86Builtin<"unsigned char(_Vector<8, int>, _Vector<8, int>, _Constant int, unsigned char)">;
   def ucmpq256_mask : X86Builtin<"unsigned char(_Vector<4, long long int>, _Vector<4, long long int>, _Constant int, unsigned char)">;
 }
 
-let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+let Features = "avx512vl,avx512bw",
+    Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
   def ucmpw256_mask : X86Builtin<"unsigned short(_Vector<16, short>, _Vector<16, short>, _Constant int, unsigned short)">;
 }
 
-let Features = "avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+let Features = "avx512bw",
+    Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
   def ucmpb512_mask : X86Builtin<"unsigned long long int(_Vector<64, char>, _Vector<64, char>, _Constant int, unsigned long long int)">;
 }
 
-let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+let Features = "avx512f",
+    Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
   def ucmpd512_mask : X86Builtin<"unsigned short(_Vector<16, int>, _Vector<16, int>, _Constant int, unsigned short)">;
   def ucmpq512_mask : X86Builtin<"unsigned char(_Vector<8, long long int>, _Vector<8, long long int>, _Constant int, unsigned char)">;
 }
 
-let Features = "avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+let Features = "avx512bw",
+    Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
   def ucmpw512_mask : X86Builtin<"unsigned int(_Vector<32, short>, _Vector<32, short>, _Constant int, unsigned int)">;
 }
 
diff --git a/clang/include/clang/Basic/BuiltinsX86_64.td b/clang/include/clang/Basic/BuiltinsX86_64.td
index 275278c5ac089..062060e6afbbe 100644
--- a/clang/include/clang/Basic/BuiltinsX86_64.td
+++ b/clang/include/clang/Basic/BuiltinsX86_64.td
@@ -239,57 +239,6 @@ let Features = "amx-complex", Attributes = [NoThrow] in {
   def tcmmrlfp16ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">;
 }
 
-let Features = "amx-transpose", Attributes = [NoThrow] in {
-  def t2rpntlvwz0_internal : X86Builtin<"void(unsigned short, unsigned short, unsigned short, _Vector<256, int *>, _Vector<256, int *>, void const *, size_t)">;
-}
-
-let Features = "amx-movrs,amx-transpose", Attributes = [NoThrow] in {
-  def t2rpntlvwz0rs_internal : X86Builtin<"void(unsigned short, unsigned short, unsigned short, _Vector<256, int *>, _Vector<256, int *>, void const *, size_t)">;
-}
-
-let Features = "amx-transpose", Attributes = [NoThrow] in {
-  def t2rpntlvwz0t1_internal : X86Builtin<"void(unsigned short, unsigned short, unsigned short, _Vector<256, int *>, _Vector<256, int *>, void const *, size_t)">;
-}
-
-let Features = "amx-movrs,amx-transpose", Attributes = [NoThrow] in {
-  def t2rpntlvwz0rst1_internal : X86Builtin<"void(unsigned short, unsigned short, unsigned short, _Vector<256, int *>, _Vector<256, int *>, void const *, size_t)">;
-}
-
-let Features = "amx-transpose", Attributes = [NoThrow] in {
-  def t2rpntlvwz1_internal : X86Builtin<"void(unsigned short, unsigned short, unsigned short, _Vector<256, int *>, _Vector<256, int *>, void const *, size_t)">;
-}
-
-let Features = "amx-movrs,amx-transpose", Attributes = [NoThrow] in {
-  def t2rpntlvwz1rs_internal : X86Builtin<"void(unsigned short, unsigned short, unsigned short, _Vector<256, int *>, _Vector<256, int *>, void const *, size_t)">;
-}
-
-let Features = "amx-transpose", Attributes = [NoThrow] in {
-  def t2rpntlvwz1t1_internal : X86Builtin<"void(unsigned short, unsigned short, unsigned short, _Vector<256, int *>, _Vector<256, int *>, void const *, size_t)">;
-}
-
-let Features = "amx-movrs,amx-transpose", Attributes = [NoThrow] in {
-  def t2rpntlvwz1rst1_internal : X86Builtin<"void(unsigned short, unsigned short, unsigned short, _Vector<256, int *>, _Vector<256, int *>, void const *, size_t)">;
-}
-
-let Features = "amx-transpose", Attributes = [NoThrow] in {
-  def ttransposed_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, _Vector<256, int>)">;
-}
-
-let Features = "amx-bf16,amx-transpose", Attributes = [NoThrow] in {
-  def ttdpbf16ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">;
-}
-
-let Features = "amx-fp16,amx-transpose", Attributes = [NoThrow] in {
-  def ttdpfp16ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">;
-}
-
-let Features = "amx-complex,amx-transpose", Attributes = [NoThrow] in {
-  def ttcmmimfp16ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">;
-  def ttcmmrlfp16ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">;
-  def tconjtcmmimfp16ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">;
-  def tconjtfp16_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, _Vector<256, int>)">;
-}
-
 let Features = "amx-avx512,avx10.2", Attributes = [NoThrow] in {
   def tcvtrowd2ps_internal : X86Builtin<"_Vector<16, float>(unsigned short, unsigned short, _Vector<256, int>, unsigned int)">;
   def tcvtrowps2bf16h_internal : X86Builtin<"_Vector<32, __bf16>(unsigned short, unsigned short, _Vector<256, int>, unsigned int)">;
@@ -303,10 +252,6 @@ let Features = "amx-tf32", Attributes = [NoThrow] in {
   def tmmultf32ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">;
 }
 
-let Features = "amx-tf32,amx-transpose", Attributes = [NoThrow] in {
-  def ttmmultf32ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">;
-}
-
 let Features = "amx-fp8", Attributes = [NoThrow] in {
   def tdpbf8ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">;
   def tdpbhf8ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">;
@@ -321,13 +266,6 @@ let Features = "amx-tile", Attributes = [NoThrow] in {
   def tilezero : X86Builtin<"void(unsigned char)">;
 }
 
-let Features = "amx-movrs,amx-transpose", Attributes = [NoThrow] in {
-  def t2rpntlvwz0rs : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">;
-  def t2rpntlvwz0rst1 : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">;
-  def t2rpntlvwz1rs : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">;
-  def t2rpntlvwz1rst1 : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">;
-}
-
 let Features = "amx-movrs", Attributes = [NoThrow] in {
   def tileloaddrs64 : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">;
   def tileloaddrst164 : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">;
@@ -359,29 +297,6 @@ let Features = "amx-complex", Attributes = [NoThrow] in {
   def tcmmrlfp16ps : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">;
 }
 
-let Features = "amx-transpose", Attributes = [NoThrow] in {
-  def t2rpntlvwz0 : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">;
-  def t2rpntlvwz0t1 : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">;
-  def t2rpntlvwz1 : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">;
-  def t2rpntlvwz1t1 : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">;
-  def ttransposed : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char)">;
-}
-
-let Features = "amx-bf16,amx-transpose", Attributes = [NoThrow] in {
-  def ttdpbf16ps : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">;
-}
-
-let Features = "amx-fp16,amx-transpose", Attributes = [NoThrow] in {
-  def ttdpfp16ps : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">;
-}
-
-let Features = "amx-complex,amx-transpose", Attributes = [NoThrow] in {
-  def ttcmmimfp16ps : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">;
-  def ttcmmrlfp16ps : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">;
-  def tconjtcmmimfp16ps : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">;
-  def tconjtfp16 : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char)">;
-}
-
 let Features = "amx-avx512,avx10.2", Attributes = [NoThrow] in {
   def tcvtrowd2ps : X86Builtin<"_Vector<16, float>(_Constant unsigned char, unsigned int)">;
   def tcvtrowps2bf16h : X86Builtin<"_Vector<32, __bf16>(_Constant unsigned char, unsigned int)">;
@@ -406,10 +321,6 @@ let Features = "amx-tf32", Attributes = [NoThrow] in {
   def tmmultf32ps : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">;
 }
 
-let Features = "amx-tf32,amx-transpose", Attributes = [NoThrow] in {
-  def ttmmultf32ps : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">;
-}
-
 let Features = "prefetchi", Attributes = [NoThrow, Const] in {
   def prefetchi : X86Builtin<"void(void const *, unsigned int)">;
 }
diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index 8d6b8a14740ce..d3cca82b4bdff 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -216,6 +216,7 @@ LANGOPT(OpenCLGenericAddressSpace, 1, 0, NotCompatible, "OpenCL generic keyword"
 LANGOPT(OpenCLPipes              , 1, 0, NotCompatible, "OpenCL pipes language constructs and built-ins")
 LANGOPT(NativeHalfType    , 1, 0, NotCompatible, "Native half type support")
 LANGOPT(NativeHalfArgsAndReturns, 1, 0, NotCompatible, "Native half args and returns")
+LANGOPT(NativeInt16Type   , 1, 1, NotCompatible, "Native int 16 type support")
 LANGOPT(CUDA              , 1, 0, NotCompatible, "CUDA")
 LANGOPT(HIP               , 1, 0, NotCompatible, "HIP")
 LANGOPT(OpenMP            , 32, 0, NotCompatible, "OpenMP support and version of OpenMP (31, 40 or 45)")
diff --git a/clang/include/clang/Basic/OpenMPKinds.def b/clang/include/clang/Basic/OpenMPKinds.def
index 202d06fa1fcaa..328a0747a82a8 100644
--- a/clang/include/clang/Basic/OpenMPKinds.def
+++ b/clang/include/clang/Basic/OpenMPKinds.def
@@ -98,6 +98,9 @@
 #ifndef OPENMP_ALLOCATE_MODIFIER
 #define OPENMP_ALLOCATE_MODIFIER(Name)
 #endif
+#ifndef OPENMP_THREADSET_KIND
+#define OPENMP_THREADSET_KIND(Name)
+#endif
 
 // Static attributes for 'schedule' clause.
 OPENMP_SCHEDULE_KIND(static)
@@ -255,6 +258,9 @@ OPENMP_DOACROSS_MODIFIER(sink)
 OPENMP_DOACROSS_MODIFIER(sink_omp_cur_iteration)
 OPENMP_DOACROSS_MODIFIER(source_omp_cur_iteration)
 
+OPENMP_THREADSET_KIND(omp_pool)
+OPENMP_THREADSET_KIND(omp_team)
+
 #undef OPENMP_NUMTASKS_MODIFIER
 #undef OPENMP_NUMTHREADS_MODIFIER
 #undef OPENMP_GRAINSIZE_MODIFIER
@@ -284,4 +290,4 @@ OPENMP_DOACROSS_MODIFIER(source_omp_cur_iteration)
 #undef OPENMP_DEFAULTMAP_MODIFIER
 #undef OPENMP_DOACROSS_MODIFIER
 #undef OPENMP_ALLOCATE_MODIFIER
-
+#undef OPENMP_THREADSET_KIND
diff --git a/clang/include/clang/Basic/OpenMPKinds.h b/clang/include/clang/Basic/OpenMPKinds.h
index ed89a31e2684b..c9ddbcd6d46c1 100644
--- a/clang/include/clang/Basic/OpenMPKinds.h
+++ b/clang/include/clang/Basic/OpenMPKinds.h
@@ -250,6 +250,13 @@ enum OpenMPAllocateClauseModifier {
   OMPC_ALLOCATE_unknown
 };
 
+/// OpenMP modifiers for 'threadset' clause.
+enum OpenMPThreadsetKind {
+#define OPENMP_THREADSET_KIND(Name) OMPC_THREADSET_##Name,
+#include "clang/Basic/OpenMPKinds.def"
+  OMPC_THREADSET_unknown
+};
+
 /// Number of allowed allocate-modifiers.
 static constexpr unsigned NumberOfOMPAllocateClauseModifiers =
     OMPC_ALLOCATE_unknown;
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 8784c9d7d206d..6e1c9425d8d75 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -5999,7 +5999,6 @@ def nofixprebinding : Flag<["-"], "nofixprebinding">;
 def nolibc : Flag<["-"], "nolibc">;
 def nomultidefs : Flag<["-"], "nomultidefs">;
 def nopie : Flag<["-"], "nopie">, Visibility<[ClangOption, FlangOption]>, Flags<[TargetSpecific]>; // OpenBSD
-def no_pie : Flag<["-"], "no-pie">, Visibility<[ClangOption, FlangOption]>;
 def noprebind : Flag<["-"], "noprebind">;
 def noprofilelib : Flag<["-"], "noprofilelib">;
 def noseglinkedit : Flag<["-"], "noseglinkedit">;
@@ -6113,7 +6112,6 @@ defm pthread : BoolOption<"", "pthread",
   PosFlag<SetTrue, [], [ClangOption], "Support POSIX threads in generated code">,
   NegFlag<SetFalse>,
   BothFlags<[], [ClangOption, CC1Option, FlangOption, FC1Option]>>;
-def pie : Flag<["-"], "pie">, Group<Link_Group>;
 def static_pie : Flag<["-"], "static-pie">, Group<Link_Group>;
 def read__only__relocs : Separate<["-"], "read_only_relocs">;
 def remap : Flag<["-"], "remap">;
@@ -6508,6 +6506,8 @@ def fpic : Flag<["-"], "fpic">, Group<f_Group>;
 def fno_pic : Flag<["-"], "fno-pic">, Group<f_Group>;
 def fpie : Flag<["-"], "fpie">, Group<f_Group>;
 def fno_pie : Flag<["-"], "fno-pie">, Group<f_Group>;
+def pie : Flag<["-"], "pie">, Group<Link_Group>;
+def no_pie : Flag<["-"], "no-pie">, Group<Link_Group>;
 
 } // let Vis = [Default, FlangOption]
 
@@ -6695,8 +6695,6 @@ def mamx_tf32 : Flag<["-"], "mamx-tf32">, Group<m_x86_Features_Group>;
 def mno_amx_tf32 : Flag<["-"], "mno-amx-tf32">, Group<m_x86_Features_Group>;
 def mamx_tile : Flag<["-"], "mamx-tile">, Group<m_x86_Features_Group>;
 def mno_amx_tile : Flag<["-"], "mno-amx-tile">, Group<m_x86_Features_Group>;
-def mamx_transpose : Flag<["-"], "mamx-transpose">, Group<m_x86_Features_Group>;
-def mno_amx_transpose : Flag<["-"], "mno-amx-transpose">, Group<m_x86_Features_Group>;
 def mamx_movrs: Flag<["-"], "mamx-movrs">, Group<m_x86_Features_Group>;
 def mno_amx_movrs: Flag<["-"], "mno-amx-movrs">, Group<m_x86_Features_Group>;
 def mcmpccxadd : Flag<["-"], "mcmpccxadd">, Group<m_x86_Features_Group>;
@@ -8628,6 +8626,11 @@ def fobjc_subscripting_legacy_runtime : Flag<["-"], "fobjc-subscripting-legacy-r
 def vtordisp_mode_EQ : Joined<["-"], "vtordisp-mode=">,
   HelpText<"Control vtordisp placement on win32 targets">,
   MarshallingInfoInt<LangOpts<"VtorDispMode">, "1">;
+def fnative_int16_type : Flag<["-"], "fnative-int16-type">,
+  HelpText<"Use 16 bit integer types">,
+  // This option is implied unless we are in HLSL lang mode
+  ImpliedByAnyOf<[!strconcat("!", hlsl.KeyPath)]>,
+  MarshallingInfoFlag<LangOpts<"NativeInt16Type">>;
 def fnative_half_type: Flag<["-"], "fnative-half-type">,
   HelpText<"Use the native half type for __fp16 instead of promoting to float">,
   MarshallingInfoFlag<LangOpts<"NativeHalfType">>,
@@ -9520,7 +9523,7 @@ def emit_pristine_llvm : DXCFlag<"emit-pristine-llvm">,
   HelpText<"Emit pristine LLVM IR from the frontend by not running any LLVM passes at all."
            "Same as -S + -emit-llvm + -disable-llvm-passes.">;
 def fcgl : DXCFlag<"fcgl">, Alias<emit_pristine_llvm>;
-def enable_16bit_types : DXCFlag<"enable-16bit-types">, Alias<fnative_half_type>,
+def enable_16bit_types : DXCFlag<"enable-16bit-types">,
   HelpText<"Enable 16-bit types and disable min precision types."
            "Available in HLSL 2018 and shader model 6.2.">;
 def fdx_rootsignature_version :
diff --git a/clang/include/clang/Format/Format.h b/clang/include/clang/Format/Format.h
index 2852c4a2916a4..f246defc1fe81 100644
--- a/clang/include/clang/Format/Format.h
+++ b/clang/include/clang/Format/Format.h
@@ -62,49 +62,28 @@ struct FormatStyle {
   /// \version 3.3
   int AccessModifierOffset;
 
-  /// Different styles for aligning after open brackets.
-  enum BracketAlignmentStyle : int8_t {
-    /// Align parameters on the open bracket, e.g.:
-    /// \code
-    ///   someLongFunction(argument1,
-    ///                    argument2);
-    /// \endcode
-    BAS_Align,
-    /// Don't align, instead use ``ContinuationIndentWidth``, e.g.:
-    /// \code
-    ///   someLongFunction(argument1,
-    ///       argument2);
-    /// \endcode
-    BAS_DontAlign,
-    /// Always break after an open bracket, if the parameters don't fit
-    /// on a single line, e.g.:
-    /// \code
-    ///   someLongFunction(
-    ///       argument1, argument2);
-    /// \endcode
-    BAS_AlwaysBreak,
-    /// Always break after an open bracket, if the parameters don't fit
-    /// on a single line. Closing brackets will be placed on a new line.
-    /// E.g.:
-    /// \code
-    ///   someLongFunction(
-    ///       argument1, argument2
-    ///   )
-    /// \endcode
-    ///
-    /// \note
-    ///  This currently only applies to braced initializer lists (when
-    ///  ``Cpp11BracedListStyle`` is not ``Block``) and parentheses.
-    /// \endnote
-    BAS_BlockIndent,
-  };
-
   /// If ``true``, horizontally aligns arguments after an open bracket.
   ///
+  /// \code
+  ///   true:                         vs.   false
+  ///   someLongFunction(argument1,         someLongFunction(argument1,
+  ///                    argument2);            argument2);
+  /// \endcode
+  ///
+  /// \note
+  ///   As of clang-format 22 this option is a bool with the previous
+  ///   option of ``Align`` replaced with ``true``, ``DontAlign`` replaced
+  ///   with ``false``, and the options of ``AlwaysBreak`` and ``BlockIndent``
+  ///   replaced with ``true`` and with setting of new style options using
+  ///   ``BreakAfterOpenBracketBracedList``, ``BreakAfterOpenBracketFunction``,
+  ///   ``BreakAfterOpenBracketIf``, ``BreakBeforeCloseBracketBracedList``,
+  ///   ``BreakBeforeCloseBracketFunction``, and ``BreakBeforeCloseBracketIf``.
+  /// \endnote
+  ///
   /// This applies to round brackets (parentheses), angle brackets and square
   /// brackets.
   /// \version 3.8
-  BracketAlignmentStyle AlignAfterOpenBracket;
+  bool AlignAfterOpenBracket;
 
   /// Different style for aligning array initializers.
   enum ArrayInitializerAlignmentStyle : int8_t {
@@ -1708,6 +1687,57 @@ struct FormatStyle {
   /// \version 16
   AttributeBreakingStyle BreakAfterAttributes;
 
+  /// Force break after the left bracket of a braced initializer list (when
+  /// ``Cpp11BracedListStyle`` is ``true``) when the list exceeds the column
+  /// limit.
+  /// \code
+  ///   true:                             false:
+  ///   vector<int> x {         vs.       vector<int> x {1,
+  ///      1, 2, 3}                            2, 3}
+  /// \endcode
+  /// \version 22
+  bool BreakAfterOpenBracketBracedList;
+
+  /// Force break after the left parenthesis of a function (declaration,
+  /// definition, call) when the parameters exceed the column limit.
+  /// \code
+  ///   true:                             false:
+  ///   foo (                   vs.       foo (a,
+  ///      a , b)                              b)
+  /// \endcode
+  /// \version 22
+  bool BreakAfterOpenBracketFunction;
+
+  /// Force break after the left parenthesis of an if control statement
+  /// when the expression exceeds the column limit.
+  /// \code
+  ///   true:                             false:
+  ///   if constexpr (          vs.       if constexpr (a ||
+  ///      a || b)                                      b)
+  /// \endcode
+  /// \version 22
+  bool BreakAfterOpenBracketIf;
+
+  /// Force break after the left parenthesis of a loop control statement
+  /// when the expression exceeds the column limit.
+  /// \code
+  ///   true:                             false:
+  ///   while (                  vs.      while (a &&
+  ///      a && b) {                             b) {
+  /// \endcode
+  /// \version 22
+  bool BreakAfterOpenBracketLoop;
+
+  /// Force break after the left parenthesis of a switch control statement
+  /// when the expression exceeds the column limit.
+  /// \code
+  ///   true:                             false:
+  ///   switch (                 vs.      switch (a +
+  ///      a + b) {                               b) {
+  /// \endcode
+  /// \version 22
+  bool BreakAfterOpenBracketSwitch;
+
   /// The function declaration return type breaking style to use.
   /// \version 19
   ReturnTypeBreakingStyle BreakAfterReturnType;
@@ -2221,6 +2251,69 @@ struct FormatStyle {
   /// \version 3.7
   BraceBreakingStyle BreakBeforeBraces;
 
+  /// Force break before the right bracket of a braced initializer list (when
+  /// ``Cpp11BracedListStyle`` is ``true``) when the list exceeds the column
+  /// limit. The break before the right bracket is only made if there is a
+  /// break after the opening bracket.
+  /// \code
+  ///   true:                             false:
+  ///   vector<int> x {         vs.       vector<int> x {
+  ///      1, 2, 3                           1, 2, 3}
+  ///   }
+  /// \endcode
+  /// \version 22
+  bool BreakBeforeCloseBracketBracedList;
+
+  /// Force break before the right parenthesis of a function (declaration,
+  /// definition, call) when the parameters exceed the column limit.
+  /// \code
+  ///   true:                             false:
+  ///   foo (                   vs.       foo (
+  ///      a , b                             a , b)
+  ///   )
+  /// \endcode
+  /// \version 22
+  bool BreakBeforeCloseBracketFunction;
+
+  /// Force break before the right parenthesis of an if control statement
+  /// when the expression exceeds the column limit. The break before the
+  /// closing parenthesis is only made if there is a break after the opening
+  /// parenthesis.
+  /// \code
+  ///   true:                             false:
+  ///   if constexpr (          vs.       if constexpr (
+  ///      a || b                            a || b )
+  ///   )
+  /// \endcode
+  /// \version 22
+  bool BreakBeforeCloseBracketIf;
+
+  /// Force break before the right parenthesis of a loop control statement
+  /// when the expression exceeds the column limit. The break before the
+  /// closing parenthesis is only made if there is a break after the opening
+  /// parenthesis.
+  /// \code
+  ///   true:                             false:
+  ///   while (                  vs.      while (
+  ///      a && b                            a && b) {
+  ///   ) {
+  /// \endcode
+  /// \version 22
+  bool BreakBeforeCloseBracketLoop;
+
+  /// Force break before the right parenthesis of a switch control statement
+  /// when the expression exceeds the column limit. The break before the
+  /// closing parenthesis is only made if there is a break after the opening
+  /// parenthesis.
+  /// \code
+  ///   true:                             false:
+  ///   switch (                 vs.      switch (
+  ///      a + b                             a + b) {
+  ///   ) {
+  /// \endcode
+  /// \version 22
+  bool BreakBeforeCloseBracketSwitch;
+
   /// Different ways to break before concept declarations.
   enum BreakBeforeConceptDeclarationsStyle : int8_t {
     /// Keep the template declaration line together with ``concept``.
@@ -5530,10 +5623,23 @@ struct FormatStyle {
            BreakAdjacentStringLiterals == R.BreakAdjacentStringLiterals &&
            BreakAfterAttributes == R.BreakAfterAttributes &&
            BreakAfterJavaFieldAnnotations == R.BreakAfterJavaFieldAnnotations &&
+           BreakAfterOpenBracketBracedList ==
+               R.BreakAfterOpenBracketBracedList &&
+           BreakAfterOpenBracketFunction == R.BreakAfterOpenBracketFunction &&
+           BreakAfterOpenBracketIf == R.BreakAfterOpenBracketIf &&
+           BreakAfterOpenBracketLoop == R.BreakAfterOpenBracketLoop &&
+           BreakAfterOpenBracketSwitch == R.BreakAfterOpenBracketSwitch &&
            BreakAfterReturnType == R.BreakAfterReturnType &&
            BreakArrays == R.BreakArrays &&
            BreakBeforeBinaryOperators == R.BreakBeforeBinaryOperators &&
            BreakBeforeBraces == R.BreakBeforeBraces &&
+           BreakBeforeCloseBracketBracedList ==
+               R.BreakBeforeCloseBracketBracedList &&
+           BreakBeforeCloseBracketFunction ==
+               R.BreakBeforeCloseBracketFunction &&
+           BreakBeforeCloseBracketIf == R.BreakBeforeCloseBracketIf &&
+           BreakBeforeCloseBracketLoop == R.BreakBeforeCloseBracketLoop &&
+           BreakBeforeCloseBracketSwitch == R.BreakBeforeCloseBracketSwitch &&
            BreakBeforeConceptDeclarations == R.BreakBeforeConceptDeclarations &&
            BreakBeforeInlineASMColon == R.BreakBeforeInlineASMColon &&
            BreakBeforeTemplateCloser == R.BreakBeforeTemplateCloser &&
diff --git a/clang/include/clang/Frontend/TextDiagnostic.h b/clang/include/clang/Frontend/TextDiagnostic.h
index e2e88d4d648a2..10028186d27f3 100644
--- a/clang/include/clang/Frontend/TextDiagnostic.h
+++ b/clang/include/clang/Frontend/TextDiagnostic.h
@@ -16,10 +16,12 @@
 #define LLVM_CLANG_FRONTEND_TEXTDIAGNOSTIC_H
 
 #include "clang/Frontend/DiagnosticRenderer.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/FormattedStream.h"
 
 namespace clang {
 
+using llvm::formatted_raw_ostream;
+
 /// Class to encapsulate the logic for formatting and printing a textual
 /// diagnostic message.
 ///
@@ -33,7 +35,7 @@ namespace clang {
 /// DiagnosticClient is implemented through this class as is diagnostic
 /// printing coming out of libclang.
 class TextDiagnostic : public DiagnosticRenderer {
-  raw_ostream &OS;
+  formatted_raw_ostream OS;
   const Preprocessor *PP;
 
 public:
@@ -47,7 +49,7 @@ class TextDiagnostic : public DiagnosticRenderer {
     unsigned End;
     enum llvm::raw_ostream::Colors Color;
     StyleRange(unsigned S, unsigned E, enum llvm::raw_ostream::Colors C)
-        : Start(S), End(E), Color(C){};
+        : Start(S), End(E), Color(C) {};
   };
 
   /// Print the diagonstic level to a raw_ostream.
diff --git a/clang/include/clang/Lex/PPEmbedParameters.h b/clang/include/clang/Lex/PPEmbedParameters.h
index c4fb8d02f6f35..41a69664df366 100644
--- a/clang/include/clang/Lex/PPEmbedParameters.h
+++ b/clang/include/clang/Lex/PPEmbedParameters.h
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// Defines all of the preprocessor directive parmeters for #embed
+// Defines all of the preprocessor directive parameters for #embed
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h
index 0d2316f73fb62..dad8efd0f017f 100644
--- a/clang/include/clang/Parse/Parser.h
+++ b/clang/include/clang/Parse/Parser.h
@@ -7677,7 +7677,7 @@ class Parser : public CodeCompletionHandler {
   /// [GNU] asm-clobbers:
   ///         asm-string-literal
   ///         asm-clobbers ',' asm-string-literal
-  /// \endverbatim 
+  /// \endverbatim
   ///
   StmtResult ParseAsmStatement(bool &msAsm);
 
diff --git a/clang/include/clang/Sema/Attr.h b/clang/include/clang/Sema/Attr.h
index 3f0b10212789a..5836231818eec 100644
--- a/clang/include/clang/Sema/Attr.h
+++ b/clang/include/clang/Sema/Attr.h
@@ -123,6 +123,12 @@ inline bool isInstanceMethod(const Decl *D) {
   return false;
 }
 
+inline bool hasImplicitObjectParameter(const Decl *D) {
+  if (const auto *MethodDecl = dyn_cast<CXXMethodDecl>(D))
+    return MethodDecl->isImplicitObjectMemberFunction();
+  return false;
+}
+
 /// Diagnose mutually exclusive attributes when present on a given
 /// declaration. Returns true if diagnosed.
 template <typename AttrTy>
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 52904c72d1cfc..c67ed99b1f49e 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -2608,13 +2608,13 @@ class Sema final : public SemaBase {
   };
 
   /// Given a function and its FormatAttr or FormatMatchesAttr info, attempts to
-  /// populate the FomatStringInfo parameter with the attribute's correct
+  /// populate the FormatStringInfo parameter with the attribute's correct
   /// format_idx and firstDataArg. Returns true when the format fits the
   /// function and the FormatStringInfo has been populated.
   static bool getFormatStringInfo(const Decl *Function, unsigned FormatIdx,
                                   unsigned FirstArg, FormatStringInfo *FSI);
   static bool getFormatStringInfo(unsigned FormatIdx, unsigned FirstArg,
-                                  bool IsCXXMember, bool IsVariadic,
+                                  bool HasImplicitThisParam, bool IsVariadic,
                                   FormatStringInfo *FSI);
 
   // Used by C++ template instantiation.
@@ -5119,7 +5119,7 @@ class Sema final : public SemaBase {
     // In C++ the implicit 'this' function parameter also counts.
     // Parameters are counted from one.
     bool HP = hasFunctionProto(D);
-    bool HasImplicitThisParam = isInstanceMethod(D);
+    bool HasImplicitThisParam = hasImplicitObjectParameter(D);
     bool IV = HP && isFunctionOrMethodVariadic(D);
     unsigned NumParams =
         (HP ? getFunctionOrMethodNumParams(D) : 0) + HasImplicitThisParam;
diff --git a/clang/include/clang/Sema/SemaOpenMP.h b/clang/include/clang/Sema/SemaOpenMP.h
index f9baeed03c347..ba12b403d9b9a 100644
--- a/clang/include/clang/Sema/SemaOpenMP.h
+++ b/clang/include/clang/Sema/SemaOpenMP.h
@@ -975,6 +975,12 @@ class SemaOpenMP : public SemaBase {
                            OpenMPDefaultClauseVariableCategory VCKind,
                            SourceLocation VCKindLoc, SourceLocation StartLoc,
                            SourceLocation LParenLoc, SourceLocation EndLoc);
+  /// Called on well-formed 'threadset' clause.
+  OMPClause *ActOnOpenMPThreadsetClause(OpenMPThreadsetKind Kind,
+                                        SourceLocation KindLoc,
+                                        SourceLocation StartLoc,
+                                        SourceLocation LParenLoc,
+                                        SourceLocation EndLoc);
   /// Called on well-formed 'proc_bind' clause.
   OMPClause *ActOnOpenMPProcBindClause(llvm::omp::ProcBindKind Kind,
                                        SourceLocation KindLoc,
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index 687cd46773f43..2669f62456711 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -12403,6 +12403,11 @@ static QualType DecodeTypeFromStr(const char *&Str, const ASTContext &Context,
   // Read the base type.
   switch (*Str++) {
   default: llvm_unreachable("Unknown builtin type letter!");
+  case 'e':
+    assert(HowLong == 0 && !Signed && !Unsigned &&
+           "Bad modifiers used with 'e'!");
+    Type = Context.getLangOpts().OpenCL ? Context.HalfTy : Context.Float16Ty;
+    break;
   case 'x':
     assert(HowLong == 0 && !Signed && !Unsigned &&
            "Bad modifiers used with 'x'!");
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 8f23001ea5a39..8b57b963c538f 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -859,7 +859,7 @@ static bool interp__builtin_carryop(InterpState &S, CodePtr OpPC,
   APSInt RHS = popToAPSInt(S.Stk, RHST);
   APSInt LHS = popToAPSInt(S.Stk, LHST);
 
-  if (CarryOutPtr.isDummy())
+  if (CarryOutPtr.isDummy() || !CarryOutPtr.isBlockPointer())
     return false;
 
   APSInt CarryOut;
@@ -3296,6 +3296,60 @@ static bool interp__builtin_vec_set(InterpState &S, CodePtr OpPC,
   return true;
 }
 
+static bool evalICmpImm(uint8_t Imm, const APSInt &A, const APSInt &B,
+                        bool IsUnsigned) {
+  switch (Imm & 0x7) {
+  case 0x00: // _MM_CMPINT_EQ
+    return (A == B);
+  case 0x01: // _MM_CMPINT_LT
+    return IsUnsigned ? A.ult(B) : A.slt(B);
+  case 0x02: // _MM_CMPINT_LE
+    return IsUnsigned ? A.ule(B) : A.sle(B);
+  case 0x03: // _MM_CMPINT_FALSE
+    return false;
+  case 0x04: // _MM_CMPINT_NE
+    return (A != B);
+  case 0x05: // _MM_CMPINT_NLT
+    return IsUnsigned ? A.ugt(B) : A.sgt(B);
+  case 0x06: // _MM_CMPINT_NLE
+    return IsUnsigned ? A.uge(B) : A.sge(B);
+  case 0x07: // _MM_CMPINT_TRUE
+    return true;
+  default:
+    llvm_unreachable("Invalid Op");
+  }
+}
+
+static bool interp__builtin_ia32_cmp_mask(InterpState &S, CodePtr OpPC,
+                                          const CallExpr *Call, unsigned ID,
+                                          bool IsUnsigned) {
+  assert(Call->getNumArgs() == 4);
+
+  APSInt Mask = popToAPSInt(S, Call->getArg(3));
+  APSInt Opcode = popToAPSInt(S, Call->getArg(2));
+  unsigned CmpOp = static_cast<unsigned>(Opcode.getZExtValue());
+  const Pointer &RHS = S.Stk.pop<Pointer>();
+  const Pointer &LHS = S.Stk.pop<Pointer>();
+
+  assert(LHS.getNumElems() == RHS.getNumElems());
+
+  APInt RetMask = APInt::getZero(LHS.getNumElems());
+  unsigned VectorLen = LHS.getNumElems();
+  PrimType ElemT = LHS.getFieldDesc()->getPrimType();
+
+  for (unsigned ElemNum = 0; ElemNum < VectorLen; ++ElemNum) {
+    APSInt A, B;
+    INT_TYPE_SWITCH_NO_BOOL(ElemT, {
+      A = LHS.elem<T>(ElemNum).toAPSInt();
+      B = RHS.elem<T>(ElemNum).toAPSInt();
+    });
+    RetMask.setBitVal(ElemNum,
+                      Mask[ElemNum] && evalICmpImm(CmpOp, A, B, IsUnsigned));
+  }
+  pushInteger(S, RetMask, Call->getType());
+  return true;
+}
+
 static bool interp__builtin_ia32_vpconflict(InterpState &S, CodePtr OpPC,
                                             const CallExpr *Call) {
   assert(Call->getNumArgs() == 1);
@@ -3357,7 +3411,7 @@ static bool interp__builtin_x86_byteshift(
 
 static bool interp__builtin_ia32_shuffle_generic(
     InterpState &S, CodePtr OpPC, const CallExpr *Call,
-    llvm::function_ref<std::pair<unsigned, unsigned>(unsigned, unsigned)>
+    llvm::function_ref<std::pair<unsigned, int>(unsigned, unsigned)>
         GetSourceIndex) {
 
   assert(Call->getNumArgs() == 3);
@@ -3374,8 +3428,19 @@ static bool interp__builtin_ia32_shuffle_generic(
 
   for (unsigned DstIdx = 0; DstIdx != NumElems; ++DstIdx) {
     auto [SrcVecIdx, SrcIdx] = GetSourceIndex(DstIdx, ShuffleMask);
-    const Pointer &Src = (SrcVecIdx == 0) ? A : B;
-    TYPE_SWITCH(ElemT, { Dst.elem<T>(DstIdx) = Src.elem<T>(SrcIdx); });
+
+    if (SrcIdx < 0) {
+      // Zero out this element
+      if (ElemT == PT_Float) {
+        Dst.elem<Floating>(DstIdx) = Floating(
+            S.getASTContext().getFloatTypeSemantics(VecT->getElementType()));
+      } else {
+        INT_TYPE_SWITCH_NO_BOOL(ElemT, { Dst.elem<T>(DstIdx) = T::from(0); });
+      }
+    } else {
+      const Pointer &Src = (SrcVecIdx == 0) ? A : B;
+      TYPE_SWITCH(ElemT, { Dst.elem<T>(DstIdx) = Src.elem<T>(SrcIdx); });
+    }
   }
   Dst.initializeAllElements();
 
@@ -4328,7 +4393,8 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
           unsigned SrcIdx = ElemInLane >= NumSelectableElems ? 1 : 0;
           unsigned BitIndex = (DstIdx * BitsPerElem) % MaskBits;
           unsigned Index = (ShuffleMask >> BitIndex) & IndexMask;
-          return std::pair<unsigned, unsigned>{SrcIdx, LaneOffset + Index};
+          return std::pair<unsigned, int>{SrcIdx,
+                                          static_cast<int>(LaneOffset + Index)};
         });
   case X86::BI__builtin_ia32_shufpd:
   case X86::BI__builtin_ia32_shufpd256:
@@ -4346,7 +4412,27 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
           unsigned SrcIdx = ElemInLane >= NumSelectableElems ? 1 : 0;
           unsigned BitIndex = (DstIdx * BitsPerElem) % MaskBits;
           unsigned Index = (ShuffleMask >> BitIndex) & IndexMask;
-          return std::pair<unsigned, unsigned>{SrcIdx, LaneOffset + Index};
+          return std::pair<unsigned, int>{SrcIdx,
+                                          static_cast<int>(LaneOffset + Index)};
+        });
+  case X86::BI__builtin_ia32_insertps128:
+    return interp__builtin_ia32_shuffle_generic(
+        S, OpPC, Call, [](unsigned DstIdx, unsigned Mask) {
+          // Bits [3:0]: zero mask - if bit is set, zero this element
+          if ((Mask & (1 << DstIdx)) != 0) {
+            return std::pair<unsigned, int>{0, -1};
+          }
+          // Bits [7:6]: select element from source vector Y (0-3)
+          // Bits [5:4]: select destination position (0-3)
+          unsigned SrcElem = (Mask >> 6) & 0x3;
+          unsigned DstElem = (Mask >> 4) & 0x3;
+          if (DstIdx == DstElem) {
+            // Insert element from source vector (B) at this position
+            return std::pair<unsigned, int>{1, static_cast<int>(SrcElem)};
+          } else {
+            // Copy from destination vector (A)
+            return std::pair<unsigned, int>{0, static_cast<int>(DstIdx)};
+          }
         });
   case X86::BI__builtin_ia32_pshufb128:
   case X86::BI__builtin_ia32_pshufb256:
@@ -4488,6 +4574,35 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
   case X86::BI__builtin_ia32_vec_set_v4di:
     return interp__builtin_vec_set(S, OpPC, Call, BuiltinID);
 
+  case X86::BI__builtin_ia32_cmpb128_mask:
+  case X86::BI__builtin_ia32_cmpw128_mask:
+  case X86::BI__builtin_ia32_cmpd128_mask:
+  case X86::BI__builtin_ia32_cmpq128_mask:
+  case X86::BI__builtin_ia32_cmpb256_mask:
+  case X86::BI__builtin_ia32_cmpw256_mask:
+  case X86::BI__builtin_ia32_cmpd256_mask:
+  case X86::BI__builtin_ia32_cmpq256_mask:
+  case X86::BI__builtin_ia32_cmpb512_mask:
+  case X86::BI__builtin_ia32_cmpw512_mask:
+  case X86::BI__builtin_ia32_cmpd512_mask:
+  case X86::BI__builtin_ia32_cmpq512_mask:
+    return interp__builtin_ia32_cmp_mask(S, OpPC, Call, BuiltinID,
+                                         /*IsUnsigned=*/false);
+
+  case X86::BI__builtin_ia32_ucmpb128_mask:
+  case X86::BI__builtin_ia32_ucmpw128_mask:
+  case X86::BI__builtin_ia32_ucmpd128_mask:
+  case X86::BI__builtin_ia32_ucmpq128_mask:
+  case X86::BI__builtin_ia32_ucmpb256_mask:
+  case X86::BI__builtin_ia32_ucmpw256_mask:
+  case X86::BI__builtin_ia32_ucmpd256_mask:
+  case X86::BI__builtin_ia32_ucmpq256_mask:
+  case X86::BI__builtin_ia32_ucmpb512_mask:
+  case X86::BI__builtin_ia32_ucmpw512_mask:
+  case X86::BI__builtin_ia32_ucmpd512_mask:
+  case X86::BI__builtin_ia32_ucmpq512_mask:
+    return interp__builtin_ia32_cmp_mask(S, OpPC, Call, BuiltinID,
+                                         /*IsUnsigned=*/true);
   case X86::BI__builtin_ia32_pslldqi128_byteshift:
   case X86::BI__builtin_ia32_pslldqi256_byteshift:
   case X86::BI__builtin_ia32_pslldqi512_byteshift:
diff --git a/clang/lib/AST/CommentSema.cpp b/clang/lib/AST/CommentSema.cpp
index 27ff5ab1f0c6b..d5ba240cb2bde 100644
--- a/clang/lib/AST/CommentSema.cpp
+++ b/clang/lib/AST/CommentSema.cpp
@@ -225,7 +225,7 @@ static ParamCommandPassDirection getParamPassDirection(StringRef Arg) {
   return llvm::StringSwitch<ParamCommandPassDirection>(Arg)
       .Case("[in]", ParamCommandPassDirection::In)
       .Case("[out]", ParamCommandPassDirection::Out)
-      .Cases("[in,out]", "[out,in]", ParamCommandPassDirection::InOut)
+      .Cases({"[in,out]", "[out,in]"}, ParamCommandPassDirection::InOut)
       .Default(static_cast<ParamCommandPassDirection>(-1));
 }
 
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 29ee089505125..97eeba8b9d6cc 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -11621,7 +11621,7 @@ static bool evalPackBuiltin(const CallExpr *E, EvalInfo &Info, APValue &Result,
 
 static bool evalShuffleGeneric(
     EvalInfo &Info, const CallExpr *Call, APValue &Out,
-    llvm::function_ref<std::pair<unsigned, unsigned>(unsigned, unsigned)>
+    llvm::function_ref<std::pair<unsigned, int>(unsigned, unsigned)>
         GetSourceIndex) {
 
   const auto *VT = Call->getType()->getAs<VectorType>();
@@ -11644,8 +11644,16 @@ static bool evalShuffleGeneric(
 
   for (unsigned DstIdx = 0; DstIdx != NumElts; ++DstIdx) {
     auto [SrcVecIdx, SrcIdx] = GetSourceIndex(DstIdx, ShuffleMask);
-    const APValue &Src = (SrcVecIdx == 0) ? A : B;
-    ResultElements.push_back(Src.getVectorElt(SrcIdx));
+
+    if (SrcIdx < 0) {
+      // Zero out this element
+      QualType ElemTy = VT->getElementType();
+      ResultElements.push_back(
+          APValue(APFloat::getZero(Info.Ctx.getFloatTypeSemantics(ElemTy))));
+    } else {
+      const APValue &Src = (SrcVecIdx == 0) ? A : B;
+      ResultElements.push_back(Src.getVectorElt(SrcIdx));
+    }
   }
 
   Out = APValue(ResultElements.data(), ResultElements.size());
@@ -12438,7 +12446,7 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
     if (!evalShuffleGeneric(
             Info, E, R,
             [](unsigned DstIdx,
-               unsigned ShuffleMask) -> std::pair<unsigned, unsigned> {
+               unsigned ShuffleMask) -> std::pair<unsigned, int> {
               constexpr unsigned LaneBits = 128u;
               unsigned NumElemPerLane = LaneBits / 32;
               unsigned NumSelectableElems = NumElemPerLane / 2;
@@ -12451,7 +12459,7 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
               unsigned BitIndex = (DstIdx * BitsPerElem) % MaskBits;
               unsigned SrcIdx = (ElemInLane < NumSelectableElems) ? 0 : 1;
               unsigned Index = (ShuffleMask >> BitIndex) & IndexMask;
-              return {SrcIdx, LaneOffset + Index};
+              return {SrcIdx, static_cast<int>(LaneOffset + Index)};
             }))
       return false;
     return Success(R, E);
@@ -12463,7 +12471,7 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
     if (!evalShuffleGeneric(
             Info, E, R,
             [](unsigned DstIdx,
-               unsigned ShuffleMask) -> std::pair<unsigned, unsigned> {
+               unsigned ShuffleMask) -> std::pair<unsigned, int> {
               constexpr unsigned LaneBits = 128u;
               unsigned NumElemPerLane = LaneBits / 64;
               unsigned NumSelectableElems = NumElemPerLane / 2;
@@ -12476,7 +12484,31 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
               unsigned BitIndex = (DstIdx * BitsPerElem) % MaskBits;
               unsigned SrcIdx = (ElemInLane < NumSelectableElems) ? 0 : 1;
               unsigned Index = (ShuffleMask >> BitIndex) & IndexMask;
-              return {SrcIdx, LaneOffset + Index};
+              return {SrcIdx, static_cast<int>(LaneOffset + Index)};
+            }))
+      return false;
+    return Success(R, E);
+  }
+  case X86::BI__builtin_ia32_insertps128: {
+    APValue R;
+    if (!evalShuffleGeneric(
+            Info, E, R,
+            [](unsigned DstIdx, unsigned Mask) -> std::pair<unsigned, int> {
+              // Bits [3:0]: zero mask - if bit is set, zero this element
+              if ((Mask & (1 << DstIdx)) != 0) {
+                return {0, -1};
+              }
+              // Bits [7:6]: select element from source vector Y (0-3)
+              // Bits [5:4]: select destination position (0-3)
+              unsigned SrcElem = (Mask >> 6) & 0x3;
+              unsigned DstElem = (Mask >> 4) & 0x3;
+              if (DstIdx == DstElem) {
+                // Insert element from source vector (B) at this position
+                return {1, static_cast<int>(SrcElem)};
+              } else {
+                // Copy from destination vector (A)
+                return {0, static_cast<int>(DstIdx)};
+              }
             }))
       return false;
     return Success(R, E);
@@ -15766,6 +15798,89 @@ bool IntExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E,
     unsigned Idx = static_cast<unsigned>(IdxAPS.getZExtValue() & (N - 1));
     return Success(Vec.getVectorElt(Idx).getInt(), E);
   }
+
+  case clang::X86::BI__builtin_ia32_cmpb128_mask:
+  case clang::X86::BI__builtin_ia32_cmpw128_mask:
+  case clang::X86::BI__builtin_ia32_cmpd128_mask:
+  case clang::X86::BI__builtin_ia32_cmpq128_mask:
+  case clang::X86::BI__builtin_ia32_cmpb256_mask:
+  case clang::X86::BI__builtin_ia32_cmpw256_mask:
+  case clang::X86::BI__builtin_ia32_cmpd256_mask:
+  case clang::X86::BI__builtin_ia32_cmpq256_mask:
+  case clang::X86::BI__builtin_ia32_cmpb512_mask:
+  case clang::X86::BI__builtin_ia32_cmpw512_mask:
+  case clang::X86::BI__builtin_ia32_cmpd512_mask:
+  case clang::X86::BI__builtin_ia32_cmpq512_mask:
+  case clang::X86::BI__builtin_ia32_ucmpb128_mask:
+  case clang::X86::BI__builtin_ia32_ucmpw128_mask:
+  case clang::X86::BI__builtin_ia32_ucmpd128_mask:
+  case clang::X86::BI__builtin_ia32_ucmpq128_mask:
+  case clang::X86::BI__builtin_ia32_ucmpb256_mask:
+  case clang::X86::BI__builtin_ia32_ucmpw256_mask:
+  case clang::X86::BI__builtin_ia32_ucmpd256_mask:
+  case clang::X86::BI__builtin_ia32_ucmpq256_mask:
+  case clang::X86::BI__builtin_ia32_ucmpb512_mask:
+  case clang::X86::BI__builtin_ia32_ucmpw512_mask:
+  case clang::X86::BI__builtin_ia32_ucmpd512_mask:
+  case clang::X86::BI__builtin_ia32_ucmpq512_mask: {
+    assert(E->getNumArgs() == 4);
+
+    bool IsUnsigned =
+        (BuiltinOp >= clang::X86::BI__builtin_ia32_ucmpb128_mask &&
+         BuiltinOp <= clang::X86::BI__builtin_ia32_ucmpq512_mask);
+
+    APValue LHS, RHS;
+    APSInt Mask, Opcode;
+    if (!EvaluateVector(E->getArg(0), LHS, Info) ||
+        !EvaluateVector(E->getArg(1), RHS, Info) ||
+        !EvaluateInteger(E->getArg(2), Opcode, Info) ||
+        !EvaluateInteger(E->getArg(3), Mask, Info))
+      return false;
+
+    assert(LHS.getVectorLength() == RHS.getVectorLength());
+
+    unsigned VectorLen = LHS.getVectorLength();
+    unsigned RetWidth = Mask.getBitWidth();
+
+    APSInt RetMask(llvm::APInt(RetWidth, 0), /*isUnsigned=*/true);
+
+    for (unsigned ElemNum = 0; ElemNum < VectorLen; ++ElemNum) {
+      const APSInt &A = LHS.getVectorElt(ElemNum).getInt();
+      const APSInt &B = RHS.getVectorElt(ElemNum).getInt();
+      bool Result = false;
+
+      switch (Opcode.getExtValue() & 0x7) {
+      case 0: // _MM_CMPINT_EQ
+        Result = (A == B);
+        break;
+      case 1: // _MM_CMPINT_LT
+        Result = IsUnsigned ? A.ult(B) : A.slt(B);
+        break;
+      case 2: // _MM_CMPINT_LE
+        Result = IsUnsigned ? A.ule(B) : A.sle(B);
+        break;
+      case 3: // _MM_CMPINT_FALSE
+        Result = false;
+        break;
+      case 4: // _MM_CMPINT_NE
+        Result = (A != B);
+        break;
+      case 5: // _MM_CMPINT_NLT (>=)
+        Result = IsUnsigned ? A.uge(B) : A.sge(B);
+        break;
+      case 6: // _MM_CMPINT_NLE (>)
+        Result = IsUnsigned ? A.ugt(B) : A.sgt(B);
+        break;
+      case 7: // _MM_CMPINT_TRUE
+        Result = true;
+        break;
+      }
+
+      RetMask.setBitVal(ElemNum, Mask[ElemNum] && Result);
+    }
+
+    return Success(APValue(RetMask), E);
+  }
   }
 }
 
diff --git a/clang/lib/AST/OpenMPClause.cpp b/clang/lib/AST/OpenMPClause.cpp
index 791df7ee1c3d4..59d94590e04d1 100644
--- a/clang/lib/AST/OpenMPClause.cpp
+++ b/clang/lib/AST/OpenMPClause.cpp
@@ -124,6 +124,7 @@ const OMPClauseWithPreInit *OMPClauseWithPreInit::get(const OMPClause *C) {
   case OMPC_nowait:
   case OMPC_untied:
   case OMPC_mergeable:
+  case OMPC_threadset:
   case OMPC_threadprivate:
   case OMPC_groupprivate:
   case OMPC_flush:
@@ -2035,6 +2036,13 @@ void OMPClausePrinter::VisitOMPDefaultClause(OMPDefaultClause *Node) {
   OS << ")";
 }
 
+void OMPClausePrinter::VisitOMPThreadsetClause(OMPThreadsetClause *Node) {
+  OS << "threadset("
+     << getOpenMPSimpleClauseTypeName(OMPC_threadset,
+                                      unsigned(Node->getThreadsetKind()))
+     << ")";
+}
+
 void OMPClausePrinter::VisitOMPProcBindClause(OMPProcBindClause *Node) {
   OS << "proc_bind("
      << getOpenMPSimpleClauseTypeName(OMPC_proc_bind,
diff --git a/clang/lib/AST/StmtProfile.cpp b/clang/lib/AST/StmtProfile.cpp
index 05b64ccda0d01..c909e1bcecd38 100644
--- a/clang/lib/AST/StmtProfile.cpp
+++ b/clang/lib/AST/StmtProfile.cpp
@@ -546,6 +546,8 @@ void OMPClauseProfiler::VisitOMPNocontextClause(const OMPNocontextClause *C) {
 
 void OMPClauseProfiler::VisitOMPDefaultClause(const OMPDefaultClause *C) { }
 
+void OMPClauseProfiler::VisitOMPThreadsetClause(const OMPThreadsetClause *C) {}
+
 void OMPClauseProfiler::VisitOMPProcBindClause(const OMPProcBindClause *C) { }
 
 void OMPClauseProfiler::VisitOMPUnifiedAddressClause(
diff --git a/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp b/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp
index 00c7ed90503e7..5ad18ee26c174 100644
--- a/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp
+++ b/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp
@@ -23,18 +23,35 @@
 #include "clang/Analysis/AnalysisDeclContext.h"
 #include "clang/Analysis/CFG.h"
 #include "llvm/ADT/FoldingSet.h"
+#include "llvm/ADT/StringMap.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TimeProfiler.h"
+#include "llvm/Support/raw_ostream.h"
 #include <memory>
 
 namespace clang::lifetimes {
 namespace internal {
 
+llvm::StringMap<int> LifetimeSafetyAnalysis::MissingOriginMap;
+
 LifetimeSafetyAnalysis::LifetimeSafetyAnalysis(AnalysisDeclContext &AC,
                                                LifetimeSafetyReporter *Reporter)
     : AC(AC), Reporter(Reporter) {}
 
+void LifetimeSafetyAnalysis::PrintStats(llvm::raw_ostream& OS) {
+  llvm::errs() << "\n*** LifetimeSafety Missing Origin Stats (expression_type : count) :\n";
+  for (const auto& [expr, count] : LifetimeSafetyAnalysis::MissingOriginMap) {
+    OS << expr << " : " << count << '\n';
+  }
+}
+
+void LifetimeSafetyAnalysis::UpdateMissingOriginCount(const OriginManager& OM) {
+  for (const auto& [expr, missing_origin_count] : OM.getMissingOrigins()) {
+    LifetimeSafetyAnalysis::MissingOriginMap[std::string(expr)] += missing_origin_count;
+  }
+}
+
 void LifetimeSafetyAnalysis::run() {
   llvm::TimeTraceScope TimeProfile("LifetimeSafetyAnalysis");
 
@@ -66,6 +83,7 @@ void LifetimeSafetyAnalysis::run() {
                   LiveOrigins->dump(llvm::dbgs(), FactMgr.getTestPoints()));
 
   runLifetimeChecker(*LoanPropagation, *LiveOrigins, FactMgr, AC, Reporter);
+  UpdateMissingOriginCount(FactMgr.getOriginMgr());
 }
 } // namespace internal
 
diff --git a/clang/lib/Analysis/LifetimeSafety/Origins.cpp b/clang/lib/Analysis/LifetimeSafety/Origins.cpp
index ea51a75324e06..c8570844fe314 100644
--- a/clang/lib/Analysis/LifetimeSafety/Origins.cpp
+++ b/clang/lib/Analysis/LifetimeSafety/Origins.cpp
@@ -7,6 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang/Analysis/Analyses/LifetimeSafety/Origins.h"
+#include "clang/AST/TypeBase.h"
+#include "llvm/ADT/StringMap.h"
 
 namespace clang::lifetimes::internal {
 
@@ -22,6 +24,11 @@ void OriginManager::dump(OriginID OID, llvm::raw_ostream &OS) const {
   OS << ")";
 }
 
+const llvm::StringMap<int> OriginManager::getMissingOrigins() const {
+  return ExprTypeToMissingOriginCount;
+}
+
+
 Origin &OriginManager::addOrigin(OriginID ID, const clang::ValueDecl &D) {
   AllOrigins.emplace_back(ID, &D);
   return AllOrigins.back();
@@ -37,6 +44,16 @@ OriginID OriginManager::get(const Expr &E) {
   auto It = ExprToOriginID.find(&E);
   if (It != ExprToOriginID.end())
     return It->second;
+
+  // if the expression has no specific origin, increment the missing origin counter.
+  const QualType ExprType = E.getType();
+  auto CountIt = ExprTypeToMissingOriginCount.find(ExprType.getAsString());
+  if (CountIt == ExprTypeToMissingOriginCount.end()) {
+    ExprTypeToMissingOriginCount[ExprType.getAsString()] = 1;
+  } else {
+    CountIt->second++;
+  }
+  
   // If the expression itself has no specific origin, and it's a reference
   // to a declaration, its origin is that of the declaration it refers to.
   // For pointer types, where we don't pre-emptively create an origin for the
diff --git a/clang/lib/Basic/OpenMPKinds.cpp b/clang/lib/Basic/OpenMPKinds.cpp
index 64b2bff063340..3d41f2d197b81 100644
--- a/clang/lib/Basic/OpenMPKinds.cpp
+++ b/clang/lib/Basic/OpenMPKinds.cpp
@@ -210,6 +210,15 @@ unsigned clang::getOpenMPSimpleClauseType(OpenMPClauseKind Kind, StringRef Str,
 #define OPENMP_ALLOCATE_MODIFIER(Name) .Case(#Name, OMPC_ALLOCATE_##Name)
 #include "clang/Basic/OpenMPKinds.def"
         .Default(OMPC_ALLOCATE_unknown);
+  case OMPC_threadset: {
+    unsigned Type = llvm::StringSwitch<unsigned>(Str)
+#define OPENMP_THREADSET_KIND(Name) .Case(#Name, OMPC_THREADSET_##Name)
+#include "clang/Basic/OpenMPKinds.def"
+                        .Default(OMPC_THREADSET_unknown);
+    if (LangOpts.OpenMP < 60)
+      return OMPC_THREADSET_unknown;
+    return Type;
+  }
   case OMPC_num_threads: {
     unsigned Type = llvm::StringSwitch<unsigned>(Str)
 #define OPENMP_NUMTHREADS_MODIFIER(Name) .Case(#Name, OMPC_NUMTHREADS_##Name)
@@ -565,6 +574,16 @@ const char *clang::getOpenMPSimpleClauseTypeName(OpenMPClauseKind Kind,
 #include "clang/Basic/OpenMPKinds.def"
     }
     llvm_unreachable("Invalid OpenMP 'num_threads' clause modifier");
+  case OMPC_threadset:
+    switch (Type) {
+    case OMPC_THREADSET_unknown:
+      return "unknown";
+#define OPENMP_THREADSET_KIND(Name)                                            \
+  case OMPC_THREADSET_##Name:                                                  \
+    return #Name;
+#include "clang/Basic/OpenMPKinds.def"
+    }
+    llvm_unreachable("Invalid OpenMP 'threadset' clause modifier");
   case OMPC_unknown:
   case OMPC_threadprivate:
   case OMPC_groupprivate:
diff --git a/clang/lib/Basic/SourceManager.cpp b/clang/lib/Basic/SourceManager.cpp
index d8ec837f0f7b9..938c6485125ee 100644
--- a/clang/lib/Basic/SourceManager.cpp
+++ b/clang/lib/Basic/SourceManager.cpp
@@ -608,8 +608,7 @@ FileID SourceManager::createFileIDImpl(ContentCache &File, StringRef Filename,
     return FileID::get(LoadedID);
   }
   unsigned FileSize = File.getSize();
-  llvm::ErrorOr<bool> NeedConversion =
-      llvm::needConversion(Filename.str().c_str());
+  llvm::ErrorOr<bool> NeedConversion = llvm::needConversion(Filename);
   if (NeedConversion && *NeedConversion) {
     // Buffer size may increase due to potential z/OS EBCDIC to UTF-8
     // conversion.
diff --git a/clang/lib/Basic/Targets/AMDGPU.cpp b/clang/lib/Basic/Targets/AMDGPU.cpp
index d4de704689e72..d4d696b8456b6 100644
--- a/clang/lib/Basic/Targets/AMDGPU.cpp
+++ b/clang/lib/Basic/Targets/AMDGPU.cpp
@@ -356,12 +356,6 @@ void AMDGPUTargetInfo::getTargetDefines(const LangOptions &Opts,
   if (hasFastFMA())
     Builder.defineMacro("FP_FAST_FMA");
 
-  Builder.defineMacro("__AMDGCN_WAVEFRONT_SIZE__", Twine(WavefrontSize),
-                      "compile-time-constant access to the wavefront size will "
-                      "be removed in a future release");
-  Builder.defineMacro("__AMDGCN_WAVEFRONT_SIZE", Twine(WavefrontSize),
-                      "compile-time-constant access to the wavefront size will "
-                      "be removed in a future release");
   Builder.defineMacro("__AMDGCN_CUMODE__", Twine(CUMode));
 }
 
diff --git a/clang/lib/Basic/Targets/AVR.cpp b/clang/lib/Basic/Targets/AVR.cpp
index 2673669bc9035..90b4ac1b857cc 100644
--- a/clang/lib/Basic/Targets/AVR.cpp
+++ b/clang/lib/Basic/Targets/AVR.cpp
@@ -30,13 +30,13 @@ struct LLVM_LIBRARY_VISIBILITY MCUInfo {
 
 // NOTE: This list has been synchronized with gcc-avr 5.4.0 and avr-libc 2.0.0.
 static MCUInfo AVRMcus[] = {
-    {"avr1", NULL, "1", 0},
+    {"avr1", nullptr, "1", 0},
     {"at90s1200", "__AVR_AT90S1200__", "1", 0},
     {"attiny11", "__AVR_ATtiny11__", "1", 0},
     {"attiny12", "__AVR_ATtiny12__", "1", 0},
     {"attiny15", "__AVR_ATtiny15__", "1", 0},
     {"attiny28", "__AVR_ATtiny28__", "1", 0},
-    {"avr2", NULL, "2", 1},
+    {"avr2", nullptr, "2", 1},
     {"at90s2313", "__AVR_AT90S2313__", "2", 1},
     {"at90s2323", "__AVR_AT90S2323__", "2", 1},
     {"at90s2333", "__AVR_AT90S2333__", "2", 1},
@@ -50,7 +50,7 @@ static MCUInfo AVRMcus[] = {
     {"at90s8515", "__AVR_AT90S8515__", "2", 1},
     {"at90c8534", "__AVR_AT90c8534__", "2", 1},
     {"at90s8535", "__AVR_AT90S8535__", "2", 1},
-    {"avr25", NULL, "25", 1},
+    {"avr25", nullptr, "25", 1},
     {"ata5272", "__AVR_ATA5272__", "25", 1},
     {"ata6616c", "__AVR_ATA6616c__", "25", 1},
     {"attiny13", "__AVR_ATtiny13__", "25", 1},
@@ -80,13 +80,13 @@ static MCUInfo AVRMcus[] = {
     {"attiny48", "__AVR_ATtiny48__", "25", 1},
     {"attiny88", "__AVR_ATtiny88__", "25", 1},
     {"attiny828", "__AVR_ATtiny828__", "25", 1},
-    {"avr3", NULL, "3", 1},
+    {"avr3", nullptr, "3", 1},
     {"at43usb355", "__AVR_AT43USB355__", "3", 1},
     {"at76c711", "__AVR_AT76C711__", "3", 1},
-    {"avr31", NULL, "31", 1},
+    {"avr31", nullptr, "31", 1},
     {"atmega103", "__AVR_ATmega103__", "31", 1},
     {"at43usb320", "__AVR_AT43USB320__", "31", 1},
-    {"avr35", NULL, "35", 1},
+    {"avr35", nullptr, "35", 1},
     {"attiny167", "__AVR_ATtiny167__", "35", 1},
     {"at90usb82", "__AVR_AT90USB82__", "35", 1},
     {"at90usb162", "__AVR_AT90USB162__", "35", 1},
@@ -97,7 +97,7 @@ static MCUInfo AVRMcus[] = {
     {"atmega16u2", "__AVR_ATmega16U2__", "35", 1},
     {"atmega32u2", "__AVR_ATmega32U2__", "35", 1},
     {"attiny1634", "__AVR_ATtiny1634__", "35", 1},
-    {"avr4", NULL, "4", 1},
+    {"avr4", nullptr, "4", 1},
     {"atmega8", "__AVR_ATmega8__", "4", 1},
     {"ata6289", "__AVR_ATA6289__", "4", 1},
     {"atmega8a", "__AVR_ATmega8A__", "4", 1},
@@ -123,7 +123,7 @@ static MCUInfo AVRMcus[] = {
     {"at90pwm3", "__AVR_AT90PWM3__", "4", 1},
     {"at90pwm3b", "__AVR_AT90PWM3B__", "4", 1},
     {"at90pwm81", "__AVR_AT90PWM81__", "4", 1},
-    {"avr5", NULL, "5", 1},
+    {"avr5", nullptr, "5", 1},
     {"ata5702m322", "__AVR_ATA5702M322__", "5", 1},
     {"ata5782", "__AVR_ATA5782__", "5", 1},
     {"ata5790", "__AVR_ATA5790__", "5", 1},
@@ -230,7 +230,7 @@ static MCUInfo AVRMcus[] = {
     {"at90scr100", "__AVR_AT90SCR100__", "5", 1},
     {"at94k", "__AVR_AT94K__", "5", 1},
     {"m3000", "__AVR_AT000__", "5", 1},
-    {"avr51", NULL, "51", 2},
+    {"avr51", nullptr, "51", 2},
     {"atmega128", "__AVR_ATmega128__", "51", 2},
     {"atmega128a", "__AVR_ATmega128A__", "51", 2},
     {"atmega1280", "__AVR_ATmega1280__", "51", 2},
@@ -243,12 +243,12 @@ static MCUInfo AVRMcus[] = {
     {"at90can128", "__AVR_AT90CAN128__", "51", 2},
     {"at90usb1286", "__AVR_AT90USB1286__", "51", 2},
     {"at90usb1287", "__AVR_AT90USB1287__", "51", 2},
-    {"avr6", NULL, "6", 4},
+    {"avr6", nullptr, "6", 4},
     {"atmega2560", "__AVR_ATmega2560__", "6", 4},
     {"atmega2561", "__AVR_ATmega2561__", "6", 4},
     {"atmega256rfr2", "__AVR_ATmega256RFR2__", "6", 4},
     {"atmega2564rfr2", "__AVR_ATmega2564RFR2__", "6", 4},
-    {"avrxmega2", NULL, "102", 1},
+    {"avrxmega2", nullptr, "102", 1},
     {"atxmega16a4", "__AVR_ATxmega16A4__", "102", 1},
     {"atxmega16a4u", "__AVR_ATxmega16A4U__", "102", 1},
     {"atxmega16c4", "__AVR_ATxmega16C4__", "102", 1},
@@ -262,7 +262,7 @@ static MCUInfo AVRMcus[] = {
     {"atxmega32e5", "__AVR_ATxmega32E5__", "102", 1},
     {"atxmega16e5", "__AVR_ATxmega16E5__", "102", 1},
     {"atxmega8e5", "__AVR_ATxmega8E5__", "102", 1},
-    {"avrxmega4", NULL, "104", 1},
+    {"avrxmega4", nullptr, "104", 1},
     {"atxmega64a3", "__AVR_ATxmega64A3__", "104", 1},
     {"atxmega64a3u", "__AVR_ATxmega64A3U__", "104", 1},
     {"atxmega64a4u", "__AVR_ATxmega64A4U__", "104", 1},
@@ -271,10 +271,10 @@ static MCUInfo AVRMcus[] = {
     {"atxmega64c3", "__AVR_ATxmega64C3__", "104", 1},
     {"atxmega64d3", "__AVR_ATxmega64D3__", "104", 1},
     {"atxmega64d4", "__AVR_ATxmega64D4__", "104", 1},
-    {"avrxmega5", NULL, "105", 1},
+    {"avrxmega5", nullptr, "105", 1},
     {"atxmega64a1", "__AVR_ATxmega64A1__", "105", 1},
     {"atxmega64a1u", "__AVR_ATxmega64A1U__", "105", 1},
-    {"avrxmega6", NULL, "106", 6},
+    {"avrxmega6", nullptr, "106", 6},
     {"atxmega128a3", "__AVR_ATxmega128A3__", "106", 2},
     {"atxmega128a3u", "__AVR_ATxmega128A3U__", "106", 2},
     {"atxmega128b1", "__AVR_ATxmega128B1__", "106", 2},
@@ -294,11 +294,11 @@ static MCUInfo AVRMcus[] = {
     {"atxmega256d3", "__AVR_ATxmega256D3__", "106", 4},
     {"atxmega384c3", "__AVR_ATxmega384C3__", "106", 6},
     {"atxmega384d3", "__AVR_ATxmega384D3__", "106", 6},
-    {"avrxmega7", NULL, "107", 2},
+    {"avrxmega7", nullptr, "107", 2},
     {"atxmega128a1", "__AVR_ATxmega128A1__", "107", 2},
     {"atxmega128a1u", "__AVR_ATxmega128A1U__", "107", 2},
     {"atxmega128a4u", "__AVR_ATxmega128A4U__", "107", 2},
-    {"avrtiny", NULL, "100", 0},
+    {"avrtiny", nullptr, "100", 0},
     {"attiny4", "__AVR_ATtiny4__", "100", 0},
     {"attiny5", "__AVR_ATtiny5__", "100", 0},
     {"attiny9", "__AVR_ATtiny9__", "100", 0},
@@ -307,7 +307,7 @@ static MCUInfo AVRMcus[] = {
     {"attiny40", "__AVR_ATtiny40__", "100", 0},
     {"attiny102", "__AVR_ATtiny102__", "100", 0},
     {"attiny104", "__AVR_ATtiny104__", "100", 0},
-    {"avrxmega3", NULL, "103", 1},
+    {"avrxmega3", nullptr, "103", 1},
     {"attiny202", "__AVR_ATtiny202__", "103", 1},
     {"attiny402", "__AVR_ATtiny402__", "103", 1},
     {"attiny204", "__AVR_ATtiny204__", "103", 1},
diff --git a/clang/lib/Basic/Targets/BPF.cpp b/clang/lib/Basic/Targets/BPF.cpp
index 0411bcca51789..8de1083d758c7 100644
--- a/clang/lib/Basic/Targets/BPF.cpp
+++ b/clang/lib/Basic/Targets/BPF.cpp
@@ -75,6 +75,7 @@ void BPFTargetInfo::getTargetDefines(const LangOptions &Opts,
     Builder.defineMacro("__BPF_FEATURE_GOTOL");
     Builder.defineMacro("__BPF_FEATURE_ST");
     Builder.defineMacro("__BPF_FEATURE_LOAD_ACQ_STORE_REL");
+    Builder.defineMacro("__BPF_FEATURE_GOTOX");
   }
 }
 
diff --git a/clang/lib/Basic/Targets/NVPTX.cpp b/clang/lib/Basic/Targets/NVPTX.cpp
index 9651c3832f51d..ec4e40b0db6eb 100644
--- a/clang/lib/Basic/Targets/NVPTX.cpp
+++ b/clang/lib/Basic/Targets/NVPTX.cpp
@@ -171,7 +171,7 @@ ArrayRef<const char *> NVPTXTargetInfo::getGCCRegNames() const {
 
 bool NVPTXTargetInfo::hasFeature(StringRef Feature) const {
   return llvm::StringSwitch<bool>(Feature)
-      .Cases("ptx", "nvptx", true)
+      .Cases({"ptx", "nvptx"}, true)
       .Default(false);
 }
 
diff --git a/clang/lib/Basic/Targets/PPC.h b/clang/lib/Basic/Targets/PPC.h
index 846b240218172..d4ada2a0e0c38 100644
--- a/clang/lib/Basic/Targets/PPC.h
+++ b/clang/lib/Basic/Targets/PPC.h
@@ -125,9 +125,8 @@ class LLVM_LIBRARY_VISIBILITY PPCTargetInfo : public TargetInfo {
               .Cases({"power3", "pwr3"}, ArchDefinePpcgr)
               .Cases({"power4", "pwr4"},
                      ArchDefinePwr4 | ArchDefinePpcgr | ArchDefinePpcsq)
-              .Cases("power5", "pwr5",
-                     ArchDefinePwr5 | ArchDefinePwr4 | ArchDefinePpcgr |
-                         ArchDefinePpcsq)
+              .Cases({"power5", "pwr5"}, ArchDefinePwr5 | ArchDefinePwr4 |
+                                             ArchDefinePpcgr | ArchDefinePpcsq)
               .Cases({"power5x", "pwr5x"},
                      ArchDefinePwr5x | ArchDefinePwr5 | ArchDefinePwr4 |
                          ArchDefinePpcgr | ArchDefinePpcsq)
@@ -166,7 +165,7 @@ class LLVM_LIBRARY_VISIBILITY PPCTargetInfo : public TargetInfo {
                         ArchDefinePwr9 | ArchDefinePwr8 | ArchDefinePwr7 |
                         ArchDefinePwr6 | ArchDefinePwr5x | ArchDefinePwr5 |
                         ArchDefinePwr4 | ArchDefinePpcgr | ArchDefinePpcsq)
-              .Cases("8548", "e500", ArchDefineE500)
+              .Cases({"8548", "e500"}, ArchDefineE500)
               .Default(ArchDefineNone);
     }
     return CPUKnown;
@@ -445,27 +444,17 @@ class LLVM_LIBRARY_VISIBILITY PPC64TargetInfo : public PPCTargetInfo {
     LongWidth = LongAlign = PointerWidth = PointerAlign = 64;
     IntMaxType = SignedLong;
     Int64Type = SignedLong;
-    std::string DataLayout;
 
     if (Triple.isOSAIX()) {
       // TODO: Set appropriate ABI for AIX platform.
-      DataLayout = "E-m:a-Fi64-i64:64-i128:128-n32:64";
       LongDoubleWidth = 64;
       LongDoubleAlign = DoubleAlign = 32;
       LongDoubleFormat = &llvm::APFloat::IEEEdouble();
-    } else if ((Triple.getArch() == llvm::Triple::ppc64le)) {
-      DataLayout = "e-m:e-Fn32-i64:64-i128:128-n32:64";
+    } else if ((Triple.getArch() == llvm::Triple::ppc64le) ||
+               Triple.isPPC64ELFv2ABI()) {
       ABI = "elfv2";
     } else {
-      DataLayout = "E-m:e";
-      if (Triple.isPPC64ELFv2ABI()) {
-        ABI = "elfv2";
-        DataLayout += "-Fn32";
-      } else {
-        ABI = "elfv1";
-        DataLayout += "-Fi64";
-      }
-      DataLayout += "-i64:64-i128:128-n32:64";
+      ABI = "elfv1";
     }
 
     if (Triple.isOSFreeBSD() || Triple.isOSOpenBSD() || Triple.isMusl()) {
@@ -473,14 +462,12 @@ class LLVM_LIBRARY_VISIBILITY PPC64TargetInfo : public PPCTargetInfo {
       LongDoubleFormat = &llvm::APFloat::IEEEdouble();
     }
 
-    if (Triple.isOSAIX() || Triple.isOSLinux())
-      DataLayout += "-S128-v256:256:256-v512:512:512";
-    resetDataLayout(DataLayout);
-
     // Newer PPC64 instruction sets support atomics up to 16 bytes.
     MaxAtomicPromoteWidth = 128;
     // Baseline PPC64 supports inlining atomics up to 8 bytes.
     MaxAtomicInlineWidth = 64;
+
+    calculateDataLayout();
   }
 
   void setMaxAtomicWidth() override {
@@ -495,10 +482,33 @@ class LLVM_LIBRARY_VISIBILITY PPC64TargetInfo : public PPCTargetInfo {
     return TargetInfo::CharPtrBuiltinVaList;
   }
 
+  void calculateDataLayout() {
+    std::string DataLayout;
+
+    if (getTriple().isOSAIX()) {
+      DataLayout = "E-m:a-Fi64-i64:64-i128:128-n32:64";
+    } else if ((getTriple().getArch() == llvm::Triple::ppc64le)) {
+      DataLayout = "e-m:e-Fn32-i64:64-i128:128-n32:64";
+    } else {
+      DataLayout = "E-m:e";
+      if (ABI == "elfv2") {
+        DataLayout += "-Fn32";
+      } else {
+        DataLayout += "-Fi64";
+      }
+      DataLayout += "-i64:64-i128:128-n32:64";
+    }
+
+    if (getTriple().isOSAIX() || getTriple().isOSLinux())
+      DataLayout += "-S128-v256:256:256-v512:512:512";
+    resetDataLayout(DataLayout);
+  }
+
   // PPC64 Linux-specific ABI options.
   bool setABI(const std::string &Name) override {
     if (Name == "elfv1" || Name == "elfv2") {
       ABI = Name;
+      calculateDataLayout();
       return true;
     }
     return false;
diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp
index e71f10c4c16fc..7a90c89dd7dc0 100644
--- a/clang/lib/Basic/Targets/X86.cpp
+++ b/clang/lib/Basic/Targets/X86.cpp
@@ -396,8 +396,6 @@ bool X86TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
       HasAMXFP8 = true;
     } else if (Feature == "+amx-movrs") {
       HasAMXMOVRS = true;
-    } else if (Feature == "+amx-transpose") {
-      HasAMXTRANSPOSE = true;
     } else if (Feature == "+amx-avx512") {
       HasAMXAVX512 = true;
     } else if (Feature == "+amx-tf32") {
@@ -925,8 +923,6 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts,
     Builder.defineMacro("__AMX_FP8__");
   if (HasAMXMOVRS)
     Builder.defineMacro("__AMX_MOVRS__");
-  if (HasAMXTRANSPOSE)
-    Builder.defineMacro("__AMX_TRANSPOSE__");
   if (HasAMXAVX512)
     Builder.defineMacro("__AMX_AVX512__");
   if (HasAMXTF32)
@@ -1068,7 +1064,6 @@ bool X86TargetInfo::isValidFeatureName(StringRef Name) const {
       .Case("amx-movrs", true)
       .Case("amx-tf32", true)
       .Case("amx-tile", true)
-      .Case("amx-transpose", true)
       .Case("avx", true)
       .Case("avx10.1", true)
       .Case("avx10.2", true)
@@ -1189,7 +1184,6 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const {
       .Case("amx-movrs", HasAMXMOVRS)
       .Case("amx-tf32", HasAMXTF32)
       .Case("amx-tile", HasAMXTILE)
-      .Case("amx-transpose", HasAMXTRANSPOSE)
       .Case("avx", SSELevel >= AVX)
       .Case("avx10.1", HasAVX10_1)
       .Case("avx10.2", HasAVX10_2)
diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h
index be3a473174370..e7da2622e78b5 100644
--- a/clang/lib/Basic/Targets/X86.h
+++ b/clang/lib/Basic/Targets/X86.h
@@ -160,7 +160,6 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo {
   bool HasAMXCOMPLEX = false;
   bool HasAMXFP8 = false;
   bool HasAMXMOVRS = false;
-  bool HasAMXTRANSPOSE = false;
   bool HasAMXAVX512 = false;
   bool HasAMXTF32 = false;
   bool HasSERIALIZE = false;
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuilder.h b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
index 50d585dca3b8c..e5066fac19185 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuilder.h
+++ b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
@@ -108,11 +108,11 @@ class CIRGenBuilderTy : public cir::CIRBaseBuilderTy {
 
   cir::LongDoubleType getLongDoubleTy(const llvm::fltSemantics &format) const {
     if (&format == &llvm::APFloat::IEEEdouble())
-      return cir::LongDoubleType::get(getContext(), typeCache.DoubleTy);
+      return cir::LongDoubleType::get(getContext(), typeCache.doubleTy);
     if (&format == &llvm::APFloat::x87DoubleExtended())
-      return cir::LongDoubleType::get(getContext(), typeCache.FP80Ty);
+      return cir::LongDoubleType::get(getContext(), typeCache.fP80Ty);
     if (&format == &llvm::APFloat::IEEEquad())
-      return cir::LongDoubleType::get(getContext(), typeCache.FP128Ty);
+      return cir::LongDoubleType::get(getContext(), typeCache.fP128Ty);
     if (&format == &llvm::APFloat::PPCDoubleDouble())
       llvm_unreachable("NYI: PPC double-double format for long double");
     llvm_unreachable("Unsupported format for long double");
@@ -258,17 +258,17 @@ class CIRGenBuilderTy : public cir::CIRBaseBuilderTy {
     }
   }
 
-  cir::VoidType getVoidTy() { return typeCache.VoidTy; }
+  cir::VoidType getVoidTy() { return typeCache.voidTy; }
 
-  cir::IntType getSInt8Ty() { return typeCache.SInt8Ty; }
-  cir::IntType getSInt16Ty() { return typeCache.SInt16Ty; }
-  cir::IntType getSInt32Ty() { return typeCache.SInt32Ty; }
-  cir::IntType getSInt64Ty() { return typeCache.SInt64Ty; }
+  cir::IntType getSInt8Ty() { return typeCache.sInt8Ty; }
+  cir::IntType getSInt16Ty() { return typeCache.sInt16Ty; }
+  cir::IntType getSInt32Ty() { return typeCache.sInt32Ty; }
+  cir::IntType getSInt64Ty() { return typeCache.sInt64Ty; }
 
-  cir::IntType getUInt8Ty() { return typeCache.UInt8Ty; }
-  cir::IntType getUInt16Ty() { return typeCache.UInt16Ty; }
-  cir::IntType getUInt32Ty() { return typeCache.UInt32Ty; }
-  cir::IntType getUInt64Ty() { return typeCache.UInt64Ty; }
+  cir::IntType getUInt8Ty() { return typeCache.uInt8Ty; }
+  cir::IntType getUInt16Ty() { return typeCache.uInt16Ty; }
+  cir::IntType getUInt32Ty() { return typeCache.uInt32Ty; }
+  cir::IntType getUInt64Ty() { return typeCache.uInt64Ty; }
 
   cir::ConstantOp getConstInt(mlir::Location loc, llvm::APSInt intVal);
 
@@ -280,21 +280,21 @@ class CIRGenBuilderTy : public cir::CIRBaseBuilderTy {
                              llvm::APFloat fpVal);
 
   bool isInt8Ty(mlir::Type i) {
-    return i == typeCache.UInt8Ty || i == typeCache.SInt8Ty;
+    return i == typeCache.uInt8Ty || i == typeCache.sInt8Ty;
   }
   bool isInt16Ty(mlir::Type i) {
-    return i == typeCache.UInt16Ty || i == typeCache.SInt16Ty;
+    return i == typeCache.uInt16Ty || i == typeCache.sInt16Ty;
   }
   bool isInt32Ty(mlir::Type i) {
-    return i == typeCache.UInt32Ty || i == typeCache.SInt32Ty;
+    return i == typeCache.uInt32Ty || i == typeCache.sInt32Ty;
   }
   bool isInt64Ty(mlir::Type i) {
-    return i == typeCache.UInt64Ty || i == typeCache.SInt64Ty;
+    return i == typeCache.uInt64Ty || i == typeCache.sInt64Ty;
   }
   bool isInt(mlir::Type i) { return mlir::isa<cir::IntType>(i); }
 
   // Fetch the type representing a pointer to unsigned int8 values.
-  cir::PointerType getUInt8PtrTy() { return typeCache.UInt8PtrTy; }
+  cir::PointerType getUInt8PtrTy() { return typeCache.uInt8PtrTy; }
 
   /// Get a CIR anonymous record type.
   cir::RecordType getAnonRecordTy(llvm::ArrayRef<mlir::Type> members,
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
index 3c9c7ecf35aff..0198a9d4eb192 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
@@ -771,14 +771,6 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
   case X86::BI_WriteBarrier:
   case X86::BI_AddressOfReturnAddress:
   case X86::BI__stosb:
-  case X86::BI__builtin_ia32_t2rpntlvwz0_internal:
-  case X86::BI__builtin_ia32_t2rpntlvwz0rs_internal:
-  case X86::BI__builtin_ia32_t2rpntlvwz0t1_internal:
-  case X86::BI__builtin_ia32_t2rpntlvwz0rst1_internal:
-  case X86::BI__builtin_ia32_t2rpntlvwz1_internal:
-  case X86::BI__builtin_ia32_t2rpntlvwz1rs_internal:
-  case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal:
-  case X86::BI__builtin_ia32_t2rpntlvwz1rst1_internal:
   case X86::BI__ud2:
   case X86::BI__int2c:
   case X86::BI__readfsbyte:
diff --git a/clang/lib/CIR/CodeGen/CIRGenClass.cpp b/clang/lib/CIR/CodeGen/CIRGenClass.cpp
index 5046e0945002f..a8296782ebc40 100644
--- a/clang/lib/CIR/CodeGen/CIRGenClass.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenClass.cpp
@@ -362,7 +362,7 @@ static Address applyNonVirtualAndVirtualOffset(
   // not bytes.  So the pointer must be cast to a byte pointer and back.
 
   mlir::Value ptr = addr.getPointer();
-  mlir::Type charPtrType = cgf.cgm.UInt8PtrTy;
+  mlir::Type charPtrType = cgf.cgm.uInt8PtrTy;
   mlir::Value charPtr = cgf.getBuilder().createBitcast(ptr, charPtrType);
   mlir::Value adjusted = cir::PtrStrideOp::create(
       cgf.getBuilder(), loc, charPtrType, charPtr, baseOffset);
@@ -1105,7 +1105,7 @@ mlir::Value CIRGenFunction::getVTTParameter(GlobalDecl gd, bool forVirtualBase,
     // We're the complete constructor, so get the VTT by name.
     cir::GlobalOp vtt = cgm.getVTables().getAddrOfVTT(rd);
     return builder.createVTTAddrPoint(
-        loc, builder.getPointerTo(cgm.VoidPtrTy),
+        loc, builder.getPointerTo(cgm.voidPtrTy),
         mlir::FlatSymbolRefAttr::get(vtt.getSymNameAttr()), subVTTIndex);
   }
 }
diff --git a/clang/lib/CIR/CodeGen/CIRGenCoroutine.cpp b/clang/lib/CIR/CodeGen/CIRGenCoroutine.cpp
index 8723a6e502b38..930ae55405756 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCoroutine.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenCoroutine.cpp
@@ -55,7 +55,7 @@ cir::CallOp CIRGenFunction::emitCoroIDBuiltinCall(mlir::Location loc,
   if (!builtin) {
     fnOp = cgm.createCIRBuiltinFunction(
         loc, cgm.builtinCoroId,
-        cir::FuncType::get({int32Ty, VoidPtrTy, VoidPtrTy, VoidPtrTy}, int32Ty),
+        cir::FuncType::get({int32Ty, voidPtrTy, voidPtrTy, voidPtrTy}, int32Ty),
         /*FD=*/nullptr);
     assert(fnOp && "should always succeed");
   } else {
@@ -75,7 +75,7 @@ cir::CallOp CIRGenFunction::emitCoroAllocBuiltinCall(mlir::Location loc) {
   cir::FuncOp fnOp;
   if (!builtin) {
     fnOp = cgm.createCIRBuiltinFunction(loc, cgm.builtinCoroAlloc,
-                                        cir::FuncType::get({UInt32Ty}, boolTy),
+                                        cir::FuncType::get({uInt32Ty}, boolTy),
                                         /*fd=*/nullptr);
     assert(fnOp && "should always succeed");
   } else {
@@ -95,7 +95,7 @@ CIRGenFunction::emitCoroBeginBuiltinCall(mlir::Location loc,
   if (!builtin) {
     fnOp = cgm.createCIRBuiltinFunction(
         loc, cgm.builtinCoroBegin,
-        cir::FuncType::get({UInt32Ty, VoidPtrTy}, VoidPtrTy),
+        cir::FuncType::get({uInt32Ty, voidPtrTy}, voidPtrTy),
         /*fd=*/nullptr);
     assert(fnOp && "should always succeed");
   } else {
@@ -110,7 +110,7 @@ CIRGenFunction::emitCoroBeginBuiltinCall(mlir::Location loc,
 mlir::LogicalResult
 CIRGenFunction::emitCoroutineBody(const CoroutineBodyStmt &s) {
   mlir::Location openCurlyLoc = getLoc(s.getBeginLoc());
-  cir::ConstantOp nullPtrCst = builder.getNullPtr(VoidPtrTy, openCurlyLoc);
+  cir::ConstantOp nullPtrCst = builder.getNullPtr(voidPtrTy, openCurlyLoc);
 
   auto fn = mlir::cast<cir::FuncOp>(curFn);
   fn.setCoroutine(true);
diff --git a/clang/lib/CIR/CodeGen/CIRGenDecl.cpp b/clang/lib/CIR/CodeGen/CIRGenDecl.cpp
index 5667273c00daf..aeea0efeb77c3 100644
--- a/clang/lib/CIR/CodeGen/CIRGenDecl.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenDecl.cpp
@@ -80,13 +80,13 @@ CIRGenFunction::emitAutoVarAlloca(const VarDecl &d,
     assert(!cir::MissingFeatures::openMP());
     if (!didCallStackSave) {
       // Save the stack.
-      cir::PointerType defaultTy = AllocaInt8PtrTy;
+      cir::PointerType defaultTy = allocaInt8PtrTy;
       CharUnits align = CharUnits::fromQuantity(
           cgm.getDataLayout().getAlignment(defaultTy, false));
       Address stack = createTempAlloca(defaultTy, align, loc, "saved_stack");
 
       mlir::Value v = builder.createStackSave(loc, defaultTy);
-      assert(v.getType() == AllocaInt8PtrTy);
+      assert(v.getType() == allocaInt8PtrTy);
       builder.createStore(loc, v, stack);
 
       didCallStackSave = true;
diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
index df6ee56eac30b..5ccb431e626ae 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
@@ -2529,7 +2529,7 @@ CIRGenFunction::emitConditionalBlocks(const AbstractConditionalOperator *e,
 
   // If both arms are void, so be it.
   if (!yieldTy)
-    yieldTy = VoidTy;
+    yieldTy = voidTy;
 
   // Insert required yields.
   for (mlir::OpBuilder::InsertPoint &toInsert : insertPoints) {
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp b/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
index 8fe0d9b4a69ef..3d3030ca87e2a 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
@@ -490,7 +490,7 @@ void AggExprEmitter::emitArrayInit(Address destPtr, cir::ArrayType arrayTy,
   for (uint64_t i = 0; i != numInitElements; ++i) {
     // Advance to the next element.
     if (i > 0) {
-      one = builder.getConstantInt(loc, cgf.PtrDiffTy, i);
+      one = builder.getConstantInt(loc, cgf.ptrDiffTy, i);
       element = builder.createPtrStride(loc, begin, one);
     }
 
@@ -512,7 +512,7 @@ void AggExprEmitter::emitArrayInit(Address destPtr, cir::ArrayType arrayTy,
         cgf.getTypes().isZeroInitializable(elementType))) {
     // Advance to the start of the rest of the array.
     if (numInitElements) {
-      one = builder.getConstantInt(loc, cgf.PtrDiffTy, 1);
+      one = builder.getConstantInt(loc, cgf.ptrDiffTy, 1);
       element = cir::PtrStrideOp::create(builder, loc, cirElementPtrType,
                                          element, one);
     }
@@ -526,7 +526,7 @@ void AggExprEmitter::emitArrayInit(Address destPtr, cir::ArrayType arrayTy,
 
     // Compute the end of array
     cir::ConstantOp numArrayElementsConst = builder.getConstInt(
-        loc, mlir::cast<cir::IntType>(cgf.PtrDiffTy), numArrayElements);
+        loc, mlir::cast<cir::IntType>(cgf.ptrDiffTy), numArrayElements);
     mlir::Value end = cir::PtrStrideOp::create(builder, loc, cirElementPtrType,
                                                begin, numArrayElementsConst);
 
@@ -563,7 +563,7 @@ void AggExprEmitter::emitArrayInit(Address destPtr, cir::ArrayType arrayTy,
 
           // Advance pointer and store them to temporary variable
           cir::ConstantOp one = builder.getConstInt(
-              loc, mlir::cast<cir::IntType>(cgf.PtrDiffTy), 1);
+              loc, mlir::cast<cir::IntType>(cgf.ptrDiffTy), 1);
           auto nextElement = cir::PtrStrideOp::create(
               builder, loc, cirElementPtrType, currentElement, one);
           cgf.emitStoreThroughLValue(RValue::get(nextElement), tmpLV);
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp b/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp
index 7a35382e79a93..9dd9b6d550763 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp
@@ -257,12 +257,12 @@ static mlir::Value emitCXXNewAllocSize(CIRGenFunction &cgf, const CXXNewExpr *e,
   if (!e->isArray()) {
     CharUnits typeSize = cgf.getContext().getTypeSizeInChars(type);
     sizeWithoutCookie = cgf.getBuilder().getConstant(
-        loc, cir::IntAttr::get(cgf.SizeTy, typeSize.getQuantity()));
+        loc, cir::IntAttr::get(cgf.sizeTy, typeSize.getQuantity()));
     return sizeWithoutCookie;
   }
 
   // The width of size_t.
-  unsigned sizeWidth = cgf.cgm.getDataLayout().getTypeSizeInBits(cgf.SizeTy);
+  unsigned sizeWidth = cgf.cgm.getDataLayout().getTypeSizeInBits(cgf.sizeTy);
 
   // The number of elements can be have an arbitrary integer type;
   // essentially, we need to multiply it by a constant factor, add a
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp b/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp
index 928e5aa821bb5..6af87a0159f0a 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp
@@ -46,7 +46,7 @@ namespace {
 class ConstExprEmitter;
 
 static mlir::TypedAttr computePadding(CIRGenModule &cgm, CharUnits size) {
-  mlir::Type eltTy = cgm.UCharTy;
+  mlir::Type eltTy = cgm.uCharTy;
   clang::CharUnits::QuantityType arSize = size.getQuantity();
   CIRGenBuilderTy &bld = cgm.getBuilder();
   if (size > CharUnits::One()) {
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
index db6878d479366..119314fe27dce 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
@@ -762,9 +762,9 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
         // FIXME(cir): For now lets pretend we shouldn't use the conversion
         // intrinsics and insert a cast here unconditionally.
         src = builder.createCast(cgf.getLoc(loc), cir::CastKind::floating, src,
-                                 cgf.FloatTy);
+                                 cgf.floatTy);
         srcType = cgf.getContext().FloatTy;
-        mlirSrcType = cgf.FloatTy;
+        mlirSrcType = cgf.floatTy;
       }
     }
 
@@ -1738,7 +1738,7 @@ mlir::Value ScalarExprEmitter::emitSub(const BinOpInfo &ops) {
   //
   // See more in `EmitSub` in CGExprScalar.cpp.
   assert(!cir::MissingFeatures::llvmLoweringPtrDiffConsidersPointee());
-  return cir::PtrDiffOp::create(builder, cgf.getLoc(ops.loc), cgf.PtrDiffTy,
+  return cir::PtrDiffOp::create(builder, cgf.getLoc(ops.loc), cgf.ptrDiffTy,
                                 ops.lhs, ops.rhs);
 }
 
@@ -2220,7 +2220,7 @@ mlir::Value ScalarExprEmitter::VisitUnaryExprOrTypeTraitExpr(
                                      "sizeof operator for VariableArrayType",
                                      e->getStmtClassName());
       return builder.getConstant(
-          loc, cir::IntAttr::get(cgf.cgm.UInt64Ty,
+          loc, cir::IntAttr::get(cgf.cgm.uInt64Ty,
                                  llvm::APSInt(llvm::APInt(64, 1), true)));
     }
   } else if (e->getKind() == UETT_OpenMPRequiredSimdAlign) {
@@ -2228,12 +2228,12 @@ mlir::Value ScalarExprEmitter::VisitUnaryExprOrTypeTraitExpr(
         e->getSourceRange(), "sizeof operator for OpenMpRequiredSimdAlign",
         e->getStmtClassName());
     return builder.getConstant(
-        loc, cir::IntAttr::get(cgf.cgm.UInt64Ty,
+        loc, cir::IntAttr::get(cgf.cgm.uInt64Ty,
                                llvm::APSInt(llvm::APInt(64, 1), true)));
   }
 
   return builder.getConstant(
-      loc, cir::IntAttr::get(cgf.cgm.UInt64Ty,
+      loc, cir::IntAttr::get(cgf.cgm.uInt64Ty,
                              e->EvaluateKnownConstInt(cgf.getContext())));
 }
 
@@ -2329,14 +2329,14 @@ mlir::Value ScalarExprEmitter::VisitAbstractConditionalOperator(
 
     mlir::Value lhs = Visit(lhsExpr);
     if (!lhs) {
-      lhs = builder.getNullValue(cgf.VoidTy, loc);
+      lhs = builder.getNullValue(cgf.voidTy, loc);
       lhsIsVoid = true;
     }
 
     mlir::Value rhs = Visit(rhsExpr);
     if (lhsIsVoid) {
       assert(!rhs && "lhs and rhs types must match");
-      rhs = builder.getNullValue(cgf.VoidTy, loc);
+      rhs = builder.getNullValue(cgf.voidTy, loc);
     }
 
     return builder.createSelect(loc, condV, lhs, rhs);
@@ -2381,7 +2381,7 @@ mlir::Value ScalarExprEmitter::VisitAbstractConditionalOperator(
   if (!insertPoints.empty()) {
     // If both arms are void, so be it.
     if (!yieldTy)
-      yieldTy = cgf.VoidTy;
+      yieldTy = cgf.voidTy;
 
     // Insert required yields.
     for (mlir::OpBuilder::InsertPoint &toInsert : insertPoints) {
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
index 58feb36f78f23..5d5209b9ffb60 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
@@ -242,12 +242,19 @@ void CIRGenFunction::LexicalScope::cleanup() {
     }
   };
 
-  if (returnBlock != nullptr) {
-    // Write out the return block, which loads the value from `__retval` and
-    // issues the `cir.return`.
+  // Cleanup are done right before codegen resumes a scope. This is where
+  // objects are destroyed. Process all return blocks.
+  // TODO(cir): Handle returning from a switch statement through a cleanup
+  // block. We can't simply jump to the cleanup block, because the cleanup block
+  // is not part of the case region. Either reemit all cleanups in the return
+  // block or wait for MLIR structured control flow to support early exits.
+  llvm::SmallVector<mlir::Block *> retBlocks;
+  for (mlir::Block *retBlock : localScope->getRetBlocks()) {
     mlir::OpBuilder::InsertionGuard guard(builder);
-    builder.setInsertionPointToEnd(returnBlock);
-    (void)emitReturn(*returnLoc);
+    builder.setInsertionPointToEnd(retBlock);
+    retBlocks.push_back(retBlock);
+    mlir::Location retLoc = localScope->getRetLoc(retBlock);
+    emitReturn(retLoc);
   }
 
   auto insertCleanupAndLeave = [&](mlir::Block *insPt) {
@@ -274,19 +281,22 @@ void CIRGenFunction::LexicalScope::cleanup() {
 
     if (localScope->depth == 0) {
       // Reached the end of the function.
-      if (returnBlock != nullptr) {
-        if (returnBlock->getUses().empty()) {
-          returnBlock->erase();
+      // Special handling only for single return block case
+      if (localScope->getRetBlocks().size() == 1) {
+        mlir::Block *retBlock = localScope->getRetBlocks()[0];
+        mlir::Location retLoc = localScope->getRetLoc(retBlock);
+        if (retBlock->getUses().empty()) {
+          retBlock->erase();
         } else {
           // Thread return block via cleanup block.
           if (cleanupBlock) {
-            for (mlir::BlockOperand &blockUse : returnBlock->getUses()) {
+            for (mlir::BlockOperand &blockUse : retBlock->getUses()) {
               cir::BrOp brOp = mlir::cast<cir::BrOp>(blockUse.getOwner());
               brOp.setSuccessor(cleanupBlock);
             }
           }
 
-          cir::BrOp::create(builder, *returnLoc, returnBlock);
+          cir::BrOp::create(builder, retLoc, retBlock);
           return;
         }
       }
@@ -324,8 +334,10 @@ void CIRGenFunction::LexicalScope::cleanup() {
   bool entryBlock = builder.getInsertionBlock()->isEntryBlock();
   if (!entryBlock && curBlock->empty()) {
     curBlock->erase();
-    if (returnBlock != nullptr && returnBlock->getUses().empty())
-      returnBlock->erase();
+    for (mlir::Block *retBlock : retBlocks) {
+      if (retBlock->getUses().empty())
+        retBlock->erase();
+    }
     return;
   }
 
@@ -1008,7 +1020,7 @@ CIRGenFunction::emitArrayLength(const clang::ArrayType *origArrayType,
   if (isa<VariableArrayType>(arrayType)) {
     assert(cir::MissingFeatures::vlas());
     cgm.errorNYI(*currSrcLoc, "VLAs");
-    return builder.getConstInt(*currSrcLoc, SizeTy, 0);
+    return builder.getConstInt(*currSrcLoc, sizeTy, 0);
   }
 
   uint64_t countFromCLAs = 1;
@@ -1037,7 +1049,7 @@ CIRGenFunction::emitArrayLength(const clang::ArrayType *origArrayType,
   }
 
   baseType = eltType;
-  return builder.getConstInt(*currSrcLoc, SizeTy, countFromCLAs);
+  return builder.getConstInt(*currSrcLoc, sizeTy, countFromCLAs);
 }
 
 mlir::Value CIRGenFunction::emitAlignmentAssumption(
@@ -1074,7 +1086,7 @@ CIRGenFunction::getVLASize(const VariableArrayType *type) {
     elementType = type->getElementType();
     mlir::Value vlaSize = vlaSizeMap[type->getSizeExpr()];
     assert(vlaSize && "no size for VLA!");
-    assert(vlaSize.getType() == SizeTy);
+    assert(vlaSize.getType() == sizeTy);
 
     if (!numElements) {
       numElements = vlaSize;
@@ -1188,7 +1200,7 @@ void CIRGenFunction::emitVariablyModifiedType(QualType type) {
           // Always zexting here would be wrong if it weren't
           // undefined behavior to have a negative bound.
           // FIXME: What about when size's type is larger than size_t?
-          entry = builder.createIntCast(size, SizeTy);
+          entry = builder.createIntCast(size, sizeTy);
         }
       }
       type = vat->getElementType();
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h
index c3fcd1a69a88e..e5cecaa573a6e 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.h
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h
@@ -1103,44 +1103,69 @@ class CIRGenFunction : public CIRGenTypeCache {
     // ---
 
   private:
-    // `returnBlock`, `returnLoc`, and all the functions that deal with them
-    // will change and become more complicated when `switch` statements are
-    // upstreamed.  `case` statements within the `switch` are in the same scope
-    // but have their own regions.  Therefore the LexicalScope will need to
-    // keep track of multiple return blocks.
-    mlir::Block *returnBlock = nullptr;
-    std::optional<mlir::Location> returnLoc;
-
-    // See the comment on `getOrCreateRetBlock`.
+    // On switches we need one return block per region, since cases don't
+    // have their own scopes but are distinct regions nonetheless.
+
+    // TODO: This implementation should change once we have support for early
+    //       exits in MLIR structured control flow (llvm-project#161575)
+    llvm::SmallVector<mlir::Block *> retBlocks;
+    llvm::DenseMap<mlir::Block *, mlir::Location> retLocs;
+    llvm::DenseMap<cir::CaseOp, unsigned> retBlockInCaseIndex;
+    std::optional<unsigned> normalRetBlockIndex;
+
+    // There's usually only one ret block per scope, but this needs to be
+    // get or create because of potential unreachable return statements, note
+    // that for those, all source location maps to the first one found.
     mlir::Block *createRetBlock(CIRGenFunction &cgf, mlir::Location loc) {
-      assert(returnBlock == nullptr && "only one return block per scope");
-      // Create the cleanup block but don't hook it up just yet.
+      assert((isa_and_nonnull<cir::CaseOp>(
+                  cgf.builder.getBlock()->getParentOp()) ||
+              retBlocks.size() == 0) &&
+             "only switches can hold more than one ret block");
+
+      // Create the return block but don't hook it up just yet.
       mlir::OpBuilder::InsertionGuard guard(cgf.builder);
-      returnBlock =
-          cgf.builder.createBlock(cgf.builder.getBlock()->getParent());
-      updateRetLoc(returnBlock, loc);
-      return returnBlock;
+      auto *b = cgf.builder.createBlock(cgf.builder.getBlock()->getParent());
+      retBlocks.push_back(b);
+      updateRetLoc(b, loc);
+      return b;
     }
 
     cir::ReturnOp emitReturn(mlir::Location loc);
     void emitImplicitReturn();
 
   public:
-    mlir::Block *getRetBlock() { return returnBlock; }
-    mlir::Location getRetLoc(mlir::Block *b) { return *returnLoc; }
-    void updateRetLoc(mlir::Block *b, mlir::Location loc) { returnLoc = loc; }
-
-    // Create the return block for this scope, or return the existing one.
-    // This get-or-create logic is necessary to handle multiple return
-    // statements within the same scope, which can happen if some of them are
-    // dead code or if there is a `goto` into the middle of the scope.
+    llvm::ArrayRef<mlir::Block *> getRetBlocks() { return retBlocks; }
+    mlir::Location getRetLoc(mlir::Block *b) { return retLocs.at(b); }
+    void updateRetLoc(mlir::Block *b, mlir::Location loc) {
+      retLocs.insert_or_assign(b, loc);
+    }
+
     mlir::Block *getOrCreateRetBlock(CIRGenFunction &cgf, mlir::Location loc) {
-      if (returnBlock == nullptr) {
-        returnBlock = createRetBlock(cgf, loc);
-        return returnBlock;
+      // Check if we're inside a case region
+      if (auto caseOp = mlir::dyn_cast_if_present<cir::CaseOp>(
+              cgf.builder.getBlock()->getParentOp())) {
+        auto iter = retBlockInCaseIndex.find(caseOp);
+        if (iter != retBlockInCaseIndex.end()) {
+          // Reuse existing return block
+          mlir::Block *ret = retBlocks[iter->second];
+          updateRetLoc(ret, loc);
+          return ret;
+        }
+        // Create new return block
+        mlir::Block *ret = createRetBlock(cgf, loc);
+        retBlockInCaseIndex[caseOp] = retBlocks.size() - 1;
+        return ret;
       }
-      updateRetLoc(returnBlock, loc);
-      return returnBlock;
+
+      if (normalRetBlockIndex) {
+        mlir::Block *ret = retBlocks[*normalRetBlockIndex];
+        updateRetLoc(ret, loc);
+        return ret;
+      }
+
+      mlir::Block *ret = createRetBlock(cgf, loc);
+      normalRetBlockIndex = retBlocks.size() - 1;
+      return ret;
     }
 
     mlir::Block *getEntryBlock() { return entryBlock; }
diff --git a/clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp b/clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp
index 88fedf1acc6a1..f603f5ec4383d 100644
--- a/clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp
@@ -1846,13 +1846,13 @@ mlir::Value CIRGenItaniumCXXABI::getVirtualBaseClassOffset(
     const CXXRecordDecl *classDecl, const CXXRecordDecl *baseClassDecl) {
   CIRGenBuilderTy &builder = cgf.getBuilder();
   mlir::Value vtablePtr = cgf.getVTablePtr(loc, thisAddr, classDecl);
-  mlir::Value vtableBytePtr = builder.createBitcast(vtablePtr, cgm.UInt8PtrTy);
+  mlir::Value vtableBytePtr = builder.createBitcast(vtablePtr, cgm.uInt8PtrTy);
   CharUnits vbaseOffsetOffset =
       cgm.getItaniumVTableContext().getVirtualBaseOffsetOffset(classDecl,
                                                                baseClassDecl);
   mlir::Value offsetVal =
       builder.getSInt64(vbaseOffsetOffset.getQuantity(), loc);
-  auto vbaseOffsetPtr = cir::PtrStrideOp::create(builder, loc, cgm.UInt8PtrTy,
+  auto vbaseOffsetPtr = cir::PtrStrideOp::create(builder, loc, cgm.uInt8PtrTy,
                                                  vtableBytePtr, offsetVal);
 
   mlir::Value vbaseOffset;
@@ -1861,9 +1861,9 @@ mlir::Value CIRGenItaniumCXXABI::getVirtualBaseClassOffset(
     cgm.errorNYI(loc, "getVirtualBaseClassOffset: relative layout");
   } else {
     mlir::Value offsetPtr = builder.createBitcast(
-        vbaseOffsetPtr, builder.getPointerTo(cgm.PtrDiffTy));
+        vbaseOffsetPtr, builder.getPointerTo(cgm.ptrDiffTy));
     vbaseOffset = builder.createLoad(
-        loc, Address(offsetPtr, cgm.PtrDiffTy, cgf.getPointerAlign()));
+        loc, Address(offsetPtr, cgm.ptrDiffTy, cgf.getPointerAlign()));
   }
   return vbaseOffset;
 }
@@ -2244,7 +2244,7 @@ Address CIRGenItaniumCXXABI::initializeArrayCookie(CIRGenFunction &cgf,
 
   // Write the number of elements into the appropriate slot.
   Address numElementsPtr =
-      cookiePtr.withElementType(cgf.getBuilder(), cgf.SizeTy);
+      cookiePtr.withElementType(cgf.getBuilder(), cgf.sizeTy);
   cgf.getBuilder().createStore(loc, numElements, numElementsPtr);
 
   // Finally, compute a pointer to the actual data buffer by skipping
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
index 46adfe28e377a..9f9b2db4771df 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
@@ -67,28 +67,28 @@ CIRGenModule::CIRGenModule(mlir::MLIRContext &mlirContext,
       abi(createCXXABI(*this)), genTypes(*this), vtables(*this) {
 
   // Initialize cached types
-  VoidTy = cir::VoidType::get(&getMLIRContext());
-  VoidPtrTy = cir::PointerType::get(VoidTy);
-  SInt8Ty = cir::IntType::get(&getMLIRContext(), 8, /*isSigned=*/true);
-  SInt16Ty = cir::IntType::get(&getMLIRContext(), 16, /*isSigned=*/true);
-  SInt32Ty = cir::IntType::get(&getMLIRContext(), 32, /*isSigned=*/true);
-  SInt64Ty = cir::IntType::get(&getMLIRContext(), 64, /*isSigned=*/true);
-  SInt128Ty = cir::IntType::get(&getMLIRContext(), 128, /*isSigned=*/true);
-  UInt8Ty = cir::IntType::get(&getMLIRContext(), 8, /*isSigned=*/false);
-  UInt8PtrTy = cir::PointerType::get(UInt8Ty);
+  voidTy = cir::VoidType::get(&getMLIRContext());
+  voidPtrTy = cir::PointerType::get(voidTy);
+  sInt8Ty = cir::IntType::get(&getMLIRContext(), 8, /*isSigned=*/true);
+  sInt16Ty = cir::IntType::get(&getMLIRContext(), 16, /*isSigned=*/true);
+  sInt32Ty = cir::IntType::get(&getMLIRContext(), 32, /*isSigned=*/true);
+  sInt64Ty = cir::IntType::get(&getMLIRContext(), 64, /*isSigned=*/true);
+  sInt128Ty = cir::IntType::get(&getMLIRContext(), 128, /*isSigned=*/true);
+  uInt8Ty = cir::IntType::get(&getMLIRContext(), 8, /*isSigned=*/false);
+  uInt8PtrTy = cir::PointerType::get(uInt8Ty);
   cirAllocaAddressSpace = getTargetCIRGenInfo().getCIRAllocaAddressSpace();
-  UInt16Ty = cir::IntType::get(&getMLIRContext(), 16, /*isSigned=*/false);
-  UInt32Ty = cir::IntType::get(&getMLIRContext(), 32, /*isSigned=*/false);
-  UInt64Ty = cir::IntType::get(&getMLIRContext(), 64, /*isSigned=*/false);
-  UInt128Ty = cir::IntType::get(&getMLIRContext(), 128, /*isSigned=*/false);
-  FP16Ty = cir::FP16Type::get(&getMLIRContext());
-  BFloat16Ty = cir::BF16Type::get(&getMLIRContext());
-  FloatTy = cir::SingleType::get(&getMLIRContext());
-  DoubleTy = cir::DoubleType::get(&getMLIRContext());
-  FP80Ty = cir::FP80Type::get(&getMLIRContext());
-  FP128Ty = cir::FP128Type::get(&getMLIRContext());
-
-  AllocaInt8PtrTy = cir::PointerType::get(UInt8Ty, cirAllocaAddressSpace);
+  uInt16Ty = cir::IntType::get(&getMLIRContext(), 16, /*isSigned=*/false);
+  uInt32Ty = cir::IntType::get(&getMLIRContext(), 32, /*isSigned=*/false);
+  uInt64Ty = cir::IntType::get(&getMLIRContext(), 64, /*isSigned=*/false);
+  uInt128Ty = cir::IntType::get(&getMLIRContext(), 128, /*isSigned=*/false);
+  fP16Ty = cir::FP16Type::get(&getMLIRContext());
+  bFloat16Ty = cir::BF16Type::get(&getMLIRContext());
+  floatTy = cir::SingleType::get(&getMLIRContext());
+  doubleTy = cir::DoubleType::get(&getMLIRContext());
+  fP80Ty = cir::FP80Type::get(&getMLIRContext());
+  fP128Ty = cir::FP128Type::get(&getMLIRContext());
+
+  allocaInt8PtrTy = cir::PointerType::get(uInt8Ty, cirAllocaAddressSpace);
 
   PointerAlignInBytes =
       astContext
@@ -97,16 +97,16 @@ CIRGenModule::CIRGenModule(mlir::MLIRContext &mlirContext,
           .getQuantity();
 
   const unsigned charSize = astContext.getTargetInfo().getCharWidth();
-  UCharTy = cir::IntType::get(&getMLIRContext(), charSize, /*isSigned=*/false);
+  uCharTy = cir::IntType::get(&getMLIRContext(), charSize, /*isSigned=*/false);
 
   // TODO(CIR): Should be updated once TypeSizeInfoAttr is upstreamed
   const unsigned sizeTypeSize =
       astContext.getTypeSize(astContext.getSignedSizeType());
   SizeSizeInBytes = astContext.toCharUnitsFromBits(sizeTypeSize).getQuantity();
   // In CIRGenTypeCache, UIntPtrTy and SizeType are fields of the same union
-  UIntPtrTy =
+  uIntPtrTy =
       cir::IntType::get(&getMLIRContext(), sizeTypeSize, /*isSigned=*/false);
-  PtrDiffTy =
+  ptrDiffTy =
       cir::IntType::get(&getMLIRContext(), sizeTypeSize, /*isSigned=*/true);
 
   std::optional<cir::SourceLanguage> sourceLanguage = getCIRSourceLanguage();
diff --git a/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp b/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp
index 50101373f3e9c..527dfd21db8a5 100644
--- a/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp
@@ -126,7 +126,7 @@ class OpenACCClauseCIREmitter final
         .CaseLower("default", mlir::acc::DeviceType::Default)
         .CaseLower("host", mlir::acc::DeviceType::Host)
         .CaseLower("multicore", mlir::acc::DeviceType::Multicore)
-        .CasesLower("nvidia", "acc_device_nvidia",
+        .CasesLower({"nvidia", "acc_device_nvidia"},
                     mlir::acc::DeviceType::Nvidia)
         .CaseLower("radeon", mlir::acc::DeviceType::Radeon);
   }
diff --git a/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp b/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp
index be063033ddcfc..890f8a6c8339d 100644
--- a/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp
@@ -617,11 +617,11 @@ void OpenACCRecipeBuilderBase::createReductionRecipeCombiner(
   if (const auto *cat = cgf.getContext().getAsConstantArrayType(origType)) {
     // If we're in an array, we have to emit the combiner for each element of
     // the array.
-    auto itrTy = mlir::cast<cir::IntType>(cgf.PtrDiffTy);
+    auto itrTy = mlir::cast<cir::IntType>(cgf.ptrDiffTy);
     auto itrPtrTy = cir::PointerType::get(itrTy);
 
     mlir::Value zero =
-        builder.getConstInt(loc, mlir::cast<cir::IntType>(cgf.PtrDiffTy), 0);
+        builder.getConstInt(loc, mlir::cast<cir::IntType>(cgf.ptrDiffTy), 0);
     mlir::Value itr =
         cir::AllocaOp::create(builder, loc, itrPtrTy, itrTy, "itr",
                               cgf.cgm.getSize(cgf.getPointerAlign()));
@@ -633,7 +633,7 @@ void OpenACCRecipeBuilderBase::createReductionRecipeCombiner(
         [&](mlir::OpBuilder &b, mlir::Location loc) {
           auto loadItr = cir::LoadOp::create(builder, loc, {itr});
           mlir::Value arraySize = builder.getConstInt(
-              loc, mlir::cast<cir::IntType>(cgf.PtrDiffTy), cat->getZExtSize());
+              loc, mlir::cast<cir::IntType>(cgf.ptrDiffTy), cat->getZExtSize());
           auto cmp = builder.createCompare(loc, cir::CmpOpKind::lt, loadItr,
                                            arraySize);
           builder.createCondition(cmp);
diff --git a/clang/lib/CIR/CodeGen/CIRGenTypeCache.h b/clang/lib/CIR/CodeGen/CIRGenTypeCache.h
index ff5842cd86e04..0f63e91f45564 100644
--- a/clang/lib/CIR/CodeGen/CIRGenTypeCache.h
+++ b/clang/lib/CIR/CodeGen/CIRGenTypeCache.h
@@ -26,47 +26,47 @@ struct CIRGenTypeCache {
   CIRGenTypeCache() {}
 
   // ClangIR void type
-  cir::VoidType VoidTy;
+  cir::VoidType voidTy;
 
   // ClangIR signed integral types of common sizes
-  cir::IntType SInt8Ty;
-  cir::IntType SInt16Ty;
-  cir::IntType SInt32Ty;
-  cir::IntType SInt64Ty;
-  cir::IntType SInt128Ty;
+  cir::IntType sInt8Ty;
+  cir::IntType sInt16Ty;
+  cir::IntType sInt32Ty;
+  cir::IntType sInt64Ty;
+  cir::IntType sInt128Ty;
 
   // ClangIR unsigned integral type of common sizes
-  cir::IntType UInt8Ty;
-  cir::IntType UInt16Ty;
-  cir::IntType UInt32Ty;
-  cir::IntType UInt64Ty;
-  cir::IntType UInt128Ty;
+  cir::IntType uInt8Ty;
+  cir::IntType uInt16Ty;
+  cir::IntType uInt32Ty;
+  cir::IntType uInt64Ty;
+  cir::IntType uInt128Ty;
 
   // ClangIR floating-point types with fixed formats
-  cir::FP16Type FP16Ty;
-  cir::BF16Type BFloat16Ty;
-  cir::SingleType FloatTy;
-  cir::DoubleType DoubleTy;
-  cir::FP80Type FP80Ty;
-  cir::FP128Type FP128Ty;
+  cir::FP16Type fP16Ty;
+  cir::BF16Type bFloat16Ty;
+  cir::SingleType floatTy;
+  cir::DoubleType doubleTy;
+  cir::FP80Type fP80Ty;
+  cir::FP128Type fP128Ty;
 
   /// ClangIR char
-  mlir::Type UCharTy;
+  mlir::Type uCharTy;
 
   /// intptr_t, size_t, and ptrdiff_t, which we assume are the same size.
   union {
-    mlir::Type UIntPtrTy;
-    mlir::Type SizeTy;
+    mlir::Type uIntPtrTy;
+    mlir::Type sizeTy;
   };
 
-  mlir::Type PtrDiffTy;
+  mlir::Type ptrDiffTy;
 
   /// void* in address space 0
-  cir::PointerType VoidPtrTy;
-  cir::PointerType UInt8PtrTy;
+  cir::PointerType voidPtrTy;
+  cir::PointerType uInt8PtrTy;
 
   /// void* in alloca address space
-  cir::PointerType AllocaInt8PtrTy;
+  cir::PointerType allocaInt8PtrTy;
 
   /// The size and alignment of a pointer into the generic address space.
   union {
diff --git a/clang/lib/CIR/CodeGen/CIRGenTypes.cpp b/clang/lib/CIR/CodeGen/CIRGenTypes.cpp
index d1b91d0c73c04..03618d4a8a8a6 100644
--- a/clang/lib/CIR/CodeGen/CIRGenTypes.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenTypes.cpp
@@ -71,7 +71,7 @@ mlir::Type CIRGenTypes::convertFunctionTypeInternal(QualType qft) {
   if (!isFuncTypeConvertible(ft)) {
     cgm.errorNYI(SourceLocation(), "function type involving an incomplete type",
                  qft);
-    return cir::FuncType::get(SmallVector<mlir::Type, 1>{}, cgm.VoidTy);
+    return cir::FuncType::get(SmallVector<mlir::Type, 1>{}, cgm.voidTy);
   }
 
   const CIRGenFunctionInfo *fi;
@@ -298,7 +298,7 @@ mlir::Type CIRGenTypes::convertType(QualType type) {
     switch (cast<BuiltinType>(ty)->getKind()) {
     // void
     case BuiltinType::Void:
-      resultType = cgm.VoidTy;
+      resultType = cgm.voidTy;
       break;
 
     // bool
@@ -338,42 +338,42 @@ mlir::Type CIRGenTypes::convertType(QualType type) {
 
     // Floating-point types
     case BuiltinType::Float16:
-      resultType = cgm.FP16Ty;
+      resultType = cgm.fP16Ty;
       break;
     case BuiltinType::Half:
       if (astContext.getLangOpts().NativeHalfType ||
           !astContext.getTargetInfo().useFP16ConversionIntrinsics()) {
-        resultType = cgm.FP16Ty;
+        resultType = cgm.fP16Ty;
       } else {
         cgm.errorNYI(SourceLocation(), "processing of built-in type", type);
-        resultType = cgm.SInt32Ty;
+        resultType = cgm.sInt32Ty;
       }
       break;
     case BuiltinType::BFloat16:
-      resultType = cgm.BFloat16Ty;
+      resultType = cgm.bFloat16Ty;
       break;
     case BuiltinType::Float:
       assert(&astContext.getFloatTypeSemantics(type) ==
                  &llvm::APFloat::IEEEsingle() &&
              "ClangIR NYI: 'float' in a format other than IEEE 32-bit");
-      resultType = cgm.FloatTy;
+      resultType = cgm.floatTy;
       break;
     case BuiltinType::Double:
       assert(&astContext.getFloatTypeSemantics(type) ==
                  &llvm::APFloat::IEEEdouble() &&
              "ClangIR NYI: 'double' in a format other than IEEE 64-bit");
-      resultType = cgm.DoubleTy;
+      resultType = cgm.doubleTy;
       break;
     case BuiltinType::LongDouble:
       resultType =
           builder.getLongDoubleTy(astContext.getFloatTypeSemantics(type));
       break;
     case BuiltinType::Float128:
-      resultType = cgm.FP128Ty;
+      resultType = cgm.fP128Ty;
       break;
     case BuiltinType::Ibm128:
       cgm.errorNYI(SourceLocation(), "processing of built-in type", type);
-      resultType = cgm.SInt32Ty;
+      resultType = cgm.sInt32Ty;
       break;
 
     case BuiltinType::NullPtr:
@@ -386,7 +386,7 @@ mlir::Type CIRGenTypes::convertType(QualType type) {
 
     default:
       cgm.errorNYI(SourceLocation(), "processing of built-in type", type);
-      resultType = cgm.SInt32Ty;
+      resultType = cgm.sInt32Ty;
       break;
     }
     break;
@@ -439,7 +439,7 @@ mlir::Type CIRGenTypes::convertType(QualType type) {
     // int X[] -> [0 x int], unless the element type is not sized.  If it is
     // unsized (e.g. an incomplete record) just use [0 x i8].
     if (!cir::isSized(elemTy)) {
-      elemTy = cgm.SInt8Ty;
+      elemTy = cgm.sInt8Ty;
     }
 
     resultType = cir::ArrayType::get(elemTy, 0);
@@ -454,7 +454,7 @@ mlir::Type CIRGenTypes::convertType(QualType type) {
     // i8 just to have a concrete type"
     if (!cir::isSized(elemTy)) {
       cgm.errorNYI(SourceLocation(), "arrays of undefined struct type", type);
-      resultType = cgm.UInt32Ty;
+      resultType = cgm.uInt32Ty;
       break;
     }
 
@@ -477,7 +477,7 @@ mlir::Type CIRGenTypes::convertType(QualType type) {
     // Return a placeholder 'i32' type.  This can be changed later when the
     // type is defined (see UpdateCompletedType), but is likely to be the
     // "right" answer.
-    resultType = cgm.UInt32Ty;
+    resultType = cgm.uInt32Ty;
     break;
   }
 
@@ -490,7 +490,7 @@ mlir::Type CIRGenTypes::convertType(QualType type) {
     const auto *bitIntTy = cast<BitIntType>(type);
     if (bitIntTy->getNumBits() > cir::IntType::maxBitwidth()) {
       cgm.errorNYI(SourceLocation(), "large _BitInt type", type);
-      resultType = cgm.SInt32Ty;
+      resultType = cgm.sInt32Ty;
     } else {
       resultType = cir::IntType::get(&getMLIRContext(), bitIntTy->getNumBits(),
                                      bitIntTy->isSigned());
@@ -515,7 +515,7 @@ mlir::Type CIRGenTypes::convertType(QualType type) {
   default:
     cgm.errorNYI(SourceLocation(), "processing of type",
                  type->getTypeClassName());
-    resultType = cgm.SInt32Ty;
+    resultType = cgm.sInt32Ty;
     break;
   }
 
diff --git a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
index 2d2ef422bfaef..7ba03ce40140c 100644
--- a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
+++ b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
@@ -286,14 +286,14 @@ void cir::ConditionOp::getSuccessorRegions(
   // Parent is a loop: condition may branch to the body or to the parent op.
   if (auto loopOp = dyn_cast<LoopOpInterface>(getOperation()->getParentOp())) {
     regions.emplace_back(&loopOp.getBody(), loopOp.getBody().getArguments());
-    regions.emplace_back(loopOp->getResults());
+    regions.emplace_back(getOperation(), loopOp->getResults());
   }
 
   assert(!cir::MissingFeatures::awaitOp());
 }
 
 MutableOperandRange
-cir::ConditionOp::getMutableSuccessorOperands(RegionBranchPoint point) {
+cir::ConditionOp::getMutableSuccessorOperands(RegionSuccessor point) {
   // No values are yielded to the successor region.
   return MutableOperandRange(getOperation(), 0, 0);
 }
@@ -989,7 +989,8 @@ void cir::IfOp::getSuccessorRegions(mlir::RegionBranchPoint point,
                                     SmallVectorImpl<RegionSuccessor> &regions) {
   // The `then` and the `else` region branch back to the parent operation.
   if (!point.isParent()) {
-    regions.push_back(RegionSuccessor());
+    regions.push_back(
+        RegionSuccessor(getOperation(), getOperation()->getResults()));
     return;
   }
 
@@ -1039,7 +1040,7 @@ void cir::ScopeOp::getSuccessorRegions(
     mlir::RegionBranchPoint point, SmallVectorImpl<RegionSuccessor> &regions) {
   // The only region always branch back to the parent operation.
   if (!point.isParent()) {
-    regions.push_back(RegionSuccessor(getODSResults(0)));
+    regions.push_back(RegionSuccessor(getOperation(), getODSResults(0)));
     return;
   }
 
@@ -1124,7 +1125,8 @@ Block *cir::BrCondOp::getSuccessorForOperands(ArrayRef<Attribute> operands) {
 void cir::CaseOp::getSuccessorRegions(
     mlir::RegionBranchPoint point, SmallVectorImpl<RegionSuccessor> &regions) {
   if (!point.isParent()) {
-    regions.push_back(RegionSuccessor());
+    regions.push_back(
+        RegionSuccessor(getOperation(), getOperation()->getResults()));
     return;
   }
   regions.push_back(RegionSuccessor(&getCaseRegion()));
@@ -1188,7 +1190,8 @@ static void printSwitchOp(OpAsmPrinter &p, cir::SwitchOp op,
 void cir::SwitchOp::getSuccessorRegions(
     mlir::RegionBranchPoint point, SmallVectorImpl<RegionSuccessor> &region) {
   if (!point.isParent()) {
-    region.push_back(RegionSuccessor());
+    region.push_back(
+        RegionSuccessor(getOperation(), getOperation()->getResults()));
     return;
   }
 
@@ -1402,7 +1405,8 @@ void cir::GlobalOp::getSuccessorRegions(
     mlir::RegionBranchPoint point, SmallVectorImpl<RegionSuccessor> &regions) {
   // The `ctor` and `dtor` regions always branch back to the parent operation.
   if (!point.isParent()) {
-    regions.push_back(RegionSuccessor());
+    regions.push_back(
+        RegionSuccessor(getOperation(), getOperation()->getResults()));
     return;
   }
 
@@ -1961,7 +1965,7 @@ void cir::TernaryOp::getSuccessorRegions(
     mlir::RegionBranchPoint point, SmallVectorImpl<RegionSuccessor> &regions) {
   // The `true` and the `false` region branch back to the parent operation.
   if (!point.isParent()) {
-    regions.push_back(RegionSuccessor(this->getODSResults(0)));
+    regions.push_back(RegionSuccessor(getOperation(), this->getODSResults(0)));
     return;
   }
 
@@ -2978,7 +2982,8 @@ void cir::TryOp::getSuccessorRegions(
     llvm::SmallVectorImpl<mlir::RegionSuccessor> &regions) {
   // The `try` and the `catchers` region branch back to the parent operation.
   if (!point.isParent()) {
-    regions.push_back(mlir::RegionSuccessor());
+    regions.push_back(
+        RegionSuccessor(getOperation(), getOperation()->getResults()));
     return;
   }
 
diff --git a/clang/lib/CIR/Dialect/Transforms/FlattenCFG.cpp b/clang/lib/CIR/Dialect/Transforms/FlattenCFG.cpp
index 21c96febf8403..ca7554e4e3754 100644
--- a/clang/lib/CIR/Dialect/Transforms/FlattenCFG.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/FlattenCFG.cpp
@@ -606,10 +606,12 @@ class CIRTryOpFlattening : public mlir::OpRewritePattern<cir::TryOp> {
     // `cir.try_call`.
     llvm::SmallVector<cir::CallOp, 4> callsToRewrite;
     tryOp.getTryRegion().walk([&](CallOp op) {
+      if (op.getNothrow())
+        return;
+
       // Only grab calls within immediate closest TryOp scope.
       if (op->getParentOfType<cir::TryOp>() != tryOp)
         return;
-      assert(!cir::MissingFeatures::opCallExceptionAttr());
       callsToRewrite.push_back(op);
     });
 
diff --git a/clang/lib/CIR/Interfaces/CIRLoopOpInterface.cpp b/clang/lib/CIR/Interfaces/CIRLoopOpInterface.cpp
index 0ce5017a399da..6de51f12837ba 100644
--- a/clang/lib/CIR/Interfaces/CIRLoopOpInterface.cpp
+++ b/clang/lib/CIR/Interfaces/CIRLoopOpInterface.cpp
@@ -17,7 +17,7 @@ namespace cir {
 void LoopOpInterface::getLoopOpSuccessorRegions(
     LoopOpInterface op, mlir::RegionBranchPoint point,
     llvm::SmallVectorImpl<mlir::RegionSuccessor> &regions) {
-  assert(point.isParent() || point.getRegionOrNull());
+  assert(point.isParent() || point.getTerminatorPredecessorOrNull());
 
   // Branching to first region: go to condition or body (do-while).
   if (point.isParent()) {
@@ -25,15 +25,18 @@ void LoopOpInterface::getLoopOpSuccessorRegions(
     return;
   }
 
+  mlir::Region *parentRegion =
+      point.getTerminatorPredecessorOrNull()->getParentRegion();
+
   // Branching from condition: go to body or exit.
-  if (&op.getCond() == point.getRegionOrNull()) {
-    regions.emplace_back(mlir::RegionSuccessor(op->getResults()));
+  if (&op.getCond() == parentRegion) {
+    regions.emplace_back(mlir::RegionSuccessor(op, op->getResults()));
     regions.emplace_back(&op.getBody(), op.getBody().getArguments());
     return;
   }
 
   // Branching from body: go to step (for) or condition.
-  if (&op.getBody() == point.getRegionOrNull()) {
+  if (&op.getBody() == parentRegion) {
     // FIXME(cir): Should we consider break/continue statements here?
     mlir::Region *afterBody =
         (op.maybeGetStep() ? op.maybeGetStep() : &op.getCond());
@@ -42,7 +45,7 @@ void LoopOpInterface::getLoopOpSuccessorRegions(
   }
 
   // Branching from step: go to condition.
-  if (op.maybeGetStep() == point.getRegionOrNull()) {
+  if (op.maybeGetStep() == parentRegion) {
     regions.emplace_back(&op.getCond(), op.getCond().getArguments());
     return;
   }
diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index aefc262dca17f..b967a26dd19d7 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -313,7 +313,7 @@ getCodeModel(const CodeGenOptions &CodeGenOpts) {
                            .Case("kernel", llvm::CodeModel::Kernel)
                            .Case("medium", llvm::CodeModel::Medium)
                            .Case("large", llvm::CodeModel::Large)
-                           .Cases("default", "", ~1u)
+                           .Cases({"default", ""}, ~1u)
                            .Default(~0u);
   assert(CodeModel != ~0u && "invalid code model!");
   if (CodeModel == ~1u)
@@ -800,16 +800,6 @@ static void addSanitizers(const Triple &TargetTriple,
       MPM.addPass(DataFlowSanitizerPass(LangOpts.NoSanitizeFiles,
                                         PB.getVirtualFileSystemPtr()));
     }
-
-    if (LangOpts.Sanitize.has(SanitizerKind::AllocToken)) {
-      if (Level == OptimizationLevel::O0) {
-        // The default pass builder only infers libcall function attrs when
-        // optimizing, so we insert it here because we need it for accurate
-        // memory allocation function detection.
-        MPM.addPass(InferFunctionAttrsPass());
-      }
-      MPM.addPass(AllocTokenPass(getAllocTokenOptions(LangOpts, CodeGenOpts)));
-    }
   };
   if (ClSanitizeOnOptimizerEarlyEP) {
     PB.registerOptimizerEarlyEPCallback(
@@ -852,6 +842,23 @@ static void addSanitizers(const Triple &TargetTriple,
   }
 }
 
+static void addAllocTokenPass(const Triple &TargetTriple,
+                              const CodeGenOptions &CodeGenOpts,
+                              const LangOptions &LangOpts, PassBuilder &PB) {
+  PB.registerOptimizerLastEPCallback([&](ModulePassManager &MPM,
+                                         OptimizationLevel Level,
+                                         ThinOrFullLTOPhase) {
+    if (Level == OptimizationLevel::O0 &&
+        LangOpts.Sanitize.has(SanitizerKind::AllocToken)) {
+      // The default pass builder only infers libcall function attrs when
+      // optimizing, so we insert it here because we need it for accurate
+      // memory allocation function detection with -fsanitize=alloc-token.
+      MPM.addPass(InferFunctionAttrsPass());
+    }
+    MPM.addPass(AllocTokenPass(getAllocTokenOptions(LangOpts, CodeGenOpts)));
+  });
+}
+
 void EmitAssemblyHelper::RunOptimizationPipeline(
     BackendAction Action, std::unique_ptr<raw_pwrite_stream> &OS,
     std::unique_ptr<llvm::ToolOutputFile> &ThinLinkOS, BackendConsumer *BC) {
@@ -1106,6 +1113,7 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
     if (!IsThinLTOPostLink) {
       addSanitizers(TargetTriple, CodeGenOpts, LangOpts, PB);
       addKCFIPass(TargetTriple, LangOpts, PB);
+      addAllocTokenPass(TargetTriple, CodeGenOpts, LangOpts, PB);
     }
 
     if (std::optional<GCOVOptions> Options =
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index fd14cd6926fe2..b81e0d02da2c9 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -4506,6 +4506,15 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     return RValue::get(AI);
   }
 
+  case Builtin::BI__builtin_infer_alloc_token: {
+    llvm::MDNode *MDN = buildAllocToken(E);
+    llvm::Value *MDV = MetadataAsValue::get(getLLVMContext(), MDN);
+    llvm::Function *F =
+        CGM.getIntrinsic(llvm::Intrinsic::alloc_token_id, {IntPtrTy});
+    llvm::CallBase *TokenID = Builder.CreateCall(F, MDV);
+    return RValue::get(TokenID);
+  }
+
   case Builtin::BIbzero:
   case Builtin::BI__builtin_bzero: {
     Address Dest = EmitPointerWithAlignment(E->getArg(0));
diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp
index 6af806686a3b9..ca579c915f49d 100644
--- a/clang/lib/CodeGen/CGDebugInfo.cpp
+++ b/clang/lib/CodeGen/CGDebugInfo.cpp
@@ -345,7 +345,7 @@ void CGDebugInfo::setLocation(SourceLocation Loc) {
   if (Loc.isInvalid())
     return;
 
-  CurLoc = CGM.getContext().getSourceManager().getExpansionLoc(Loc);
+  CurLoc = CGM.getContext().getSourceManager().getFileLoc(Loc);
 
   // If we've changed files in the middle of a lexical scope go ahead
   // and create a new lexical scope with file node if it's different
@@ -572,7 +572,7 @@ llvm::DIFile *CGDebugInfo::getOrCreateFile(SourceLocation Loc) {
     FileName = TheCU->getFile()->getFilename();
     CSInfo = TheCU->getFile()->getChecksum();
   } else {
-    PresumedLoc PLoc = SM.getPresumedLoc(Loc);
+    PresumedLoc PLoc = SM.getPresumedLoc(SM.getFileLoc(Loc));
     FileName = PLoc.getFilename();
 
     if (FileName.empty()) {
@@ -599,7 +599,8 @@ llvm::DIFile *CGDebugInfo::getOrCreateFile(SourceLocation Loc) {
     if (CSKind)
       CSInfo.emplace(*CSKind, Checksum);
   }
-  return createFile(FileName, CSInfo, getSource(SM, SM.getFileID(Loc)));
+  return createFile(FileName, CSInfo,
+                    getSource(SM, SM.getFileID(SM.getFileLoc(Loc))));
 }
 
 llvm::DIFile *CGDebugInfo::createFile(
@@ -654,7 +655,7 @@ unsigned CGDebugInfo::getLineNumber(SourceLocation Loc) {
   if (Loc.isInvalid())
     return 0;
   SourceManager &SM = CGM.getContext().getSourceManager();
-  return SM.getPresumedLoc(Loc).getLine();
+  return SM.getPresumedLoc(SM.getFileLoc(Loc)).getLine();
 }
 
 unsigned CGDebugInfo::getColumnNumber(SourceLocation Loc, bool Force) {
@@ -666,7 +667,8 @@ unsigned CGDebugInfo::getColumnNumber(SourceLocation Loc, bool Force) {
   if (Loc.isInvalid() && CurLoc.isInvalid())
     return 0;
   SourceManager &SM = CGM.getContext().getSourceManager();
-  PresumedLoc PLoc = SM.getPresumedLoc(Loc.isValid() ? Loc : CurLoc);
+  PresumedLoc PLoc =
+      SM.getPresumedLoc(Loc.isValid() ? SM.getFileLoc(Loc) : CurLoc);
   return PLoc.isValid() ? PLoc.getColumn() : 0;
 }
 
@@ -1174,14 +1176,16 @@ llvm::DIType *CGDebugInfo::CreateType(const BuiltinType *BT) {
 }
 
 llvm::DIType *CGDebugInfo::CreateType(const BitIntType *Ty) {
-
-  StringRef Name = Ty->isUnsigned() ? "unsigned _BitInt" : "_BitInt";
+  SmallString<32> Name;
+  llvm::raw_svector_ostream OS(Name);
+  OS << (Ty->isUnsigned() ? "unsigned _BitInt(" : "_BitInt(")
+     << Ty->getNumBits() << ")";
   llvm::dwarf::TypeKind Encoding = Ty->isUnsigned()
                                        ? llvm::dwarf::DW_ATE_unsigned
                                        : llvm::dwarf::DW_ATE_signed;
-
   return DBuilder.createBasicType(Name, CGM.getContext().getTypeSize(Ty),
-                                  Encoding);
+                                  Encoding, llvm::DINode::FlagZero, 0,
+                                  Ty->getNumBits());
 }
 
 llvm::DIType *CGDebugInfo::CreateType(const ComplexType *Ty) {
@@ -5000,7 +5004,7 @@ void CGDebugInfo::EmitLocation(CGBuilderTy &Builder, SourceLocation Loc) {
   // Update our current location
   setLocation(Loc);
 
-  if (CurLoc.isInvalid() || CurLoc.isMacroID() || LexicalBlockStack.empty())
+  if (CurLoc.isInvalid() || LexicalBlockStack.empty())
     return;
 
   llvm::MDNode *Scope = LexicalBlockStack.back();
@@ -6276,7 +6280,8 @@ void CGDebugInfo::EmitGlobalAlias(const llvm::GlobalValue *GV,
 void CGDebugInfo::AddStringLiteralDebugInfo(llvm::GlobalVariable *GV,
                                             const StringLiteral *S) {
   SourceLocation Loc = S->getStrTokenLoc(0);
-  PresumedLoc PLoc = CGM.getContext().getSourceManager().getPresumedLoc(Loc);
+  SourceManager &SM = CGM.getContext().getSourceManager();
+  PresumedLoc PLoc = SM.getPresumedLoc(SM.getFileLoc(Loc));
   if (!PLoc.isValid())
     return;
 
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index 301d5770cf78f..01f2161f27555 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -2297,9 +2297,13 @@ void CodeGenFunction::EmitStoreOfScalar(llvm::Value *Value, Address Addr,
           CGM.getABIInfo().getOptimalVectorMemoryType(VecTy, getLangOpts());
       if (!ClangVecTy->isPackedVectorBoolType(getContext()) &&
           VecTy != NewVecTy) {
-        SmallVector<int, 16> Mask(NewVecTy->getNumElements(), -1);
+        SmallVector<int, 16> Mask(NewVecTy->getNumElements(),
+                                  VecTy->getNumElements());
         std::iota(Mask.begin(), Mask.begin() + VecTy->getNumElements(), 0);
-        Value = Builder.CreateShuffleVector(Value, Mask, "extractVec");
+        // Use undef instead of poison for the padding lanes, to make sure no
+        // padding bits are poisoned, which may break coercion.
+        Value = Builder.CreateShuffleVector(Value, llvm::UndefValue::get(VecTy),
+                                            Mask, "extractVec");
         SrcTy = NewVecTy;
       }
       if (Addr.getElementType() != SrcTy)
diff --git a/clang/lib/CodeGen/CGHLSLBuiltins.cpp b/clang/lib/CodeGen/CGHLSLBuiltins.cpp
index 384bd59e7533a..fbf4a5722caed 100644
--- a/clang/lib/CodeGen/CGHLSLBuiltins.cpp
+++ b/clang/lib/CodeGen/CGHLSLBuiltins.cpp
@@ -206,7 +206,7 @@ static Intrinsic::ID getWaveActiveSumIntrinsic(llvm::Triple::ArchType Arch,
   }
 }
 
-// Return wave active sum that corresponds to the QT scalar type
+// Return wave active max that corresponds to the QT scalar type
 static Intrinsic::ID getWaveActiveMaxIntrinsic(llvm::Triple::ArchType Arch,
                                                CGHLSLRuntime &RT, QualType QT) {
   switch (Arch) {
@@ -225,6 +225,25 @@ static Intrinsic::ID getWaveActiveMaxIntrinsic(llvm::Triple::ArchType Arch,
   }
 }
 
+// Return wave active min that corresponds to the QT scalar type
+static Intrinsic::ID getWaveActiveMinIntrinsic(llvm::Triple::ArchType Arch,
+                                               CGHLSLRuntime &RT, QualType QT) {
+  switch (Arch) {
+  case llvm::Triple::spirv:
+    if (QT->isUnsignedIntegerType())
+      return Intrinsic::spv_wave_reduce_umin;
+    return Intrinsic::spv_wave_reduce_min;
+  case llvm::Triple::dxil: {
+    if (QT->isUnsignedIntegerType())
+      return Intrinsic::dx_wave_reduce_umin;
+    return Intrinsic::dx_wave_reduce_min;
+  }
+  default:
+    llvm_unreachable("Intrinsic WaveActiveMin"
+                     " not supported by target architecture");
+  }
+}
+
 // Returns the mangled name for a builtin function that the SPIR-V backend
 // will expand into a spec Constant.
 static std::string getSpecConstantFunctionName(clang::QualType SpecConstantType,
@@ -742,6 +761,17 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
                                &CGM.getModule(), IID, {OpExpr->getType()}),
                            ArrayRef{OpExpr}, "hlsl.wave.active.max");
   }
+  case Builtin::BI__builtin_hlsl_wave_active_min: {
+    // Due to the use of variadic arguments, explicitly retreive argument
+    Value *OpExpr = EmitScalarExpr(E->getArg(0));
+    Intrinsic::ID IID = getWaveActiveMinIntrinsic(
+        getTarget().getTriple().getArch(), CGM.getHLSLRuntime(),
+        E->getArg(0)->getType());
+
+    return EmitRuntimeCall(Intrinsic::getOrInsertDeclaration(
+                               &CGM.getModule(), IID, {OpExpr->getType()}),
+                           ArrayRef{OpExpr}, "hlsl.wave.active.min");
+  }
   case Builtin::BI__builtin_hlsl_wave_get_lane_index: {
     // We don't define a SPIR-V intrinsic, instead it is a SPIR-V built-in
     // defined in SPIRVBuiltins.td. So instead we manually get the matching name
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 66fea920812c2..121de42248e3b 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -3731,6 +3731,7 @@ CGOpenMPRuntime::emitTaskInit(CodeGenFunction &CGF, SourceLocation Loc,
     DestructorsFlag = 0x8,
     PriorityFlag = 0x20,
     DetachableFlag = 0x40,
+    FreeAgentFlag = 0x80,
   };
   unsigned Flags = Data.Tied ? TiedFlag : 0;
   bool NeedsCleanup = false;
@@ -3740,6 +3741,11 @@ CGOpenMPRuntime::emitTaskInit(CodeGenFunction &CGF, SourceLocation Loc,
     if (NeedsCleanup)
       Flags = Flags | DestructorsFlag;
   }
+  if (const auto *Clause = D.getSingleClause<OMPThreadsetClause>()) {
+    OpenMPThreadsetKind Kind = Clause->getThreadsetKind();
+    if (Kind == OMPC_THREADSET_omp_pool)
+      Flags = Flags | FreeAgentFlag;
+  }
   if (Data.Priority.getInt())
     Flags = Flags | PriorityFlag;
   if (D.hasClausesOfKind<OMPDetachClause>())
diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index f49a5af2c9587..9eab70955b6b9 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -647,8 +647,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_ballot_w64: {
     llvm::Type *ResultType = ConvertType(E->getType());
     llvm::Value *Src = EmitScalarExpr(E->getArg(0));
-    Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_ballot, { ResultType });
-    return Builder.CreateCall(F, { Src });
+    Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_ballot, {ResultType});
+    return Builder.CreateCall(F, {Src});
   }
   case AMDGPU::BI__builtin_amdgcn_inverse_ballot_w32:
   case AMDGPU::BI__builtin_amdgcn_inverse_ballot_w64: {
@@ -1139,6 +1139,83 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_image_sample_cube_v4f16_f32:
     return emitAMDGCNImageOverloadedReturnType(
         *this, E, Intrinsic::amdgcn_image_sample_cube, false);
+  case clang::AMDGPU::BI__builtin_amdgcn_image_sample_lz_1d_v4f32_f32:
+  case clang::AMDGPU::BI__builtin_amdgcn_image_sample_lz_1d_v4f16_f32:
+    return emitAMDGCNImageOverloadedReturnType(
+        *this, E, Intrinsic::amdgcn_image_sample_lz_1d, false);
+  case clang::AMDGPU::BI__builtin_amdgcn_image_sample_l_1d_v4f32_f32:
+  case clang::AMDGPU::BI__builtin_amdgcn_image_sample_l_1d_v4f16_f32:
+    return emitAMDGCNImageOverloadedReturnType(
+        *this, E, Intrinsic::amdgcn_image_sample_l_1d, false);
+  case clang::AMDGPU::BI__builtin_amdgcn_image_sample_d_1d_v4f32_f32:
+  case clang::AMDGPU::BI__builtin_amdgcn_image_sample_d_1d_v4f16_f32:
+    return emitAMDGCNImageOverloadedReturnType(
+        *this, E, Intrinsic::amdgcn_image_sample_d_1d, false);
+  case clang::AMDGPU::BI__builtin_amdgcn_image_sample_lz_2d_v4f32_f32:
+  case clang::AMDGPU::BI__builtin_amdgcn_image_sample_lz_2d_v4f16_f32:
+  case clang::AMDGPU::BI__builtin_amdgcn_image_sample_lz_2d_f32_f32:
+    return emitAMDGCNImageOverloadedReturnType(
+        *this, E, Intrinsic::amdgcn_image_sample_lz_2d, false);
+  case clang::AMDGPU::BI__builtin_amdgcn_image_sample_l_2d_v4f32_f32:
+  case clang::AMDGPU::BI__builtin_amdgcn_image_sample_l_2d_v4f16_f32:
+  case clang::AMDGPU::BI__builtin_amdgcn_image_sample_l_2d_f32_f32:
+    return emitAMDGCNImageOverloadedReturnType(
+        *this, E, Intrinsic::amdgcn_image_sample_l_2d, false);
+  case clang::AMDGPU::BI__builtin_amdgcn_image_sample_d_2d_v4f32_f32:
+  case clang::AMDGPU::BI__builtin_amdgcn_image_sample_d_2d_v4f16_f32:
+  case clang::AMDGPU::BI__builtin_amdgcn_image_sample_d_2d_f32_f32:
+    return emitAMDGCNImageOverloadedReturnType(
+        *this, E, Intrinsic::amdgcn_image_sample_d_2d, false);
+  case clang::AMDGPU::BI__builtin_amdgcn_image_sample_lz_3d_v4f32_f32:
+  case clang::AMDGPU::BI__builtin_amdgcn_image_sample_lz_3d_v4f16_f32:
+    return emitAMDGCNImageOverloadedReturnType(
+        *this, E, Intrinsic::amdgcn_image_sample_lz_3d, false);
+  case clang::AMDGPU::BI__builtin_amdgcn_image_sample_l_3d_v4f32_f32:
+  case clang::AMDGPU::BI__builtin_amdgcn_image_sample_l_3d_v4f16_f32:
+    return emitAMDGCNImageOverloadedReturnType(
+        *this, E, Intrinsic::amdgcn_image_sample_l_3d, false);
+  case clang::AMDGPU::BI__builtin_amdgcn_image_sample_d_3d_v4f32_f32:
+  case clang::AMDGPU::BI__builtin_amdgcn_image_sample_d_3d_v4f16_f32:
+    return emitAMDGCNImageOverloadedReturnType(
+        *this, E, Intrinsic::amdgcn_image_sample_d_3d, false);
+  case clang::AMDGPU::BI__builtin_amdgcn_image_sample_lz_cube_v4f32_f32:
+  case clang::AMDGPU::BI__builtin_amdgcn_image_sample_lz_cube_v4f16_f32:
+    return emitAMDGCNImageOverloadedReturnType(
+        *this, E, Intrinsic::amdgcn_image_sample_lz_cube, false);
+  case clang::AMDGPU::BI__builtin_amdgcn_image_sample_l_cube_v4f32_f32:
+  case clang::AMDGPU::BI__builtin_amdgcn_image_sample_l_cube_v4f16_f32:
+    return emitAMDGCNImageOverloadedReturnType(
+        *this, E, Intrinsic::amdgcn_image_sample_l_cube, false);
+  case clang::AMDGPU::BI__builtin_amdgcn_image_sample_lz_1darray_v4f32_f32:
+  case clang::AMDGPU::BI__builtin_amdgcn_image_sample_lz_1darray_v4f16_f32:
+    return emitAMDGCNImageOverloadedReturnType(
+        *this, E, Intrinsic::amdgcn_image_sample_lz_1darray, false);
+  case clang::AMDGPU::BI__builtin_amdgcn_image_sample_l_1darray_v4f32_f32:
+  case clang::AMDGPU::BI__builtin_amdgcn_image_sample_l_1darray_v4f16_f32:
+    return emitAMDGCNImageOverloadedReturnType(
+        *this, E, Intrinsic::amdgcn_image_sample_l_1darray, false);
+  case clang::AMDGPU::BI__builtin_amdgcn_image_sample_d_1darray_v4f32_f32:
+  case clang::AMDGPU::BI__builtin_amdgcn_image_sample_d_1darray_v4f16_f32:
+    return emitAMDGCNImageOverloadedReturnType(
+        *this, E, Intrinsic::amdgcn_image_sample_d_1darray, false);
+  case clang::AMDGPU::BI__builtin_amdgcn_image_sample_lz_2darray_v4f32_f32:
+  case clang::AMDGPU::BI__builtin_amdgcn_image_sample_lz_2darray_v4f16_f32:
+  case clang::AMDGPU::BI__builtin_amdgcn_image_sample_lz_2darray_f32_f32:
+    return emitAMDGCNImageOverloadedReturnType(
+        *this, E, Intrinsic::amdgcn_image_sample_lz_2darray, false);
+  case clang::AMDGPU::BI__builtin_amdgcn_image_sample_l_2darray_v4f32_f32:
+  case clang::AMDGPU::BI__builtin_amdgcn_image_sample_l_2darray_v4f16_f32:
+  case clang::AMDGPU::BI__builtin_amdgcn_image_sample_l_2darray_f32_f32:
+    return emitAMDGCNImageOverloadedReturnType(
+        *this, E, Intrinsic::amdgcn_image_sample_l_2darray, false);
+  case clang::AMDGPU::BI__builtin_amdgcn_image_sample_d_2darray_v4f32_f32:
+  case clang::AMDGPU::BI__builtin_amdgcn_image_sample_d_2darray_v4f16_f32:
+  case clang::AMDGPU::BI__builtin_amdgcn_image_sample_d_2darray_f32_f32:
+    return emitAMDGCNImageOverloadedReturnType(
+        *this, E, Intrinsic::amdgcn_image_sample_d_2darray, false);
+  case clang::AMDGPU::BI__builtin_amdgcn_image_gather4_lz_2d_v4f32_f32:
+    return emitAMDGCNImageOverloadedReturnType(
+        *this, E, Intrinsic::amdgcn_image_gather4_lz_2d, false);
   case AMDGPU::BI__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4:
   case AMDGPU::BI__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4: {
     llvm::FixedVectorType *VT = FixedVectorType::get(Builder.getInt32Ty(), 8);
diff --git a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
index 60f9b86333670..15fa78ddba715 100644
--- a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
@@ -1193,14 +1193,22 @@ static const ARMVectorIntrinsicInfo AArch64SISDIntrinsicMap[] = {
   NEONMAP1(vaddlvq_s32, aarch64_neon_saddlv, AddRetType | Add1ArgType),
   NEONMAP1(vaddlvq_u32, aarch64_neon_uaddlv, AddRetType | Add1ArgType),
   NEONMAP1(vaddv_f32, aarch64_neon_faddv, AddRetType | Add1ArgType),
-  NEONMAP1(vaddv_s32, aarch64_neon_saddv, AddRetType | Add1ArgType),
-  NEONMAP1(vaddv_u32, aarch64_neon_uaddv, AddRetType | Add1ArgType),
+  NEONMAP1(vaddv_s16, vector_reduce_add, Add1ArgType),
+  NEONMAP1(vaddv_s32, vector_reduce_add, Add1ArgType),
+  NEONMAP1(vaddv_s8, vector_reduce_add, Add1ArgType),
+  NEONMAP1(vaddv_u16, vector_reduce_add, Add1ArgType),
+  NEONMAP1(vaddv_u32, vector_reduce_add, Add1ArgType),
+  NEONMAP1(vaddv_u8, vector_reduce_add, Add1ArgType),
   NEONMAP1(vaddvq_f32, aarch64_neon_faddv, AddRetType | Add1ArgType),
   NEONMAP1(vaddvq_f64, aarch64_neon_faddv, AddRetType | Add1ArgType),
-  NEONMAP1(vaddvq_s32, aarch64_neon_saddv, AddRetType | Add1ArgType),
-  NEONMAP1(vaddvq_s64, aarch64_neon_saddv, AddRetType | Add1ArgType),
-  NEONMAP1(vaddvq_u32, aarch64_neon_uaddv, AddRetType | Add1ArgType),
-  NEONMAP1(vaddvq_u64, aarch64_neon_uaddv, AddRetType | Add1ArgType),
+  NEONMAP1(vaddvq_s16, vector_reduce_add, Add1ArgType),
+  NEONMAP1(vaddvq_s32, vector_reduce_add, Add1ArgType),
+  NEONMAP1(vaddvq_s64, vector_reduce_add, Add1ArgType),
+  NEONMAP1(vaddvq_s8, vector_reduce_add, Add1ArgType),
+  NEONMAP1(vaddvq_u16, vector_reduce_add, Add1ArgType),
+  NEONMAP1(vaddvq_u32, vector_reduce_add, Add1ArgType),
+  NEONMAP1(vaddvq_u64, vector_reduce_add, Add1ArgType),
+  NEONMAP1(vaddvq_u8, vector_reduce_add, Add1ArgType),
   NEONMAP1(vcaged_f64, aarch64_neon_facge, AddRetType | Add1ArgType),
   NEONMAP1(vcages_f32, aarch64_neon_facge, AddRetType | Add1ArgType),
   NEONMAP1(vcagtd_f64, aarch64_neon_facgt, AddRetType | Add1ArgType),
@@ -1243,27 +1251,43 @@ static const ARMVectorIntrinsicInfo AArch64SISDIntrinsicMap[] = {
   NEONMAP1(vmaxnmvq_f32, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType),
   NEONMAP1(vmaxnmvq_f64, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType),
   NEONMAP1(vmaxv_f32, aarch64_neon_fmaxv, AddRetType | Add1ArgType),
-  NEONMAP1(vmaxv_s32, aarch64_neon_smaxv, AddRetType | Add1ArgType),
-  NEONMAP1(vmaxv_u32, aarch64_neon_umaxv, AddRetType | Add1ArgType),
+  NEONMAP1(vmaxv_s16, vector_reduce_smax, Add1ArgType),
+  NEONMAP1(vmaxv_s32, vector_reduce_smax, Add1ArgType),
+  NEONMAP1(vmaxv_s8, vector_reduce_smax, Add1ArgType),
+  NEONMAP1(vmaxv_u16, vector_reduce_umax, Add1ArgType),
+  NEONMAP1(vmaxv_u32, vector_reduce_umax, Add1ArgType),
+  NEONMAP1(vmaxv_u8, vector_reduce_umax, Add1ArgType),
   NEONMAP1(vmaxvq_f32, aarch64_neon_fmaxv, AddRetType | Add1ArgType),
   NEONMAP1(vmaxvq_f64, aarch64_neon_fmaxv, AddRetType | Add1ArgType),
-  NEONMAP1(vmaxvq_s32, aarch64_neon_smaxv, AddRetType | Add1ArgType),
-  NEONMAP1(vmaxvq_u32, aarch64_neon_umaxv, AddRetType | Add1ArgType),
+  NEONMAP1(vmaxvq_s16, vector_reduce_smax, Add1ArgType),
+  NEONMAP1(vmaxvq_s32, vector_reduce_smax, Add1ArgType),
+  NEONMAP1(vmaxvq_s8, vector_reduce_smax, Add1ArgType),
+  NEONMAP1(vmaxvq_u16, vector_reduce_umax, Add1ArgType),
+  NEONMAP1(vmaxvq_u32, vector_reduce_umax, Add1ArgType),
+  NEONMAP1(vmaxvq_u8, vector_reduce_umax, Add1ArgType),
   NEONMAP1(vminnmv_f32, aarch64_neon_fminnmv, AddRetType | Add1ArgType),
   NEONMAP1(vminnmvq_f32, aarch64_neon_fminnmv, AddRetType | Add1ArgType),
   NEONMAP1(vminnmvq_f64, aarch64_neon_fminnmv, AddRetType | Add1ArgType),
   NEONMAP1(vminv_f32, aarch64_neon_fminv, AddRetType | Add1ArgType),
-  NEONMAP1(vminv_s32, aarch64_neon_sminv, AddRetType | Add1ArgType),
-  NEONMAP1(vminv_u32, aarch64_neon_uminv, AddRetType | Add1ArgType),
+  NEONMAP1(vminv_s16, vector_reduce_smin, Add1ArgType),
+  NEONMAP1(vminv_s32, vector_reduce_smin, Add1ArgType),
+  NEONMAP1(vminv_s8, vector_reduce_smin, Add1ArgType),
+  NEONMAP1(vminv_u16, vector_reduce_umin, Add1ArgType),
+  NEONMAP1(vminv_u32, vector_reduce_umin, Add1ArgType),
+  NEONMAP1(vminv_u8, vector_reduce_umin, Add1ArgType),
   NEONMAP1(vminvq_f32, aarch64_neon_fminv, AddRetType | Add1ArgType),
   NEONMAP1(vminvq_f64, aarch64_neon_fminv, AddRetType | Add1ArgType),
-  NEONMAP1(vminvq_s32, aarch64_neon_sminv, AddRetType | Add1ArgType),
-  NEONMAP1(vminvq_u32, aarch64_neon_uminv, AddRetType | Add1ArgType),
+  NEONMAP1(vminvq_s16, vector_reduce_smin, Add1ArgType),
+  NEONMAP1(vminvq_s32, vector_reduce_smin, Add1ArgType),
+  NEONMAP1(vminvq_s8, vector_reduce_smin, Add1ArgType),
+  NEONMAP1(vminvq_u16, vector_reduce_umin, Add1ArgType),
+  NEONMAP1(vminvq_u32, vector_reduce_umin, Add1ArgType),
+  NEONMAP1(vminvq_u8, vector_reduce_umin, Add1ArgType),
   NEONMAP1(vmull_p64, aarch64_neon_pmull64, 0),
   NEONMAP1(vmulxd_f64, aarch64_neon_fmulx, Add1ArgType),
   NEONMAP1(vmulxs_f32, aarch64_neon_fmulx, Add1ArgType),
-  NEONMAP1(vpaddd_s64, aarch64_neon_uaddv, AddRetType | Add1ArgType),
-  NEONMAP1(vpaddd_u64, aarch64_neon_uaddv, AddRetType | Add1ArgType),
+  NEONMAP1(vpaddd_s64, vector_reduce_add, Add1ArgType),
+  NEONMAP1(vpaddd_u64, vector_reduce_add, Add1ArgType),
   NEONMAP1(vpmaxnmqd_f64, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType),
   NEONMAP1(vpmaxnms_f32, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType),
   NEONMAP1(vpmaxqd_f64, aarch64_neon_fmaxv, AddRetType | Add1ArgType),
@@ -7067,127 +7091,6 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     Int = Intrinsic::bitreverse;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrbit");
   }
-  case NEON::BI__builtin_neon_vaddv_u8:
-    // FIXME: These are handled by the AArch64 scalar code.
-    usgn = true;
-    [[fallthrough]];
-  case NEON::BI__builtin_neon_vaddv_s8: {
-    Int = usgn ? Intrinsic::aarch64_neon_uaddv : Intrinsic::aarch64_neon_saddv;
-    Ty = Int32Ty;
-    VTy = llvm::FixedVectorType::get(Int8Ty, 8);
-    llvm::Type *Tys[2] = { Ty, VTy };
-    Ops.push_back(EmitScalarExpr(E->getArg(0)));
-    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddv");
-    return Builder.CreateTrunc(Ops[0], Int8Ty);
-  }
-  case NEON::BI__builtin_neon_vaddv_u16:
-    usgn = true;
-    [[fallthrough]];
-  case NEON::BI__builtin_neon_vaddv_s16: {
-    Int = usgn ? Intrinsic::aarch64_neon_uaddv : Intrinsic::aarch64_neon_saddv;
-    Ty = Int32Ty;
-    VTy = llvm::FixedVectorType::get(Int16Ty, 4);
-    llvm::Type *Tys[2] = { Ty, VTy };
-    Ops.push_back(EmitScalarExpr(E->getArg(0)));
-    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddv");
-    return Builder.CreateTrunc(Ops[0], Int16Ty);
-  }
-  case NEON::BI__builtin_neon_vaddvq_u8:
-    usgn = true;
-    [[fallthrough]];
-  case NEON::BI__builtin_neon_vaddvq_s8: {
-    Int = usgn ? Intrinsic::aarch64_neon_uaddv : Intrinsic::aarch64_neon_saddv;
-    Ty = Int32Ty;
-    VTy = llvm::FixedVectorType::get(Int8Ty, 16);
-    llvm::Type *Tys[2] = { Ty, VTy };
-    Ops.push_back(EmitScalarExpr(E->getArg(0)));
-    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddv");
-    return Builder.CreateTrunc(Ops[0], Int8Ty);
-  }
-  case NEON::BI__builtin_neon_vaddvq_u16:
-    usgn = true;
-    [[fallthrough]];
-  case NEON::BI__builtin_neon_vaddvq_s16: {
-    Int = usgn ? Intrinsic::aarch64_neon_uaddv : Intrinsic::aarch64_neon_saddv;
-    Ty = Int32Ty;
-    VTy = llvm::FixedVectorType::get(Int16Ty, 8);
-    llvm::Type *Tys[2] = { Ty, VTy };
-    Ops.push_back(EmitScalarExpr(E->getArg(0)));
-    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddv");
-    return Builder.CreateTrunc(Ops[0], Int16Ty);
-  }
-  case NEON::BI__builtin_neon_vmaxv_u8: {
-    Int = Intrinsic::aarch64_neon_umaxv;
-    Ty = Int32Ty;
-    VTy = llvm::FixedVectorType::get(Int8Ty, 8);
-    llvm::Type *Tys[2] = { Ty, VTy };
-    Ops.push_back(EmitScalarExpr(E->getArg(0)));
-    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
-    return Builder.CreateTrunc(Ops[0], Int8Ty);
-  }
-  case NEON::BI__builtin_neon_vmaxv_u16: {
-    Int = Intrinsic::aarch64_neon_umaxv;
-    Ty = Int32Ty;
-    VTy = llvm::FixedVectorType::get(Int16Ty, 4);
-    llvm::Type *Tys[2] = { Ty, VTy };
-    Ops.push_back(EmitScalarExpr(E->getArg(0)));
-    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
-    return Builder.CreateTrunc(Ops[0], Int16Ty);
-  }
-  case NEON::BI__builtin_neon_vmaxvq_u8: {
-    Int = Intrinsic::aarch64_neon_umaxv;
-    Ty = Int32Ty;
-    VTy = llvm::FixedVectorType::get(Int8Ty, 16);
-    llvm::Type *Tys[2] = { Ty, VTy };
-    Ops.push_back(EmitScalarExpr(E->getArg(0)));
-    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
-    return Builder.CreateTrunc(Ops[0], Int8Ty);
-  }
-  case NEON::BI__builtin_neon_vmaxvq_u16: {
-    Int = Intrinsic::aarch64_neon_umaxv;
-    Ty = Int32Ty;
-    VTy = llvm::FixedVectorType::get(Int16Ty, 8);
-    llvm::Type *Tys[2] = { Ty, VTy };
-    Ops.push_back(EmitScalarExpr(E->getArg(0)));
-    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
-    return Builder.CreateTrunc(Ops[0], Int16Ty);
-  }
-  case NEON::BI__builtin_neon_vmaxv_s8: {
-    Int = Intrinsic::aarch64_neon_smaxv;
-    Ty = Int32Ty;
-    VTy = llvm::FixedVectorType::get(Int8Ty, 8);
-    llvm::Type *Tys[2] = { Ty, VTy };
-    Ops.push_back(EmitScalarExpr(E->getArg(0)));
-    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
-    return Builder.CreateTrunc(Ops[0], Int8Ty);
-  }
-  case NEON::BI__builtin_neon_vmaxv_s16: {
-    Int = Intrinsic::aarch64_neon_smaxv;
-    Ty = Int32Ty;
-    VTy = llvm::FixedVectorType::get(Int16Ty, 4);
-    llvm::Type *Tys[2] = { Ty, VTy };
-    Ops.push_back(EmitScalarExpr(E->getArg(0)));
-    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
-    return Builder.CreateTrunc(Ops[0], Int16Ty);
-  }
-  case NEON::BI__builtin_neon_vmaxvq_s8: {
-    Int = Intrinsic::aarch64_neon_smaxv;
-    Ty = Int32Ty;
-    VTy = llvm::FixedVectorType::get(Int8Ty, 16);
-    llvm::Type *Tys[2] = { Ty, VTy };
-    Ops.push_back(EmitScalarExpr(E->getArg(0)));
-    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
-    return Builder.CreateTrunc(Ops[0], Int8Ty);
-  }
-  case NEON::BI__builtin_neon_vmaxvq_s16: {
-    Int = Intrinsic::aarch64_neon_smaxv;
-    Ty = Int32Ty;
-    VTy = llvm::FixedVectorType::get(Int16Ty, 8);
-    llvm::Type *Tys[2] = { Ty, VTy };
-    Ops.push_back(EmitScalarExpr(E->getArg(0)));
-    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
-    return Builder.CreateTrunc(Ops[0], Int16Ty);
-  }
   case NEON::BI__builtin_neon_vmaxv_f16: {
     Int = Intrinsic::aarch64_neon_fmaxv;
     Ty = HalfTy;
@@ -7206,78 +7109,6 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
     return Builder.CreateTrunc(Ops[0], HalfTy);
   }
-  case NEON::BI__builtin_neon_vminv_u8: {
-    Int = Intrinsic::aarch64_neon_uminv;
-    Ty = Int32Ty;
-    VTy = llvm::FixedVectorType::get(Int8Ty, 8);
-    llvm::Type *Tys[2] = { Ty, VTy };
-    Ops.push_back(EmitScalarExpr(E->getArg(0)));
-    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
-    return Builder.CreateTrunc(Ops[0], Int8Ty);
-  }
-  case NEON::BI__builtin_neon_vminv_u16: {
-    Int = Intrinsic::aarch64_neon_uminv;
-    Ty = Int32Ty;
-    VTy = llvm::FixedVectorType::get(Int16Ty, 4);
-    llvm::Type *Tys[2] = { Ty, VTy };
-    Ops.push_back(EmitScalarExpr(E->getArg(0)));
-    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
-    return Builder.CreateTrunc(Ops[0], Int16Ty);
-  }
-  case NEON::BI__builtin_neon_vminvq_u8: {
-    Int = Intrinsic::aarch64_neon_uminv;
-    Ty = Int32Ty;
-    VTy = llvm::FixedVectorType::get(Int8Ty, 16);
-    llvm::Type *Tys[2] = { Ty, VTy };
-    Ops.push_back(EmitScalarExpr(E->getArg(0)));
-    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
-    return Builder.CreateTrunc(Ops[0], Int8Ty);
-  }
-  case NEON::BI__builtin_neon_vminvq_u16: {
-    Int = Intrinsic::aarch64_neon_uminv;
-    Ty = Int32Ty;
-    VTy = llvm::FixedVectorType::get(Int16Ty, 8);
-    llvm::Type *Tys[2] = { Ty, VTy };
-    Ops.push_back(EmitScalarExpr(E->getArg(0)));
-    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
-    return Builder.CreateTrunc(Ops[0], Int16Ty);
-  }
-  case NEON::BI__builtin_neon_vminv_s8: {
-    Int = Intrinsic::aarch64_neon_sminv;
-    Ty = Int32Ty;
-    VTy = llvm::FixedVectorType::get(Int8Ty, 8);
-    llvm::Type *Tys[2] = { Ty, VTy };
-    Ops.push_back(EmitScalarExpr(E->getArg(0)));
-    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
-    return Builder.CreateTrunc(Ops[0], Int8Ty);
-  }
-  case NEON::BI__builtin_neon_vminv_s16: {
-    Int = Intrinsic::aarch64_neon_sminv;
-    Ty = Int32Ty;
-    VTy = llvm::FixedVectorType::get(Int16Ty, 4);
-    llvm::Type *Tys[2] = { Ty, VTy };
-    Ops.push_back(EmitScalarExpr(E->getArg(0)));
-    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
-    return Builder.CreateTrunc(Ops[0], Int16Ty);
-  }
-  case NEON::BI__builtin_neon_vminvq_s8: {
-    Int = Intrinsic::aarch64_neon_sminv;
-    Ty = Int32Ty;
-    VTy = llvm::FixedVectorType::get(Int8Ty, 16);
-    llvm::Type *Tys[2] = { Ty, VTy };
-    Ops.push_back(EmitScalarExpr(E->getArg(0)));
-    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
-    return Builder.CreateTrunc(Ops[0], Int8Ty);
-  }
-  case NEON::BI__builtin_neon_vminvq_s16: {
-    Int = Intrinsic::aarch64_neon_sminv;
-    Ty = Int32Ty;
-    VTy = llvm::FixedVectorType::get(Int16Ty, 8);
-    llvm::Type *Tys[2] = { Ty, VTy };
-    Ops.push_back(EmitScalarExpr(E->getArg(0)));
-    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
-    return Builder.CreateTrunc(Ops[0], Int16Ty);
-  }
   case NEON::BI__builtin_neon_vminv_f16: {
     Int = Intrinsic::aarch64_neon_fminv;
     Ty = HalfTy;
diff --git a/clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp b/clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp
index 6da65b681df1e..8a1cab3417d98 100644
--- a/clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp
@@ -375,28 +375,28 @@ static Value *MakeCpAsync(unsigned IntrinsicID, unsigned IntrinsicIDS,
                                        CGF.EmitScalarExpr(E->getArg(1))});
 }
 
-static Value *MakeHalfType(unsigned IntrinsicID, unsigned BuiltinID,
-                           const CallExpr *E, CodeGenFunction &CGF) {
+static bool EnsureNativeHalfSupport(unsigned BuiltinID, const CallExpr *E,
+                                    CodeGenFunction &CGF) {
   auto &C = CGF.CGM.getContext();
-  if (!(C.getLangOpts().NativeHalfType ||
-        !C.getTargetInfo().useFP16ConversionIntrinsics())) {
+  if (!C.getLangOpts().NativeHalfType &&
+      C.getTargetInfo().useFP16ConversionIntrinsics()) {
     CGF.CGM.Error(E->getExprLoc(), C.BuiltinInfo.getQuotedName(BuiltinID) +
                                        " requires native half type support.");
-    return nullptr;
+    return false;
   }
+  return true;
+}
 
-  if (BuiltinID == NVPTX::BI__nvvm_ldg_h || BuiltinID == NVPTX::BI__nvvm_ldg_h2)
-    return MakeLdg(CGF, E);
-
-  if (IntrinsicID == Intrinsic::nvvm_ldu_global_f)
-    return MakeLdu(IntrinsicID, CGF, E);
+static Value *MakeHalfType(Function *Intrinsic, unsigned BuiltinID,
+                           const CallExpr *E, CodeGenFunction &CGF) {
+  if (!EnsureNativeHalfSupport(BuiltinID, E, CGF))
+    return nullptr;
 
   SmallVector<Value *, 16> Args;
-  auto *F = CGF.CGM.getIntrinsic(IntrinsicID);
-  auto *FTy = F->getFunctionType();
+  auto *FTy = Intrinsic->getFunctionType();
   unsigned ICEArguments = 0;
   ASTContext::GetBuiltinTypeError Error;
-  C.GetBuiltinType(BuiltinID, Error, &ICEArguments);
+  CGF.CGM.getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);
   assert(Error == ASTContext::GE_None && "Should not codegen an error");
   for (unsigned i = 0, e = E->getNumArgs(); i != e; ++i) {
     assert((ICEArguments & (1 << i)) == 0);
@@ -407,8 +407,14 @@ static Value *MakeHalfType(unsigned IntrinsicID, unsigned BuiltinID,
     Args.push_back(ArgValue);
   }
 
-  return CGF.Builder.CreateCall(F, Args);
+  return CGF.Builder.CreateCall(Intrinsic, Args);
 }
+
+static Value *MakeHalfType(unsigned IntrinsicID, unsigned BuiltinID,
+                           const CallExpr *E, CodeGenFunction &CGF) {
+  return MakeHalfType(CGF.CGM.getIntrinsic(IntrinsicID), BuiltinID, E, CGF);
+}
+
 } // namespace
 
 Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
@@ -913,9 +919,14 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
   }
   // The following builtins require half type support
   case NVPTX::BI__nvvm_ex2_approx_f16:
-    return MakeHalfType(Intrinsic::nvvm_ex2_approx_f16, BuiltinID, E, *this);
+    return MakeHalfType(
+        CGM.getIntrinsic(Intrinsic::nvvm_ex2_approx, Builder.getHalfTy()),
+        BuiltinID, E, *this);
   case NVPTX::BI__nvvm_ex2_approx_f16x2:
-    return MakeHalfType(Intrinsic::nvvm_ex2_approx_f16x2, BuiltinID, E, *this);
+    return MakeHalfType(
+        CGM.getIntrinsic(Intrinsic::nvvm_ex2_approx,
+                         FixedVectorType::get(Builder.getHalfTy(), 2)),
+        BuiltinID, E, *this);
   case NVPTX::BI__nvvm_ff2f16x2_rn:
     return MakeHalfType(Intrinsic::nvvm_ff2f16x2_rn, BuiltinID, E, *this);
   case NVPTX::BI__nvvm_ff2f16x2_rn_relu:
@@ -1049,12 +1060,22 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
   case NVPTX::BI__nvvm_fabs_d:
     return Builder.CreateUnaryIntrinsic(Intrinsic::fabs,
                                         EmitScalarExpr(E->getArg(0)));
+  case NVPTX::BI__nvvm_ex2_approx_d:
+  case NVPTX::BI__nvvm_ex2_approx_f:
+    return Builder.CreateUnaryIntrinsic(Intrinsic::nvvm_ex2_approx,
+                                        EmitScalarExpr(E->getArg(0)));
+  case NVPTX::BI__nvvm_ex2_approx_ftz_f:
+    return Builder.CreateUnaryIntrinsic(Intrinsic::nvvm_ex2_approx_ftz,
+                                        EmitScalarExpr(E->getArg(0)));
   case NVPTX::BI__nvvm_ldg_h:
   case NVPTX::BI__nvvm_ldg_h2:
-    return MakeHalfType(Intrinsic::not_intrinsic, BuiltinID, E, *this);
+    return EnsureNativeHalfSupport(BuiltinID, E, *this) ? MakeLdg(*this, E)
+                                                        : nullptr;
   case NVPTX::BI__nvvm_ldu_h:
   case NVPTX::BI__nvvm_ldu_h2:
-    return MakeHalfType(Intrinsic::nvvm_ldu_global_f, BuiltinID, E, *this);
+    return EnsureNativeHalfSupport(BuiltinID, E, *this)
+               ? MakeLdu(Intrinsic::nvvm_ldu_global_f, *this, E)
+               : nullptr;
   case NVPTX::BI__nvvm_cp_async_ca_shared_global_4:
     return MakeCpAsync(Intrinsic::nvvm_cp_async_ca_shared_global_4,
                        Intrinsic::nvvm_cp_async_ca_shared_global_4_s, *this, E,
diff --git a/clang/lib/CodeGen/TargetBuiltins/X86.cpp b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
index b924407b6ddd7..2381b2e7cf2cf 100644
--- a/clang/lib/CodeGen/TargetBuiltins/X86.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
@@ -2931,74 +2931,6 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
     // instruction, but it will create a memset that won't be optimized away.
     return Builder.CreateMemSet(Ops[0], Ops[1], Ops[2], Align(1), true);
   }
-  // Corresponding to intrisics which will return 2 tiles (tile0_tile1).
-  case X86::BI__builtin_ia32_t2rpntlvwz0_internal:
-  case X86::BI__builtin_ia32_t2rpntlvwz0rs_internal:
-  case X86::BI__builtin_ia32_t2rpntlvwz0t1_internal:
-  case X86::BI__builtin_ia32_t2rpntlvwz0rst1_internal:
-  case X86::BI__builtin_ia32_t2rpntlvwz1_internal:
-  case X86::BI__builtin_ia32_t2rpntlvwz1rs_internal:
-  case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal:
-  case X86::BI__builtin_ia32_t2rpntlvwz1rst1_internal: {
-    Intrinsic::ID IID;
-    switch (BuiltinID) {
-    default:
-      llvm_unreachable("Unsupported intrinsic!");
-    case X86::BI__builtin_ia32_t2rpntlvwz0_internal:
-      IID = Intrinsic::x86_t2rpntlvwz0_internal;
-      break;
-    case X86::BI__builtin_ia32_t2rpntlvwz0rs_internal:
-      IID = Intrinsic::x86_t2rpntlvwz0rs_internal;
-      break;
-    case X86::BI__builtin_ia32_t2rpntlvwz0t1_internal:
-      IID = Intrinsic::x86_t2rpntlvwz0t1_internal;
-      break;
-    case X86::BI__builtin_ia32_t2rpntlvwz0rst1_internal:
-      IID = Intrinsic::x86_t2rpntlvwz0rst1_internal;
-      break;
-    case X86::BI__builtin_ia32_t2rpntlvwz1_internal:
-      IID = Intrinsic::x86_t2rpntlvwz1_internal;
-      break;
-    case X86::BI__builtin_ia32_t2rpntlvwz1rs_internal:
-      IID = Intrinsic::x86_t2rpntlvwz1rs_internal;
-      break;
-    case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal:
-      IID = Intrinsic::x86_t2rpntlvwz1t1_internal;
-      break;
-    case X86::BI__builtin_ia32_t2rpntlvwz1rst1_internal:
-      IID = Intrinsic::x86_t2rpntlvwz1rst1_internal;
-      break;
-    }
-
-    // Ops = (Row0, Col0, Col1, DstPtr0, DstPtr1, SrcPtr, Stride)
-    Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID),
-                                     {Ops[0], Ops[1], Ops[2], Ops[5], Ops[6]});
-
-    auto *PtrTy = E->getArg(3)->getType()->getAs<PointerType>();
-    assert(PtrTy && "arg3 must be of pointer type");
-    QualType PtreeTy = PtrTy->getPointeeType();
-    llvm::Type *TyPtee = ConvertType(PtreeTy);
-
-    // Bitcast amx type (x86_amx) to vector type (256 x i32)
-    // Then store tile0 into DstPtr0
-    Value *T0 = Builder.CreateExtractValue(Call, 0);
-    Value *VecT0 = Builder.CreateIntrinsic(Intrinsic::x86_cast_tile_to_vector,
-                                           {TyPtee}, {T0});
-    Builder.CreateDefaultAlignedStore(VecT0, Ops[3]);
-
-    // Then store tile1 into DstPtr1
-    Value *T1 = Builder.CreateExtractValue(Call, 1);
-    Value *VecT1 = Builder.CreateIntrinsic(Intrinsic::x86_cast_tile_to_vector,
-                                           {TyPtee}, {T1});
-    Value *Store = Builder.CreateDefaultAlignedStore(VecT1, Ops[4]);
-
-    // Note: Here we escape directly use x86_tilestored64_internal to store
-    // the results due to it can't make sure the Mem written scope. This may
-    // cause shapes reloads after first amx intrinsic, which current amx reg-
-    // ister allocation has no ability to handle it.
-
-    return Store;
-  }
   case X86::BI__ud2:
     // llvm.trap makes a ud2a instruction on x86.
     return EmitTrapCall(Intrinsic::trap);
diff --git a/clang/lib/CodeGen/Targets/SPIR.cpp b/clang/lib/CodeGen/Targets/SPIR.cpp
index 15d0b353d748c..abd049aca0ed7 100644
--- a/clang/lib/CodeGen/Targets/SPIR.cpp
+++ b/clang/lib/CodeGen/Targets/SPIR.cpp
@@ -260,7 +260,8 @@ CommonSPIRTargetCodeGenInfo::getNullPointer(const CodeGen::CodeGenModule &CGM,
   LangAS AS = QT->getUnqualifiedDesugaredType()->isNullPtrType()
                   ? LangAS::Default
                   : QT->getPointeeType().getAddressSpace();
-  if (AS == LangAS::Default || AS == LangAS::opencl_generic)
+  if (AS == LangAS::Default || AS == LangAS::opencl_generic ||
+      AS == LangAS::opencl_constant)
     return llvm::ConstantPointerNull::get(PT);
 
   auto &Ctx = CGM.getContext();
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 40ea513e85427..51618d17a4180 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -308,9 +308,18 @@ InputArgList Driver::ParseArgStrings(ArrayRef<const char *> ArgStrings,
     auto ArgString = A->getAsString(Args);
     std::string Nearest;
     if (getOpts().findNearest(ArgString, Nearest, VisibilityMask) > 1) {
-      if (!IsCLMode() &&
-          getOpts().findExact(ArgString, Nearest,
-                              llvm::opt::Visibility(options::CC1Option))) {
+      if (IsFlangMode()) {
+        if (getOpts().findExact(ArgString, Nearest,
+                                llvm::opt::Visibility(options::FC1Option))) {
+          DiagID = diag::err_drv_unknown_argument_with_suggestion;
+          Diags.Report(DiagID) << ArgString << "-Xflang " + Nearest;
+        } else {
+          DiagID = diag::err_drv_unknown_argument;
+          Diags.Report(DiagID) << ArgString;
+        }
+      } else if (!IsCLMode() && getOpts().findExact(ArgString, Nearest,
+                                                    llvm::opt::Visibility(
+                                                        options::CC1Option))) {
         DiagID = diag::err_drv_unknown_argument_with_suggestion;
         Diags.Report(DiagID) << ArgString << "-Xclang " + Nearest;
       } else {
@@ -2531,10 +2540,14 @@ bool Driver::HandleImmediateArgs(Compilation &C) {
   }
 
   if (C.getArgs().hasArg(options::OPT_print_runtime_dir)) {
-    if (std::optional<std::string> RuntimePath = TC.getRuntimePath())
-      llvm::outs() << *RuntimePath << '\n';
-    else
-      llvm::outs() << TC.getCompilerRTPath() << '\n';
+    for (auto RuntimePath :
+         {TC.getRuntimePath(), std::make_optional(TC.getCompilerRTPath())}) {
+      if (RuntimePath && getVFS().exists(*RuntimePath)) {
+        llvm::outs() << *RuntimePath << '\n';
+        return false;
+      }
+    }
+    llvm::outs() << "(runtime dir is not present)" << '\n';
     return false;
   }
 
diff --git a/clang/lib/Driver/ToolChains/Arch/M68k.cpp b/clang/lib/Driver/ToolChains/Arch/M68k.cpp
index 1037c0ea80bf6..708ec84a37cfb 100644
--- a/clang/lib/Driver/ToolChains/Arch/M68k.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/M68k.cpp
@@ -36,12 +36,12 @@ std::string m68k::getM68kTargetCPU(const ArgList &Args) {
       return "generic";
 
     return llvm::StringSwitch<std::string>(CPUName)
-        .Cases("m68000", "68000", "M68000")
-        .Cases("m68010", "68010", "M68010")
-        .Cases("m68020", "68020", "M68020")
-        .Cases("m68030", "68030", "M68030")
-        .Cases("m68040", "68040", "M68040")
-        .Cases("m68060", "68060", "M68060")
+        .Cases({"m68000", "68000"}, "M68000")
+        .Cases({"m68010", "68010"}, "M68010")
+        .Cases({"m68020", "68020"}, "M68020")
+        .Cases({"m68030", "68030"}, "M68030")
+        .Cases({"m68040", "68040"}, "M68040")
+        .Cases({"m68060", "68060"}, "M68060")
         .Default(CPUName.str());
   }
   // FIXME: Throw error when multiple sub-architecture flag exist
diff --git a/clang/lib/Driver/ToolChains/Arch/Mips.cpp b/clang/lib/Driver/ToolChains/Arch/Mips.cpp
index 6a6a4ee1a647b..8d7b85dbeed99 100644
--- a/clang/lib/Driver/ToolChains/Arch/Mips.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/Mips.cpp
@@ -117,7 +117,7 @@ void mips::getMipsCPUAndABI(const ArgList &Args, const llvm::Triple &Triple,
     // Deduce CPU name from ABI name.
     CPUName = llvm::StringSwitch<const char *>(ABIName)
                   .Case("o32", DefMips32CPU)
-                  .Cases("n32", "n64", DefMips64CPU)
+                  .Cases({"n32", "n64"}, DefMips64CPU)
                   .Default("");
   }
 
@@ -467,7 +467,7 @@ bool mips::isNaN2008(const Driver &D, const ArgList &Args,
 
   // NaN2008 is the default for MIPS32r6/MIPS64r6.
   return llvm::StringSwitch<bool>(getCPUName(D, Args, Triple))
-      .Cases("mips32r6", "mips64r6", true)
+      .Cases({"mips32r6", "mips64r6"}, true)
       .Default(false);
 }
 
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 79edc561c551f..d3ab6f1261ad6 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -1414,17 +1414,18 @@ static void CollectARMPACBTIOptions(const ToolChain &TC, const ArgList &Args,
     GuardedControlStack = PBP.GuardedControlStack;
   }
 
-  bool HasPtrauthReturns = llvm::any_of(CmdArgs, [](const char *Arg) {
-    return StringRef(Arg) == "-fptrauth-returns";
-  });
+  Arg *PtrauthReturnsArg = Args.getLastArg(options::OPT_fptrauth_returns,
+                                           options::OPT_fno_ptrauth_returns);
+  bool HasPtrauthReturns =
+      PtrauthReturnsArg &&
+      PtrauthReturnsArg->getOption().matches(options::OPT_fptrauth_returns);
   // GCS is currently untested with ptrauth-returns, but enabling this could be
   // allowed in future after testing with a suitable system.
-  if (HasPtrauthReturns &&
-      (Scope != "none" || BranchProtectionPAuthLR || GuardedControlStack)) {
+  if (Scope != "none" || BranchProtectionPAuthLR || GuardedControlStack) {
     if (Triple.getEnvironment() == llvm::Triple::PAuthTest)
       D.Diag(diag::err_drv_unsupported_opt_for_target)
           << A->getAsString(Args) << Triple.getTriple();
-    else
+    else if (HasPtrauthReturns)
       D.Diag(diag::err_drv_incompatible_options)
           << A->getAsString(Args) << "-fptrauth-returns";
   }
@@ -1670,34 +1671,42 @@ void Clang::AddAArch64TargetArgs(const ArgList &Args,
 
   AddUnalignedAccessWarning(CmdArgs);
 
-  Args.addOptInFlag(CmdArgs, options::OPT_fptrauth_intrinsics,
-                    options::OPT_fno_ptrauth_intrinsics);
-  Args.addOptInFlag(CmdArgs, options::OPT_fptrauth_calls,
-                    options::OPT_fno_ptrauth_calls);
-  Args.addOptInFlag(CmdArgs, options::OPT_fptrauth_returns,
-                    options::OPT_fno_ptrauth_returns);
-  Args.addOptInFlag(CmdArgs, options::OPT_fptrauth_auth_traps,
-                    options::OPT_fno_ptrauth_auth_traps);
-  Args.addOptInFlag(
-      CmdArgs, options::OPT_fptrauth_vtable_pointer_address_discrimination,
-      options::OPT_fno_ptrauth_vtable_pointer_address_discrimination);
-  Args.addOptInFlag(
-      CmdArgs, options::OPT_fptrauth_vtable_pointer_type_discrimination,
-      options::OPT_fno_ptrauth_vtable_pointer_type_discrimination);
-  Args.addOptInFlag(
-      CmdArgs, options::OPT_fptrauth_type_info_vtable_pointer_discrimination,
-      options::OPT_fno_ptrauth_type_info_vtable_pointer_discrimination);
-  Args.addOptInFlag(
-      CmdArgs, options::OPT_fptrauth_function_pointer_type_discrimination,
-      options::OPT_fno_ptrauth_function_pointer_type_discrimination);
-
-  Args.addOptInFlag(CmdArgs, options::OPT_fptrauth_indirect_gotos,
-                    options::OPT_fno_ptrauth_indirect_gotos);
-  Args.addOptInFlag(CmdArgs, options::OPT_fptrauth_init_fini,
-                    options::OPT_fno_ptrauth_init_fini);
-  Args.addOptInFlag(CmdArgs,
-                    options::OPT_fptrauth_init_fini_address_discrimination,
-                    options::OPT_fno_ptrauth_init_fini_address_discrimination);
+  if (Triple.isOSDarwin() ||
+      (Triple.isOSLinux() &&
+       Triple.getEnvironment() == llvm::Triple::PAuthTest)) {
+    Args.addOptInFlag(CmdArgs, options::OPT_fptrauth_intrinsics,
+                      options::OPT_fno_ptrauth_intrinsics);
+    Args.addOptInFlag(CmdArgs, options::OPT_fptrauth_calls,
+                      options::OPT_fno_ptrauth_calls);
+    Args.addOptInFlag(CmdArgs, options::OPT_fptrauth_returns,
+                      options::OPT_fno_ptrauth_returns);
+    Args.addOptInFlag(CmdArgs, options::OPT_fptrauth_auth_traps,
+                      options::OPT_fno_ptrauth_auth_traps);
+    Args.addOptInFlag(
+        CmdArgs, options::OPT_fptrauth_vtable_pointer_address_discrimination,
+        options::OPT_fno_ptrauth_vtable_pointer_address_discrimination);
+    Args.addOptInFlag(
+        CmdArgs, options::OPT_fptrauth_vtable_pointer_type_discrimination,
+        options::OPT_fno_ptrauth_vtable_pointer_type_discrimination);
+    Args.addOptInFlag(
+        CmdArgs, options::OPT_fptrauth_type_info_vtable_pointer_discrimination,
+        options::OPT_fno_ptrauth_type_info_vtable_pointer_discrimination);
+    Args.addOptInFlag(
+        CmdArgs, options::OPT_fptrauth_function_pointer_type_discrimination,
+        options::OPT_fno_ptrauth_function_pointer_type_discrimination);
+    Args.addOptInFlag(CmdArgs, options::OPT_fptrauth_indirect_gotos,
+                      options::OPT_fno_ptrauth_indirect_gotos);
+  }
+  if (Triple.isOSLinux() &&
+      Triple.getEnvironment() == llvm::Triple::PAuthTest) {
+    Args.addOptInFlag(CmdArgs, options::OPT_fptrauth_init_fini,
+                      options::OPT_fno_ptrauth_init_fini);
+    Args.addOptInFlag(
+        CmdArgs, options::OPT_fptrauth_init_fini_address_discrimination,
+        options::OPT_fno_ptrauth_init_fini_address_discrimination);
+    Args.addOptInFlag(CmdArgs, options::OPT_fptrauth_elf_got,
+                      options::OPT_fno_ptrauth_elf_got);
+  }
   Args.addOptInFlag(CmdArgs, options::OPT_faarch64_jump_table_hardening,
                     options::OPT_fno_aarch64_jump_table_hardening);
 
@@ -3699,6 +3708,7 @@ static void RenderHLSLOptions(const ArgList &Args, ArgStringList &CmdArgs,
       options::OPT_emit_obj,
       options::OPT_disable_llvm_passes,
       options::OPT_fnative_half_type,
+      options::OPT_fnative_int16_type,
       options::OPT_hlsl_entrypoint,
       options::OPT_fdx_rootsignature_define,
       options::OPT_fdx_rootsignature_version,
diff --git a/clang/lib/Driver/ToolChains/Darwin.cpp b/clang/lib/Driver/ToolChains/Darwin.cpp
index cc5bcd1816c52..2fb7652d64536 100644
--- a/clang/lib/Driver/ToolChains/Darwin.cpp
+++ b/clang/lib/Driver/ToolChains/Darwin.cpp
@@ -1035,12 +1035,12 @@ static const char *ArmMachOArchName(StringRef Arch) {
       .Case("xscale", "xscale")
       .Case("armv4t", "armv4t")
       .Case("armv7", "armv7")
-      .Cases("armv7a", "armv7-a", "armv7")
-      .Cases("armv7r", "armv7-r", "armv7")
-      .Cases("armv7em", "armv7e-m", "armv7em")
-      .Cases("armv7k", "armv7-k", "armv7k")
-      .Cases("armv7m", "armv7-m", "armv7m")
-      .Cases("armv7s", "armv7-s", "armv7s")
+      .Cases({"armv7a", "armv7-a"}, "armv7")
+      .Cases({"armv7r", "armv7-r"}, "armv7")
+      .Cases({"armv7em", "armv7e-m"}, "armv7em")
+      .Cases({"armv7k", "armv7-k"}, "armv7k")
+      .Cases({"armv7m", "armv7-m"}, "armv7m")
+      .Cases({"armv7s", "armv7-s"}, "armv7s")
       .Default(nullptr);
 }
 
diff --git a/clang/lib/Driver/ToolChains/Fuchsia.cpp b/clang/lib/Driver/ToolChains/Fuchsia.cpp
index 31c2f3f7e1be4..507cc03b27513 100644
--- a/clang/lib/Driver/ToolChains/Fuchsia.cpp
+++ b/clang/lib/Driver/ToolChains/Fuchsia.cpp
@@ -483,7 +483,8 @@ SanitizerMask Fuchsia::getSupportedSanitizers() const {
   Res |= SanitizerKind::Leak;
   Res |= SanitizerKind::Scudo;
   Res |= SanitizerKind::Thread;
-  if (getTriple().getArch() == llvm::Triple::x86_64) {
+  if (getTriple().getArch() == llvm::Triple::x86_64 ||
+      getTriple().getArch() == llvm::Triple::x86) {
     Res |= SanitizerKind::SafeStack;
   }
   return Res;
@@ -496,6 +497,7 @@ SanitizerMask Fuchsia::getDefaultSanitizers() const {
   case llvm::Triple::riscv64:
     Res |= SanitizerKind::ShadowCallStack;
     break;
+  case llvm::Triple::x86:
   case llvm::Triple::x86_64:
     Res |= SanitizerKind::SafeStack;
     break;
diff --git a/clang/lib/Driver/ToolChains/HLSL.cpp b/clang/lib/Driver/ToolChains/HLSL.cpp
index 20a320ea233d4..8d3fba7137c7c 100644
--- a/clang/lib/Driver/ToolChains/HLSL.cpp
+++ b/clang/lib/Driver/ToolChains/HLSL.cpp
@@ -498,6 +498,15 @@ HLSLToolChain::TranslateArgs(const DerivedArgList &Args, StringRef BoundArch,
       continue;
     }
 
+    if (A->getOption().getID() == options::OPT_enable_16bit_types) {
+      // Translate -enable-16bit-types into -fnative-half-type and
+      // -fnative-int16-type
+      DAL->AddFlagArg(nullptr, Opts.getOption(options::OPT_fnative_half_type));
+      DAL->AddFlagArg(nullptr, Opts.getOption(options::OPT_fnative_int16_type));
+      A->claim();
+      continue;
+    }
+
     DAL->append(A);
   }
 
diff --git a/clang/lib/Driver/ToolChains/Solaris.cpp b/clang/lib/Driver/ToolChains/Solaris.cpp
index 02aa59817449d..64c7d1ceb3a36 100644
--- a/clang/lib/Driver/ToolChains/Solaris.cpp
+++ b/clang/lib/Driver/ToolChains/Solaris.cpp
@@ -346,7 +346,7 @@ SanitizerMask Solaris::getSupportedSanitizers() const {
 const char *Solaris::getDefaultLinker() const {
   // FIXME: Only handle Solaris ld and GNU ld here.
   return llvm::StringSwitch<const char *>(getDriver().getPreferredLinker())
-      .Cases("bfd", "gld", "/usr/gnu/bin/ld")
+      .Cases({"bfd", "gld"}, "/usr/gnu/bin/ld")
       .Default("/usr/bin/ld");
 }
 
diff --git a/clang/lib/Driver/ToolChains/ZOS.cpp b/clang/lib/Driver/ToolChains/ZOS.cpp
index 57bcb3c306cef..9a3c45323a3cf 100644
--- a/clang/lib/Driver/ToolChains/ZOS.cpp
+++ b/clang/lib/Driver/ToolChains/ZOS.cpp
@@ -75,7 +75,7 @@ void zos::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
 
   const char *Exec = Args.MakeArgString(getToolChain().GetProgramPath("as"));
   C.addCommand(std::make_unique<Command>(JA, *this, ResponseFileSupport::None(),
-                                         Exec, CmdArgs, Inputs));
+                                         Exec, CmdArgs, Inputs, Output));
 }
 
 static std::string getLEHLQ(const ArgList &Args) {
@@ -213,7 +213,7 @@ void zos::Linker::ConstructJob(Compilation &C, const JobAction &JA,
 
   const char *Exec = Args.MakeArgString(ToolChain.GetLinkerPath());
   C.addCommand(std::make_unique<Command>(JA, *this, ResponseFileSupport::None(),
-                                         Exec, CmdArgs, Inputs));
+                                         Exec, CmdArgs, Inputs, Output));
 }
 
 ToolChain::RuntimeLibType ZOS::GetDefaultRuntimeLibType() const {
diff --git a/clang/lib/Format/ContinuationIndenter.cpp b/clang/lib/Format/ContinuationIndenter.cpp
index e5abf833194d4..9ab024a03fbd7 100644
--- a/clang/lib/Format/ContinuationIndenter.cpp
+++ b/clang/lib/Format/ContinuationIndenter.cpp
@@ -356,9 +356,11 @@ bool ContinuationIndenter::canBreak(const LineState &State) {
     return CurrentState.BreakBeforeClosingBrace;
   }
 
-  // Allow breaking before the right parens with block indentation if there was
-  // a break after the left parens, which is tracked by BreakBeforeClosingParen.
-  if (Style.AlignAfterOpenBracket == FormatStyle::BAS_BlockIndent &&
+  // Check need to break before the right parens if there was a break after
+  // the left parens, which is tracked by BreakBeforeClosingParen.
+  if ((Style.BreakBeforeCloseBracketFunction ||
+       Style.BreakBeforeCloseBracketIf || Style.BreakBeforeCloseBracketLoop ||
+       Style.BreakBeforeCloseBracketSwitch) &&
       Current.is(tok::r_paren)) {
     return CurrentState.BreakBeforeClosingParen;
   }
@@ -837,32 +839,38 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun,
       return Tok.is(tok::l_brace) && Tok.isNot(BK_Block) &&
              Style.Cpp11BracedListStyle != FormatStyle::BLS_Block;
     };
-    if (Tok.isNoneOf(tok::l_paren, TT_TemplateOpener, tok::l_square) &&
-        !IsStartOfBracedList()) {
+    if (IsStartOfBracedList())
+      return Style.BreakAfterOpenBracketBracedList;
+    if (Tok.isNoneOf(tok::l_paren, TT_TemplateOpener, tok::l_square))
       return false;
-    }
     if (!Tok.Previous)
       return true;
     if (Tok.Previous->isIf())
-      return Style.AlignAfterOpenBracket == FormatStyle::BAS_AlwaysBreak;
-    return Tok.Previous->isNoneOf(TT_CastRParen, tok::kw_for, tok::kw_while,
-                                  tok::kw_switch) &&
-           !(Style.isJavaScript() && Tok.Previous->is(Keywords.kw_await));
+      return Style.BreakAfterOpenBracketIf;
+    if (Tok.Previous->isLoop(Style))
+      return Style.BreakAfterOpenBracketLoop;
+    if (Tok.Previous->is(tok::kw_switch))
+      return Style.BreakAfterOpenBracketSwitch;
+    if (Style.BreakAfterOpenBracketFunction) {
+      return !Tok.Previous->is(TT_CastRParen) &&
+             !(Style.isJavaScript() && Tok.is(Keywords.kw_await));
+    }
+    return false;
   };
   auto IsFunctionCallParen = [](const FormatToken &Tok) {
     return Tok.is(tok::l_paren) && Tok.ParameterCount > 0 && Tok.Previous &&
            Tok.Previous->is(tok::identifier);
   };
-  auto IsInTemplateString = [this](const FormatToken &Tok) {
+  auto IsInTemplateString = [this](const FormatToken &Tok, bool NestBlocks) {
     if (!Style.isJavaScript())
       return false;
     for (const auto *Prev = &Tok; Prev; Prev = Prev->Previous) {
       if (Prev->is(TT_TemplateString) && Prev->opensScope())
         return true;
-      if (Prev->opensScope() ||
-          (Prev->is(TT_TemplateString) && Prev->closesScope())) {
-        break;
-      }
+      if (Prev->opensScope() && !NestBlocks)
+        return false;
+      if (Prev->is(TT_TemplateString) && Prev->closesScope())
+        return false;
     }
     return false;
   };
@@ -884,21 +892,25 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun,
          Tok.isOneOf(tok::ellipsis, Keywords.kw_await))) {
       return true;
     }
-    if (const auto *Previous = Tok.Previous;
-        !Previous || (Previous->isNoneOf(TT_FunctionDeclarationLParen,
-                                         TT_LambdaDefinitionLParen) &&
-                      !IsFunctionCallParen(*Previous))) {
+    const auto *Previous = TokAfterLParen.Previous;
+    assert(Previous); // IsOpeningBracket(Previous)
+    if (Previous->Previous &&
+        (Previous->Previous->isIf() || Previous->Previous->isLoop(Style) ||
+         Previous->Previous->is(tok::kw_switch))) {
+      return false;
+    }
+    if (Previous->isNoneOf(TT_FunctionDeclarationLParen,
+                           TT_LambdaDefinitionLParen) &&
+        !IsFunctionCallParen(*Previous)) {
       return true;
     }
-    if (IsOpeningBracket(Tok) || IsInTemplateString(Tok))
+    if (IsOpeningBracket(Tok) || IsInTemplateString(Tok, true))
       return true;
     const auto *Next = Tok.Next;
     return !Next || Next->isMemberAccess() ||
            Next->is(TT_FunctionDeclarationLParen) || IsFunctionCallParen(*Next);
   };
-  if ((Style.AlignAfterOpenBracket == FormatStyle::BAS_AlwaysBreak ||
-       Style.AlignAfterOpenBracket == FormatStyle::BAS_BlockIndent) &&
-      IsOpeningBracket(Previous) && State.Column > getNewLineColumn(State) &&
+  if (IsOpeningBracket(Previous) && State.Column > getNewLineColumn(State) &&
       // Don't do this for simple (no expressions) one-argument function calls
       // as that feels like needlessly wasting whitespace, e.g.:
       //
@@ -920,7 +932,7 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun,
   // Note: This doesn't apply to macro expansion lines, which are MACRO( , , )
   // with args as children of the '(' and ',' tokens. It does not make sense to
   // align the commas with the opening paren.
-  if (Style.AlignAfterOpenBracket != FormatStyle::BAS_DontAlign &&
+  if (Style.AlignAfterOpenBracket &&
       !CurrentState.IsCSharpGenericTypeConstraint && Previous.opensScope() &&
       Previous.isNoneOf(TT_ObjCMethodExpr, TT_RequiresClause,
                         TT_TableGenDAGArgOpener,
@@ -933,7 +945,7 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun,
          Previous.Previous->isNoneOf(tok::identifier, tok::l_paren,
                                      BK_BracedInit))) ||
        Previous.is(TT_VerilogMultiLineListLParen)) &&
-      !IsInTemplateString(Current)) {
+      !IsInTemplateString(Current, false)) {
     CurrentState.Indent = State.Column + Spaces;
     CurrentState.IsAligned = true;
   }
@@ -1271,8 +1283,20 @@ unsigned ContinuationIndenter::addTokenOnNewLine(LineState &State,
   }
 
   if (PreviousNonComment && PreviousNonComment->is(tok::l_paren)) {
-    CurrentState.BreakBeforeClosingParen =
-        Style.AlignAfterOpenBracket == FormatStyle::BAS_BlockIndent;
+    if (auto Previous = PreviousNonComment->Previous) {
+      if (Previous->isIf()) {
+        CurrentState.BreakBeforeClosingParen = Style.BreakBeforeCloseBracketIf;
+      } else if (Previous->isLoop(Style)) {
+        CurrentState.BreakBeforeClosingParen =
+            Style.BreakBeforeCloseBracketLoop;
+      } else if (Previous->is(tok::kw_switch)) {
+        CurrentState.BreakBeforeClosingParen =
+            Style.BreakBeforeCloseBracketSwitch;
+      } else {
+        CurrentState.BreakBeforeClosingParen =
+            Style.BreakBeforeCloseBracketFunction;
+      }
+    }
   }
 
   if (PreviousNonComment && PreviousNonComment->is(TT_TemplateOpener))
@@ -1416,13 +1440,17 @@ unsigned ContinuationIndenter::getNewLineColumn(const LineState &State) {
       State.Stack.size() > 1) {
     return State.Stack[State.Stack.size() - 2].LastSpace;
   }
-  if (Style.AlignAfterOpenBracket == FormatStyle::BAS_BlockIndent &&
-      (Current.is(tok::r_paren) ||
-       (Current.is(tok::r_brace) && Current.MatchingParen &&
-        Current.MatchingParen->is(BK_BracedInit))) &&
+  if (Style.BreakBeforeCloseBracketBracedList && Current.is(tok::r_brace) &&
+      Current.MatchingParen && Current.MatchingParen->is(BK_BracedInit) &&
       State.Stack.size() > 1) {
     return State.Stack[State.Stack.size() - 2].LastSpace;
   }
+  if ((Style.BreakBeforeCloseBracketFunction ||
+       Style.BreakBeforeCloseBracketIf || Style.BreakBeforeCloseBracketLoop ||
+       Style.BreakBeforeCloseBracketSwitch) &&
+      Current.is(tok::r_paren) && State.Stack.size() > 1) {
+    return State.Stack[State.Stack.size() - 2].LastSpace;
+  }
   if (Style.BreakBeforeTemplateCloser && Current.is(TT_TemplateCloser) &&
       State.Stack.size() > 1) {
     return State.Stack[State.Stack.size() - 2].LastSpace;
@@ -1844,8 +1872,8 @@ void ContinuationIndenter::moveStatePastFakeLParens(LineState &State,
          PrecedenceLevel < prec::Assignment) &&
         (!Previous || Previous->isNot(tok::kw_return) ||
          (!Style.isJava() && PrecedenceLevel > 0)) &&
-        (Style.AlignAfterOpenBracket != FormatStyle::BAS_DontAlign ||
-         PrecedenceLevel > prec::Comma || Current.NestingLevel == 0) &&
+        (Style.AlignAfterOpenBracket || PrecedenceLevel > prec::Comma ||
+         Current.NestingLevel == 0) &&
         (!Style.isTableGen() ||
          (Previous && Previous->isOneOf(TT_TableGenDAGArgListComma,
                                         TT_TableGenDAGArgListCommaToBreak)))) {
@@ -1885,8 +1913,7 @@ void ContinuationIndenter::moveStatePastFakeLParens(LineState &State,
     if (PrecedenceLevel > prec::Unknown)
       NewParenState.LastSpace = std::max(NewParenState.LastSpace, State.Column);
     if (PrecedenceLevel != prec::Conditional &&
-        Current.isNot(TT_UnaryOperator) &&
-        Style.AlignAfterOpenBracket != FormatStyle::BAS_DontAlign) {
+        Current.isNot(TT_UnaryOperator) && Style.AlignAfterOpenBracket) {
       NewParenState.StartOfFunctionCall = State.Column;
     }
 
diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp
index edd126c7724b8..dd14fcd72922f 100644
--- a/clang/lib/Format/Format.cpp
+++ b/clang/lib/Format/Format.cpp
@@ -32,6 +32,13 @@ using clang::format::FormatStyle;
 
 LLVM_YAML_IS_SEQUENCE_VECTOR(FormatStyle::RawStringFormat)
 
+enum BracketAlignmentStyle : int8_t {
+  BAS_Align,
+  BAS_DontAlign,
+  BAS_AlwaysBreak,
+  BAS_BlockIndent
+};
+
 namespace llvm {
 namespace yaml {
 template <>
@@ -204,16 +211,16 @@ template <> struct MappingTraits<FormatStyle::BraceWrappingFlags> {
   }
 };
 
-template <> struct ScalarEnumerationTraits<FormatStyle::BracketAlignmentStyle> {
-  static void enumeration(IO &IO, FormatStyle::BracketAlignmentStyle &Value) {
-    IO.enumCase(Value, "Align", FormatStyle::BAS_Align);
-    IO.enumCase(Value, "DontAlign", FormatStyle::BAS_DontAlign);
-    IO.enumCase(Value, "AlwaysBreak", FormatStyle::BAS_AlwaysBreak);
-    IO.enumCase(Value, "BlockIndent", FormatStyle::BAS_BlockIndent);
+template <> struct ScalarEnumerationTraits<BracketAlignmentStyle> {
+  static void enumeration(IO &IO, BracketAlignmentStyle &Value) {
+    IO.enumCase(Value, "Align", BAS_Align);
+    IO.enumCase(Value, "DontAlign", BAS_DontAlign);
 
     // For backward compatibility.
-    IO.enumCase(Value, "true", FormatStyle::BAS_Align);
-    IO.enumCase(Value, "false", FormatStyle::BAS_DontAlign);
+    IO.enumCase(Value, "true", BAS_Align);
+    IO.enumCase(Value, "false", BAS_DontAlign);
+    IO.enumCase(Value, "AlwaysBreak", BAS_AlwaysBreak);
+    IO.enumCase(Value, "BlockIndent", BAS_BlockIndent);
   }
 };
 
@@ -979,6 +986,54 @@ template <> struct MappingTraits<FormatStyle> {
     bool SpacesInCStyleCastParentheses = false;
     bool SpacesInParentheses = false;
 
+    if (IO.outputting()) {
+      IO.mapOptional("AlignAfterOpenBracket", Style.AlignAfterOpenBracket);
+    } else {
+      // For backward compatibility.
+      BracketAlignmentStyle LocalBAS = BAS_Align;
+      if (IsGoogleOrChromium) {
+        FormatStyle::LanguageKind Language = Style.Language;
+        if (Language == FormatStyle::LK_None)
+          Language = ((FormatStyle *)IO.getContext())->Language;
+        if (Language == FormatStyle::LK_JavaScript)
+          LocalBAS = BAS_AlwaysBreak;
+        else if (Language == FormatStyle::LK_Java)
+          LocalBAS = BAS_DontAlign;
+      } else if (BasedOnStyle.equals_insensitive("webkit")) {
+        LocalBAS = BAS_DontAlign;
+      }
+      IO.mapOptional("AlignAfterOpenBracket", LocalBAS);
+      Style.BreakAfterOpenBracketBracedList = false;
+      Style.BreakAfterOpenBracketFunction = false;
+      Style.BreakAfterOpenBracketIf = false;
+      Style.BreakAfterOpenBracketLoop = false;
+      Style.BreakAfterOpenBracketSwitch = false;
+      Style.BreakBeforeCloseBracketBracedList = false;
+      Style.BreakBeforeCloseBracketFunction = false;
+      Style.BreakBeforeCloseBracketIf = false;
+      Style.BreakBeforeCloseBracketLoop = false;
+      Style.BreakBeforeCloseBracketSwitch = false;
+
+      switch (LocalBAS) {
+      case BAS_DontAlign:
+        Style.AlignAfterOpenBracket = false;
+        break;
+      case BAS_BlockIndent:
+        Style.BreakBeforeCloseBracketBracedList = true;
+        Style.BreakBeforeCloseBracketFunction = true;
+        Style.BreakBeforeCloseBracketIf = true;
+        [[fallthrough]];
+      case BAS_AlwaysBreak:
+        Style.BreakAfterOpenBracketBracedList = true;
+        Style.BreakAfterOpenBracketFunction = true;
+        Style.BreakAfterOpenBracketIf = true;
+        [[fallthrough]];
+      case BAS_Align:
+        Style.AlignAfterOpenBracket = true;
+        break;
+      }
+    }
+
     // For backward compatibility.
     if (!IO.outputting()) {
       IO.mapOptional("AlignEscapedNewlinesLeft", Style.AlignEscapedNewlines);
@@ -1014,7 +1069,6 @@ template <> struct MappingTraits<FormatStyle> {
     }
 
     IO.mapOptional("AccessModifierOffset", Style.AccessModifierOffset);
-    IO.mapOptional("AlignAfterOpenBracket", Style.AlignAfterOpenBracket);
     IO.mapOptional("AlignArrayOfStructures", Style.AlignArrayOfStructures);
     IO.mapOptional("AlignConsecutiveAssignments",
                    Style.AlignConsecutiveAssignments);
@@ -1079,10 +1133,29 @@ template <> struct MappingTraits<FormatStyle> {
     IO.mapOptional("BreakAfterAttributes", Style.BreakAfterAttributes);
     IO.mapOptional("BreakAfterJavaFieldAnnotations",
                    Style.BreakAfterJavaFieldAnnotations);
+    IO.mapOptional("BreakAfterOpenBracketBracedList",
+                   Style.BreakAfterOpenBracketBracedList);
+    IO.mapOptional("BreakAfterOpenBracketFunction",
+                   Style.BreakAfterOpenBracketFunction);
+    IO.mapOptional("BreakAfterOpenBracketIf", Style.BreakAfterOpenBracketIf);
+    IO.mapOptional("BreakAfterOpenBracketLoop",
+                   Style.BreakAfterOpenBracketLoop);
+    IO.mapOptional("BreakAfterOpenBracketSwitch",
+                   Style.BreakAfterOpenBracketSwitch);
     IO.mapOptional("BreakAfterReturnType", Style.BreakAfterReturnType);
     IO.mapOptional("BreakArrays", Style.BreakArrays);
     IO.mapOptional("BreakBeforeBinaryOperators",
                    Style.BreakBeforeBinaryOperators);
+    IO.mapOptional("BreakBeforeCloseBracketBracedList",
+                   Style.BreakBeforeCloseBracketBracedList);
+    IO.mapOptional("BreakBeforeCloseBracketFunction",
+                   Style.BreakBeforeCloseBracketFunction);
+    IO.mapOptional("BreakBeforeCloseBracketIf",
+                   Style.BreakBeforeCloseBracketIf);
+    IO.mapOptional("BreakBeforeCloseBracketLoop",
+                   Style.BreakBeforeCloseBracketLoop);
+    IO.mapOptional("BreakBeforeCloseBracketSwitch",
+                   Style.BreakBeforeCloseBracketSwitch);
     IO.mapOptional("BreakBeforeConceptDeclarations",
                    Style.BreakBeforeConceptDeclarations);
     IO.mapOptional("BreakBeforeBraces", Style.BreakBeforeBraces);
@@ -1561,7 +1634,7 @@ static void expandPresetsSpacesInParens(FormatStyle &Expanded) {
 FormatStyle getLLVMStyle(FormatStyle::LanguageKind Language) {
   FormatStyle LLVMStyle;
   LLVMStyle.AccessModifierOffset = -2;
-  LLVMStyle.AlignAfterOpenBracket = FormatStyle::BAS_Align;
+  LLVMStyle.AlignAfterOpenBracket = true;
   LLVMStyle.AlignArrayOfStructures = FormatStyle::AIAS_None;
   LLVMStyle.AlignConsecutiveAssignments = {};
   LLVMStyle.AlignConsecutiveAssignments.PadOperators = true;
@@ -1621,10 +1694,20 @@ FormatStyle getLLVMStyle(FormatStyle::LanguageKind Language) {
   LLVMStyle.BreakAdjacentStringLiterals = true;
   LLVMStyle.BreakAfterAttributes = FormatStyle::ABS_Leave;
   LLVMStyle.BreakAfterJavaFieldAnnotations = false;
+  LLVMStyle.BreakAfterOpenBracketBracedList = false;
+  LLVMStyle.BreakAfterOpenBracketFunction = false;
+  LLVMStyle.BreakAfterOpenBracketIf = false;
+  LLVMStyle.BreakAfterOpenBracketLoop = false;
+  LLVMStyle.BreakAfterOpenBracketSwitch = false;
   LLVMStyle.BreakAfterReturnType = FormatStyle::RTBS_None;
   LLVMStyle.BreakArrays = true;
   LLVMStyle.BreakBeforeBinaryOperators = FormatStyle::BOS_None;
   LLVMStyle.BreakBeforeBraces = FormatStyle::BS_Attach;
+  LLVMStyle.BreakBeforeCloseBracketBracedList = false;
+  LLVMStyle.BreakBeforeCloseBracketFunction = false;
+  LLVMStyle.BreakBeforeCloseBracketIf = false;
+  LLVMStyle.BreakBeforeCloseBracketLoop = false;
+  LLVMStyle.BreakBeforeCloseBracketSwitch = false;
   LLVMStyle.BreakBeforeConceptDeclarations = FormatStyle::BBCDS_Always;
   LLVMStyle.BreakBeforeInlineASMColon = FormatStyle::BBIAS_OnlyMultiline;
   LLVMStyle.BreakBeforeTemplateCloser = false;
@@ -1877,7 +1960,7 @@ FormatStyle getGoogleStyle(FormatStyle::LanguageKind Language) {
   GoogleStyle.PenaltyReturnTypeOnItsOwnLine = 200;
 
   if (Language == FormatStyle::LK_Java) {
-    GoogleStyle.AlignAfterOpenBracket = FormatStyle::BAS_DontAlign;
+    GoogleStyle.AlignAfterOpenBracket = false;
     GoogleStyle.AlignOperands = FormatStyle::OAS_DontAlign;
     GoogleStyle.AlignTrailingComments = {};
     GoogleStyle.AlignTrailingComments.Kind = FormatStyle::TCAS_Never;
@@ -1889,7 +1972,9 @@ FormatStyle getGoogleStyle(FormatStyle::LanguageKind Language) {
     GoogleStyle.SpaceAfterCStyleCast = true;
     GoogleStyle.SpacesBeforeTrailingComments = 1;
   } else if (Language == FormatStyle::LK_JavaScript) {
-    GoogleStyle.AlignAfterOpenBracket = FormatStyle::BAS_AlwaysBreak;
+    GoogleStyle.BreakAfterOpenBracketBracedList = true;
+    GoogleStyle.BreakAfterOpenBracketFunction = true;
+    GoogleStyle.BreakAfterOpenBracketIf = true;
     GoogleStyle.AlignOperands = FormatStyle::OAS_DontAlign;
     GoogleStyle.AllowShortFunctionsOnASingleLine = FormatStyle::SFS_Empty;
     // TODO: still under discussion whether to switch to SLS_All.
@@ -2026,7 +2111,7 @@ FormatStyle getMozillaStyle() {
 FormatStyle getWebKitStyle() {
   FormatStyle Style = getLLVMStyle();
   Style.AccessModifierOffset = -4;
-  Style.AlignAfterOpenBracket = FormatStyle::BAS_DontAlign;
+  Style.AlignAfterOpenBracket = false;
   Style.AlignOperands = FormatStyle::OAS_DontAlign;
   Style.AlignTrailingComments = {};
   Style.AlignTrailingComments.Kind = FormatStyle::TCAS_Never;
diff --git a/clang/lib/Format/FormatToken.cpp b/clang/lib/Format/FormatToken.cpp
index d1c62642efd43..28fdbcbf0e47f 100644
--- a/clang/lib/Format/FormatToken.cpp
+++ b/clang/lib/Format/FormatToken.cpp
@@ -68,7 +68,7 @@ bool FormatToken::isBlockIndentedInitRBrace(const FormatStyle &Style) const {
   assert(MatchingParen);
   assert(MatchingParen->is(tok::l_brace));
   if (Style.Cpp11BracedListStyle == FormatStyle::BLS_Block ||
-      Style.AlignAfterOpenBracket != FormatStyle::BAS_BlockIndent) {
+      !Style.BreakBeforeCloseBracketBracedList) {
     return false;
   }
   const auto *LBrace = MatchingParen;
@@ -198,7 +198,7 @@ void CommaSeparatedList::precomputeFormattingInfos(const FormatToken *Token) {
     return;
 
   // Column format doesn't really make sense if we don't align after brackets.
-  if (Style.AlignAfterOpenBracket == FormatStyle::BAS_DontAlign)
+  if (!Style.AlignAfterOpenBracket)
     return;
 
   FormatToken *ItemBegin = Token->Next;
diff --git a/clang/lib/Format/FormatToken.h b/clang/lib/Format/FormatToken.h
index 6f3d24aefc1ca..d833130a538f1 100644
--- a/clang/lib/Format/FormatToken.h
+++ b/clang/lib/Format/FormatToken.h
@@ -666,6 +666,12 @@ struct FormatToken {
            (endsSequence(tok::identifier, tok::kw_if) && AllowConstexprMacro);
   }
 
+  bool isLoop(const FormatStyle &Style) const {
+    return isOneOf(tok::kw_for, tok::kw_while) ||
+           (Style.isJavaScript() && isNot(tok::l_paren) && Previous &&
+            Previous->is(tok::kw_for));
+  }
+
   bool closesScopeAfterBlock() const {
     if (getBlockKind() == BK_Block)
       return true;
diff --git a/clang/lib/Format/FormatTokenLexer.cpp b/clang/lib/Format/FormatTokenLexer.cpp
index ab3293841a2a4..a9ea5ec9009c4 100644
--- a/clang/lib/Format/FormatTokenLexer.cpp
+++ b/clang/lib/Format/FormatTokenLexer.cpp
@@ -318,14 +318,21 @@ void FormatTokenLexer::tryMergePreviousTokens() {
                            {tok::equal, tok::greater},
                            {tok::star, tok::greater},
                            {tok::pipeequal, tok::greater},
-                           {tok::pipe, tok::arrow},
-                           {tok::hash, tok::minus, tok::hash},
-                           {tok::hash, tok::equal, tok::hash}},
+                           {tok::pipe, tok::arrow}},
                           TT_BinaryOperator) ||
         Tokens.back()->is(tok::arrow)) {
       Tokens.back()->ForcedPrecedence = prec::Comma;
       return;
     }
+    if (Tokens.size() >= 3 &&
+        Tokens[Tokens.size() - 3]->is(Keywords.kw_verilogHash) &&
+        Tokens[Tokens.size() - 2]->isOneOf(tok::minus, tok::equal) &&
+        Tokens[Tokens.size() - 1]->is(Keywords.kw_verilogHash) &&
+        tryMergeTokens(3, TT_BinaryOperator)) {
+      Tokens.back()->setFinalizedType(TT_BinaryOperator);
+      Tokens.back()->ForcedPrecedence = prec::Comma;
+      return;
+    }
   } else if (Style.isTableGen()) {
     // TableGen's Multi line string starts with [{
     if (tryMergeTokens({tok::l_square, tok::l_brace},
diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index 1d0dfd0b9c151..cb41756c56bf7 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -358,11 +358,11 @@ class AnnotatingParser {
       Contexts.back().IsExpression = false;
     } else if (OpeningParen.Previous &&
                (OpeningParen.Previous->isOneOf(
-                    tok::kw_static_assert, tok::kw_noexcept, tok::kw_explicit,
-                    tok::kw_while, tok::l_paren, tok::comma, TT_CastRParen,
+                    tok::kw_noexcept, tok::kw_explicit, tok::kw_while,
+                    tok::l_paren, tok::comma, TT_CastRParen,
                     TT_BinaryOperator) ||
                 OpeningParen.Previous->isIf())) {
-      // static_assert, if and while usually contain expressions.
+      // if and while usually contain expressions.
       Contexts.back().IsExpression = true;
     } else if (Style.isJavaScript() && OpeningParen.Previous &&
                (OpeningParen.Previous->is(Keywords.kw_function) ||
@@ -454,6 +454,11 @@ class AnnotatingParser {
     if (StartsObjCSelector)
       OpeningParen.setType(TT_ObjCSelector);
 
+    const bool IsStaticAssert =
+        PrevNonComment && PrevNonComment->is(tok::kw_static_assert);
+    if (IsStaticAssert)
+      Contexts.back().InStaticAssertFirstArgument = true;
+
     // MightBeFunctionType and ProbablyFunctionType are used for
     // function pointer and reference types as well as Objective-C
     // block types:
@@ -583,8 +588,12 @@ class AnnotatingParser {
       }
       // When we discover a 'new', we set CanBeExpression to 'false' in order to
       // parse the type correctly. Reset that after a comma.
-      if (CurrentToken->is(tok::comma))
-        Contexts.back().CanBeExpression = true;
+      if (CurrentToken->is(tok::comma)) {
+        if (IsStaticAssert)
+          Contexts.back().InStaticAssertFirstArgument = false;
+        else
+          Contexts.back().CanBeExpression = true;
+      }
 
       if (Style.isTableGen()) {
         if (CurrentToken->is(tok::comma)) {
@@ -2144,6 +2153,7 @@ class AnnotatingParser {
     bool CaretFound = false;
     bool InCpp11AttributeSpecifier = false;
     bool InCSharpAttributeSpecifier = false;
+    bool InStaticAssertFirstArgument = false;
     bool VerilogAssignmentFound = false;
     // Whether the braces may mean concatenation instead of structure or array
     // literal.
@@ -2440,7 +2450,8 @@ class AnnotatingParser {
     } else if (Current.isPointerOrReference()) {
       Current.setType(determineStarAmpUsage(
           Current,
-          Contexts.back().CanBeExpression && Contexts.back().IsExpression,
+          (Contexts.back().CanBeExpression && Contexts.back().IsExpression) ||
+              Contexts.back().InStaticAssertFirstArgument,
           Contexts.back().ContextType == Context::TemplateArgument));
     } else if (Current.isOneOf(tok::minus, tok::plus, tok::caret) ||
                (Style.isVerilog() && Current.is(tok::pipe))) {
@@ -2674,8 +2685,11 @@ class AnnotatingParser {
     }
 
     // *a or &a or &&a.
-    if (PreviousNotConst->is(TT_PointerOrReference))
+    if (PreviousNotConst->is(TT_PointerOrReference) ||
+        PreviousNotConst->endsSequence(tok::coloncolon,
+                                       TT_PointerOrReference)) {
       return true;
+    }
 
     // MyClass a;
     if (PreviousNotConst->isTypeName(LangOpts))
@@ -4424,10 +4438,8 @@ unsigned TokenAnnotator::splitPenalty(const AnnotatedLine &Line,
 
   if (Left.is(tok::l_paren) && Style.PenaltyBreakOpenParenthesis != 0)
     return Style.PenaltyBreakOpenParenthesis;
-  if (Left.is(tok::l_paren) && InFunctionDecl &&
-      Style.AlignAfterOpenBracket != FormatStyle::BAS_DontAlign) {
+  if (Left.is(tok::l_paren) && InFunctionDecl && Style.AlignAfterOpenBracket)
     return 100;
-  }
   if (Left.is(tok::l_paren) && Left.Previous &&
       (Left.Previous->isOneOf(tok::kw_for, tok::kw__Generic) ||
        Left.Previous->isIf())) {
@@ -4443,7 +4455,7 @@ unsigned TokenAnnotator::splitPenalty(const AnnotatedLine &Line,
     // If we aren't aligning after opening parens/braces we can always break
     // here unless the style does not want us to place all arguments on the
     // next line.
-    if (Style.AlignAfterOpenBracket == FormatStyle::BAS_DontAlign &&
+    if (!Style.AlignAfterOpenBracket &&
         (Left.ParameterCount <= 1 || Style.AllowAllArgumentsOnNextLine)) {
       return 0;
     }
@@ -6223,24 +6235,31 @@ bool TokenAnnotator::canBreakBefore(const AnnotatedLine &Line,
                                    (Right.isBlockIndentedInitRBrace(Style)));
   }
 
-  // We only break before r_paren if we're in a block indented context.
+  // We can break before r_paren if we're in a block indented context or
+  // a control statement with an explicit style option.
   if (Right.is(tok::r_paren)) {
-    if (Style.AlignAfterOpenBracket != FormatStyle::BAS_BlockIndent ||
-        !Right.MatchingParen) {
+    if (!Right.MatchingParen)
       return false;
-    }
     auto Next = Right.Next;
     if (Next && Next->is(tok::r_paren))
       Next = Next->Next;
     if (Next && Next->is(tok::l_paren))
       return false;
     const FormatToken *Previous = Right.MatchingParen->Previous;
-    return !(Previous && (Previous->is(tok::kw_for) || Previous->isIf()));
+    if (!Previous)
+      return false;
+    if (Previous->isIf())
+      return Style.BreakBeforeCloseBracketIf;
+    if (Previous->isLoop(Style))
+      return Style.BreakBeforeCloseBracketLoop;
+    if (Previous->is(tok::kw_switch))
+      return Style.BreakBeforeCloseBracketSwitch;
+    return Style.BreakBeforeCloseBracketFunction;
   }
 
   if (Left.isOneOf(tok::r_paren, TT_TrailingAnnotation) &&
       Right.is(TT_TrailingAnnotation) &&
-      Style.AlignAfterOpenBracket == FormatStyle::BAS_BlockIndent) {
+      Style.BreakBeforeCloseBracketFunction) {
     return false;
   }
 
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index bd36eb4ecf9da..be7c1d367e082 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -4049,18 +4049,18 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,
   // -cl-std only applies for OpenCL language standards.
   // Override the -std option in this case.
   if (const Arg *A = Args.getLastArg(OPT_cl_std_EQ)) {
-    LangStandard::Kind OpenCLLangStd
-      = llvm::StringSwitch<LangStandard::Kind>(A->getValue())
-        .Cases("cl", "CL", LangStandard::lang_opencl10)
-        .Cases("cl1.0", "CL1.0", LangStandard::lang_opencl10)
-        .Cases("cl1.1", "CL1.1", LangStandard::lang_opencl11)
-        .Cases("cl1.2", "CL1.2", LangStandard::lang_opencl12)
-        .Cases("cl2.0", "CL2.0", LangStandard::lang_opencl20)
-        .Cases("cl3.0", "CL3.0", LangStandard::lang_opencl30)
-        .Cases("clc++", "CLC++", LangStandard::lang_openclcpp10)
-        .Cases("clc++1.0", "CLC++1.0", LangStandard::lang_openclcpp10)
-        .Cases("clc++2021", "CLC++2021", LangStandard::lang_openclcpp2021)
-        .Default(LangStandard::lang_unspecified);
+    LangStandard::Kind OpenCLLangStd =
+        llvm::StringSwitch<LangStandard::Kind>(A->getValue())
+            .Cases({"cl", "CL"}, LangStandard::lang_opencl10)
+            .Cases({"cl1.0", "CL1.0"}, LangStandard::lang_opencl10)
+            .Cases({"cl1.1", "CL1.1"}, LangStandard::lang_opencl11)
+            .Cases({"cl1.2", "CL1.2"}, LangStandard::lang_opencl12)
+            .Cases({"cl2.0", "CL2.0"}, LangStandard::lang_opencl20)
+            .Cases({"cl3.0", "CL3.0"}, LangStandard::lang_opencl30)
+            .Cases({"clc++", "CLC++"}, LangStandard::lang_openclcpp10)
+            .Cases({"clc++1.0", "CLC++1.0"}, LangStandard::lang_openclcpp10)
+            .Cases({"clc++2021", "CLC++2021"}, LangStandard::lang_openclcpp2021)
+            .Default(LangStandard::lang_unspecified);
 
     if (OpenCLLangStd == LangStandard::lang_unspecified) {
       Diags.Report(diag::err_drv_invalid_value)
@@ -4600,7 +4600,8 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,
         // Validate that if fnative-half-type is given, that
         // the language standard is at least hlsl2018, and that
         // the target shader model is at least 6.2.
-        if (Args.getLastArg(OPT_fnative_half_type)) {
+        if (Args.getLastArg(OPT_fnative_half_type) ||
+            Args.getLastArg(OPT_fnative_int16_type)) {
           const LangStandard &Std =
               LangStandard::getLangStandardForKind(Opts.LangStd);
           if (!(Opts.LangStd >= LangStandard::lang_hlsl2018 &&
@@ -4614,12 +4615,16 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,
           Diags.Report(diag::err_drv_hlsl_bad_shader_unsupported)
               << VulkanEnv << T.getOSName() << T.str();
         }
-        if (Args.getLastArg(OPT_fnative_half_type)) {
+        if (Args.getLastArg(OPT_fnative_half_type) ||
+            Args.getLastArg(OPT_fnative_int16_type)) {
+          const char *Str = Args.getLastArg(OPT_fnative_half_type)
+                                ? "-fnative-half-type"
+                                : "-fnative-int16-type";
           const LangStandard &Std =
               LangStandard::getLangStandardForKind(Opts.LangStd);
           if (!(Opts.LangStd >= LangStandard::lang_hlsl2018))
             Diags.Report(diag::err_drv_hlsl_16bit_types_unsupported)
-                << "-fnative-half-type" << false << Std.getName();
+                << Str << false << Std.getName();
         }
       } else {
         llvm_unreachable("expected DXIL or SPIR-V target");
diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp
index 47f1d5a6b636c..8602be1d8a173 100644
--- a/clang/lib/Frontend/InitPreprocessor.cpp
+++ b/clang/lib/Frontend/InitPreprocessor.cpp
@@ -399,7 +399,7 @@ static void InitializeStandardPredefinedMacros(const TargetInfo &TI,
     Builder.defineMacro("__HLSL_202y",
                         Twine((unsigned)LangOptions::HLSLLangStd::HLSL_202y));
 
-    if (LangOpts.NativeHalfType)
+    if (LangOpts.NativeHalfType && LangOpts.NativeInt16Type)
       Builder.defineMacro("__HLSL_ENABLE_16_BIT", "1");
 
     // Shader target information
diff --git a/clang/lib/Frontend/TextDiagnostic.cpp b/clang/lib/Frontend/TextDiagnostic.cpp
index 58885712fbdcc..aea3e72d92a84 100644
--- a/clang/lib/Frontend/TextDiagnostic.cpp
+++ b/clang/lib/Frontend/TextDiagnostic.cpp
@@ -17,28 +17,21 @@
 #include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Locale.h"
-#include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <optional>
 
 using namespace clang;
 
-static const enum raw_ostream::Colors noteColor = raw_ostream::CYAN;
-static const enum raw_ostream::Colors remarkColor =
-  raw_ostream::BLUE;
-static const enum raw_ostream::Colors fixitColor =
-  raw_ostream::GREEN;
-static const enum raw_ostream::Colors caretColor =
-  raw_ostream::GREEN;
-static const enum raw_ostream::Colors warningColor =
-  raw_ostream::MAGENTA;
-static const enum raw_ostream::Colors templateColor =
-  raw_ostream::CYAN;
-static const enum raw_ostream::Colors errorColor = raw_ostream::RED;
-static const enum raw_ostream::Colors fatalColor = raw_ostream::RED;
+static constexpr raw_ostream::Colors NoteColor = raw_ostream::CYAN;
+static constexpr raw_ostream::Colors RemarkColor = raw_ostream::BLUE;
+static constexpr raw_ostream::Colors FixitColor = raw_ostream::GREEN;
+static constexpr raw_ostream::Colors CaretColor = raw_ostream::GREEN;
+static constexpr raw_ostream::Colors WarningColor = raw_ostream::MAGENTA;
+static constexpr raw_ostream::Colors TemplateColor = raw_ostream::CYAN;
+static constexpr raw_ostream::Colors ErrorColor = raw_ostream::RED;
+static constexpr raw_ostream::Colors FatalColor = raw_ostream::RED;
 // Used for changing only the bold attribute.
-static const enum raw_ostream::Colors savedColor =
-  raw_ostream::SAVEDCOLOR;
+static constexpr raw_ostream::Colors SavedColor = raw_ostream::SAVEDCOLOR;
 
 // Magenta is taken for 'warning'. Red is already 'error' and 'cyan'
 // is already taken for 'note'. Green is already used to underline
@@ -48,6 +41,43 @@ static constexpr raw_ostream::Colors CommentColor = raw_ostream::YELLOW;
 static constexpr raw_ostream::Colors LiteralColor = raw_ostream::GREEN;
 static constexpr raw_ostream::Colors KeywordColor = raw_ostream::BLUE;
 
+namespace {
+template <typename Sub> class ColumnsOrBytes {
+public:
+  int V = 0;
+  ColumnsOrBytes(int V) : V(V) {}
+  bool isValid() const { return V != -1; }
+  Sub next() const { return Sub(V + 1); }
+  Sub prev() const { return Sub(V - 1); }
+
+  bool operator>(Sub O) const { return V > O.V; }
+  bool operator<(Sub O) const { return V < O.V; }
+  bool operator<=(Sub B) const { return V <= B.V; }
+  bool operator!=(Sub C) const { return C.V != V; }
+
+  Sub operator+(Sub B) const { return Sub(V + B.V); }
+  Sub &operator+=(Sub B) {
+    V += B.V;
+    return *static_cast<Sub *>(this);
+  }
+  Sub operator-(Sub B) const { return Sub(V - B.V); }
+  Sub &operator-=(Sub B) {
+    V -= B.V;
+    return *static_cast<Sub *>(this);
+  }
+};
+
+class Bytes final : public ColumnsOrBytes<Bytes> {
+public:
+  Bytes(int V) : ColumnsOrBytes(V) {}
+};
+
+class Columns final : public ColumnsOrBytes<Columns> {
+public:
+  Columns(int V) : ColumnsOrBytes(V) {}
+};
+} // namespace
+
 /// Add highlights to differences in template strings.
 static void applyTemplateHighlighting(raw_ostream &OS, StringRef Str,
                                       bool &Normal, bool Bold) {
@@ -59,11 +89,11 @@ static void applyTemplateHighlighting(raw_ostream &OS, StringRef Str,
 
     Str = Str.substr(Pos + 1);
     if (Normal)
-      OS.changeColor(templateColor, true);
+      OS.changeColor(TemplateColor, true);
     else {
       OS.resetColor();
       if (Bold)
-        OS.changeColor(savedColor, true);
+        OS.changeColor(SavedColor, true);
     }
     Normal = !Normal;
   }
@@ -110,8 +140,8 @@ printableTextForNextCharacter(StringRef SourceLine, size_t *I,
   if (SourceLine[*I] == '\t') {
     assert(0 < TabStop && TabStop <= DiagnosticOptions::MaxTabStop &&
            "Invalid -ftabstop value");
-    unsigned Col = bytesSincePreviousTabOrLineBegin(SourceLine, *I);
-    unsigned NumSpaces = TabStop - (Col % TabStop);
+    unsigned LineBytes = bytesSincePreviousTabOrLineBegin(SourceLine, *I);
+    unsigned NumSpaces = TabStop - (LineBytes % TabStop);
     assert(0 < NumSpaces && NumSpaces <= TabStop
            && "Invalid computation of space amt");
     ++(*I);
@@ -221,97 +251,99 @@ static void expandTabs(std::string &SourceLine, unsigned TabStop) {
 ///  (\\u3042 is represented in UTF-8 by three bytes and takes two columns to
 ///   display)
 static void genColumnByteMapping(StringRef SourceLine, unsigned TabStop,
-                                 SmallVectorImpl<int> &BytesOut,
-                                 SmallVectorImpl<int> &ColumnsOut) {
+                                 SmallVectorImpl<Bytes> &BytesOut,
+                                 SmallVectorImpl<Columns> &ColumnsOut) {
   assert(BytesOut.empty());
   assert(ColumnsOut.empty());
 
   if (SourceLine.empty()) {
-    BytesOut.resize(1u, 0);
-    ColumnsOut.resize(1u, 0);
+    BytesOut.resize(1u, Bytes(0));
+    ColumnsOut.resize(1u, Columns(0));
     return;
   }
 
   ColumnsOut.resize(SourceLine.size() + 1, -1);
 
-  int Columns = 0;
+  Columns NumColumns = 0;
   size_t I = 0;
   while (I < SourceLine.size()) {
-    ColumnsOut[I] = Columns;
-    BytesOut.resize(Columns + 1, -1);
-    BytesOut.back() = I;
+    ColumnsOut[I] = NumColumns;
+    BytesOut.resize(NumColumns.V + 1, -1);
+    BytesOut.back() = Bytes(I);
     auto [Str, Printable] =
         printableTextForNextCharacter(SourceLine, &I, TabStop);
-    Columns += llvm::sys::locale::columnWidth(Str);
+    NumColumns += Columns(llvm::sys::locale::columnWidth(Str));
   }
 
-  ColumnsOut.back() = Columns;
-  BytesOut.resize(Columns + 1, -1);
-  BytesOut.back() = I;
+  ColumnsOut.back() = NumColumns;
+  BytesOut.resize(NumColumns.V + 1, -1);
+  BytesOut.back() = Bytes(I);
 }
 
 namespace {
 struct SourceColumnMap {
   SourceColumnMap(StringRef SourceLine, unsigned TabStop)
-  : m_SourceLine(SourceLine) {
+      : SourceLine(SourceLine) {
 
-    genColumnByteMapping(SourceLine, TabStop, m_columnToByte, m_byteToColumn);
+    genColumnByteMapping(SourceLine, TabStop, ColumnToByte, ByteToColumn);
 
-    assert(m_byteToColumn.size()==SourceLine.size()+1);
-    assert(0 < m_byteToColumn.size() && 0 < m_columnToByte.size());
-    assert(m_byteToColumn.size()
-           == static_cast<unsigned>(m_columnToByte.back()+1));
-    assert(static_cast<unsigned>(m_byteToColumn.back()+1)
-           == m_columnToByte.size());
+    assert(ByteToColumn.size() == SourceLine.size() + 1);
+    assert(0 < ByteToColumn.size() && 0 < ColumnToByte.size());
+    assert(ByteToColumn.size() ==
+           static_cast<unsigned>(ColumnToByte.back().V + 1));
+    assert(static_cast<unsigned>(ByteToColumn.back().V + 1) ==
+           ColumnToByte.size());
   }
-  int columns() const { return m_byteToColumn.back(); }
-  int bytes() const { return m_columnToByte.back(); }
+  Columns columns() const { return ByteToColumn.back(); }
+  Bytes bytes() const { return ColumnToByte.back(); }
 
   /// Map a byte to the column which it is at the start of, or return -1
   /// if it is not at the start of a column (for a UTF-8 trailing byte).
-  int byteToColumn(int n) const {
-    assert(0<=n && n<static_cast<int>(m_byteToColumn.size()));
-    return m_byteToColumn[n];
+  Columns byteToColumn(Bytes N) const {
+    assert(0 <= N.V && N.V < static_cast<int>(ByteToColumn.size()));
+    return ByteToColumn[N.V];
   }
 
   /// Map a byte to the first column which contains it.
-  int byteToContainingColumn(int N) const {
-    assert(0 <= N && N < static_cast<int>(m_byteToColumn.size()));
-    while (m_byteToColumn[N] == -1)
-      --N;
-    return m_byteToColumn[N];
+  Columns byteToContainingColumn(Bytes N) const {
+    assert(0 <= N.V && N.V < static_cast<int>(ByteToColumn.size()));
+    while (!ByteToColumn[N.V].isValid())
+      --N.V;
+    return ByteToColumn[N.V];
   }
 
   /// Map a column to the byte which starts the column, or return -1 if
   /// the column the second or subsequent column of an expanded tab or similar
   /// multi-column entity.
-  int columnToByte(int n) const {
-    assert(0<=n && n<static_cast<int>(m_columnToByte.size()));
-    return m_columnToByte[n];
+  Bytes columnToByte(Columns N) const {
+    assert(0 <= N.V && N.V < static_cast<int>(ColumnToByte.size()));
+    return ColumnToByte[N.V];
   }
 
   /// Map from a byte index to the next byte which starts a column.
-  int startOfNextColumn(int N) const {
-    assert(0 <= N && N < static_cast<int>(m_byteToColumn.size() - 1));
-    while (byteToColumn(++N) == -1) {}
+  Bytes startOfNextColumn(Bytes N) const {
+    assert(0 <= N.V && N.V < static_cast<int>(ByteToColumn.size() - 1));
+    N = N.next();
+    while (!byteToColumn(N).isValid())
+      N = N.next();
     return N;
   }
 
   /// Map from a byte index to the previous byte which starts a column.
-  int startOfPreviousColumn(int N) const {
-    assert(0 < N && N < static_cast<int>(m_byteToColumn.size()));
-    while (byteToColumn(--N) == -1) {}
+  Bytes startOfPreviousColumn(Bytes N) const {
+    assert(0 < N.V && N.V < static_cast<int>(ByteToColumn.size()));
+    N = N.prev();
+    while (!byteToColumn(N).isValid())
+      N = N.prev();
     return N;
   }
 
-  StringRef getSourceLine() const {
-    return m_SourceLine;
-  }
+  StringRef getSourceLine() const { return SourceLine; }
 
 private:
-  const std::string m_SourceLine;
-  SmallVector<int,200> m_byteToColumn;
-  SmallVector<int,200> m_columnToByte;
+  StringRef SourceLine;
+  SmallVector<Columns, 200> ByteToColumn;
+  SmallVector<Bytes, 200> ColumnToByte;
 };
 } // end anonymous namespace
 
@@ -320,14 +352,15 @@ struct SourceColumnMap {
 static void selectInterestingSourceRegion(std::string &SourceLine,
                                           std::string &CaretLine,
                                           std::string &FixItInsertionLine,
-                                          unsigned Columns,
-                                          const SourceColumnMap &map) {
-  unsigned CaretColumns = CaretLine.size();
-  unsigned FixItColumns = llvm::sys::locale::columnWidth(FixItInsertionLine);
-  unsigned MaxColumns = std::max(static_cast<unsigned>(map.columns()),
-                                 std::max(CaretColumns, FixItColumns));
+                                          Columns NonGutterColumns,
+                                          const SourceColumnMap &Map) {
+  Columns CaretColumns = Columns(CaretLine.size());
+  Columns FixItColumns =
+      Columns(llvm::sys::locale::columnWidth(FixItInsertionLine));
+  Columns MaxColumns =
+      std::max({Map.columns().V, CaretColumns.V, FixItColumns.V});
   // if the number of columns is less than the desired number we're done
-  if (MaxColumns <= Columns)
+  if (MaxColumns <= NonGutterColumns)
     return;
 
   // No special characters are allowed in CaretLine.
@@ -335,13 +368,13 @@ static void selectInterestingSourceRegion(std::string &SourceLine,
 
   // Find the slice that we need to display the full caret line
   // correctly.
-  unsigned CaretStart = 0, CaretEnd = CaretLine.size();
-  for (; CaretStart != CaretEnd; ++CaretStart)
-    if (!isWhitespace(CaretLine[CaretStart]))
+  Columns CaretStart = 0, CaretEnd = CaretLine.size();
+  for (; CaretStart != CaretEnd; CaretStart = CaretStart.next())
+    if (!isWhitespace(CaretLine[CaretStart.V]))
       break;
 
-  for (; CaretEnd != CaretStart; --CaretEnd)
-    if (!isWhitespace(CaretLine[CaretEnd - 1]))
+  for (; CaretEnd != CaretStart; CaretEnd = CaretEnd.prev())
+    if (!isWhitespace(CaretLine[CaretEnd.V - 1]))
       break;
 
   // caret has already been inserted into CaretLine so the above whitespace
@@ -350,39 +383,38 @@ static void selectInterestingSourceRegion(std::string &SourceLine,
   // If we have a fix-it line, make sure the slice includes all of the
   // fix-it information.
   if (!FixItInsertionLine.empty()) {
-    unsigned FixItStart = 0, FixItEnd = FixItInsertionLine.size();
-    for (; FixItStart != FixItEnd; ++FixItStart)
-      if (!isWhitespace(FixItInsertionLine[FixItStart]))
-        break;
-
-    for (; FixItEnd != FixItStart; --FixItEnd)
-      if (!isWhitespace(FixItInsertionLine[FixItEnd - 1]))
-        break;
-
     // We can safely use the byte offset FixItStart as the column offset
     // because the characters up until FixItStart are all ASCII whitespace
     // characters.
-    unsigned FixItStartCol = FixItStart;
-    unsigned FixItEndCol
-      = llvm::sys::locale::columnWidth(FixItInsertionLine.substr(0, FixItEnd));
-
-    CaretStart = std::min(FixItStartCol, CaretStart);
-    CaretEnd = std::max(FixItEndCol, CaretEnd);
+    Bytes FixItStart = 0;
+    Bytes FixItEnd = Bytes(FixItInsertionLine.size());
+    while (FixItStart != FixItEnd &&
+           isWhitespace(FixItInsertionLine[FixItStart.V]))
+      FixItStart = FixItStart.next();
+
+    while (FixItEnd != FixItStart &&
+           isWhitespace(FixItInsertionLine[FixItEnd.V - 1]))
+      FixItEnd = FixItEnd.prev();
+
+    Columns FixItStartCol = Columns(FixItStart.V);
+    Columns FixItEndCol = Columns(llvm::sys::locale::columnWidth(
+        FixItInsertionLine.substr(0, FixItEnd.V)));
+
+    CaretStart = std::min(FixItStartCol.V, CaretStart.V);
+    CaretEnd = std::max(FixItEndCol.V, CaretEnd.V);
   }
 
   // CaretEnd may have been set at the middle of a character
   // If it's not at a character's first column then advance it past the current
   //   character.
-  while (static_cast<int>(CaretEnd) < map.columns() &&
-         -1 == map.columnToByte(CaretEnd))
-    ++CaretEnd;
-
-  assert((static_cast<int>(CaretStart) > map.columns() ||
-          -1!=map.columnToByte(CaretStart)) &&
-         "CaretStart must not point to a column in the middle of a source"
-         " line character");
-  assert((static_cast<int>(CaretEnd) > map.columns() ||
-          -1!=map.columnToByte(CaretEnd)) &&
+  while (CaretEnd < Map.columns() && !Map.columnToByte(CaretEnd).isValid())
+    CaretEnd = CaretEnd.next();
+
+  assert(
+      (CaretStart > Map.columns() || Map.columnToByte(CaretStart).isValid()) &&
+      "CaretStart must not point to a column in the middle of a source"
+      " line character");
+  assert((CaretEnd > Map.columns() || Map.columnToByte(CaretEnd).isValid()) &&
          "CaretEnd must not point to a column in the middle of a source line"
          " character");
 
@@ -391,70 +423,69 @@ static void selectInterestingSourceRegion(std::string &SourceLine,
   // number of columns we have, try to grow the slice to encompass
   // more context.
 
-  unsigned SourceStart = map.columnToByte(std::min<unsigned>(CaretStart,
-                                                             map.columns()));
-  unsigned SourceEnd = map.columnToByte(std::min<unsigned>(CaretEnd,
-                                                           map.columns()));
+  Bytes SourceStart = Map.columnToByte(std::min(CaretStart.V, Map.columns().V));
+  Bytes SourceEnd = Map.columnToByte(std::min(CaretEnd.V, Map.columns().V));
 
-  unsigned CaretColumnsOutsideSource = CaretEnd-CaretStart
-    - (map.byteToColumn(SourceEnd)-map.byteToColumn(SourceStart));
+  Columns CaretColumnsOutsideSource =
+      CaretEnd - CaretStart -
+      (Map.byteToColumn(SourceEnd) - Map.byteToColumn(SourceStart));
 
-  char const *front_ellipse = "  ...";
-  char const *front_space   = "     ";
-  char const *back_ellipse = "...";
-  unsigned ellipses_space = strlen(front_ellipse) + strlen(back_ellipse);
+  constexpr StringRef FrontEllipse = "  ...";
+  constexpr StringRef FrontSpace = "     ";
+  constexpr StringRef BackEllipse = "...";
+  Columns EllipsesColumns = Columns(FrontEllipse.size() + BackEllipse.size());
 
-  unsigned TargetColumns = Columns;
+  Columns TargetColumns = NonGutterColumns;
   // Give us extra room for the ellipses
   //  and any of the caret line that extends past the source
-  if (TargetColumns > ellipses_space+CaretColumnsOutsideSource)
-    TargetColumns -= ellipses_space+CaretColumnsOutsideSource;
+  if (TargetColumns > EllipsesColumns + CaretColumnsOutsideSource)
+    TargetColumns -= EllipsesColumns + CaretColumnsOutsideSource;
 
-  while (SourceStart>0 || SourceEnd<SourceLine.size()) {
+  while (SourceStart > 0 || SourceEnd < SourceLine.size()) {
     bool ExpandedRegion = false;
 
-    if (SourceStart>0) {
-      unsigned NewStart = map.startOfPreviousColumn(SourceStart);
+    if (SourceStart > 0) {
+      Bytes NewStart = Map.startOfPreviousColumn(SourceStart);
 
       // Skip over any whitespace we see here; we're looking for
       // another bit of interesting text.
       // FIXME: Detect non-ASCII whitespace characters too.
-      while (NewStart && isWhitespace(SourceLine[NewStart]))
-        NewStart = map.startOfPreviousColumn(NewStart);
+      while (NewStart > 0 && isWhitespace(SourceLine[NewStart.V]))
+        NewStart = Map.startOfPreviousColumn(NewStart);
 
       // Skip over this bit of "interesting" text.
-      while (NewStart) {
-        unsigned Prev = map.startOfPreviousColumn(NewStart);
-        if (isWhitespace(SourceLine[Prev]))
+      while (NewStart > 0) {
+        Bytes Prev = Map.startOfPreviousColumn(NewStart);
+        if (isWhitespace(SourceLine[Prev.V]))
           break;
         NewStart = Prev;
       }
 
-      assert(map.byteToColumn(NewStart) != -1);
-      unsigned NewColumns = map.byteToColumn(SourceEnd) -
-                              map.byteToColumn(NewStart);
+      assert(Map.byteToColumn(NewStart).isValid());
+      Columns NewColumns =
+          Map.byteToColumn(SourceEnd) - Map.byteToColumn(NewStart);
       if (NewColumns <= TargetColumns) {
         SourceStart = NewStart;
         ExpandedRegion = true;
       }
     }
 
-    if (SourceEnd<SourceLine.size()) {
-      unsigned NewEnd = map.startOfNextColumn(SourceEnd);
+    if (SourceEnd < SourceLine.size()) {
+      Bytes NewEnd = Map.startOfNextColumn(SourceEnd);
 
       // Skip over any whitespace we see here; we're looking for
       // another bit of interesting text.
       // FIXME: Detect non-ASCII whitespace characters too.
-      while (NewEnd < SourceLine.size() && isWhitespace(SourceLine[NewEnd]))
-        NewEnd = map.startOfNextColumn(NewEnd);
+      while (NewEnd < SourceLine.size() && isWhitespace(SourceLine[NewEnd.V]))
+        NewEnd = Map.startOfNextColumn(NewEnd);
 
       // Skip over this bit of "interesting" text.
-      while (NewEnd < SourceLine.size() && isWhitespace(SourceLine[NewEnd]))
-        NewEnd = map.startOfNextColumn(NewEnd);
+      while (NewEnd < SourceLine.size() && isWhitespace(SourceLine[NewEnd.V]))
+        NewEnd = Map.startOfNextColumn(NewEnd);
 
-      assert(map.byteToColumn(NewEnd) != -1);
-      unsigned NewColumns = map.byteToColumn(NewEnd) -
-                              map.byteToColumn(SourceStart);
+      assert(Map.byteToColumn(NewEnd).isValid());
+      Columns NewColumns =
+          Map.byteToColumn(NewEnd) - Map.byteToColumn(SourceStart);
       if (NewColumns <= TargetColumns) {
         SourceEnd = NewEnd;
         ExpandedRegion = true;
@@ -465,39 +496,41 @@ static void selectInterestingSourceRegion(std::string &SourceLine,
       break;
   }
 
-  CaretStart = map.byteToColumn(SourceStart);
-  CaretEnd = map.byteToColumn(SourceEnd) + CaretColumnsOutsideSource;
+  CaretStart = Map.byteToColumn(SourceStart);
+  CaretEnd = Map.byteToColumn(SourceEnd) + CaretColumnsOutsideSource;
 
   // [CaretStart, CaretEnd) is the slice we want. Update the various
   // output lines to show only this slice.
-  assert(CaretStart!=(unsigned)-1 && CaretEnd!=(unsigned)-1 &&
-         SourceStart!=(unsigned)-1 && SourceEnd!=(unsigned)-1);
+  assert(CaretStart.isValid() && CaretEnd.isValid() && SourceStart.isValid() &&
+         SourceEnd.isValid());
   assert(SourceStart <= SourceEnd);
   assert(CaretStart <= CaretEnd);
 
-  unsigned BackColumnsRemoved
-    = map.byteToColumn(SourceLine.size())-map.byteToColumn(SourceEnd);
-  unsigned FrontColumnsRemoved = CaretStart;
-  unsigned ColumnsKept = CaretEnd-CaretStart;
+  Columns BackColumnsRemoved =
+      Map.byteToColumn(Bytes{static_cast<int>(SourceLine.size())}) -
+      Map.byteToColumn(SourceEnd);
+  Columns FrontColumnsRemoved = CaretStart;
+  Columns ColumnsKept = CaretEnd - CaretStart;
 
   // We checked up front that the line needed truncation
-  assert(FrontColumnsRemoved+ColumnsKept+BackColumnsRemoved > Columns);
+  assert(FrontColumnsRemoved + ColumnsKept + BackColumnsRemoved >
+         NonGutterColumns);
 
   // The line needs some truncation, and we'd prefer to keep the front
   //  if possible, so remove the back
-  if (BackColumnsRemoved > strlen(back_ellipse))
-    SourceLine.replace(SourceEnd, std::string::npos, back_ellipse);
+  if (BackColumnsRemoved > Columns(BackEllipse.size()))
+    SourceLine.replace(SourceEnd.V, std::string::npos, BackEllipse);
 
   // If that's enough then we're done
-  if (FrontColumnsRemoved+ColumnsKept <= Columns)
+  if (FrontColumnsRemoved + ColumnsKept <= Columns(NonGutterColumns))
     return;
 
   // Otherwise remove the front as well
-  if (FrontColumnsRemoved > strlen(front_ellipse)) {
-    SourceLine.replace(0, SourceStart, front_ellipse);
-    CaretLine.replace(0, CaretStart, front_space);
+  if (FrontColumnsRemoved > Columns(FrontEllipse.size())) {
+    SourceLine.replace(0, SourceStart.V, FrontEllipse);
+    CaretLine.replace(0, CaretStart.V, FrontSpace);
     if (!FixItInsertionLine.empty())
-      FixItInsertionLine.replace(0, CaretStart, front_space);
+      FixItInsertionLine.replace(0, CaretStart.V, FrontSpace);
   }
 }
 
@@ -662,7 +695,7 @@ void TextDiagnostic::emitDiagnosticMessage(
     FullSourceLoc Loc, PresumedLoc PLoc, DiagnosticsEngine::Level Level,
     StringRef Message, ArrayRef<clang::CharSourceRange> Ranges,
     DiagOrStoredDiag D) {
-  uint64_t StartOfLocationInfo = OS.tell();
+  uint64_t StartOfLocationInfo = OS.getColumn();
 
   // Emit the location of this particular diagnostic.
   if (Loc.isValid())
@@ -675,8 +708,11 @@ void TextDiagnostic::emitDiagnosticMessage(
     printDiagnosticLevel(OS, Level, DiagOpts.ShowColors);
   printDiagnosticMessage(OS,
                          /*IsSupplemental*/ Level == DiagnosticsEngine::Note,
-                         Message, OS.tell() - StartOfLocationInfo,
+                         Message, OS.getColumn() - StartOfLocationInfo,
                          DiagOpts.MessageLength, DiagOpts.ShowColors);
+  // We use a formatted ostream, which does its own buffering. Flush here
+  // so we keep the proper order of output.
+  OS.flush();
 }
 
 /*static*/ void
@@ -688,11 +724,21 @@ TextDiagnostic::printDiagnosticLevel(raw_ostream &OS,
     switch (Level) {
     case DiagnosticsEngine::Ignored:
       llvm_unreachable("Invalid diagnostic type");
-    case DiagnosticsEngine::Note:    OS.changeColor(noteColor, true); break;
-    case DiagnosticsEngine::Remark:  OS.changeColor(remarkColor, true); break;
-    case DiagnosticsEngine::Warning: OS.changeColor(warningColor, true); break;
-    case DiagnosticsEngine::Error:   OS.changeColor(errorColor, true); break;
-    case DiagnosticsEngine::Fatal:   OS.changeColor(fatalColor, true); break;
+    case DiagnosticsEngine::Note:
+      OS.changeColor(NoteColor, true);
+      break;
+    case DiagnosticsEngine::Remark:
+      OS.changeColor(RemarkColor, true);
+      break;
+    case DiagnosticsEngine::Warning:
+      OS.changeColor(WarningColor, true);
+      break;
+    case DiagnosticsEngine::Error:
+      OS.changeColor(ErrorColor, true);
+      break;
+    case DiagnosticsEngine::Fatal:
+      OS.changeColor(FatalColor, true);
+      break;
     }
   }
 
@@ -720,7 +766,7 @@ void TextDiagnostic::printDiagnosticMessage(raw_ostream &OS,
   if (ShowColors && !IsSupplemental) {
     // Print primary diagnostic messages in bold and without color, to visually
     // indicate the transition from continuation notes and other output.
-    OS.changeColor(savedColor, true);
+    OS.changeColor(SavedColor, true);
     Bold = true;
   }
 
@@ -798,7 +844,7 @@ void TextDiagnostic::emitDiagnosticLoc(FullSourceLoc Loc, PresumedLoc PLoc,
     return;
 
   if (DiagOpts.ShowColors)
-    OS.changeColor(savedColor, true);
+    OS.changeColor(SavedColor, true);
 
   emitFilename(PLoc.getFilename(), Loc.getManager());
   switch (DiagOpts.getFormat()) {
@@ -959,41 +1005,40 @@ maybeAddRange(std::pair<unsigned, unsigned> A, std::pair<unsigned, unsigned> B,
 
 struct LineRange {
   unsigned LineNo;
-  unsigned StartCol;
-  unsigned EndCol;
+  Bytes StartByte;
+  Bytes EndByte;
 };
 
 /// Highlight \p R (with ~'s) on the current source line.
 static void highlightRange(const LineRange &R, const SourceColumnMap &Map,
                            std::string &CaretLine) {
   // Pick the first non-whitespace column.
-  unsigned StartColNo = R.StartCol;
-  while (StartColNo < Map.getSourceLine().size() &&
-         (Map.getSourceLine()[StartColNo] == ' ' ||
-          Map.getSourceLine()[StartColNo] == '\t'))
-    StartColNo = Map.startOfNextColumn(StartColNo);
+  Bytes StartByte = R.StartByte;
+  while (StartByte < Map.bytes() && (Map.getSourceLine()[StartByte.V] == ' ' ||
+                                     Map.getSourceLine()[StartByte.V] == '\t'))
+    StartByte = Map.startOfNextColumn(StartByte);
 
   // Pick the last non-whitespace column.
-  unsigned EndColNo =
-      std::min(static_cast<size_t>(R.EndCol), Map.getSourceLine().size());
-  while (EndColNo && (Map.getSourceLine()[EndColNo - 1] == ' ' ||
-                      Map.getSourceLine()[EndColNo - 1] == '\t'))
-    EndColNo = Map.startOfPreviousColumn(EndColNo);
+  Bytes EndByte = std::min(R.EndByte.V, Map.bytes().V);
+  while (EndByte.V != 0 && (Map.getSourceLine()[EndByte.V - 1] == ' ' ||
+                            Map.getSourceLine()[EndByte.V - 1] == '\t'))
+    EndByte = Map.startOfPreviousColumn(EndByte);
 
   // If the start/end passed each other, then we are trying to highlight a
   // range that just exists in whitespace. That most likely means we have
   // a multi-line highlighting range that covers a blank line.
-  if (StartColNo > EndColNo)
+  if (StartByte > EndByte)
     return;
 
+  assert(StartByte <= EndByte && "Invalid range!");
   // Fill the range with ~'s.
-  StartColNo = Map.byteToContainingColumn(StartColNo);
-  EndColNo = Map.byteToContainingColumn(EndColNo);
+  Columns StartCol = Map.byteToContainingColumn(StartByte);
+  Columns EndCol = Map.byteToContainingColumn(EndByte);
+
+  if (CaretLine.size() < static_cast<size_t>(EndCol.V))
+    CaretLine.resize(EndCol.V, ' ');
 
-  assert(StartColNo <= EndColNo && "Invalid range!");
-  if (CaretLine.size() < EndColNo)
-    CaretLine.resize(EndColNo, ' ');
-  std::fill(CaretLine.begin() + StartColNo, CaretLine.begin() + EndColNo, '~');
+  std::fill(CaretLine.begin() + StartCol.V, CaretLine.begin() + EndCol.V, '~');
 }
 
 static std::string buildFixItInsertionLine(FileID FID, unsigned LineNo,
@@ -1004,7 +1049,7 @@ static std::string buildFixItInsertionLine(FileID FID, unsigned LineNo,
   std::string FixItInsertionLine;
   if (Hints.empty() || !DiagOpts.ShowFixits)
     return FixItInsertionLine;
-  unsigned PrevHintEndCol = 0;
+  Columns PrevHintEndCol = 0;
 
   for (const auto &H : Hints) {
     if (H.CodeToInsert.empty())
@@ -1022,12 +1067,13 @@ static std::string buildFixItInsertionLine(FileID FID, unsigned LineNo,
       // Note: When modifying this function, be very careful about what is a
       // "column" (printed width, platform-dependent) and what is a
       // "byte offset" (SourceManager "column").
-      unsigned HintByteOffset =
-          SM.getColumnNumber(HintLocInfo.first, HintLocInfo.second) - 1;
+      Bytes HintByteOffset =
+          Bytes(SM.getColumnNumber(HintLocInfo.first, HintLocInfo.second))
+              .prev();
 
       // The hint must start inside the source or right at the end
-      assert(HintByteOffset < static_cast<unsigned>(map.bytes()) + 1);
-      unsigned HintCol = map.byteToContainingColumn(HintByteOffset);
+      assert(HintByteOffset < map.bytes().next());
+      Columns HintCol = map.byteToContainingColumn(HintByteOffset);
 
       // If we inserted a long previous hint, push this one forwards, and add
       // an extra space to show that this is not part of the previous
@@ -1041,11 +1087,11 @@ static std::string buildFixItInsertionLine(FileID FID, unsigned LineNo,
 
       // This should NOT use HintByteOffset, because the source might have
       // Unicode characters in earlier columns.
-      unsigned NewFixItLineSize = FixItInsertionLine.size() +
-                                  (HintCol - PrevHintEndCol) +
-                                  H.CodeToInsert.size();
+      Columns NewFixItLineSize = Columns(FixItInsertionLine.size()) +
+                                 (HintCol - PrevHintEndCol) +
+                                 Columns(H.CodeToInsert.size());
       if (NewFixItLineSize > FixItInsertionLine.size())
-        FixItInsertionLine.resize(NewFixItLineSize, ' ');
+        FixItInsertionLine.resize(NewFixItLineSize.V, ' ');
 
       std::copy(H.CodeToInsert.begin(), H.CodeToInsert.end(),
                 FixItInsertionLine.end() - H.CodeToInsert.size());
@@ -1093,28 +1139,29 @@ prepareAndFilterRanges(const SmallVectorImpl<CharSourceRange> &Ranges,
     if (EndLineNo < Lines.first || SM.getFileID(End) != FID)
       continue;
 
-    unsigned StartColumn = SM.getExpansionColumnNumber(Begin);
-    unsigned EndColumn = SM.getExpansionColumnNumber(End);
-    assert(StartColumn && "StartColumn must be valid, 0 is invalid");
-    assert(EndColumn && "EndColumn must be valid, 0 is invalid");
+    Bytes StartByte = SM.getExpansionColumnNumber(Begin);
+    Bytes EndByte = SM.getExpansionColumnNumber(End);
+    assert(StartByte.V != 0 && "StartByte must be valid, 0 is invalid");
+    assert(EndByte.V != 0 && "EndByte must be valid, 0 is invalid");
     if (R.isTokenRange())
-      EndColumn += Lexer::MeasureTokenLength(End, SM, LangOpts);
+      EndByte += Bytes(Lexer::MeasureTokenLength(End, SM, LangOpts));
 
     // Only a single line.
     if (StartLineNo == EndLineNo) {
-      LineRanges.push_back({StartLineNo, StartColumn - 1, EndColumn - 1});
+      LineRanges.push_back({StartLineNo, StartByte.prev(), EndByte.prev()});
       continue;
     }
 
     // Start line.
-    LineRanges.push_back({StartLineNo, StartColumn - 1, ~0u});
+    LineRanges.push_back(
+        {StartLineNo, StartByte.prev(), std::numeric_limits<int>::max()});
 
     // Middle lines.
     for (unsigned S = StartLineNo + 1; S != EndLineNo; ++S)
-      LineRanges.push_back({S, 0, ~0u});
+      LineRanges.push_back({S, 0, std::numeric_limits<int>::max()});
 
     // End line.
-    LineRanges.push_back({EndLineNo, 0, EndColumn - 1});
+    LineRanges.push_back({EndLineNo, 0, EndByte.prev()});
   }
 
   return LineRanges;
@@ -1224,8 +1271,7 @@ highlightLines(StringRef FileData, unsigned StartLineNumber,
     if (TokenStartLine > EndLineNumber)
       break;
 
-    unsigned StartCol =
-        SM.getSpellingColumnNumber(T.getLocation(), &Invalid) - 1;
+    Bytes StartCol = SM.getSpellingColumnNumber(T.getLocation(), &Invalid) - 1;
     if (Invalid)
       continue;
 
@@ -1233,14 +1279,14 @@ highlightLines(StringRef FileData, unsigned StartLineNumber,
     if (TokenStartLine == TokenEndLine) {
       SmallVector<TextDiagnostic::StyleRange> &LineRanges =
           SnippetRanges[TokenStartLine - StartLineNumber];
-      appendStyle(LineRanges, T, StartCol, T.getLength());
+      appendStyle(LineRanges, T, StartCol.V, T.getLength());
       continue;
     }
     assert((TokenEndLine - TokenStartLine) >= 1);
 
     // For tokens that span multiple lines (think multiline comments), we
     // divide them into multiple StyleRanges.
-    unsigned EndCol = SM.getSpellingColumnNumber(T.getEndLoc(), &Invalid) - 1;
+    Bytes EndCol = SM.getSpellingColumnNumber(T.getEndLoc(), &Invalid) - 1;
     if (Invalid)
       continue;
 
@@ -1256,9 +1302,9 @@ highlightLines(StringRef FileData, unsigned StartLineNumber,
               SnippetRanges[L - StartLineNumber];
 
           if (L == TokenStartLine) // First line
-            appendStyle(LineRanges, T, StartCol, LineLength);
+            appendStyle(LineRanges, T, StartCol.V, LineLength);
           else if (L == TokenEndLine) // Last line
-            appendStyle(LineRanges, T, 0, EndCol);
+            appendStyle(LineRanges, T, 0, EndCol.V);
           else
             appendStyle(LineRanges, T, 0, LineLength);
         }
@@ -1313,11 +1359,11 @@ void TextDiagnostic::emitSnippetAndCaret(
   const char *BufEnd = BufStart + BufData.size();
 
   unsigned CaretLineNo = Loc.getLineNumber();
-  unsigned CaretColNo = Loc.getColumnNumber();
+  Bytes CaretByte = Loc.getColumnNumber();
 
   // Arbitrarily stop showing snippets when the line is too long.
   static const size_t MaxLineLengthToPrint = 4096;
-  if (CaretColNo > MaxLineLengthToPrint)
+  if (CaretByte > MaxLineLengthToPrint)
     return;
 
   // Find the set of lines to include.
@@ -1377,35 +1423,37 @@ void TextDiagnostic::emitSnippetAndCaret(
     std::string SourceLine(LineStart, LineEnd);
     // Remove trailing null bytes.
     while (!SourceLine.empty() && SourceLine.back() == '\0' &&
-           (LineNo != CaretLineNo || SourceLine.size() > CaretColNo))
+           (LineNo != CaretLineNo ||
+            SourceLine.size() > static_cast<size_t>(CaretByte.V)))
       SourceLine.pop_back();
 
     // Build the byte to column map.
-    const SourceColumnMap sourceColMap(SourceLine, DiagOpts.TabStop);
+    const SourceColumnMap SourceColMap(SourceLine, DiagOpts.TabStop);
 
     std::string CaretLine;
     // Highlight all of the characters covered by Ranges with ~ characters.
     for (const auto &LR : LineRanges) {
       if (LR.LineNo == LineNo)
-        highlightRange(LR, sourceColMap, CaretLine);
+        highlightRange(LR, SourceColMap, CaretLine);
     }
 
     // Next, insert the caret itself.
     if (CaretLineNo == LineNo) {
-      size_t Col = sourceColMap.byteToContainingColumn(CaretColNo - 1);
-      CaretLine.resize(std::max(Col + 1, CaretLine.size()), ' ');
-      CaretLine[Col] = '^';
+      Columns Col = SourceColMap.byteToContainingColumn(CaretByte.prev());
+      CaretLine.resize(
+          std::max(static_cast<size_t>(Col.V) + 1, CaretLine.size()), ' ');
+      CaretLine[Col.V] = '^';
     }
 
     std::string FixItInsertionLine =
-        buildFixItInsertionLine(FID, LineNo, sourceColMap, Hints, SM, DiagOpts);
+        buildFixItInsertionLine(FID, LineNo, SourceColMap, Hints, SM, DiagOpts);
 
     // If the source line is too long for our terminal, select only the
     // "interesting" source region within that line.
-    unsigned Columns = DiagOpts.MessageLength;
-    if (Columns)
+    Columns MessageLength = DiagOpts.MessageLength;
+    if (MessageLength.V != 0)
       selectInterestingSourceRegion(SourceLine, CaretLine, FixItInsertionLine,
-                                    Columns, sourceColMap);
+                                    MessageLength, SourceColMap);
 
     // If we are in -fdiagnostics-print-source-range-info mode, we are trying
     // to produce easily machine parsable output.  Add a space before the
@@ -1423,7 +1471,7 @@ void TextDiagnostic::emitSnippetAndCaret(
     if (!CaretLine.empty()) {
       indentForLineNumbers();
       if (DiagOpts.ShowColors)
-        OS.changeColor(caretColor, true);
+        OS.changeColor(CaretColor, true);
       OS << CaretLine << '\n';
       if (DiagOpts.ShowColors)
         OS.resetColor();
@@ -1433,7 +1481,7 @@ void TextDiagnostic::emitSnippetAndCaret(
       indentForLineNumbers();
       if (DiagOpts.ShowColors)
         // Print fixit line in color
-        OS.changeColor(fixitColor, false);
+        OS.changeColor(FixitColor, false);
       if (DiagOpts.ShowSourceRanges)
         OS << ' ';
       OS << FixItInsertionLine << '\n';
@@ -1485,7 +1533,7 @@ void TextDiagnostic::emitSnippet(StringRef SourceLine,
       if (CharStyle != Styles.end()) {
         if (!CurrentColor ||
             (CurrentColor && *CurrentColor != CharStyle->Color)) {
-          OS.changeColor(CharStyle->Color, false);
+          OS.changeColor(CharStyle->Color);
           CurrentColor = CharStyle->Color;
         }
       } else if (CurrentColor) {
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index 18589125697b0..33fff7645df65 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -162,18 +162,12 @@ set(x86_files
   adxintrin.h
   ammintrin.h
   amxavx512intrin.h
-  amxbf16transposeintrin.h
   amxcomplexintrin.h
-  amxcomplextransposeintrin.h
   amxfp16intrin.h
-  amxfp16transposeintrin.h
   amxfp8intrin.h
   amxintrin.h
   amxmovrsintrin.h
-  amxmovrstransposeintrin.h
   amxtf32intrin.h
-  amxtf32transposeintrin.h
-  amxtransposeintrin.h
   avx10_2_512bf16intrin.h
   avx10_2_512convertintrin.h
   avx10_2_512minmaxintrin.h
diff --git a/clang/lib/Headers/amxbf16transposeintrin.h b/clang/lib/Headers/amxbf16transposeintrin.h
deleted file mode 100644
index 86f09f2ad8db2..0000000000000
--- a/clang/lib/Headers/amxbf16transposeintrin.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/*===----- amxbf16transposeintrin.h - AMX-BF16 and AMX-TRANSPOSE ------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===------------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error                                                                         \
-    "Never use <amxbf16transposeintrin.h> directly; use <immintrin.h> instead."
-#endif /* __IMMINTRIN_H */
-
-#ifndef __AMX_BF16TRANSPOSEINTRIN_H
-#define __AMX_BF16TRANSPOSEINTRIN_H
-#ifdef __x86_64__
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS                                                     \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("amx-bf16,amx-transpose")))
-
-/// Compute transpose and dot-product of BF16 (16-bit) floating-point pairs in
-///    tiles \a a and \a b, accumulating the intermediate single-precision
-///    (32-bit) floating-point elements with elements in \a dst, and store the
-///    32-bit result back to tile \a dst.
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// void _tile_tdpbf16ps (__tile dst, __tile a, __tile b)
-/// \endcode
-///
-/// \code{.operation}
-/// FOR m := 0 TO dst.rows - 1
-///	tmp := dst.row[m]
-///	FOR k := 0 TO (a.colsb / 4) - 1
-///		FOR n := 0 TO (dst.colsb / 4) - 1
-///			tmp.bf32[n] += FP32(a.row[m].bf16[2*k+0]) *
-///					FP32(b.row[k].bf16[2*n+0])
-///			tmp.bf32[n] += FP32(a.row[m].bf16[2*k+1]) *
-///					FP32(b.row[k].bf16[2*n+1])
-///		ENDFOR
-///	ENDFOR
-///	write_row_and_zero(dst, m, tmp, dst.colsb)
-/// ENDFOR
-/// zero_upper_rows(dst, dst.rows)
-/// zero_tileconfig_start()
-/// \endcode
-///
-/// This intrinsic corresponds to the \c TTDPBF16PS instruction.
-///
-/// \param dst
-///    The destination tile. Max size is 1024 Bytes.
-/// \param a
-///    The 1st source tile. Max size is 1024 Bytes.
-/// \param b
-///    The 2nd source tile. Max size is 1024 Bytes.
-#define _tile_tdpbf16ps(dst, a, b) __builtin_ia32_ttdpbf16ps((dst), (a), (b))
-
-/// This is internal intrinsic. C/C++ user should avoid calling it directly.
-static __inline__ _tile1024i __DEFAULT_FN_ATTRS
-_tile_tdpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k,
-                         _tile1024i dst, _tile1024i src1, _tile1024i src2) {
-  return __builtin_ia32_ttdpbf16ps_internal(m, n, k, dst, src1, src2);
-}
-
-/// Compute transpose and dot-product of BF16 (16-bit) floating-point pairs in
-///    tiles src0 and src1, accumulating the intermediate single-precision
-///    (32-bit) floating-point elements with elements in "dst", and store the
-///    32-bit result back to tile "dst".
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TTDPBF16PS </c> instruction.
-///
-/// \param dst
-///    The destination tile. Max size is 1024 Bytes.
-/// \param src0
-///    The 1st source tile. Max size is 1024 Bytes.
-/// \param src1
-///    The 2nd source tile. Max size is 1024 Bytes.
-__DEFAULT_FN_ATTRS
-static __inline__ void __tile_tdpbf16ps(__tile1024i *dst, __tile1024i src0,
-                                        __tile1024i src1) {
-  dst->tile = _tile_tdpbf16ps_internal(src0.row, src1.col, src0.col, dst->tile,
-                                       src0.tile, src1.tile);
-}
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif /* __x86_64__ */
-#endif /* __AMX_BF16TRANSPOSEINTRIN_H */
diff --git a/clang/lib/Headers/amxcomplextransposeintrin.h b/clang/lib/Headers/amxcomplextransposeintrin.h
deleted file mode 100644
index 11abaf98e9371..0000000000000
--- a/clang/lib/Headers/amxcomplextransposeintrin.h
+++ /dev/null
@@ -1,303 +0,0 @@
-/*===----- amxcomplextransposeintrin.h - AMX-COMPLEX and AMX-TRANSPOSE ------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===------------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error                                                                         \
-    "Never use <amxcomplextransposeintrin.h> directly; include <immintrin.h> instead."
-#endif // __IMMINTRIN_H
-
-#ifndef __AMX_COMPLEXTRANSPOSEINTRIN_H
-#define __AMX_COMPLEXTRANSPOSEINTRIN_H
-#ifdef __x86_64__
-
-#define __DEFAULT_FN_ATTRS                                                     \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("amx-complex,amx-transpose")))
-
-/// Perform matrix multiplication of two tiles containing complex elements and
-///    accumulate the results into a packed single precision tile. Each dword
-///    element in input tiles \a a and \a b is interpreted as a complex number
-///    with FP16 real part and FP16 imaginary part.
-/// Calculates the imaginary part of the result. For each possible combination
-///    of (transposed column of \a a, column of \a b), it performs a set of
-///    multiplication and accumulations on all corresponding complex numbers
-///    (one from \a a and one from \a b). The imaginary part of the \a a element
-///    is multiplied with the real part of the corresponding \a b element, and
-///    the real part of the \a a element is multiplied with the imaginary part
-///    of the corresponding \a b elements. The two accumulated results are
-///    added, and then accumulated into the corresponding row and column of
-///    \a dst.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// void _tile_tcmmimfp16ps(__tile dst, __tile a, __tile b);
-/// \endcode
-///
-/// \code{.operation}
-/// FOR m := 0 TO dst.rows - 1
-///	tmp := dst.row[m]
-///	FOR k := 0 TO a.rows - 1
-///		FOR n := 0 TO (dst.colsb / 4) - 1
-///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+1])
-///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+0])
-///		ENDFOR
-///	ENDFOR
-///	write_row_and_zero(dst, m, tmp, dst.colsb)
-/// ENDFOR
-/// zero_upper_rows(dst, dst.rows)
-/// zero_tileconfig_start()
-/// \endcode
-///
-/// This intrinsic corresponds to the \c TTCMMIMFP16PS instruction.
-///
-/// \param dst
-///    The destination tile. Max size is 1024 Bytes.
-/// \param a
-///    The 1st source tile. Max size is 1024 Bytes.
-/// \param b
-///    The 2nd source tile. Max size is 1024 Bytes.
-#define _tile_tcmmimfp16ps(dst, a, b)                                          \
-  __builtin_ia32_ttcmmimfp16ps((dst), (a), (b))
-
-/// Perform matrix multiplication of two tiles containing complex elements and
-///    accumulate the results into a packed single precision tile. Each dword
-///    element in input tiles \a a and \a b is interpreted as a complex number
-///    with FP16 real part and FP16 imaginary part.
-/// Calculates the real part of the result. For each possible combination
-///    of (rtransposed colum of \a a, column of \a b), it performs a set of
-///    multiplication and accumulations on all corresponding complex numbers
-///    (one from \a a and one from \a b). The real part of the \a a element is
-///    multiplied with the real part of the corresponding \a b element, and the
-///    negated imaginary part of the \a a element is multiplied with the
-///    imaginary part of the corresponding \a b elements. The two accumulated
-///    results are added, and then accumulated into the corresponding row and
-///    column of \a dst.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// void _tile_tcmmrlfp16ps(__tile dst, __tile a, __tile b);
-/// \endcode
-///
-/// \code{.operation}
-/// FOR m := 0 TO dst.rows - 1
-///	tmp := dst.row[m]
-///	FOR k := 0 TO a.rows - 1
-///		FOR n := 0 TO (dst.colsb / 4) - 1
-///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+0])
-///			tmp.fp32[n] += FP32(-a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+1])
-///		ENDFOR
-///	ENDFOR
-///	write_row_and_zero(dst, m, tmp, dst.colsb)
-/// ENDFOR
-/// zero_upper_rows(dst, dst.rows)
-/// zero_tileconfig_start()
-/// \endcode
-///
-/// This intrinsic corresponds to the \c TTCMMIMFP16PS instruction.
-///
-/// \param dst
-///    The destination tile. Max size is 1024 Bytes.
-/// \param a
-///    The 1st source tile. Max size is 1024 Bytes.
-/// \param b
-///    The 2nd source tile. Max size is 1024 Bytes.
-#define _tile_tcmmrlfp16ps(dst, a, b)                                          \
-  __builtin_ia32_ttcmmrlfp16ps((dst), (a), (b))
-
-/// Perform matrix conjugate transpose and multiplication of two tiles
-///    containing complex elements and accumulate the results into a packed
-///    single precision tile. Each dword element in input tiles \a a and \a b
-///    is interpreted as a complex number with FP16 real part and FP16 imaginary
-///    part.
-/// Calculates the imaginary part of the result. For each possible combination
-///    of (transposed column of \a a, column of \a b), it performs a set of
-///    multiplication and accumulations on all corresponding complex numbers
-///    (one from \a a and one from \a b). The negated imaginary part of the \a a
-///    element is multiplied with the real part of the corresponding \a b
-///    element, and the real part of the \a a element is multiplied with the
-///    imaginary part of the corresponding \a b elements. The two accumulated
-///    results are added, and then accumulated into the corresponding row and
-///    column of \a dst.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// void _tile_conjtcmmimfp16ps(__tile dst, __tile a, __tile b);
-/// \endcode
-///
-/// \code{.operation}
-/// FOR m := 0 TO dst.rows - 1
-///	tmp := dst.row[m]
-///	FOR k := 0 TO a.rows - 1
-///		FOR n := 0 TO (dst.colsb / 4) - 1
-///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+1])
-///			tmp.fp32[n] += FP32(-a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+0])
-///		ENDFOR
-///	ENDFOR
-///	write_row_and_zero(dst, m, tmp, dst.colsb)
-/// ENDFOR
-/// zero_upper_rows(dst, dst.rows)
-/// zero_tileconfig_start()
-/// \endcode
-///
-/// This intrinsic corresponds to the \c TCONJTCMMIMFP16PS instruction.
-///
-/// \param dst
-///    The destination tile. Max size is 1024 Bytes.
-/// \param a
-///    The 1st source tile. Max size is 1024 Bytes.
-/// \param b
-///    The 2nd source tile. Max size is 1024 Bytes.
-#define _tile_conjtcmmimfp16ps(dst, a, b)                                      \
-  __builtin_ia32_tconjtcmmimfp16ps((dst), (a), (b))
-
-/// Perform conjugate transpose of an FP16-pair of complex elements from \a a
-///    and writes the result to \a dst.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// void _tile_conjtfp16(__tile dst, __tile a);
-/// \endcode
-///
-/// \code{.operation}
-/// FOR i := 0 TO dst.rows - 1
-///	FOR j := 0 TO (dst.colsb / 4) - 1
-///		tmp.fp16[2*j+0] := a.row[j].fp16[2*i+0]
-///		tmp.fp16[2*j+1] := -a.row[j].fp16[2*i+1]
-///	ENDFOR
-///	write_row_and_zero(dst, i, tmp, dst.colsb)
-/// ENDFOR
-/// zero_upper_rows(dst, dst.rows)
-/// zero_tileconfig_start()
-/// \endcode
-///
-/// This intrinsic corresponds to the \c TCONJTFP16 instruction.
-///
-/// \param dst
-///    The destination tile. Max size is 1024 Bytes.
-/// \param a
-///    The source tile. Max size is 1024 Bytes.
-#define _tile_conjtfp16(dst, a) __builtin_ia32_tconjtfp16((dst), (a))
-
-static __inline__ _tile1024i __DEFAULT_FN_ATTRS _tile_tcmmimfp16ps_internal(
-    unsigned short m, unsigned short n, unsigned short k, _tile1024i dst,
-    _tile1024i src1, _tile1024i src2) {
-  return __builtin_ia32_ttcmmimfp16ps_internal(m, n, k, dst, src1, src2);
-}
-
-static __inline__ _tile1024i __DEFAULT_FN_ATTRS _tile_tcmmrlfp16ps_internal(
-    unsigned short m, unsigned short n, unsigned short k, _tile1024i dst,
-    _tile1024i src1, _tile1024i src2) {
-  return __builtin_ia32_ttcmmrlfp16ps_internal(m, n, k, dst, src1, src2);
-}
-
-static __inline__ _tile1024i __DEFAULT_FN_ATTRS _tile_conjtcmmimfp16ps_internal(
-    unsigned short m, unsigned short n, unsigned short k, _tile1024i dst,
-    _tile1024i src1, _tile1024i src2) {
-  return __builtin_ia32_tconjtcmmimfp16ps_internal(m, n, k, dst, src1, src2);
-}
-
-static __inline__ _tile1024i __DEFAULT_FN_ATTRS
-_tile_conjtfp16_internal(unsigned short m, unsigned short n, _tile1024i src) {
-  return __builtin_ia32_tconjtfp16_internal(m, n, src);
-}
-
-/// Perform matrix multiplication of two tiles containing complex elements and
-///    accumulate the results into a packed single precision tile. Each dword
-///    element in input tiles src0 and src1 is interpreted as a complex number
-///    with FP16 real part and FP16 imaginary part.
-///    This function calculates the imaginary part of the result.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TTCMMIMFP16PS </c> instruction.
-///
-/// \param dst
-///    The destination tile. Max size is 1024 Bytes.
-/// \param src0
-///    The 1st source tile. Max size is 1024 Bytes.
-/// \param src1
-///    The 2nd source tile. Max size is 1024 Bytes.
-__DEFAULT_FN_ATTRS
-static void __tile_tcmmimfp16ps(__tile1024i *dst, __tile1024i src0,
-                                __tile1024i src1) {
-  dst->tile = _tile_tcmmimfp16ps_internal(src0.row, src1.col, src0.col,
-                                          dst->tile, src0.tile, src1.tile);
-}
-
-/// Perform matrix multiplication of two tiles containing complex elements and
-///    accumulate the results into a packed single precision tile. Each dword
-///    element in input tiles src0 and src1 is interpreted as a complex number
-///    with FP16 real part and FP16 imaginary part.
-///    This function calculates the real part of the result.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TTCMMRLFP16PS </c> instruction.
-///
-/// \param dst
-///    The destination tile. Max size is 1024 Bytes.
-/// \param src0
-///    The 1st source tile. Max size is 1024 Bytes.
-/// \param src1
-///    The 2nd source tile. Max size is 1024 Bytes.
-__DEFAULT_FN_ATTRS
-static void __tile_tcmmrlfp16ps(__tile1024i *dst, __tile1024i src0,
-                                __tile1024i src1) {
-  dst->tile = _tile_tcmmrlfp16ps_internal(src0.row, src1.col, src0.col,
-                                          dst->tile, src0.tile, src1.tile);
-}
-
-/// Perform matrix conjugate transpose and multiplication of two tiles
-///    containing complex elements and accumulate the results into a packed
-///    single precision tile. Each dword element in input tiles src0 and src1
-///    is interpreted as a complex number with FP16 real part and FP16 imaginary
-///    part.
-///    This function calculates the imaginary part of the result.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TCONJTCMMIMFP16PS </c> instruction.
-///
-/// \param dst
-///    The destination tile. Max size is 1024 Bytes.
-/// \param src0
-///    The 1st source tile. Max size is 1024 Bytes.
-/// \param src1
-///    The 2nd source tile. Max size is 1024 Bytes.
-__DEFAULT_FN_ATTRS
-static void __tile_conjtcmmimfp16ps(__tile1024i *dst, __tile1024i src0,
-                                    __tile1024i src1) {
-  dst->tile = _tile_conjtcmmimfp16ps_internal(src0.row, src1.col, src0.col,
-                                              dst->tile, src0.tile, src1.tile);
-}
-
-/// Perform conjugate transpose of an FP16-pair of complex elements from src and
-///    writes the result to dst.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TCONJTFP16 </c> instruction.
-///
-/// \param dst
-///    The destination tile. Max size is 1024 Bytes.
-/// \param src
-///    The source tile. Max size is 1024 Bytes.
-__DEFAULT_FN_ATTRS
-static void __tile_conjtfp16(__tile1024i *dst, __tile1024i src) {
-  dst->tile = _tile_conjtfp16_internal(src.row, src.col, src.tile);
-}
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif // __x86_64__
-#endif // __AMX_COMPLEXTRANSPOSEINTRIN_H
diff --git a/clang/lib/Headers/amxfp16transposeintrin.h b/clang/lib/Headers/amxfp16transposeintrin.h
deleted file mode 100644
index 191f8c6097a2c..0000000000000
--- a/clang/lib/Headers/amxfp16transposeintrin.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/*===----- amxfp16transposeintrin.h - AMX-FP16 and AMX-TRANSPOSE ------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===------------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error                                                                         \
-    "Never use <amxfp16transposeintrin.h> directly; use <immintrin.h> instead."
-#endif /* __IMMINTRIN_H */
-
-#ifndef __AMX_FP16TRANSPOSEINTRIN_H
-#define __AMX_FP16TRANSPOSEINTRIN_H
-#ifdef __x86_64__
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS                                                     \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("amx-fp16,amx-transpose")))
-
-/// Compute transpose and dot-product of FP16 (16-bit) floating-point pairs in
-///    tiles \a a and \a b, accumulating the intermediate single-precision
-///    (32-bit) floating-point elements with elements in \a dst, and store the
-///    32-bit result back to tile \a dst.
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// void _tile_tdpfp16ps (__tile dst, __tile a, __tile b)
-/// \endcode
-///
-/// \code{.operation}
-/// FOR m := 0 TO dst.rows - 1
-///	tmp := dst.row[m]
-///	FOR k := 0 TO (a.colsb / 4) - 1
-///		FOR n := 0 TO (dst.colsb / 4) - 1
-///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) *
-///					FP32(b.row[k].fp16[2*n+0])
-///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) *
-///					FP32(b.row[k].fp16[2*n+1])
-///		ENDFOR
-///	ENDFOR
-///	write_row_and_zero(dst, m, tmp, dst.colsb)
-/// ENDFOR
-/// zero_upper_rows(dst, dst.rows)
-/// zero_tileconfig_start()
-/// \endcode
-///
-/// This intrinsic corresponds to the \c TTDPFP16PS instruction.
-///
-/// \param dst
-///    The destination tile. Max size is 1024 Bytes.
-/// \param a
-///    The 1st source tile. Max size is 1024 Bytes.
-/// \param b
-///    The 2nd source tile. Max size is 1024 Bytes.
-#define _tile_tdpfp16ps(dst, a, b) __builtin_ia32_ttdpfp16ps((dst), (a), (b))
-
-/// This is internal intrinsic. C/C++ user should avoid calling it directly.
-static __inline__ _tile1024i __DEFAULT_FN_ATTRS
-_tile_tdpfp16ps_internal(unsigned short m, unsigned short n, unsigned short k,
-                         _tile1024i dst, _tile1024i src1, _tile1024i src2) {
-  return __builtin_ia32_ttdpfp16ps_internal(m, n, k, dst, src1, src2);
-}
-
-/// Compute transpose and dot-product of FP16 (16-bit) floating-point pairs in
-///    tiles src0 and src1, accumulating the intermediate single-precision
-///    (32-bit) floating-point elements with elements in "dst", and store the
-///    32-bit result back to tile "dst".
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TTDPFP16PS </c> instruction.
-///
-/// \param dst
-///    The destination tile. Max size is 1024 Bytes.
-/// \param src0
-///    The 1st source tile. Max size is 1024 Bytes.
-/// \param src1
-///    The 2nd source tile. Max size is 1024 Bytes.
-__DEFAULT_FN_ATTRS
-static __inline__ void __tile_tdpfp16ps(__tile1024i *dst, __tile1024i src0,
-                                        __tile1024i src1) {
-  dst->tile = _tile_tdpfp16ps_internal(src0.row, src1.col, src0.col, dst->tile,
-                                       src0.tile, src1.tile);
-}
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif /* __x86_64__ */
-#endif /* __AMX_FP16TRANSPOSEINTRIN_H */
diff --git a/clang/lib/Headers/amxintrin.h b/clang/lib/Headers/amxintrin.h
index a7da10d9951e7..208aa3580625f 100644
--- a/clang/lib/Headers/amxintrin.h
+++ b/clang/lib/Headers/amxintrin.h
@@ -230,8 +230,6 @@ static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
 /// bytes. Since there is no 2D type in llvm IR, we use vector type to
 /// represent 2D tile and the fixed size is maximum amx tile register size.
 typedef int _tile1024i __attribute__((__vector_size__(1024), __aligned__(64)));
-typedef int _tile1024i_1024a
-    __attribute__((__vector_size__(1024), __aligned__(1024)));
 
 /// This is internal intrinsic. C/C++ user should avoid calling it directly.
 static __inline__ _tile1024i __DEFAULT_FN_ATTRS_TILE
diff --git a/clang/lib/Headers/amxmovrstransposeintrin.h b/clang/lib/Headers/amxmovrstransposeintrin.h
deleted file mode 100644
index 5f48cba949f34..0000000000000
--- a/clang/lib/Headers/amxmovrstransposeintrin.h
+++ /dev/null
@@ -1,200 +0,0 @@
-/* ===--- amxmovrstransposeintrin.h - AMX_MOVRS_TRANSPOSE intrinsics --------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- * ===-----------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error                                                                         \
-    "Never use <amxmovrstransposeintrin.h> directly; use <immintrin.h> instead."
-#endif /* __IMMINTRIN_H */
-
-#ifndef __AMX_MOVRS_TRANSPOSEINTRIN_H
-#define __AMX_MOVRS_TRANSPOSEINTRIN_H
-#ifdef __x86_64__
-
-#define __DEFAULT_FN_ATTRS                                                     \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("amx-transpose,amx-movrs")))
-
-#define _tile_2rpntlvwz0rs(tdst, base, stride)                                 \
-  __builtin_ia32_t2rpntlvwz0rs(tdst, base, stride)
-#define _tile_2rpntlvwz0rst1(tdst, base, stride)                               \
-  __builtin_ia32_t2rpntlvwz0rst1(tdst, base, stride)
-#define _tile_2rpntlvwz1rs(tdst, base, stride)                                 \
-  __builtin_ia32_t2rpntlvwz1rs(tdst, base, stride)
-#define _tile_2rpntlvwz1rst1(tdst, base, stride)                               \
-  __builtin_ia32_t2rpntlvwz1rst1(tdst, base, stride)
-
-static __inline__ void __DEFAULT_FN_ATTRS _tile_2rpntlvwz0rs_internal(
-    unsigned short row, unsigned short col0, unsigned short col1,
-    _tile1024i *dst0, _tile1024i *dst1, const void *base,
-    __SIZE_TYPE__ stride) {
-  // Use __tile1024i_1024a* to escape the alignment check in
-  // clang/test/Headers/x86-intrinsics-headers-clean.cpp
-  __builtin_ia32_t2rpntlvwz0rs_internal(
-      row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
-      (__SIZE_TYPE__)(stride));
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS _tile_2rpntlvwz0rst1_internal(
-    unsigned short row, unsigned short col0, unsigned short col1,
-    _tile1024i *dst0, _tile1024i *dst1, const void *base,
-    __SIZE_TYPE__ stride) {
-  __builtin_ia32_t2rpntlvwz0rst1_internal(
-      row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
-      (__SIZE_TYPE__)(stride));
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS _tile_2rpntlvwz1rs_internal(
-    unsigned short row, unsigned short col0, unsigned short col1,
-    _tile1024i *dst0, _tile1024i *dst1, const void *base,
-    __SIZE_TYPE__ stride) {
-  __builtin_ia32_t2rpntlvwz1rs_internal(
-      row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
-      (__SIZE_TYPE__)(stride));
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS _tile_2rpntlvwz1rst1_internal(
-    unsigned short row, unsigned short col0, unsigned short col1,
-    _tile1024i *dst0, _tile1024i *dst1, const void *base,
-    __SIZE_TYPE__ stride) {
-  __builtin_ia32_t2rpntlvwz1rst1_internal(
-      row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
-      (__SIZE_TYPE__)(stride));
-}
-
-/// Converts a pair of tiles from memory into VNNI format, and places the
-/// results in a pair of destinations specified by dst. The pair of tiles
-/// in memory is specified via a tsib; the second tile is after the first
-/// one, separated by the same stride that separates each row.
-/// The tile configuration for the destination tiles indicates the amount
-/// of data to read from memory. The instruction will load a number of rows
-/// that is equal to twice the number of rows in tmm1. The size of each row
-/// is equal to the average width of the destination tiles. If the second
-/// tile is configured with zero rows and columns, only the first tile will
-/// be written.
-/// Provides a hint to the implementation that the data will likely become
-/// read shared in the near future and the data caching can be optimized.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> T2RPNTLVWZ0RS </c> instruction.
-///
-/// \param dst0
-///    First tile of destination tile pair. Max size is 1024i*2 Bytes.
-/// \param dst1
-///    Second tile of destination tile pair. Max size is 1024i*2 Bytes.
-/// \param base
-///    A pointer to base address.
-/// \param stride
-///    The stride between the rows' data to be loaded in memory.
-__DEFAULT_FN_ATTRS
-static void __tile_2rpntlvwz0rs(__tile1024i *dst0, __tile1024i *dst1,
-                                const void *base, __SIZE_TYPE__ stride) {
-  _tile_2rpntlvwz0rs_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
-                              &dst1->tile, base, stride);
-}
-
-/// Converts a pair of tiles from memory into VNNI format, and places the
-/// results in a pair of destinations specified by dst. The pair of tiles
-/// in memory is specified via a tsib; the second tile is after the first
-/// one, separated by the same stride that separates each row.
-/// The tile configuration for the destination tiles indicates the amount
-/// of data to read from memory. The instruction will load a number of rows
-/// that is equal to twice the number of rows in tmm1. The size of each row
-/// is equal to the average width of the destination tiles. If the second
-/// tile is configured with zero rows and columns, only the first tile will
-/// be written.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> T2RPNTLVWZ0T1RS </c> instruction.
-///
-/// \param dst0
-///    First tile of destination tile pair. Max size is 1024i*2 Bytes.
-/// \param dst1
-///    Second tile of destination tile pair. Max size is 1024i*2 Bytes.
-/// \param base
-///    A pointer to base address.
-/// \param stride
-///    The stride between the rows' data to be loaded in memory.
-__DEFAULT_FN_ATTRS
-static void __tile_2rpntlvwz0rst1(__tile1024i *dst0, __tile1024i *dst1,
-                                  const void *base, __SIZE_TYPE__ stride) {
-  _tile_2rpntlvwz0rst1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
-                                &dst1->tile, base, stride);
-}
-
-/// Converts a pair of tiles from memory into VNNI format, and places the
-/// results in a pair of destinations specified by dst. The pair of tiles
-/// in memory is specified via a tsib; the second tile is after the first
-/// one, separated by the same stride that separates each row.
-/// The tile configuration for the destination tiles indicates the amount
-/// of data to read from memory. The instruction will load a number of rows
-/// that is equal to twice the number of rows in tmm1. The size of each row
-/// is equal to the average width of the destination tiles. If the second
-/// tile is configured with zero rows and columns, only the first tile will
-/// be written. The last row will be not be read from memory but instead
-/// filled with zeros.
-/// Provides a hint to the implementation that the data will likely become
-/// read shared in the near future and the data caching can be optimized.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> T2RPNTLVWZ1 </c> instruction.
-///
-/// \param dst0
-///    First tile of destination tile pair. Max size is 1024i*2 Bytes.
-/// \param dst1
-///    Second tile of destination tile pair. Max size is 1024i*2 Bytes.
-/// \param base
-///    A pointer to base address.
-/// \param stride
-///    The stride between the rows' data to be loaded in memory.
-__DEFAULT_FN_ATTRS
-static void __tile_2rpntlvwz1rs(__tile1024i *dst0, __tile1024i *dst1,
-                                const void *base, __SIZE_TYPE__ stride) {
-  _tile_2rpntlvwz1rs_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
-                              &dst1->tile, base, stride);
-}
-
-/// Converts a pair of tiles from memory into VNNI format, and places the
-/// results in a pair of destinations specified by dst. The pair of tiles
-/// in memory is specified via a tsib; the second tile is after the first
-/// one, separated by the same stride that separates each row.
-/// The tile configuration for the destination tiles indicates the amount
-/// of data to read from memory. The instruction will load a number of rows
-/// that is equal to twice the number of rows in tmm1. The size of each row
-/// is equal to the average width of the destination tiles. If the second
-/// tile is configured with zero rows and columns, only the first tile will
-/// be written. The last row will be not be read from memory but instead
-/// filled with zeros.
-/// Provides a hint to the implementation that the data will likely become
-/// read shared in the near future and the data caching can be optimized.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> T2RPNTLVWZ1T1RS </c> instruction.
-///
-/// \param dst0
-///    First tile of destination tile pair. Max size is 1024i*2 Bytes.
-/// \param dst1
-///    Second tile of destination tile pair. Max size is 1024i*2 Bytes.
-/// \param base
-///    A pointer to base address.
-/// \param stride
-///    The stride between the rows' data to be loaded in memory.
-__DEFAULT_FN_ATTRS
-static void __tile_2rpntlvwz1rst1(__tile1024i *dst0, __tile1024i *dst1,
-                                  const void *base, __SIZE_TYPE__ stride) {
-  _tile_2rpntlvwz1rst1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
-                                &dst1->tile, base, stride);
-}
-
-#undef __DEFAULT_FN_ATTRS
-#endif /* __x86_64__ */
-#endif /* __AMX_MOVRS_TRANSPOSEINTRIN_H */
diff --git a/clang/lib/Headers/amxtf32transposeintrin.h b/clang/lib/Headers/amxtf32transposeintrin.h
deleted file mode 100644
index e1b90c1adfb22..0000000000000
--- a/clang/lib/Headers/amxtf32transposeintrin.h
+++ /dev/null
@@ -1,105 +0,0 @@
-/*===--------- amxtf32transposeintrin.h - AMX-TF32 and AMX-TRANSPOSE --------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===------------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error                                                                         \
-    "Never use <amxtf32transposeintrin.h> directly; include <immintrin.h> instead."
-#endif // __IMMINTRIN_H
-
-#ifndef __AMX_TF32TRANSPOSEINTRIN_H
-#define __AMX_TF32TRANSPOSEINTRIN_H
-#ifdef __x86_64__
-
-#define __DEFAULT_FN_ATTRS_TF32_TRANSPOSE                                      \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("amx-tf32,amx-transpose")))
-
-/// \code
-/// void _tile_tmmultf32ps(constexpr int srcdst, constexpr int a, \
-///                        constexpr int b);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> TTMMULTF32PS </c> instruction.
-///
-/// \param srcdst
-/// 	The destination tile. Max size is 1024 Bytes.
-/// \param a
-/// 	The 1st source tile. Max size is 1024 Bytes.
-/// \param b
-/// 	The 2nd source tile. Max size is 1024 Bytes.
-///
-/// \code{.operation}
-/// DEFINE zero_lower_mantissa_bits_fp32(x[31:0]) {
-/// 	dword[12:0] := 0
-/// 	dword[31:13] := x[31:13]
-/// 	return dword
-/// }
-///
-/// DEFINE silence_snan_fp32(x[31:0]) {
-/// 	IF (x.exponent == 255 and x.fraction != 0 and x.fraction[22] == 0)
-/// 		x.fraction[22] := 1
-/// 	return x
-/// }
-///
-/// elements_dest:= srcdst.colsb/4
-///
-/// FOR m := 0 TO (srcdst.rows-1)
-/// 	tmp[511:0] := 0
-/// 	FOR k := 0 TO (a.rows-1)
-/// 		FOR n := 0 TO (elements_dest-1)
-/// 			a1e := silence_snan_fp32(a.row[k].fp32[m])
-/// 			a2e := silence_snan_fp32(b.row[k].fp32[n])
-/// 			s1e := zero_lower_mantissa_bits_fp32(a1e)
-/// 			s2e := zero_lower_mantissa_bits_fp32(a2e)
-/// 			tmp.fp32[n] += s1e * s2e
-/// 		ENDFOR
-/// 	ENDFOR
-///
-/// 	FOR n := 0 TO (elements_dest-1)
-/// 		tmp.fp32[n] += srcdst.row[m].fp32[n]
-/// 	ENDFOR
-///	write_row_and_zero(srcdst, m, tmp, srcdst.colsb)
-///
-/// ENDFOR
-///
-/// zero_upper_rows(srcdst, srcdst.rows)
-/// zero_tileconfig_start()
-/// \endcode
-#define _tile_tmmultf32ps(srcdst, a, b)                                        \
-  __builtin_ia32_ttmmultf32ps((srcdst), (a), (b))
-
-// dst = m x n (srcdest), src1 = k x m, src2 = k x n
-static __inline__ _tile1024i __DEFAULT_FN_ATTRS_TF32_TRANSPOSE
-_tile_tmmultf32ps_internal(unsigned short m, unsigned short n, unsigned short k,
-                           _tile1024i dst, _tile1024i src1, _tile1024i src2) {
-  return __builtin_ia32_ttmmultf32ps_internal(m, n, k, dst, src1, src2);
-}
-
-/// Compute transpose and do Matrix Multiplication of src0 and src1, and then do
-/// Matrix Plus with dst. All the calculation is base on float32 but with the
-/// lower 13-bit set to 0.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TTMMULTF32PS </c> instruction.
-///
-/// \param dst
-///    The destination tile. Max size is 1024 Bytes.
-/// \param src0
-///    The 1st source tile. Max size is 1024 Bytes.
-/// \param src1
-///    The 2nd source tile. Max size is 1024 Bytes.
-__DEFAULT_FN_ATTRS_TF32_TRANSPOSE
-static void __tile_tmmultf32ps(__tile1024i *dst, __tile1024i src0,
-                               __tile1024i src1) {
-  dst->tile = _tile_tmmultf32ps_internal(src0.row, src1.col, src0.col,
-                                         dst->tile, src0.tile, src1.tile);
-}
-
-#endif // __x86_64__
-#endif // __AMX_TF32TRANSPOSEINTRIN_H
diff --git a/clang/lib/Headers/amxtransposeintrin.h b/clang/lib/Headers/amxtransposeintrin.h
deleted file mode 100644
index b3fa37d766c45..0000000000000
--- a/clang/lib/Headers/amxtransposeintrin.h
+++ /dev/null
@@ -1,248 +0,0 @@
-/* ===--- amxtransposeintrin.h - AMX_TRANSPOSE intrinsics -*- C++ -*---------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- * ===-----------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error "Never use <amxtransposeintrin.h> directly; use <immintrin.h> instead."
-#endif /* __IMMINTRIN_H */
-
-#ifndef __AMX_TRANSPOSEINTRIN_H
-#define __AMX_TRANSPOSEINTRIN_H
-#ifdef __x86_64__
-
-#define __DEFAULT_FN_ATTRS_TRANSPOSE                                           \
-  __attribute__((__always_inline__, __nodebug__, __target__("amx-transpose")))
-
-#define _tile_2rpntlvwz0(tdst, base, stride)                                   \
-  __builtin_ia32_t2rpntlvwz0(tdst, base, stride)
-#define _tile_2rpntlvwz0t1(tdst, base, stride)                                 \
-  __builtin_ia32_t2rpntlvwz0t1(tdst, base, stride)
-#define _tile_2rpntlvwz1(tdst, base, stride)                                   \
-  __builtin_ia32_t2rpntlvwz1(tdst, base, stride)
-#define _tile_2rpntlvwz1t1(tdst, base, stride)                                 \
-  __builtin_ia32_t2rpntlvwz1t1(tdst, base, stride)
-
-/// Transpose 32-bit elements from \a src and write the result to \a dst.
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// void _tile_transposed(__tile dst, __tile src);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> TTRANSPOSED </c> instruction.
-///
-/// \param dst
-/// 	The destination tile. Max size is 1024 Bytes.
-/// \param src
-/// 	The source tile. Max size is 1024 Bytes.
-///
-/// \code{.operation}
-///
-/// FOR i := 0 TO (dst.rows-1)
-/// 	tmp[511:0] := 0
-/// 	FOR j := 0 TO (dst.colsb/4-1)
-/// 		tmp.dword[j] := src.row[j].dword[i]
-/// 	ENDFOR
-/// 	dst.row[i] := tmp
-/// ENDFOR
-///
-/// zero_upper_rows(dst, dst.rows)
-/// zero_tileconfig_start()
-/// \endcode
-#define _tile_transposed(dst, src) __builtin_ia32_ttransposed(dst, src)
-
-static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz0_internal(
-    unsigned short row, unsigned short col0, unsigned short col1,
-    _tile1024i *dst0, _tile1024i *dst1, const void *base,
-    __SIZE_TYPE__ stride) {
-  // Use __tile1024i_1024a* to escape the alignment check in
-  // clang/test/Headers/x86-intrinsics-headers-clean.cpp
-  __builtin_ia32_t2rpntlvwz0_internal(row, col0, col1, (_tile1024i_1024a *)dst0,
-                                      (_tile1024i_1024a *)dst1, base,
-                                      (__SIZE_TYPE__)(stride));
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz0t1_internal(
-    unsigned short row, unsigned short col0, unsigned short col1,
-    _tile1024i *dst0, _tile1024i *dst1, const void *base,
-    __SIZE_TYPE__ stride) {
-  __builtin_ia32_t2rpntlvwz0t1_internal(
-      row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
-      (__SIZE_TYPE__)(stride));
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz1_internal(
-    unsigned short row, unsigned short col0, unsigned short col1,
-    _tile1024i *dst0, _tile1024i *dst1, const void *base,
-    __SIZE_TYPE__ stride) {
-  __builtin_ia32_t2rpntlvwz1_internal(row, col0, col1, (_tile1024i_1024a *)dst0,
-                                      (_tile1024i_1024a *)dst1, base,
-                                      (__SIZE_TYPE__)(stride));
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz1t1_internal(
-    unsigned short row, unsigned short col0, unsigned short col1,
-    _tile1024i *dst0, _tile1024i *dst1, const void *base,
-    __SIZE_TYPE__ stride) {
-  __builtin_ia32_t2rpntlvwz1t1_internal(
-      row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
-      (__SIZE_TYPE__)(stride));
-}
-
-// This is internal intrinsic. C/C++ user should avoid calling it directly.
-static __inline__ _tile1024i __DEFAULT_FN_ATTRS_TRANSPOSE
-_tile_transposed_internal(unsigned short m, unsigned short n, _tile1024i src) {
-  return __builtin_ia32_ttransposed_internal(m, n, src);
-}
-
-/// Converts a pair of tiles from memory into VNNI format, and places the
-/// results in a pair of destinations specified by dst. The pair of tiles
-/// in memory is specified via a tsib; the second tile is after the first
-/// one, separated by the same stride that separates each row.
-/// The tile configuration for the destination tiles indicates the amount
-/// of data to read from memory. The instruction will load a number of rows
-/// that is equal to twice the number of rows in tmm1. The size of each row
-/// is equal to the average width of the destination tiles. If the second
-/// tile is configured with zero rows and columns, only the first tile will
-/// be written.
-/// Provides a hint to the implementation that the data will likely not be
-/// reused in the near future and the data caching can be optimized.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> T2RPNTLVWZ0 </c> instruction.
-///
-/// \param dst0
-///    First tile of destination tile pair. Max size is 1024i*2 Bytes.
-/// \param dst1
-///    Second tile of destination tile pair. Max size is 1024i*2 Bytes.
-/// \param base
-///    A pointer to base address.
-/// \param stride
-///    The stride between the rows' data to be loaded in memory.
-__DEFAULT_FN_ATTRS_TRANSPOSE
-static void __tile_2rpntlvwz0(__tile1024i *dst0, __tile1024i *dst1,
-                              const void *base, __SIZE_TYPE__ stride) {
-  _tile_2rpntlvwz0_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
-                            &dst1->tile, base, stride);
-}
-
-/// Converts a pair of tiles from memory into VNNI format, and places the
-/// results in a pair of destinations specified by dst. The pair of tiles
-/// in memory is specified via a tsib; the second tile is after the first
-/// one, separated by the same stride that separates each row.
-/// The tile configuration for the destination tiles indicates the amount
-/// of data to read from memory. The instruction will load a number of rows
-/// that is equal to twice the number of rows in tmm1. The size of each row
-/// is equal to the average width of the destination tiles. If the second
-/// tile is configured with zero rows and columns, only the first tile will
-/// be written.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> T2RPNTLVWZ0T1 </c> instruction.
-///
-/// \param dst0
-///    First tile of destination tile pair. Max size is 1024i*2 Bytes.
-/// \param dst1
-///    Second tile of destination tile pair. Max size is 1024i*2 Bytes.
-/// \param base
-///    A pointer to base address.
-/// \param stride
-///    The stride between the rows' data to be loaded in memory.
-__DEFAULT_FN_ATTRS_TRANSPOSE
-static void __tile_2rpntlvwz0t1(__tile1024i *dst0, __tile1024i *dst1,
-                                const void *base, __SIZE_TYPE__ stride) {
-  _tile_2rpntlvwz0t1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
-                              &dst1->tile, base, stride);
-}
-
-/// Converts a pair of tiles from memory into VNNI format, and places the
-/// results in a pair of destinations specified by dst. The pair of tiles
-/// in memory is specified via a tsib; the second tile is after the first
-/// one, separated by the same stride that separates each row.
-/// The tile configuration for the destination tiles indicates the amount
-/// of data to read from memory. The instruction will load a number of rows
-/// that is equal to twice the number of rows in tmm1. The size of each row
-/// is equal to the average width of the destination tiles. If the second
-/// tile is configured with zero rows and columns, only the first tile will
-/// be written. The last row will be not be read from memory but instead
-/// filled with zeros.
-/// Provides a hint to the implementation that the data will likely not be
-/// reused in the near future and the data caching can be optimized.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> T2RPNTLVWZ1 </c> instruction.
-///
-/// \param dst0
-///    First tile of destination tile pair. Max size is 1024i*2 Bytes.
-/// \param dst1
-///    Second tile of destination tile pair. Max size is 1024i*2 Bytes.
-/// \param base
-///    A pointer to base address.
-/// \param stride
-///    The stride between the rows' data to be loaded in memory.
-__DEFAULT_FN_ATTRS_TRANSPOSE
-static void __tile_2rpntlvwz1(__tile1024i *dst0, __tile1024i *dst1,
-                              const void *base, __SIZE_TYPE__ stride) {
-  _tile_2rpntlvwz1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
-                            &dst1->tile, base, stride);
-}
-
-/// Converts a pair of tiles from memory into VNNI format, and places the
-/// results in a pair of destinations specified by dst. The pair of tiles
-/// in memory is specified via a tsib; the second tile is after the first
-/// one, separated by the same stride that separates each row.
-/// The tile configuration for the destination tiles indicates the amount
-/// of data to read from memory. The instruction will load a number of rows
-/// that is equal to twice the number of rows in tmm1. The size of each row
-/// is equal to the average width of the destination tiles. If the second
-/// tile is configured with zero rows and columns, only the first tile will
-/// be written. The last row will be not be read from memory but instead
-/// filled with zeros.
-/// Provides a hint to the implementation that the data will likely not be
-/// reused in the near future and the data caching can be optimized.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> T2RPNTLVWZ1T1 </c> instruction.
-///
-/// \param dst0
-///    First tile of destination tile pair. Max size is 1024i*2 Bytes.
-/// \param dst1
-///    Second tile of destination tile pair. Max size is 1024i*2 Bytes.
-/// \param base
-///    A pointer to base address.
-/// \param stride
-///    The stride between the rows' data to be loaded in memory.
-__DEFAULT_FN_ATTRS_TRANSPOSE
-static void __tile_2rpntlvwz1t1(__tile1024i *dst0, __tile1024i *dst1,
-                                const void *base, __SIZE_TYPE__ stride) {
-  _tile_2rpntlvwz1t1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
-                              &dst1->tile, base, stride);
-}
-
-/// Transpose 32-bit elements from src and write the result to dst.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TTRANSPOSED </c> instruction.
-///
-/// \param dst
-///    The destination tile. Max size is 1024 Bytes.
-/// \param src
-///    The source tile. Max size is 1024 Bytes.
-__DEFAULT_FN_ATTRS_TRANSPOSE
-static void __tile_transposed(__tile1024i *dst, __tile1024i src) {
-  dst->tile = _tile_transposed_internal(dst->row, dst->col, src.tile);
-}
-
-#endif /* __x86_64__ */
-#endif /* __AMX_TRANSPOSEINTRIN_H */
diff --git a/clang/lib/Headers/avx512vlbwintrin.h b/clang/lib/Headers/avx512vlbwintrin.h
index 0fcfe3779fa19..263a1079b26d5 100644
--- a/clang/lib/Headers/avx512vlbwintrin.h
+++ b/clang/lib/Headers/avx512vlbwintrin.h
@@ -2385,22 +2385,19 @@ _mm256_mask_storeu_epi8 (void *__P, __mmask32 __U, __m256i __A)
              (__mmask32) __U);
 }
 
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS128
-_mm_test_epi8_mask (__m128i __A, __m128i __B)
-{
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_test_epi8_mask(__m128i __A, __m128i __B) {
   return _mm_cmpneq_epi8_mask (_mm_and_si128(__A, __B), _mm_setzero_si128());
 }
 
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS128
-_mm_mask_test_epi8_mask (__mmask16 __U, __m128i __A, __m128i __B)
-{
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_mask_test_epi8_mask(__mmask16 __U, __m128i __A, __m128i __B) {
   return _mm_mask_cmpneq_epi8_mask (__U, _mm_and_si128 (__A, __B),
                                     _mm_setzero_si128());
 }
 
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS256
-_mm256_test_epi8_mask (__m256i __A, __m256i __B)
-{
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_test_epi8_mask(__m256i __A, __m256i __B) {
   return _mm256_cmpneq_epi8_mask (_mm256_and_si256(__A, __B),
                                   _mm256_setzero_si256());
 }
@@ -2439,9 +2436,8 @@ _mm256_mask_test_epi16_mask (__mmask16 __U, __m256i __A, __m256i __B)
                                         _mm256_setzero_si256());
 }
 
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS128
-_mm_testn_epi8_mask (__m128i __A, __m128i __B)
-{
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_testn_epi8_mask(__m128i __A, __m128i __B) {
   return _mm_cmpeq_epi8_mask (_mm_and_si128 (__A, __B), _mm_setzero_si128());
 }
 
diff --git a/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h b/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h
index d973371312701..a918af39e4074 100644
--- a/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h
+++ b/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h
@@ -2597,6 +2597,129 @@ __attribute__((convergent)) double3 WaveActiveMax(double3);
 _HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_max)
 __attribute__((convergent)) double4 WaveActiveMax(double4);
 
+//===----------------------------------------------------------------------===//
+// WaveActiveMin builtins
+//===----------------------------------------------------------------------===//
+
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min)
+__attribute__((convergent)) half WaveActiveMin(half);
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min)
+__attribute__((convergent)) half2 WaveActiveMin(half2);
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min)
+__attribute__((convergent)) half3 WaveActiveMin(half3);
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min)
+__attribute__((convergent)) half4 WaveActiveMin(half4);
+
+#ifdef __HLSL_ENABLE_16_BIT
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min)
+__attribute__((convergent)) int16_t WaveActiveMin(int16_t);
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min)
+__attribute__((convergent)) int16_t2 WaveActiveMin(int16_t2);
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min)
+__attribute__((convergent)) int16_t3 WaveActiveMin(int16_t3);
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min)
+__attribute__((convergent)) int16_t4 WaveActiveMin(int16_t4);
+
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min)
+__attribute__((convergent)) uint16_t WaveActiveMin(uint16_t);
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min)
+__attribute__((convergent)) uint16_t2 WaveActiveMin(uint16_t2);
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min)
+__attribute__((convergent)) uint16_t3 WaveActiveMin(uint16_t3);
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min)
+__attribute__((convergent)) uint16_t4 WaveActiveMin(uint16_t4);
+#endif
+
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min)
+__attribute__((convergent)) int WaveActiveMin(int);
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min)
+__attribute__((convergent)) int2 WaveActiveMin(int2);
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min)
+__attribute__((convergent)) int3 WaveActiveMin(int3);
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min)
+__attribute__((convergent)) int4 WaveActiveMin(int4);
+
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min)
+__attribute__((convergent)) uint WaveActiveMin(uint);
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min)
+__attribute__((convergent)) uint2 WaveActiveMin(uint2);
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min)
+__attribute__((convergent)) uint3 WaveActiveMin(uint3);
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min)
+__attribute__((convergent)) uint4 WaveActiveMin(uint4);
+
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min)
+__attribute__((convergent)) int64_t WaveActiveMin(int64_t);
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min)
+__attribute__((convergent)) int64_t2 WaveActiveMin(int64_t2);
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min)
+__attribute__((convergent)) int64_t3 WaveActiveMin(int64_t3);
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min)
+__attribute__((convergent)) int64_t4 WaveActiveMin(int64_t4);
+
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min)
+__attribute__((convergent)) uint64_t WaveActiveMin(uint64_t);
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min)
+__attribute__((convergent)) uint64_t2 WaveActiveMin(uint64_t2);
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min)
+__attribute__((convergent)) uint64_t3 WaveActiveMin(uint64_t3);
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min)
+__attribute__((convergent)) uint64_t4 WaveActiveMin(uint64_t4);
+
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min)
+__attribute__((convergent)) float WaveActiveMin(float);
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min)
+__attribute__((convergent)) float2 WaveActiveMin(float2);
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min)
+__attribute__((convergent)) float3 WaveActiveMin(float3);
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min)
+__attribute__((convergent)) float4 WaveActiveMin(float4);
+
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min)
+__attribute__((convergent)) double WaveActiveMin(double);
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min)
+__attribute__((convergent)) double2 WaveActiveMin(double2);
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min)
+__attribute__((convergent)) double3 WaveActiveMin(double3);
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min)
+__attribute__((convergent)) double4 WaveActiveMin(double4);
+
 //===----------------------------------------------------------------------===//
 // WaveActiveSum builtins
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/Headers/hlsl/hlsl_compat_overloads.h b/clang/lib/Headers/hlsl/hlsl_compat_overloads.h
index fe4277ed4a7d2..ee243abef6a41 100644
--- a/clang/lib/Headers/hlsl/hlsl_compat_overloads.h
+++ b/clang/lib/Headers/hlsl/hlsl_compat_overloads.h
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #ifndef _HLSL_COMPAT_OVERLOADS_H_
-#define _HLSl_COMPAT_OVERLOADS_H_
+#define _HLSL_COMPAT_OVERLOADS_H_
 
 namespace hlsl {
 
diff --git a/clang/lib/Headers/hvx_hexagon_protos.h b/clang/lib/Headers/hvx_hexagon_protos.h
index fd120a589f64f..19309a40d6dd1 100644
--- a/clang/lib/Headers/hvx_hexagon_protos.h
+++ b/clang/lib/Headers/hvx_hexagon_protos.h
@@ -5605,6 +5605,399 @@
   __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vsub_hf_f8)(Vu, Vv)
 #endif /* __HEXAGON_ARCH___ >= 79 */
 
+#if __HVX_ARCH__ >= 81
+/* ==========================================================================
+   Assembly Syntax:       Vd32.qf16=vabs(Vu32.hf)
+   C Intrinsic Prototype: HVX_Vector Q6_Vqf16_vabs_Vhf(HVX_Vector Vu)
+   Instruction Type:      CVI_VS
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+
+#define Q6_Vqf16_vabs_Vhf(Vu)                                                  \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vabs_qf16_hf)(Vu)
+#endif /* __HEXAGON_ARCH___ >= 81 */
+
+#if __HVX_ARCH__ >= 81
+/* ==========================================================================
+   Assembly Syntax:       Vd32.qf16=vabs(Vu32.qf16)
+   C Intrinsic Prototype: HVX_Vector Q6_Vqf16_vabs_Vqf16(HVX_Vector Vu)
+   Instruction Type:      CVI_VS
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+
+#define Q6_Vqf16_vabs_Vqf16(Vu)                                                \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vabs_qf16_qf16)(Vu)
+#endif /* __HEXAGON_ARCH___ >= 81 */
+
+#if __HVX_ARCH__ >= 81
+/* ==========================================================================
+   Assembly Syntax:       Vd32.qf32=vabs(Vu32.qf32)
+   C Intrinsic Prototype: HVX_Vector Q6_Vqf32_vabs_Vqf32(HVX_Vector Vu)
+   Instruction Type:      CVI_VS
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+
+#define Q6_Vqf32_vabs_Vqf32(Vu)                                                \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vabs_qf32_qf32)(Vu)
+#endif /* __HEXAGON_ARCH___ >= 81 */
+
+#if __HVX_ARCH__ >= 81
+/* ==========================================================================
+   Assembly Syntax:       Vd32.qf32=vabs(Vu32.sf)
+   C Intrinsic Prototype: HVX_Vector Q6_Vqf32_vabs_Vsf(HVX_Vector Vu)
+   Instruction Type:      CVI_VS
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+
+#define Q6_Vqf32_vabs_Vsf(Vu)                                                  \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vabs_qf32_sf)(Vu)
+#endif /* __HEXAGON_ARCH___ >= 81 */
+
+#if __HVX_ARCH__ >= 81
+/* ==========================================================================
+   Assembly Syntax:       Vd32=valign4(Vu32,Vv32,Rt8)
+   C Intrinsic Prototype: HVX_Vector Q6_V_valign4_VVR(HVX_Vector Vu, HVX_Vector
+   Vv, Word32 Rt) Instruction Type:      CVI_VA Execution Slots:       SLOT0123
+   ========================================================================== */
+
+#define Q6_V_valign4_VVR(Vu, Vv, Rt)                                           \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_valign4)(Vu, Vv, Rt)
+#endif /* __HEXAGON_ARCH___ >= 81 */
+
+#if __HVX_ARCH__ >= 81
+/* ==========================================================================
+   Assembly Syntax:       Vd32.bf=Vuu32.qf32
+   C Intrinsic Prototype: HVX_Vector Q6_Vbf_equals_Wqf32(HVX_VectorPair Vuu)
+   Instruction Type:      CVI_VS
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+
+#define Q6_Vbf_equals_Wqf32(Vuu)                                               \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vconv_bf_qf32)(Vuu)
+#endif /* __HEXAGON_ARCH___ >= 81 */
+
+#if __HVX_ARCH__ >= 81
+/* ==========================================================================
+   Assembly Syntax:       Vd32.f8=Vu32.qf16
+   C Intrinsic Prototype: HVX_Vector Q6_V_equals_Vqf16(HVX_Vector Vu)
+   Instruction Type:      CVI_VS
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+
+#define Q6_V_equals_Vqf16(Vu)                                                  \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vconv_f8_qf16)(Vu)
+#endif /* __HEXAGON_ARCH___ >= 81 */
+
+#if __HVX_ARCH__ >= 81
+/* ==========================================================================
+   Assembly Syntax:       Vd32.h=Vu32.hf:rnd
+   C Intrinsic Prototype: HVX_Vector Q6_Vh_equals_Vhf_rnd(HVX_Vector Vu)
+   Instruction Type:      CVI_VS
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+
+#define Q6_Vh_equals_Vhf_rnd(Vu)                                               \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vconv_h_hf_rnd)(Vu)
+#endif /* __HEXAGON_ARCH___ >= 81 */
+
+#if __HVX_ARCH__ >= 81
+/* ==========================================================================
+   Assembly Syntax:       Vdd32.qf16=Vu32.f8
+   C Intrinsic Prototype: HVX_VectorPair Q6_Wqf16_equals_V(HVX_Vector Vu)
+   Instruction Type:      CVI_VP_VS
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+
+#define Q6_Wqf16_equals_V(Vu)                                                  \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vconv_qf16_f8)(Vu)
+#endif /* __HEXAGON_ARCH___ >= 81 */
+
+#if __HVX_ARCH__ >= 81
+/* ==========================================================================
+   Assembly Syntax:       Vd32.qf16=Vu32.hf
+   C Intrinsic Prototype: HVX_Vector Q6_Vqf16_equals_Vhf(HVX_Vector Vu)
+   Instruction Type:      CVI_VS
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+
+#define Q6_Vqf16_equals_Vhf(Vu)                                                \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vconv_qf16_hf)(Vu)
+#endif /* __HEXAGON_ARCH___ >= 81 */
+
+#if __HVX_ARCH__ >= 81
+/* ==========================================================================
+   Assembly Syntax:       Vd32.qf16=Vu32.qf16
+   C Intrinsic Prototype: HVX_Vector Q6_Vqf16_equals_Vqf16(HVX_Vector Vu)
+   Instruction Type:      CVI_VS
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+
+#define Q6_Vqf16_equals_Vqf16(Vu)                                              \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vconv_qf16_qf16)(Vu)
+#endif /* __HEXAGON_ARCH___ >= 81 */
+
+#if __HVX_ARCH__ >= 81
+/* ==========================================================================
+   Assembly Syntax:       Vd32.qf32=Vu32.qf32
+   C Intrinsic Prototype: HVX_Vector Q6_Vqf32_equals_Vqf32(HVX_Vector Vu)
+   Instruction Type:      CVI_VS
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+
+#define Q6_Vqf32_equals_Vqf32(Vu)                                              \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vconv_qf32_qf32)(Vu)
+#endif /* __HEXAGON_ARCH___ >= 81 */
+
+#if __HVX_ARCH__ >= 81
+/* ==========================================================================
+   Assembly Syntax:       Vd32.qf32=Vu32.sf
+   C Intrinsic Prototype: HVX_Vector Q6_Vqf32_equals_Vsf(HVX_Vector Vu)
+   Instruction Type:      CVI_VS
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+
+#define Q6_Vqf32_equals_Vsf(Vu)                                                \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vconv_qf32_sf)(Vu)
+#endif /* __HEXAGON_ARCH___ >= 81 */
+
+#if __HVX_ARCH__ >= 81
+/* ==========================================================================
+   Assembly Syntax:       Qd4=vcmp.eq(Vu32.hf,Vv32.hf)
+   C Intrinsic Prototype: HVX_VectorPred Q6_Q_vcmp_eq_VhfVhf(HVX_Vector Vu,
+   HVX_Vector Vv) Instruction Type:      CVI_VA Execution Slots:       SLOT0123
+   ========================================================================== */
+
+#define Q6_Q_vcmp_eq_VhfVhf(Vu, Vv)                                            \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandqrt)(                         \
+      (__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_veqhf)(Vu, Vv)), -1)
+#endif /* __HEXAGON_ARCH___ >= 81 */
+
+#if __HVX_ARCH__ >= 81
+/* ==========================================================================
+   Assembly Syntax:       Qx4&=vcmp.eq(Vu32.hf,Vv32.hf)
+   C Intrinsic Prototype: HVX_VectorPred Q6_Q_vcmp_eqand_QVhfVhf(HVX_VectorPred
+   Qx, HVX_Vector Vu, HVX_Vector Vv) Instruction Type:      CVI_VA Execution
+   Slots:       SLOT0123
+   ========================================================================== */
+
+#define Q6_Q_vcmp_eqand_QVhfVhf(Qx, Vu, Vv)                                    \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandqrt)(                         \
+      (__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_veqhf_and)(                  \
+          __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandvrt)((Qx), -1), Vu,   \
+          Vv)),                                                                \
+      -1)
+#endif /* __HEXAGON_ARCH___ >= 81 */
+
+#if __HVX_ARCH__ >= 81
+/* ==========================================================================
+   Assembly Syntax:       Qx4|=vcmp.eq(Vu32.hf,Vv32.hf)
+   C Intrinsic Prototype: HVX_VectorPred Q6_Q_vcmp_eqor_QVhfVhf(HVX_VectorPred
+   Qx, HVX_Vector Vu, HVX_Vector Vv) Instruction Type:      CVI_VA Execution
+   Slots:       SLOT0123
+   ========================================================================== */
+
+#define Q6_Q_vcmp_eqor_QVhfVhf(Qx, Vu, Vv)                                     \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandqrt)(                         \
+      (__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_veqhf_or)(                   \
+          __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandvrt)((Qx), -1), Vu,   \
+          Vv)),                                                                \
+      -1)
+#endif /* __HEXAGON_ARCH___ >= 81 */
+
+#if __HVX_ARCH__ >= 81
+/* ==========================================================================
+   Assembly Syntax:       Qx4^=vcmp.eq(Vu32.hf,Vv32.hf)
+   C Intrinsic Prototype: HVX_VectorPred Q6_Q_vcmp_eqxacc_QVhfVhf(HVX_VectorPred
+   Qx, HVX_Vector Vu, HVX_Vector Vv) Instruction Type:      CVI_VA Execution
+   Slots:       SLOT0123
+   ========================================================================== */
+
+#define Q6_Q_vcmp_eqxacc_QVhfVhf(Qx, Vu, Vv)                                   \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandqrt)(                         \
+      (__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_veqhf_xor)(                  \
+          __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandvrt)((Qx), -1), Vu,   \
+          Vv)),                                                                \
+      -1)
+#endif /* __HEXAGON_ARCH___ >= 81 */
+
+#if __HVX_ARCH__ >= 81
+/* ==========================================================================
+   Assembly Syntax:       Qd4=vcmp.eq(Vu32.sf,Vv32.sf)
+   C Intrinsic Prototype: HVX_VectorPred Q6_Q_vcmp_eq_VsfVsf(HVX_Vector Vu,
+   HVX_Vector Vv) Instruction Type:      CVI_VA Execution Slots:       SLOT0123
+   ========================================================================== */
+
+#define Q6_Q_vcmp_eq_VsfVsf(Vu, Vv)                                            \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandqrt)(                         \
+      (__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_veqsf)(Vu, Vv)), -1)
+#endif /* __HEXAGON_ARCH___ >= 81 */
+
+#if __HVX_ARCH__ >= 81
+/* ==========================================================================
+   Assembly Syntax:       Qx4&=vcmp.eq(Vu32.sf,Vv32.sf)
+   C Intrinsic Prototype: HVX_VectorPred Q6_Q_vcmp_eqand_QVsfVsf(HVX_VectorPred
+   Qx, HVX_Vector Vu, HVX_Vector Vv) Instruction Type:      CVI_VA Execution
+   Slots:       SLOT0123
+   ========================================================================== */
+
+#define Q6_Q_vcmp_eqand_QVsfVsf(Qx, Vu, Vv)                                    \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandqrt)(                         \
+      (__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_veqsf_and)(                  \
+          __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandvrt)((Qx), -1), Vu,   \
+          Vv)),                                                                \
+      -1)
+#endif /* __HEXAGON_ARCH___ >= 81 */
+
+#if __HVX_ARCH__ >= 81
+/* ==========================================================================
+   Assembly Syntax:       Qx4|=vcmp.eq(Vu32.sf,Vv32.sf)
+   C Intrinsic Prototype: HVX_VectorPred Q6_Q_vcmp_eqor_QVsfVsf(HVX_VectorPred
+   Qx, HVX_Vector Vu, HVX_Vector Vv) Instruction Type:      CVI_VA Execution
+   Slots:       SLOT0123
+   ========================================================================== */
+
+#define Q6_Q_vcmp_eqor_QVsfVsf(Qx, Vu, Vv)                                     \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandqrt)(                         \
+      (__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_veqsf_or)(                   \
+          __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandvrt)((Qx), -1), Vu,   \
+          Vv)),                                                                \
+      -1)
+#endif /* __HEXAGON_ARCH___ >= 81 */
+
+#if __HVX_ARCH__ >= 81
+/* ==========================================================================
+   Assembly Syntax:       Qx4^=vcmp.eq(Vu32.sf,Vv32.sf)
+   C Intrinsic Prototype: HVX_VectorPred Q6_Q_vcmp_eqxacc_QVsfVsf(HVX_VectorPred
+   Qx, HVX_Vector Vu, HVX_Vector Vv) Instruction Type:      CVI_VA Execution
+   Slots:       SLOT0123
+   ========================================================================== */
+
+#define Q6_Q_vcmp_eqxacc_QVsfVsf(Qx, Vu, Vv)                                   \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandqrt)(                         \
+      (__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_veqsf_xor)(                  \
+          __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandvrt)((Qx), -1), Vu,   \
+          Vv)),                                                                \
+      -1)
+#endif /* __HEXAGON_ARCH___ >= 81 */
+
+#if __HVX_ARCH__ >= 81
+/* ==========================================================================
+   Assembly Syntax:       Vd32.w=vilog2(Vu32.hf)
+   C Intrinsic Prototype: HVX_Vector Q6_Vw_vilog2_Vhf(HVX_Vector Vu)
+   Instruction Type:      CVI_VS
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+
+#define Q6_Vw_vilog2_Vhf(Vu)                                                   \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vilog2_hf)(Vu)
+#endif /* __HEXAGON_ARCH___ >= 81 */
+
+#if __HVX_ARCH__ >= 81
+/* ==========================================================================
+   Assembly Syntax:       Vd32.w=vilog2(Vu32.qf16)
+   C Intrinsic Prototype: HVX_Vector Q6_Vw_vilog2_Vqf16(HVX_Vector Vu)
+   Instruction Type:      CVI_VS
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+
+#define Q6_Vw_vilog2_Vqf16(Vu)                                                 \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vilog2_qf16)(Vu)
+#endif /* __HEXAGON_ARCH___ >= 81 */
+
+#if __HVX_ARCH__ >= 81
+/* ==========================================================================
+   Assembly Syntax:       Vd32.w=vilog2(Vu32.qf32)
+   C Intrinsic Prototype: HVX_Vector Q6_Vw_vilog2_Vqf32(HVX_Vector Vu)
+   Instruction Type:      CVI_VS
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+
+#define Q6_Vw_vilog2_Vqf32(Vu)                                                 \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vilog2_qf32)(Vu)
+#endif /* __HEXAGON_ARCH___ >= 81 */
+
+#if __HVX_ARCH__ >= 81
+/* ==========================================================================
+   Assembly Syntax:       Vd32.w=vilog2(Vu32.sf)
+   C Intrinsic Prototype: HVX_Vector Q6_Vw_vilog2_Vsf(HVX_Vector Vu)
+   Instruction Type:      CVI_VS
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+
+#define Q6_Vw_vilog2_Vsf(Vu)                                                   \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vilog2_sf)(Vu)
+#endif /* __HEXAGON_ARCH___ >= 81 */
+
+#if __HVX_ARCH__ >= 81
+/* ==========================================================================
+   Assembly Syntax:       Vd32.qf16=vneg(Vu32.hf)
+   C Intrinsic Prototype: HVX_Vector Q6_Vqf16_vneg_Vhf(HVX_Vector Vu)
+   Instruction Type:      CVI_VS
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+
+#define Q6_Vqf16_vneg_Vhf(Vu)                                                  \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vneg_qf16_hf)(Vu)
+#endif /* __HEXAGON_ARCH___ >= 81 */
+
+#if __HVX_ARCH__ >= 81
+/* ==========================================================================
+   Assembly Syntax:       Vd32.qf16=vneg(Vu32.qf16)
+   C Intrinsic Prototype: HVX_Vector Q6_Vqf16_vneg_Vqf16(HVX_Vector Vu)
+   Instruction Type:      CVI_VS
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+
+#define Q6_Vqf16_vneg_Vqf16(Vu)                                                \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vneg_qf16_qf16)(Vu)
+#endif /* __HEXAGON_ARCH___ >= 81 */
+
+#if __HVX_ARCH__ >= 81
+/* ==========================================================================
+   Assembly Syntax:       Vd32.qf32=vneg(Vu32.qf32)
+   C Intrinsic Prototype: HVX_Vector Q6_Vqf32_vneg_Vqf32(HVX_Vector Vu)
+   Instruction Type:      CVI_VS
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+
+#define Q6_Vqf32_vneg_Vqf32(Vu)                                                \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vneg_qf32_qf32)(Vu)
+#endif /* __HEXAGON_ARCH___ >= 81 */
+
+#if __HVX_ARCH__ >= 81
+/* ==========================================================================
+   Assembly Syntax:       Vd32.qf32=vneg(Vu32.sf)
+   C Intrinsic Prototype: HVX_Vector Q6_Vqf32_vneg_Vsf(HVX_Vector Vu)
+   Instruction Type:      CVI_VS
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+
+#define Q6_Vqf32_vneg_Vsf(Vu)                                                  \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vneg_qf32_sf)(Vu)
+#endif /* __HEXAGON_ARCH___ >= 81 */
+
+#if __HVX_ARCH__ >= 81
+/* ==========================================================================
+   Assembly Syntax:       Vd32.qf16=vsub(Vu32.hf,Vv32.qf16)
+   C Intrinsic Prototype: HVX_Vector Q6_Vqf16_vsub_VhfVqf16(HVX_Vector Vu,
+   HVX_Vector Vv) Instruction Type:      CVI_VS Execution Slots:       SLOT0123
+   ========================================================================== */
+
+#define Q6_Vqf16_vsub_VhfVqf16(Vu, Vv)                                         \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vsub_hf_mix)(Vu, Vv)
+#endif /* __HEXAGON_ARCH___ >= 81 */
+
+#if __HVX_ARCH__ >= 81
+/* ==========================================================================
+   Assembly Syntax:       Vd32.qf32=vsub(Vu32.sf,Vv32.qf32)
+   C Intrinsic Prototype: HVX_Vector Q6_Vqf32_vsub_VsfVqf32(HVX_Vector Vu,
+   HVX_Vector Vv) Instruction Type:      CVI_VS Execution Slots:       SLOT0123
+   ========================================================================== */
+
+#define Q6_Vqf32_vsub_VsfVqf32(Vu, Vv)                                         \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vsub_sf_mix)(Vu, Vv)
+#endif /* __HEXAGON_ARCH___ >= 81 */
+
 #endif /* __HVX__ */
 
 #endif
diff --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h
index 35f012cc70043..19064a4ff5cea 100644
--- a/clang/lib/Headers/immintrin.h
+++ b/clang/lib/Headers/immintrin.h
@@ -475,24 +475,12 @@ _storebe_i64(void * __P, long long __D) {
 
 #include <amxfp8intrin.h>
 
-#include <amxtransposeintrin.h>
-
 #include <amxmovrsintrin.h>
 
-#include <amxmovrstransposeintrin.h>
-
 #include <amxavx512intrin.h>
 
 #include <amxtf32intrin.h>
 
-#include <amxtf32transposeintrin.h>
-
-#include <amxbf16transposeintrin.h>
-
-#include <amxfp16transposeintrin.h>
-
-#include <amxcomplextransposeintrin.h>
-
 #include <avx512vp2intersectintrin.h>
 
 #include <avx512vlvp2intersectintrin.h>
diff --git a/clang/lib/Interpreter/InterpreterValuePrinter.cpp b/clang/lib/Interpreter/InterpreterValuePrinter.cpp
index 0ed02f3bfabe8..cfa50ee908bf8 100644
--- a/clang/lib/Interpreter/InterpreterValuePrinter.cpp
+++ b/clang/lib/Interpreter/InterpreterValuePrinter.cpp
@@ -411,7 +411,8 @@ class InterfaceKindVisitor
   }
 
   InterfaceKind VisitReferenceType(const ReferenceType *Ty) {
-    ExprResult AddrOfE = S.CreateBuiltinUnaryOp(SourceLocation(), UO_AddrOf, E);
+    ExprResult AddrOfE = S.CreateBuiltinUnaryOp(SourceLocation(), UO_AddrOf,
+                                                E->IgnoreImpCasts());
     assert(!AddrOfE.isInvalid() && "Can not create unary expression");
     Args.push_back(AddrOfE.get());
     return InterfaceKind::NoAlloc;
@@ -537,7 +538,7 @@ llvm::Expected<Expr *> Interpreter::convertExprToValue(Expr *E) {
   QualType DesugaredTy = Ty.getDesugaredType(Ctx);
 
   // For lvalue struct, we treat it as a reference.
-  if (DesugaredTy->isRecordType() && E->isLValue()) {
+  if (DesugaredTy->isRecordType() && E->IgnoreImpCasts()->isLValue()) {
     DesugaredTy = Ctx.getLValueReferenceType(DesugaredTy);
     Ty = Ctx.getLValueReferenceType(Ty);
   }
diff --git a/clang/lib/Lex/HeaderSearch.cpp b/clang/lib/Lex/HeaderSearch.cpp
index 65c324c10ca5d..f05c28fd7a123 100644
--- a/clang/lib/Lex/HeaderSearch.cpp
+++ b/clang/lib/Lex/HeaderSearch.cpp
@@ -221,7 +221,7 @@ std::string HeaderSearch::getPrebuiltModuleFileName(StringRef ModuleName,
   // file.
   for (const std::string &Dir : HSOpts.PrebuiltModulePaths) {
     SmallString<256> Result(Dir);
-    llvm::sys::fs::make_absolute(Result);
+    FileMgr.makeAbsolutePath(Result);
     if (ModuleName.contains(':'))
       // The separator of C++20 modules partitions (':') is not good for file
       // systems, here clang and gcc choose '-' by default since it is not a
@@ -246,7 +246,7 @@ std::string HeaderSearch::getPrebuiltImplicitModuleFileName(Module *Module) {
   StringRef ModuleCacheHash = HSOpts.DisableModuleHash ? "" : getModuleHash();
   for (const std::string &Dir : HSOpts.PrebuiltModulePaths) {
     SmallString<256> CachePath(Dir);
-    llvm::sys::fs::make_absolute(CachePath);
+    FileMgr.makeAbsolutePath(CachePath);
     llvm::sys::path::append(CachePath, ModuleCacheHash);
     std::string FileName =
         getCachedModuleFileNameImpl(ModuleName, ModuleMapPath, CachePath);
diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp
index e4b158e4a6248..7e4a164e34eda 100644
--- a/clang/lib/Parse/ParseDecl.cpp
+++ b/clang/lib/Parse/ParseDecl.cpp
@@ -4248,6 +4248,13 @@ void Parser::ParseDeclarationSpecifiers(
 
     // type-specifier
     case tok::kw_short:
+      if (!getLangOpts().NativeInt16Type) {
+        Diag(Tok, diag::err_unknown_typename) << Tok.getName();
+        DS.SetTypeSpecError();
+        DS.SetRangeEnd(Tok.getLocation());
+        ConsumeToken();
+        goto DoneWithDeclSpec;
+      }
       isInvalid = DS.SetTypeSpecWidth(TypeSpecifierWidth::Short, Loc, PrevSpec,
                                       DiagID, Policy);
       break;
diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp
index 25199c739ace9..31bc941e6a015 100644
--- a/clang/lib/Parse/ParseOpenMP.cpp
+++ b/clang/lib/Parse/ParseOpenMP.cpp
@@ -3221,6 +3221,7 @@ OMPClause *Parser::ParseOpenMPClause(OpenMPDirectiveKind DKind,
     else
       Clause = ParseOpenMPSingleExprClause(CKind, WrongDirective);
     break;
+  case OMPC_threadset:
   case OMPC_fail:
   case OMPC_proc_bind:
   case OMPC_atomic_default_mem_order:
diff --git a/clang/lib/Sema/AnalysisBasedWarnings.cpp b/clang/lib/Sema/AnalysisBasedWarnings.cpp
index 140b709dbb651..ca74c637bb92f 100644
--- a/clang/lib/Sema/AnalysisBasedWarnings.cpp
+++ b/clang/lib/Sema/AnalysisBasedWarnings.cpp
@@ -30,6 +30,7 @@
 #include "clang/Analysis/Analyses/CalledOnceCheck.h"
 #include "clang/Analysis/Analyses/Consumed.h"
 #include "clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h"
+#include "clang/Analysis/Analyses/LifetimeSafety/Origins.h"
 #include "clang/Analysis/Analyses/ReachableCode.h"
 #include "clang/Analysis/Analyses/ThreadSafety.h"
 #include "clang/Analysis/Analyses/UninitializedValues.h"
@@ -53,6 +54,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <deque>
 #include <iterator>
@@ -3132,6 +3134,7 @@ void clang::sema::AnalysisBasedWarnings::IssueWarnings(
 }
 
 void clang::sema::AnalysisBasedWarnings::PrintStats() const {
+  clang::lifetimes::internal::LifetimeSafetyAnalysis::PrintStats(llvm::errs());
   llvm::errs() << "\n*** Analysis Based Warnings Stats:\n";
 
   unsigned NumCFGsBuilt = NumFunctionsAnalyzed - NumFunctionsWithBadCFGs;
diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp
index e32f4376a5ebf..139c4abc040df 100644
--- a/clang/lib/Sema/SemaAMDGPU.cpp
+++ b/clang/lib/Sema/SemaAMDGPU.cpp
@@ -153,7 +153,48 @@ bool SemaAMDGPU::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_image_sample_3d_v4f32_f32:
   case AMDGPU::BI__builtin_amdgcn_image_sample_3d_v4f16_f32:
   case AMDGPU::BI__builtin_amdgcn_image_sample_cube_v4f32_f32:
-  case AMDGPU::BI__builtin_amdgcn_image_sample_cube_v4f16_f32: {
+  case AMDGPU::BI__builtin_amdgcn_image_sample_cube_v4f16_f32:
+  case AMDGPU::BI__builtin_amdgcn_image_sample_lz_1d_v4f32_f32:
+  case AMDGPU::BI__builtin_amdgcn_image_sample_lz_1d_v4f16_f32:
+  case AMDGPU::BI__builtin_amdgcn_image_sample_lz_1darray_v4f32_f32:
+  case AMDGPU::BI__builtin_amdgcn_image_sample_lz_1darray_v4f16_f32:
+  case AMDGPU::BI__builtin_amdgcn_image_sample_lz_2d_f32_f32:
+  case AMDGPU::BI__builtin_amdgcn_image_sample_lz_2d_v4f32_f32:
+  case AMDGPU::BI__builtin_amdgcn_image_sample_lz_2d_v4f16_f32:
+  case AMDGPU::BI__builtin_amdgcn_image_sample_lz_2darray_f32_f32:
+  case AMDGPU::BI__builtin_amdgcn_image_sample_lz_2darray_v4f32_f32:
+  case AMDGPU::BI__builtin_amdgcn_image_sample_lz_2darray_v4f16_f32:
+  case AMDGPU::BI__builtin_amdgcn_image_sample_lz_3d_v4f32_f32:
+  case AMDGPU::BI__builtin_amdgcn_image_sample_lz_3d_v4f16_f32:
+  case AMDGPU::BI__builtin_amdgcn_image_sample_lz_cube_v4f32_f32:
+  case AMDGPU::BI__builtin_amdgcn_image_sample_lz_cube_v4f16_f32:
+  case AMDGPU::BI__builtin_amdgcn_image_sample_l_1d_v4f32_f32:
+  case AMDGPU::BI__builtin_amdgcn_image_sample_l_1d_v4f16_f32:
+  case AMDGPU::BI__builtin_amdgcn_image_sample_l_1darray_v4f32_f32:
+  case AMDGPU::BI__builtin_amdgcn_image_sample_l_1darray_v4f16_f32:
+  case AMDGPU::BI__builtin_amdgcn_image_sample_l_2d_f32_f32:
+  case AMDGPU::BI__builtin_amdgcn_image_sample_l_2d_v4f16_f32:
+  case AMDGPU::BI__builtin_amdgcn_image_sample_l_2d_v4f32_f32:
+  case AMDGPU::BI__builtin_amdgcn_image_sample_l_2darray_f32_f32:
+  case AMDGPU::BI__builtin_amdgcn_image_sample_l_2darray_v4f32_f32:
+  case AMDGPU::BI__builtin_amdgcn_image_sample_l_2darray_v4f16_f32:
+  case AMDGPU::BI__builtin_amdgcn_image_sample_l_3d_v4f32_f32:
+  case AMDGPU::BI__builtin_amdgcn_image_sample_l_3d_v4f16_f32:
+  case AMDGPU::BI__builtin_amdgcn_image_sample_l_cube_v4f32_f32:
+  case AMDGPU::BI__builtin_amdgcn_image_sample_l_cube_v4f16_f32:
+  case AMDGPU::BI__builtin_amdgcn_image_sample_d_1d_v4f32_f32:
+  case AMDGPU::BI__builtin_amdgcn_image_sample_d_1d_v4f16_f32:
+  case AMDGPU::BI__builtin_amdgcn_image_sample_d_1darray_v4f32_f32:
+  case AMDGPU::BI__builtin_amdgcn_image_sample_d_1darray_v4f16_f32:
+  case AMDGPU::BI__builtin_amdgcn_image_sample_d_2d_f32_f32:
+  case AMDGPU::BI__builtin_amdgcn_image_sample_d_2d_v4f32_f32:
+  case AMDGPU::BI__builtin_amdgcn_image_sample_d_2d_v4f16_f32:
+  case AMDGPU::BI__builtin_amdgcn_image_sample_d_2darray_f32_f32:
+  case AMDGPU::BI__builtin_amdgcn_image_sample_d_2darray_v4f32_f32:
+  case AMDGPU::BI__builtin_amdgcn_image_sample_d_2darray_v4f16_f32:
+  case AMDGPU::BI__builtin_amdgcn_image_sample_d_3d_v4f32_f32:
+  case AMDGPU::BI__builtin_amdgcn_image_sample_d_3d_v4f16_f32:
+  case AMDGPU::BI__builtin_amdgcn_image_gather4_lz_2d_v4f32_f32: {
     StringRef FeatureList(
         getASTContext().BuiltinInfo.getRequiredFeatures(BuiltinID));
     if (!Builtin::evaluateRequiredTargetFeatures(FeatureList,
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index f4517877b04c8..ad2c2e4a97bb9 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -3542,9 +3542,7 @@ bool Sema::ValueIsRunOfOnes(CallExpr *TheCall, unsigned ArgNum) {
 
 bool Sema::getFormatStringInfo(const Decl *D, unsigned FormatIdx,
                                unsigned FirstArg, FormatStringInfo *FSI) {
-  bool IsCXXMember = false;
-  if (const auto *MD = dyn_cast<CXXMethodDecl>(D))
-    IsCXXMember = MD->isInstance();
+  bool HasImplicitThisParam = hasImplicitObjectParameter(D);
   bool IsVariadic = false;
   if (const FunctionType *FnTy = D->getFunctionType())
     IsVariadic = cast<FunctionProtoType>(FnTy)->isVariadic();
@@ -3553,11 +3551,12 @@ bool Sema::getFormatStringInfo(const Decl *D, unsigned FormatIdx,
   else if (const auto *OMD = dyn_cast<ObjCMethodDecl>(D))
     IsVariadic = OMD->isVariadic();
 
-  return getFormatStringInfo(FormatIdx, FirstArg, IsCXXMember, IsVariadic, FSI);
+  return getFormatStringInfo(FormatIdx, FirstArg, HasImplicitThisParam,
+                             IsVariadic, FSI);
 }
 
 bool Sema::getFormatStringInfo(unsigned FormatIdx, unsigned FirstArg,
-                               bool IsCXXMember, bool IsVariadic,
+                               bool HasImplicitThisParam, bool IsVariadic,
                                FormatStringInfo *FSI) {
   if (FirstArg == 0)
     FSI->ArgPassingKind = FAPK_VAList;
@@ -3571,7 +3570,7 @@ bool Sema::getFormatStringInfo(unsigned FormatIdx, unsigned FirstArg,
   // The way the format attribute works in GCC, the implicit this argument
   // of member functions is counted. However, it doesn't appear in our own
   // lists, so decrement format_idx in that case.
-  if (IsCXXMember) {
+  if (HasImplicitThisParam) {
     if(FSI->FormatIdx == 0)
       return false;
     --FSI->FormatIdx;
diff --git a/clang/lib/Sema/SemaCodeComplete.cpp b/clang/lib/Sema/SemaCodeComplete.cpp
index 0514d1033f74f..aa93507ab5c30 100644
--- a/clang/lib/Sema/SemaCodeComplete.cpp
+++ b/clang/lib/Sema/SemaCodeComplete.cpp
@@ -10208,6 +10208,24 @@ void SemaCodeCompletion::CodeCompletePreprocessorDirective(bool InConditional) {
   Builder.AddPlaceholderChunk("message");
   Results.AddResult(Builder.TakeString());
 
+  if (getLangOpts().C23) {
+    // #embed "file"
+    Builder.AddTypedTextChunk("embed");
+    Builder.AddChunk(CodeCompletionString::CK_HorizontalSpace);
+    Builder.AddTextChunk("\"");
+    Builder.AddPlaceholderChunk("file");
+    Builder.AddTextChunk("\"");
+    Results.AddResult(Builder.TakeString());
+
+    // #embed <file>
+    Builder.AddTypedTextChunk("embed");
+    Builder.AddChunk(CodeCompletionString::CK_HorizontalSpace);
+    Builder.AddTextChunk("<");
+    Builder.AddPlaceholderChunk("file");
+    Builder.AddTextChunk(">");
+    Results.AddResult(Builder.TakeString());
+  }
+
   // Note: #ident and #sccs are such crazy anachronisms that we don't provide
   // completions for them. And __include_macros is a Clang-internal extension
   // that we don't want to encourage anyone to use.
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 964a2a791e18f..a9e7b44ac9d73 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -3785,7 +3785,7 @@ static bool handleFormatAttrCommon(Sema &S, Decl *D, const ParsedAttr &AL,
 
   // In C++ the implicit 'this' function parameter also counts, and they are
   // counted from one.
-  bool HasImplicitThisParam = isInstanceMethod(D);
+  bool HasImplicitThisParam = hasImplicitObjectParameter(D);
   Info->NumArgs = getFunctionOrMethodNumParams(D) + HasImplicitThisParam;
 
   Info->Identifier = AL.getArgAsIdent(0)->getIdentifierInfo();
@@ -3926,7 +3926,7 @@ static void handleCallbackAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
     return;
   }
 
-  bool HasImplicitThisParam = isInstanceMethod(D);
+  bool HasImplicitThisParam = hasImplicitObjectParameter(D);
   int32_t NumArgs = getFunctionOrMethodNumParams(D);
 
   FunctionDecl *FD = D->getAsFunction();
@@ -4110,7 +4110,7 @@ static void handleLifetimeCaptureByAttr(Sema &S, Decl *D,
 }
 
 void Sema::LazyProcessLifetimeCaptureByParams(FunctionDecl *FD) {
-  bool HasImplicitThisParam = isInstanceMethod(FD);
+  bool HasImplicitThisParam = hasImplicitObjectParameter(FD);
   SmallVector<LifetimeCaptureByAttr *, 1> Attrs;
   for (ParmVarDecl *PVD : FD->parameters())
     if (auto *A = PVD->getAttr<LifetimeCaptureByAttr>())
diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index 96d51426e0b5c..94a490a8f68dc 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -3279,6 +3279,7 @@ bool SemaHLSL::CheckBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
     break;
   }
   case Builtin::BI__builtin_hlsl_wave_active_max:
+  case Builtin::BI__builtin_hlsl_wave_active_min:
   case Builtin::BI__builtin_hlsl_wave_active_sum: {
     if (SemaRef.checkArgCount(TheCall, 1))
       return true;
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index 6d5cb0fcaea24..256f9521b3a7e 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -17216,6 +17216,10 @@ OMPClause *SemaOpenMP::ActOnOpenMPSimpleClause(
         static_cast<OpenMPSeverityClauseKind>(Argument), ArgumentLoc, StartLoc,
         LParenLoc, EndLoc);
     break;
+  case OMPC_threadset:
+    Res = ActOnOpenMPThreadsetClause(static_cast<OpenMPThreadsetKind>(Argument),
+                                     ArgumentLoc, StartLoc, LParenLoc, EndLoc);
+    break;
   case OMPC_if:
   case OMPC_final:
   case OMPC_num_threads:
@@ -17355,6 +17359,23 @@ OMPClause *SemaOpenMP::ActOnOpenMPDefaultClause(
       OMPDefaultClause(M, MLoc, VCKind, VCKindLoc, StartLoc, LParenLoc, EndLoc);
 }
 
+OMPClause *SemaOpenMP::ActOnOpenMPThreadsetClause(OpenMPThreadsetKind Kind,
+                                                  SourceLocation KindLoc,
+                                                  SourceLocation StartLoc,
+                                                  SourceLocation LParenLoc,
+                                                  SourceLocation EndLoc) {
+  if (Kind == OMPC_THREADSET_unknown) {
+    Diag(KindLoc, diag::err_omp_unexpected_clause_value)
+        << getListOfPossibleValues(OMPC_threadset, /*First=*/0,
+                                   /*Last=*/unsigned(OMPC_THREADSET_unknown))
+        << getOpenMPClauseName(OMPC_threadset);
+    return nullptr;
+  }
+
+  return new (getASTContext())
+      OMPThreadsetClause(Kind, KindLoc, StartLoc, LParenLoc, EndLoc);
+}
+
 OMPClause *SemaOpenMP::ActOnOpenMPProcBindClause(ProcBindKind Kind,
                                                  SourceLocation KindKwLoc,
                                                  SourceLocation StartLoc,
diff --git a/clang/lib/Sema/SemaX86.cpp b/clang/lib/Sema/SemaX86.cpp
index 850bcb17bece1..2f61bdd9a6540 100644
--- a/clang/lib/Sema/SemaX86.cpp
+++ b/clang/lib/Sema/SemaX86.cpp
@@ -489,14 +489,6 @@ bool SemaX86::CheckBuiltinTileArguments(unsigned BuiltinID, CallExpr *TheCall) {
   case X86::BI__builtin_ia32_tileloaddrst164:
   case X86::BI__builtin_ia32_tilestored64:
   case X86::BI__builtin_ia32_tilezero:
-  case X86::BI__builtin_ia32_t2rpntlvwz0:
-  case X86::BI__builtin_ia32_t2rpntlvwz0t1:
-  case X86::BI__builtin_ia32_t2rpntlvwz1:
-  case X86::BI__builtin_ia32_t2rpntlvwz1t1:
-  case X86::BI__builtin_ia32_t2rpntlvwz0rst1:
-  case X86::BI__builtin_ia32_t2rpntlvwz1rs:
-  case X86::BI__builtin_ia32_t2rpntlvwz1rst1:
-  case X86::BI__builtin_ia32_t2rpntlvwz0rs:
   case X86::BI__builtin_ia32_tcvtrowps2bf16h:
   case X86::BI__builtin_ia32_tcvtrowps2bf16l:
   case X86::BI__builtin_ia32_tcvtrowps2phh:
@@ -516,17 +508,8 @@ bool SemaX86::CheckBuiltinTileArguments(unsigned BuiltinID, CallExpr *TheCall) {
   case X86::BI__builtin_ia32_tdpbhf8ps:
   case X86::BI__builtin_ia32_tdphbf8ps:
   case X86::BI__builtin_ia32_tdphf8ps:
-  case X86::BI__builtin_ia32_ttdpbf16ps:
-  case X86::BI__builtin_ia32_ttdpfp16ps:
-  case X86::BI__builtin_ia32_ttcmmimfp16ps:
-  case X86::BI__builtin_ia32_ttcmmrlfp16ps:
-  case X86::BI__builtin_ia32_tconjtcmmimfp16ps:
   case X86::BI__builtin_ia32_tmmultf32ps:
-  case X86::BI__builtin_ia32_ttmmultf32ps:
     return CheckBuiltinTileRangeAndDuplicate(TheCall, {0, 1, 2});
-  case X86::BI__builtin_ia32_ttransposed:
-  case X86::BI__builtin_ia32_tconjtfp16:
-    return CheckBuiltinTileArgumentsRange(TheCall, {0, 1});
   }
 }
 static bool isX86_32Builtin(unsigned BuiltinID) {
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index 0c8c1d18d317e..8c20078e97a13 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -10622,6 +10622,13 @@ TreeTransform<Derived>::TransformOMPDefaultClause(OMPDefaultClause *C) {
       C->getEndLoc());
 }
 
+template <typename Derived>
+OMPClause *
+TreeTransform<Derived>::TransformOMPThreadsetClause(OMPThreadsetClause *C) {
+  // No need to rebuild this clause, no template-dependent parameters.
+  return C;
+}
+
 template <typename Derived>
 OMPClause *
 TreeTransform<Derived>::TransformOMPProcBindClause(OMPProcBindClause *C) {
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index c1b5cb730e4a4..e3106f8d8e13c 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -11255,6 +11255,9 @@ OMPClause *OMPClauseReader::readClause() {
   case llvm::omp::OMPC_mergeable:
     C = new (Context) OMPMergeableClause();
     break;
+  case llvm::omp::OMPC_threadset:
+    C = new (Context) OMPThreadsetClause();
+    break;
   case llvm::omp::OMPC_read:
     C = new (Context) OMPReadClause();
     break;
@@ -11658,6 +11661,17 @@ void OMPClauseReader::VisitOMPDefaultClause(OMPDefaultClause *C) {
   C->setDefaultVariableCategoryLocation(Record.readSourceLocation());
 }
 
+// Read the parameter of threadset clause. This will have been saved when
+// OMPClauseWriter is called.
+void OMPClauseReader::VisitOMPThreadsetClause(OMPThreadsetClause *C) {
+  C->setLParenLoc(Record.readSourceLocation());
+  SourceLocation ThreadsetKindLoc = Record.readSourceLocation();
+  C->setThreadsetKindLoc(ThreadsetKindLoc);
+  OpenMPThreadsetKind TKind =
+      static_cast<OpenMPThreadsetKind>(Record.readInt());
+  C->setThreadsetKind(TKind);
+}
+
 void OMPClauseReader::VisitOMPProcBindClause(OMPProcBindClause *C) {
   C->setProcBindKind(static_cast<llvm::omp::ProcBindKind>(Record.readInt()));
   C->setLParenLoc(Record.readSourceLocation());
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index 377e3966874f3..3ac338e013deb 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -7913,6 +7913,12 @@ void OMPClauseWriter::VisitOMPDefaultClause(OMPDefaultClause *C) {
   Record.AddSourceLocation(C->getDefaultVCLoc());
 }
 
+void OMPClauseWriter::VisitOMPThreadsetClause(OMPThreadsetClause *C) {
+  Record.AddSourceLocation(C->getLParenLoc());
+  Record.AddSourceLocation(C->getThreadsetKindLoc());
+  Record.writeEnum(C->getThreadsetKind());
+}
+
 void OMPClauseWriter::VisitOMPProcBindClause(OMPProcBindClause *C) {
   Record.push_back(unsigned(C->getProcBindKind()));
   Record.AddSourceLocation(C->getLParenLoc());
diff --git a/clang/lib/Tooling/DependencyScanning/DependencyScannerImpl.cpp b/clang/lib/Tooling/DependencyScanning/DependencyScannerImpl.cpp
index 42f52d0ff6241..eebecdbdbb122 100644
--- a/clang/lib/Tooling/DependencyScanning/DependencyScannerImpl.cpp
+++ b/clang/lib/Tooling/DependencyScanning/DependencyScannerImpl.cpp
@@ -350,7 +350,7 @@ void sanitizeDiagOpts(DiagnosticOptions &DiagOpts) {
   //       See `test/ClangScanDeps/diagnostic-pragmas.c` for an example.
   llvm::erase_if(DiagOpts.Warnings, [](StringRef Warning) {
     return llvm::StringSwitch<bool>(Warning)
-        .Cases("pch-vfs-diff", "error=pch-vfs-diff", false)
+        .Cases({"pch-vfs-diff", "error=pch-vfs-diff"}, false)
         .StartsWith("no-error=", false)
         .Default(true);
   });
diff --git a/clang/lib/Tooling/Transformer/RangeSelector.cpp b/clang/lib/Tooling/Transformer/RangeSelector.cpp
index 171c786bc366f..b4bdec1fcdd69 100644
--- a/clang/lib/Tooling/Transformer/RangeSelector.cpp
+++ b/clang/lib/Tooling/Transformer/RangeSelector.cpp
@@ -205,8 +205,12 @@ RangeSelector transformer::name(std::string ID) {
       // `foo<int>` for which this range will be too short.  Doing so will
       // require subcasing `NamedDecl`, because it doesn't provide virtual
       // access to the \c DeclarationNameInfo.
-      if (tooling::getText(R, *Result.Context) != D->getName())
-        return CharSourceRange();
+      StringRef Text = tooling::getText(R, *Result.Context);
+      if (Text != D->getName())
+        return llvm::make_error<StringError>(
+            llvm::errc::not_supported,
+            "range selected by name(node id=" + ID + "): '" + Text +
+                "' is different from decl name '" + D->getName() + "'");
       return R;
     }
     if (const auto *E = Node.get<DeclRefExpr>()) {
diff --git a/clang/test/AST/ByteCode/builtin-functions.cpp b/clang/test/AST/ByteCode/builtin-functions.cpp
index e9093b2f23f74..a90f636b5134b 100644
--- a/clang/test/AST/ByteCode/builtin-functions.cpp
+++ b/clang/test/AST/ByteCode/builtin-functions.cpp
@@ -1856,7 +1856,8 @@ namespace InitParam {
 
 #endif
 
-namespace SAddOverflowInt {
+namespace NonBlockPointerStore {
   int a;
   void foo(void) { a *= __builtin_sadd_overflow(1, 2, 0); }
+  void foo2(void) { a *= __builtin_addc(1, 2, 0, 0); }
 }
diff --git a/clang/test/AST/HLSL/packoffset.hlsl b/clang/test/AST/HLSL/packoffset.hlsl
index 4d18a9ca631f1..05b927279e198 100644
--- a/clang/test/AST/HLSL/packoffset.hlsl
+++ b/clang/test/AST/HLSL/packoffset.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple dxil-unknown-shadermodel6.3-library -S -finclude-default-header -fnative-half-type -ast-dump  -x hlsl %s | FileCheck %s
+// RUN: %clang_cc1 -triple dxil-unknown-shadermodel6.3-library -S -finclude-default-header -fnative-half-type -fnative-int16-type -ast-dump  -x hlsl %s | FileCheck %s
 
 
 // CHECK: HLSLBufferDecl {{.*}} cbuffer A
diff --git a/clang/test/AST/HLSL/vk.spec-constant.usage.hlsl b/clang/test/AST/HLSL/vk.spec-constant.usage.hlsl
index 733c4e2ee5a36..5654974b26d2d 100644
--- a/clang/test/AST/HLSL/vk.spec-constant.usage.hlsl
+++ b/clang/test/AST/HLSL/vk.spec-constant.usage.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -triple spirv-unknown-vulkan-compute -x hlsl -ast-dump -o - %s | FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -fnative-int16-type -triple spirv-unknown-vulkan-compute -x hlsl -ast-dump -o - %s | FileCheck %s
 
 // CHECK: VarDecl {{.*}} bool_const 'const hlsl_private bool' static cinit
 // CHECK-NEXT: CallExpr {{.*}} 'bool'
diff --git a/clang/test/CIR/CodeGen/builtin_prefetech.c b/clang/test/CIR/CodeGen/builtin_prefetch.c
similarity index 100%
rename from clang/test/CIR/CodeGen/builtin_prefetech.c
rename to clang/test/CIR/CodeGen/builtin_prefetch.c
diff --git a/clang/test/CIR/CodeGen/switch.cpp b/clang/test/CIR/CodeGen/switch.cpp
index e13aa8f4f4953..3824be0d08c2f 100644
--- a/clang/test/CIR/CodeGen/switch.cpp
+++ b/clang/test/CIR/CodeGen/switch.cpp
@@ -1183,3 +1183,90 @@ int nested_switch(int a) {
 // OGCG: [[IFEND10]]:
 // OGCG:   br label %[[EPILOG]]
 // OGCG: [[EPILOG]]:
+
+int sw_return_multi_cases(int x) {
+  switch (x) {
+  case 0:
+    return 0;
+  case 1:
+    return 1;
+  case 2:
+    return 2;
+  default:
+    return -1;
+  }
+}
+
+// CIR-LABEL: cir.func{{.*}} @_Z21sw_return_multi_casesi
+// CIR:       cir.switch (%{{.*}} : !s32i) {
+// CIR-NEXT:  cir.case(equal, [#cir.int<0> : !s32i]) {
+// CIR:         %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CIR:         cir.store{{.*}} %[[ZERO]], %{{.*}} : !s32i, !cir.ptr<!s32i>
+// CIR:         %[[RET0:.*]] = cir.load{{.*}} %{{.*}} : !cir.ptr<!s32i>, !s32i
+// CIR-NEXT:    cir.return %[[RET0]] : !s32i
+// CIR-NEXT:  }
+// CIR-NEXT:  cir.case(equal, [#cir.int<1> : !s32i]) {
+// CIR:         %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+// CIR:         cir.store{{.*}} %[[ONE]], %{{.*}} : !s32i, !cir.ptr<!s32i>
+// CIR:         %[[RET1:.*]] = cir.load{{.*}} %{{.*}} : !cir.ptr<!s32i>, !s32i
+// CIR-NEXT:    cir.return %[[RET1]] : !s32i
+// CIR-NEXT:  }
+// CIR-NEXT:  cir.case(equal, [#cir.int<2> : !s32i]) {
+// CIR:         %[[TWO:.*]] = cir.const #cir.int<2> : !s32i
+// CIR:         cir.store{{.*}} %[[TWO]], %{{.*}} : !s32i, !cir.ptr<!s32i>
+// CIR:         %[[RET2:.*]] = cir.load{{.*}} %{{.*}} : !cir.ptr<!s32i>, !s32i
+// CIR-NEXT:    cir.return %[[RET2]] : !s32i
+// CIR-NEXT:  }
+// CIR-NEXT:  cir.case(default, []) {
+// CIR:         %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+// CIR:         %[[NEG:.*]] = cir.unary(minus, %[[ONE]]) {{.*}} : !s32i, !s32i
+// CIR:         cir.store{{.*}} %[[NEG]], %{{.*}} : !s32i, !cir.ptr<!s32i>
+// CIR:         %[[RETDEF:.*]] = cir.load{{.*}} %{{.*}} : !cir.ptr<!s32i>, !s32i
+// CIR-NEXT:    cir.return %[[RETDEF]] : !s32i
+// CIR-NEXT:  }
+// CIR-NEXT:  cir.yield
+
+// LLVM-LABEL: define{{.*}} i32 @_Z21sw_return_multi_casesi
+// LLVM:   switch i32 %{{.*}}, label %[[DEFAULT:.*]] [
+// LLVM-DAG:   i32 0, label %[[CASE0:.*]]
+// LLVM-DAG:   i32 1, label %[[CASE1:.*]]
+// LLVM-DAG:   i32 2, label %[[CASE2:.*]]
+// LLVM:   ]
+// LLVM: [[CASE0]]:
+// LLVM:   store i32 0, ptr %{{.*}}, align 4
+// LLVM:   %{{.*}} = load i32, ptr %{{.*}}, align 4
+// LLVM:   ret i32 %{{.*}}
+// LLVM: [[CASE1]]:
+// LLVM:   store i32 1, ptr %{{.*}}, align 4
+// LLVM:   %{{.*}} = load i32, ptr %{{.*}}, align 4
+// LLVM:   ret i32 %{{.*}}
+// LLVM: [[CASE2]]:
+// LLVM:   store i32 2, ptr %{{.*}}, align 4
+// LLVM:   %{{.*}} = load i32, ptr %{{.*}}, align 4
+// LLVM:   ret i32 %{{.*}}
+// LLVM: [[DEFAULT]]:
+// LLVM:   store i32 -1, ptr %{{.*}}, align 4
+// LLVM:   %{{.*}} = load i32, ptr %{{.*}}, align 4
+// LLVM:   ret i32 %{{.*}}
+
+// OGCG-LABEL: define{{.*}} i32 @_Z21sw_return_multi_casesi
+// OGCG: entry:
+// OGCG:   %[[RETVAL:.*]] = alloca i32, align 4
+// OGCG:   %[[X_ADDR:.*]] = alloca i32, align 4
+// OGCG:   %[[X_VAL:.*]] = load i32, ptr %[[X_ADDR]], align 4
+// OGCG:   switch i32 %[[X_VAL]], label %[[DEFAULT:.*]] [
+// OGCG-DAG:   i32 0, label %[[SW0:.*]]
+// OGCG-DAG:   i32 1, label %[[SW1:.*]]
+// OGCG-DAG:   i32 2, label %[[SW2:.*]]
+// OGCG:   ]
+// OGCG: [[SW0]]:
+// OGCG:   br label %[[RETURN:.*]]
+// OGCG: [[SW1]]:
+// OGCG:   br label %[[RETURN]]
+// OGCG: [[SW2]]:
+// OGCG:   br label %[[RETURN]]
+// OGCG: [[DEFAULT]]:
+// OGCG:   br label %[[RETURN]]
+// OGCG: [[RETURN]]:
+// OGCG:   %[[RETVAL_LOAD:.*]] = load i32, ptr %[[RETVAL]], align 4
+// OGCG:   ret i32 %[[RETVAL_LOAD]]
diff --git a/clang/test/CIR/CodeGen/try-catch.cpp b/clang/test/CIR/CodeGen/try-catch.cpp
index 1e4d2a63ada01..27e3d8ef41115 100644
--- a/clang/test/CIR/CodeGen/try-catch.cpp
+++ b/clang/test/CIR/CodeGen/try-catch.cpp
@@ -164,3 +164,33 @@ void try_catch_with_alloca() {
 // OGCG: %[[TMP_B:.*]] = load i32, ptr %[[B_ADDR]], align 4
 // OGCG: %[[RESULT:.*]] = add nsw i32 %[[TMP_A]], %[[TMP_B]]
 // OGCG: store i32 %[[RESULT]], ptr %[[C_ADDR]], align 4
+
+void function_with_noexcept() noexcept;
+
+void calling_noexcept_function_inside_try_block() {
+  try {
+    function_with_noexcept();
+  } catch (...) {
+  }
+}
+
+// CIR: cir.scope {
+// CIR:   cir.try {
+// CIR:     cir.call @_Z22function_with_noexceptv() nothrow : () -> ()
+// CIR:     cir.yield
+// CIR:   }
+// CIR: }
+
+// LLVM:   br label %[[LABEL_1:.*]]
+// LLVM: [[LABEL_1]]:
+// LLVM:   br label %[[LABEL_2:.*]]
+// LLVM: [[LABEL_2]]:
+// LLVM:   call void @_Z22function_with_noexceptv()
+// LLVM:   br label %[[LABEL_3:.*]]
+// LLVM: [[LABEL_3]]:
+// LLVM:   br label %[[LABEL_4:.*]]
+// LLVM: [[LABEL_4]]:
+// LLVM:   ret void
+
+// OGCG: call void @_Z22function_with_noexceptv()
+// OGCG: ret void
diff --git a/clang/test/CXX/drs/cwg0xx.cpp b/clang/test/CXX/drs/cwg0xx.cpp
index 805be67f2dc1a..10a4f1d6add3a 100644
--- a/clang/test/CXX/drs/cwg0xx.cpp
+++ b/clang/test/CXX/drs/cwg0xx.cpp
@@ -90,6 +90,8 @@ namespace cwg5 { // cwg5: 3.1
   const C c = e;
 } // namespace cwg5
 
+// cwg6 is in cwg6.cpp
+
 namespace cwg7 { // cwg7: 3.4
   class A { public: ~A(); };
   class B : virtual private A {}; // #cwg7-B
diff --git a/clang/test/CXX/drs/cwg28xx.cpp b/clang/test/CXX/drs/cwg28xx.cpp
index a6b2b99e0c3f1..d0ee191ef23d8 100644
--- a/clang/test/CXX/drs/cwg28xx.cpp
+++ b/clang/test/CXX/drs/cwg28xx.cpp
@@ -61,6 +61,24 @@ namespace cwg2819 { // cwg2819: 19 c++26
 #endif
 } // namespace cwg2819
 
+namespace cwg2823 { // cwg2823: no
+#if __cplusplus >= 201103L
+  constexpr int *p = 0;
+  constexpr int *q1 = &*p;
+  // expected-error@-1 {{constexpr variable 'q1' must be initialized by a constant expression}}
+  //   expected-note@-2 {{dereferencing a null pointer is not allowed in a constant expression}}
+  // FIXME: invalid: dereferencing a null pointer.
+  constexpr int *q2 = &p[0];
+
+  int arr[32];
+  constexpr int *r = arr;
+  // FIXME: invalid: dereferencing a past-the-end pointer.
+  constexpr int *s1 = &*(r + 32);
+  // FIXME: invalid: dereferencing a past-the-end pointer.
+  constexpr int *s2 = &r[32];
+#endif
+}
+
 namespace cwg2847 { // cwg2847: 19 review 2024-03-01
 
 #if __cplusplus >= 202002L
diff --git a/clang/test/CXX/drs/cwg2xx.cpp b/clang/test/CXX/drs/cwg2xx.cpp
index 37186e3c3f205..a4995ddc2c588 100644
--- a/clang/test/CXX/drs/cwg2xx.cpp
+++ b/clang/test/CXX/drs/cwg2xx.cpp
@@ -230,6 +230,38 @@ namespace cwg211 { // cwg211: 2.7
   };
 } // namespace cwg211
 
+namespace cwg212 { // cwg212: 2.7
+  template<typename T> struct Base;
+  template<typename T> struct Derived;
+
+  int *overload(void*);
+  float *overload(Base<int>*);
+  double *overload(Base<long>*);
+
+  void f(Derived<int> *p) {
+    // OK, calls void* overload.
+    int *a = overload(p);
+
+    Base<int> *q = p;
+    // expected-error@-1 {{cannot initialize a variable of type 'Base<int> *' with an lvalue of type 'Derived<int> *'}}
+  }
+
+  template<typename T> struct Base {};
+  template<typename T> struct Derived : Base<T> {};
+
+  void g(Derived<long> *p) {
+    // OK, instantiates and calls Base<long>* overlod.
+    double *b = overload(p);
+    (void)b;
+  }
+
+  void h(Derived<float> *p) {
+    // OK, instantiates and converts.
+    Base<float> *q = p;
+    (void)q;
+  }
+}
+
 namespace cwg213 { // cwg213: 2.7
   template <class T> struct A : T {
     void h(T t) {
@@ -593,6 +625,9 @@ namespace cwg231 { // cwg231: 2.7
   }
 } // namespace cwg231
 
+// 232 is NAD; the desired behavior is described in 2823.
+// cwg232: dup 2823
+
 // cwg234: na
 // cwg235: na
 
diff --git a/clang/test/CXX/drs/cwg6.cpp b/clang/test/CXX/drs/cwg6.cpp
new file mode 100644
index 0000000000000..4752e72034c78
--- /dev/null
+++ b/clang/test/CXX/drs/cwg6.cpp
@@ -0,0 +1,51 @@
+// RUN: %clang_cc1 -std=c++98 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++11 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++14 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++17 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++20 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++23 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++2c %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | FileCheck %s --check-prefixes CHECK
+
+#if __cplusplus == 199711L
+#define static_assert(expr) __extension__ _Static_assert(expr)
+#define noexcept throw()
+#endif
+
+namespace cwg6 { // cwg6: 2.7
+#if __cplusplus >= 201103L
+struct Counter {
+  int copies;
+  constexpr Counter(int copies) : copies(copies) {}
+  constexpr Counter(const Counter& other) : copies(other.copies + 1) {}
+};
+
+// Passing an lvalue by value makes a non-elidable copy.
+constexpr int PassByValue(Counter c) { return c.copies; }
+constexpr int PassByValue2(Counter c) { return PassByValue(c); }
+constexpr int PassByValue3(Counter c) { return PassByValue2(c); }
+static_assert(PassByValue(Counter(0)) == 0, "expect no copies");
+static_assert(PassByValue2(Counter(0)) == 1, "expect 1 copy");
+static_assert(PassByValue3(Counter(0)) == 2, "expect 2 copies");
+#endif
+
+struct A {
+  A() noexcept;
+  A(const A&) noexcept;
+  ~A() noexcept;
+};
+
+inline void f(A a) noexcept {}
+
+// CHECK-LABEL: define {{.*}} @_ZN4cwg64callEv
+void call() {
+  A a;
+  // We copy the parameter here, even though object is not mutated by f and
+  // otherwise satisfies the criteria for the proposed CWG6 optimization.
+  // CHECK: call {{.*}} @_ZN4cwg61AC1ERKS0_(
+  // CHECK: call {{.*}} @_ZN4cwg61fENS_1AE(
+  f(a);
+  // CHECK: call {{.*}} @_ZN4cwg61AD1Ev(
+  // CHECK: call {{.*}} @_ZN4cwg61AD1Ev(
+}
+
+} // namespace cwg6
diff --git a/clang/test/CodeGen/AArch64/ext-vector-coercion.c b/clang/test/CodeGen/AArch64/ext-vector-coercion.c
new file mode 100644
index 0000000000000..354980afe06d7
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/ext-vector-coercion.c
@@ -0,0 +1,42 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
+// RUN: %clang_cc1 -fenable-matrix -triple arm64-apple-macosx %s -emit-llvm -disable-llvm-passes -o - | FileCheck %s
+
+typedef float float3 __attribute__((ext_vector_type(3)));
+struct Vec3 {
+  union {
+    struct {
+      float x;
+      float y;
+      float z;
+    };
+    float vec __attribute__((ext_vector_type(3)));
+  };
+};
+
+// CHECK-LABEL: define i128 @add(
+// CHECK-SAME: i128 [[A_COERCE:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_VEC3:%.*]], align 16
+// CHECK-NEXT:    [[A:%.*]] = alloca [[STRUCT_VEC3]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_VEC3]], ptr [[A]], i32 0, i32 0
+// CHECK-NEXT:    store i128 [[A_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_VEC3]], ptr [[A]], i32 0, i32 0
+// CHECK-NEXT:    [[LOADVECN:%.*]] = load <4 x float>, ptr [[TMP0]], align 16
+// CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_VEC3]], ptr [[A]], i32 0, i32 0
+// CHECK-NEXT:    [[LOADVECN1:%.*]] = load <4 x float>, ptr [[TMP1]], align 16
+// CHECK-NEXT:    [[EXTRACTVEC2:%.*]] = shufflevector <4 x float> [[LOADVECN1]], <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
+// CHECK-NEXT:    [[ADD:%.*]] = fadd <3 x float> [[EXTRACTVEC]], [[EXTRACTVEC2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_VEC3]], ptr [[RETVAL]], i32 0, i32 0
+// CHECK-NEXT:    [[EXTRACTVEC3:%.*]] = shufflevector <3 x float> [[ADD]], <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    store <4 x float> [[EXTRACTVEC3]], ptr [[TMP2]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE4:%.*]] = getelementptr inbounds nuw [[STRUCT_VEC3]], ptr [[RETVAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load i128, ptr [[COERCE_DIVE4]], align 16
+// CHECK-NEXT:    ret i128 [[TMP3]]
+//
+struct Vec3 add(struct Vec3 a) {
+  struct Vec3 res;
+  res.vec = a.vec + a.vec;
+  return res;
+}
+
diff --git a/clang/test/CodeGen/AArch64/neon-across.c b/clang/test/CodeGen/AArch64/neon-across.c
index d365975593559..aae5097da7789 100644
--- a/clang/test/CodeGen/AArch64/neon-across.c
+++ b/clang/test/CodeGen/AArch64/neon-across.c
@@ -49,7 +49,7 @@ uint32_t test_vaddlv_u16(uint16x4_t a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vaddlvq_s8
-// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.saddlv.i32.v16i8(<16 x i8> [[A]])
 // CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VADDLV_I]] to i16
@@ -60,7 +60,7 @@ int16_t test_vaddlvq_s8(int8x16_t a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vaddlvq_s16
-// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.saddlv.i32.v8i16(<8 x i16> [[A]])
 // CHECK-NEXT:    ret i32 [[VADDLV_I]]
@@ -70,7 +70,7 @@ int32_t test_vaddlvq_s16(int16x8_t a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vaddlvq_s32
-// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VADDLVQ_S32_I:%.*]] = call i64 @llvm.aarch64.neon.saddlv.i64.v4i32(<4 x i32> [[A]])
 // CHECK-NEXT:    ret i64 [[VADDLVQ_S32_I]]
@@ -80,7 +80,7 @@ int64_t test_vaddlvq_s32(int32x4_t a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vaddlvq_u8
-// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddlv.i32.v16i8(<16 x i8> [[A]])
 // CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VADDLV_I]] to i16
@@ -91,7 +91,7 @@ uint16_t test_vaddlvq_u8(uint8x16_t a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vaddlvq_u16
-// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddlv.i32.v8i16(<8 x i16> [[A]])
 // CHECK-NEXT:    ret i32 [[VADDLV_I]]
@@ -101,7 +101,7 @@ uint32_t test_vaddlvq_u16(uint16x8_t a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vaddlvq_u32
-// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VADDLVQ_U32_I:%.*]] = call i64 @llvm.aarch64.neon.uaddlv.i64.v4i32(<4 x i32> [[A]])
 // CHECK-NEXT:    ret i64 [[VADDLVQ_U32_I]]
@@ -113,9 +113,8 @@ uint64_t test_vaddlvq_u32(uint32x4_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vmaxv_s8
 // CHECK-SAME: (<8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v8i8(<8 x i8> [[A]])
-// CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i8
-// CHECK-NEXT:    ret i8 [[TMP0]]
+// CHECK-NEXT:    [[VMAXV_S8_I:%.*]] = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> [[A]])
+// CHECK-NEXT:    ret i8 [[VMAXV_S8_I]]
 //
 int8_t test_vmaxv_s8(int8x8_t a) {
   return vmaxv_s8(a);
@@ -124,9 +123,8 @@ int8_t test_vmaxv_s8(int8x8_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vmaxv_s16
 // CHECK-SAME: (<4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v4i16(<4 x i16> [[A]])
-// CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i16
-// CHECK-NEXT:    ret i16 [[TMP0]]
+// CHECK-NEXT:    [[VMAXV_S16_I:%.*]] = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> [[A]])
+// CHECK-NEXT:    ret i16 [[VMAXV_S16_I]]
 //
 int16_t test_vmaxv_s16(int16x4_t a) {
   return vmaxv_s16(a);
@@ -135,9 +133,8 @@ int16_t test_vmaxv_s16(int16x4_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vmaxv_u8
 // CHECK-SAME: (<8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v8i8(<8 x i8> [[A]])
-// CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i8
-// CHECK-NEXT:    ret i8 [[TMP0]]
+// CHECK-NEXT:    [[VMAXV_U8_I:%.*]] = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> [[A]])
+// CHECK-NEXT:    ret i8 [[VMAXV_U8_I]]
 //
 uint8_t test_vmaxv_u8(uint8x8_t a) {
   return vmaxv_u8(a);
@@ -146,40 +143,37 @@ uint8_t test_vmaxv_u8(uint8x8_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vmaxv_u16
 // CHECK-SAME: (<4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v4i16(<4 x i16> [[A]])
-// CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i16
-// CHECK-NEXT:    ret i16 [[TMP0]]
+// CHECK-NEXT:    [[VMAXV_U16_I:%.*]] = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> [[A]])
+// CHECK-NEXT:    ret i16 [[VMAXV_U16_I]]
 //
 uint16_t test_vmaxv_u16(uint16x4_t a) {
   return vmaxv_u16(a);
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vmaxvq_s8
-// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v16i8(<16 x i8> [[A]])
-// CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i8
-// CHECK-NEXT:    ret i8 [[TMP0]]
+// CHECK-NEXT:    [[VMAXVQ_S8_I:%.*]] = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> [[A]])
+// CHECK-NEXT:    ret i8 [[VMAXVQ_S8_I]]
 //
 int8_t test_vmaxvq_s8(int8x16_t a) {
   return vmaxvq_s8(a);
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vmaxvq_s16
-// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v8i16(<8 x i16> [[A]])
-// CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i16
-// CHECK-NEXT:    ret i16 [[TMP0]]
+// CHECK-NEXT:    [[VMAXVQ_S16_I:%.*]] = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> [[A]])
+// CHECK-NEXT:    ret i16 [[VMAXVQ_S16_I]]
 //
 int16_t test_vmaxvq_s16(int16x8_t a) {
   return vmaxvq_s16(a);
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vmaxvq_s32
-// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VMAXVQ_S32_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v4i32(<4 x i32> [[A]])
+// CHECK-NEXT:    [[VMAXVQ_S32_I:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[A]])
 // CHECK-NEXT:    ret i32 [[VMAXVQ_S32_I]]
 //
 int32_t test_vmaxvq_s32(int32x4_t a) {
@@ -187,31 +181,29 @@ int32_t test_vmaxvq_s32(int32x4_t a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vmaxvq_u8
-// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v16i8(<16 x i8> [[A]])
-// CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i8
-// CHECK-NEXT:    ret i8 [[TMP0]]
+// CHECK-NEXT:    [[VMAXVQ_U8_I:%.*]] = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> [[A]])
+// CHECK-NEXT:    ret i8 [[VMAXVQ_U8_I]]
 //
 uint8_t test_vmaxvq_u8(uint8x16_t a) {
   return vmaxvq_u8(a);
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vmaxvq_u16
-// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v8i16(<8 x i16> [[A]])
-// CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i16
-// CHECK-NEXT:    ret i16 [[TMP0]]
+// CHECK-NEXT:    [[VMAXVQ_U16_I:%.*]] = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> [[A]])
+// CHECK-NEXT:    ret i16 [[VMAXVQ_U16_I]]
 //
 uint16_t test_vmaxvq_u16(uint16x8_t a) {
   return vmaxvq_u16(a);
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vmaxvq_u32
-// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VMAXVQ_U32_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v4i32(<4 x i32> [[A]])
+// CHECK-NEXT:    [[VMAXVQ_U32_I:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[A]])
 // CHECK-NEXT:    ret i32 [[VMAXVQ_U32_I]]
 //
 uint32_t test_vmaxvq_u32(uint32x4_t a) {
@@ -221,9 +213,8 @@ uint32_t test_vmaxvq_u32(uint32x4_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vminv_s8
 // CHECK-SAME: (<8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v8i8(<8 x i8> [[A]])
-// CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i8
-// CHECK-NEXT:    ret i8 [[TMP0]]
+// CHECK-NEXT:    [[VMINV_S8_I:%.*]] = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> [[A]])
+// CHECK-NEXT:    ret i8 [[VMINV_S8_I]]
 //
 int8_t test_vminv_s8(int8x8_t a) {
   return vminv_s8(a);
@@ -232,9 +223,8 @@ int8_t test_vminv_s8(int8x8_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vminv_s16
 // CHECK-SAME: (<4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v4i16(<4 x i16> [[A]])
-// CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i16
-// CHECK-NEXT:    ret i16 [[TMP0]]
+// CHECK-NEXT:    [[VMINV_S16_I:%.*]] = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> [[A]])
+// CHECK-NEXT:    ret i16 [[VMINV_S16_I]]
 //
 int16_t test_vminv_s16(int16x4_t a) {
   return vminv_s16(a);
@@ -243,9 +233,8 @@ int16_t test_vminv_s16(int16x4_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vminv_u8
 // CHECK-SAME: (<8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v8i8(<8 x i8> [[A]])
-// CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i8
-// CHECK-NEXT:    ret i8 [[TMP0]]
+// CHECK-NEXT:    [[VMINV_U8_I:%.*]] = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> [[A]])
+// CHECK-NEXT:    ret i8 [[VMINV_U8_I]]
 //
 uint8_t test_vminv_u8(uint8x8_t a) {
   return vminv_u8(a);
@@ -254,40 +243,37 @@ uint8_t test_vminv_u8(uint8x8_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vminv_u16
 // CHECK-SAME: (<4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v4i16(<4 x i16> [[A]])
-// CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i16
-// CHECK-NEXT:    ret i16 [[TMP0]]
+// CHECK-NEXT:    [[VMINV_U16_I:%.*]] = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> [[A]])
+// CHECK-NEXT:    ret i16 [[VMINV_U16_I]]
 //
 uint16_t test_vminv_u16(uint16x4_t a) {
   return vminv_u16(a);
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vminvq_s8
-// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v16i8(<16 x i8> [[A]])
-// CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i8
-// CHECK-NEXT:    ret i8 [[TMP0]]
+// CHECK-NEXT:    [[VMINVQ_S8_I:%.*]] = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> [[A]])
+// CHECK-NEXT:    ret i8 [[VMINVQ_S8_I]]
 //
 int8_t test_vminvq_s8(int8x16_t a) {
   return vminvq_s8(a);
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vminvq_s16
-// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v8i16(<8 x i16> [[A]])
-// CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i16
-// CHECK-NEXT:    ret i16 [[TMP0]]
+// CHECK-NEXT:    [[VMINVQ_S16_I:%.*]] = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> [[A]])
+// CHECK-NEXT:    ret i16 [[VMINVQ_S16_I]]
 //
 int16_t test_vminvq_s16(int16x8_t a) {
   return vminvq_s16(a);
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vminvq_s32
-// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VMINVQ_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v4i32(<4 x i32> [[A]])
+// CHECK-NEXT:    [[VMINVQ_S32_I:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[A]])
 // CHECK-NEXT:    ret i32 [[VMINVQ_S32_I]]
 //
 int32_t test_vminvq_s32(int32x4_t a) {
@@ -295,31 +281,29 @@ int32_t test_vminvq_s32(int32x4_t a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vminvq_u8
-// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v16i8(<16 x i8> [[A]])
-// CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i8
-// CHECK-NEXT:    ret i8 [[TMP0]]
+// CHECK-NEXT:    [[VMINVQ_U8_I:%.*]] = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> [[A]])
+// CHECK-NEXT:    ret i8 [[VMINVQ_U8_I]]
 //
 uint8_t test_vminvq_u8(uint8x16_t a) {
   return vminvq_u8(a);
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vminvq_u16
-// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v8i16(<8 x i16> [[A]])
-// CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i16
-// CHECK-NEXT:    ret i16 [[TMP0]]
+// CHECK-NEXT:    [[VMINVQ_U16_I:%.*]] = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> [[A]])
+// CHECK-NEXT:    ret i16 [[VMINVQ_U16_I]]
 //
 uint16_t test_vminvq_u16(uint16x8_t a) {
   return vminvq_u16(a);
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vminvq_u32
-// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VMINVQ_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v4i32(<4 x i32> [[A]])
+// CHECK-NEXT:    [[VMINVQ_U32_I:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[A]])
 // CHECK-NEXT:    ret i32 [[VMINVQ_U32_I]]
 //
 uint32_t test_vminvq_u32(uint32x4_t a) {
@@ -329,9 +313,8 @@ uint32_t test_vminvq_u32(uint32x4_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vaddv_s8
 // CHECK-SAME: (<8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v8i8(<8 x i8> [[A]])
-// CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i8
-// CHECK-NEXT:    ret i8 [[TMP0]]
+// CHECK-NEXT:    [[VADDV_S8_I:%.*]] = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> [[A]])
+// CHECK-NEXT:    ret i8 [[VADDV_S8_I]]
 //
 int8_t test_vaddv_s8(int8x8_t a) {
   return vaddv_s8(a);
@@ -340,9 +323,8 @@ int8_t test_vaddv_s8(int8x8_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vaddv_s16
 // CHECK-SAME: (<4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v4i16(<4 x i16> [[A]])
-// CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i16
-// CHECK-NEXT:    ret i16 [[TMP0]]
+// CHECK-NEXT:    [[VADDV_S16_I:%.*]] = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> [[A]])
+// CHECK-NEXT:    ret i16 [[VADDV_S16_I]]
 //
 int16_t test_vaddv_s16(int16x4_t a) {
   return vaddv_s16(a);
@@ -351,9 +333,8 @@ int16_t test_vaddv_s16(int16x4_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vaddv_u8
 // CHECK-SAME: (<8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v8i8(<8 x i8> [[A]])
-// CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i8
-// CHECK-NEXT:    ret i8 [[TMP0]]
+// CHECK-NEXT:    [[VADDV_U8_I:%.*]] = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> [[A]])
+// CHECK-NEXT:    ret i8 [[VADDV_U8_I]]
 //
 uint8_t test_vaddv_u8(uint8x8_t a) {
   return vaddv_u8(a);
@@ -362,40 +343,37 @@ uint8_t test_vaddv_u8(uint8x8_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vaddv_u16
 // CHECK-SAME: (<4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v4i16(<4 x i16> [[A]])
-// CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i16
-// CHECK-NEXT:    ret i16 [[TMP0]]
+// CHECK-NEXT:    [[VADDV_U16_I:%.*]] = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> [[A]])
+// CHECK-NEXT:    ret i16 [[VADDV_U16_I]]
 //
 uint16_t test_vaddv_u16(uint16x4_t a) {
   return vaddv_u16(a);
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vaddvq_s8
-// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v16i8(<16 x i8> [[A]])
-// CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i8
-// CHECK-NEXT:    ret i8 [[TMP0]]
+// CHECK-NEXT:    [[VADDVQ_S8_I:%.*]] = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> [[A]])
+// CHECK-NEXT:    ret i8 [[VADDVQ_S8_I]]
 //
 int8_t test_vaddvq_s8(int8x16_t a) {
   return vaddvq_s8(a);
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vaddvq_s16
-// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v8i16(<8 x i16> [[A]])
-// CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i16
-// CHECK-NEXT:    ret i16 [[TMP0]]
+// CHECK-NEXT:    [[VADDVQ_S16_I:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[A]])
+// CHECK-NEXT:    ret i16 [[VADDVQ_S16_I]]
 //
 int16_t test_vaddvq_s16(int16x8_t a) {
   return vaddvq_s16(a);
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vaddvq_s32
-// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VADDVQ_S32_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v4i32(<4 x i32> [[A]])
+// CHECK-NEXT:    [[VADDVQ_S32_I:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[A]])
 // CHECK-NEXT:    ret i32 [[VADDVQ_S32_I]]
 //
 int32_t test_vaddvq_s32(int32x4_t a) {
@@ -403,31 +381,29 @@ int32_t test_vaddvq_s32(int32x4_t a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vaddvq_u8
-// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v16i8(<16 x i8> [[A]])
-// CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i8
-// CHECK-NEXT:    ret i8 [[TMP0]]
+// CHECK-NEXT:    [[VADDVQ_U8_I:%.*]] = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> [[A]])
+// CHECK-NEXT:    ret i8 [[VADDVQ_U8_I]]
 //
 uint8_t test_vaddvq_u8(uint8x16_t a) {
   return vaddvq_u8(a);
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vaddvq_u16
-// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v8i16(<8 x i16> [[A]])
-// CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i16
-// CHECK-NEXT:    ret i16 [[TMP0]]
+// CHECK-NEXT:    [[VADDVQ_U16_I:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[A]])
+// CHECK-NEXT:    ret i16 [[VADDVQ_U16_I]]
 //
 uint16_t test_vaddvq_u16(uint16x8_t a) {
   return vaddvq_u16(a);
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vaddvq_u32
-// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VADDVQ_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v4i32(<4 x i32> [[A]])
+// CHECK-NEXT:    [[VADDVQ_U32_I:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[A]])
 // CHECK-NEXT:    ret i32 [[VADDVQ_U32_I]]
 //
 uint32_t test_vaddvq_u32(uint32x4_t a) {
@@ -435,7 +411,7 @@ uint32_t test_vaddvq_u32(uint32x4_t a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vmaxvq_f32
-// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VMAXVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxv.f32.v4f32(<4 x float> [[A]])
 // CHECK-NEXT:    ret float [[VMAXVQ_F32_I]]
@@ -445,7 +421,7 @@ float32_t test_vmaxvq_f32(float32x4_t a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vminvq_f32
-// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VMINVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.fminv.f32.v4f32(<4 x float> [[A]])
 // CHECK-NEXT:    ret float [[VMINVQ_F32_I]]
@@ -455,7 +431,7 @@ float32_t test_vminvq_f32(float32x4_t a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vmaxnmvq_f32
-// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VMAXNMVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxnmv.f32.v4f32(<4 x float> [[A]])
 // CHECK-NEXT:    ret float [[VMAXNMVQ_F32_I]]
@@ -465,7 +441,7 @@ float32_t test_vmaxnmvq_f32(float32x4_t a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vminnmvq_f32
-// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VMINNMVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.fminnmv.f32.v4f32(<4 x float> [[A]])
 // CHECK-NEXT:    ret float [[VMINNMVQ_F32_I]]
diff --git a/clang/test/CodeGen/AArch64/neon-intrinsics.c b/clang/test/CodeGen/AArch64/neon-intrinsics.c
index 035e1ca1b45e8..1c628bbba483f 100644
--- a/clang/test/CodeGen/AArch64/neon-intrinsics.c
+++ b/clang/test/CodeGen/AArch64/neon-intrinsics.c
@@ -12643,7 +12643,7 @@ uint64_t test_vqrshld_u64(uint64_t a, int64_t b) {
 // CHECK-LABEL: define dso_local i64 @test_vpaddd_s64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VPADDD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> [[A]])
+// CHECK-NEXT:    [[VPADDD_S64_I:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[A]])
 // CHECK-NEXT:    ret i64 [[VPADDD_S64_I]]
 //
 int64_t test_vpaddd_s64(int64x2_t a) {
@@ -23227,7 +23227,7 @@ uint64x2_t test_vpaddq_u64(uint64x2_t a, uint64x2_t b) {
 // CHECK-LABEL: define dso_local i64 @test_vpaddd_u64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VPADDD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> [[A]])
+// CHECK-NEXT:    [[VPADDD_U64_I:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[A]])
 // CHECK-NEXT:    ret i64 [[VPADDD_U64_I]]
 //
 uint64_t test_vpaddd_u64(uint64x2_t a) {
@@ -23237,7 +23237,7 @@ uint64_t test_vpaddd_u64(uint64x2_t a) {
 // CHECK-LABEL: define dso_local i64 @test_vaddvq_s64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VADDVQ_S64_I:%.*]] = call i64 @llvm.aarch64.neon.saddv.i64.v2i64(<2 x i64> [[A]])
+// CHECK-NEXT:    [[VADDVQ_S64_I:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[A]])
 // CHECK-NEXT:    ret i64 [[VADDVQ_S64_I]]
 //
 int64_t test_vaddvq_s64(int64x2_t a) {
@@ -23247,7 +23247,7 @@ int64_t test_vaddvq_s64(int64x2_t a) {
 // CHECK-LABEL: define dso_local i64 @test_vaddvq_u64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VADDVQ_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> [[A]])
+// CHECK-NEXT:    [[VADDVQ_U64_I:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[A]])
 // CHECK-NEXT:    ret i64 [[VADDVQ_U64_I]]
 //
 uint64_t test_vaddvq_u64(uint64x2_t a) {
@@ -23878,7 +23878,7 @@ float64x1_t test_vrsqrts_f64(float64x1_t a, float64x1_t b) {
 // CHECK-LABEL: define dso_local i32 @test_vminv_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VMINV_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v2i32(<2 x i32> [[A]])
+// CHECK-NEXT:    [[VMINV_S32_I:%.*]] = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> [[A]])
 // CHECK-NEXT:    ret i32 [[VMINV_S32_I]]
 //
 int32_t test_vminv_s32(int32x2_t a) {
@@ -23888,7 +23888,7 @@ int32_t test_vminv_s32(int32x2_t a) {
 // CHECK-LABEL: define dso_local i32 @test_vminv_u32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VMINV_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v2i32(<2 x i32> [[A]])
+// CHECK-NEXT:    [[VMINV_U32_I:%.*]] = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> [[A]])
 // CHECK-NEXT:    ret i32 [[VMINV_U32_I]]
 //
 uint32_t test_vminv_u32(uint32x2_t a) {
@@ -23898,7 +23898,7 @@ uint32_t test_vminv_u32(uint32x2_t a) {
 // CHECK-LABEL: define dso_local i32 @test_vmaxv_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VMAXV_S32_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v2i32(<2 x i32> [[A]])
+// CHECK-NEXT:    [[VMAXV_S32_I:%.*]] = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> [[A]])
 // CHECK-NEXT:    ret i32 [[VMAXV_S32_I]]
 //
 int32_t test_vmaxv_s32(int32x2_t a) {
@@ -23908,7 +23908,7 @@ int32_t test_vmaxv_s32(int32x2_t a) {
 // CHECK-LABEL: define dso_local i32 @test_vmaxv_u32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VMAXV_U32_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v2i32(<2 x i32> [[A]])
+// CHECK-NEXT:    [[VMAXV_U32_I:%.*]] = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> [[A]])
 // CHECK-NEXT:    ret i32 [[VMAXV_U32_I]]
 //
 uint32_t test_vmaxv_u32(uint32x2_t a) {
@@ -23918,7 +23918,7 @@ uint32_t test_vmaxv_u32(uint32x2_t a) {
 // CHECK-LABEL: define dso_local i32 @test_vaddv_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VADDV_S32_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v2i32(<2 x i32> [[A]])
+// CHECK-NEXT:    [[VADDV_S32_I:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[A]])
 // CHECK-NEXT:    ret i32 [[VADDV_S32_I]]
 //
 int32_t test_vaddv_s32(int32x2_t a) {
@@ -23928,7 +23928,7 @@ int32_t test_vaddv_s32(int32x2_t a) {
 // CHECK-LABEL: define dso_local i32 @test_vaddv_u32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VADDV_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v2i32(<2 x i32> [[A]])
+// CHECK-NEXT:    [[VADDV_U32_I:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[A]])
 // CHECK-NEXT:    ret i32 [[VADDV_U32_I]]
 //
 uint32_t test_vaddv_u32(uint32x2_t a) {
diff --git a/clang/test/CodeGen/PowerPC/ppc64-abi-override-datalayout.c b/clang/test/CodeGen/PowerPC/ppc64-abi-override-datalayout.c
new file mode 100644
index 0000000000000..30b85d24a56fd
--- /dev/null
+++ b/clang/test/CodeGen/PowerPC/ppc64-abi-override-datalayout.c
@@ -0,0 +1,8 @@
+// RUN: %clang_cc1 -triple powerpc64-unknown-linux-gnu -target-abi elfv2 %s -o - -emit-llvm | FileCheck %s
+
+// REQUIRES: powerpc-registered-target
+
+// Make sure that overriding the ABI to ELFv2 on a target that defaults to
+// ELFv1 changes the data layout:
+
+// CHECK: target datalayout = "E-m:e-Fn32-i64:64-i128:128-n32:64-S128-v256:256:256-v512:512:512"
diff --git a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector.c b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector.c
index d5d15b4dea966..35fde8733f375 100644
--- a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector.c
+++ b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector.c
@@ -3584,13 +3584,13 @@ void test_integer(void) {
   // CHECK-ASM: vsrlb
 
   vsc = vec_abs(vsc);
-  // CHECK-ASM: vlcb
+  // CHECK-ASM: vlpb
   vss = vec_abs(vss);
-  // CHECK-ASM: vlch
+  // CHECK-ASM: vlph
   vsi = vec_abs(vsi);
-  // CHECK-ASM: vlcf
+  // CHECK-ASM: vlpf
   vsl = vec_abs(vsl);
-  // CHECK-ASM: vlcg
+  // CHECK-ASM: vlpg
 
   vsc = vec_max(vsc, vsc);
   // CHECK-ASM: vmxb
diff --git a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector5.c b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector5.c
index 6ee9e1ee3a117..cd0fafdb7435f 100644
--- a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector5.c
+++ b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector5.c
@@ -246,7 +246,7 @@ void test_integer(void) {
   // CHECK-ASM: vctzq
 
   vslll = vec_abs(vslll);
-  // CHECK-ASM: vlcq
+  // CHECK-ASM: vlpq
 
   vslll = vec_avg(vslll, vslll);
   // CHECK: call i128 @llvm.s390.vavgq(i128 %{{.*}}, i128 %{{.*}})
diff --git a/clang/test/CodeGen/X86/amx_movrs_tranpose.c b/clang/test/CodeGen/X86/amx_movrs_tranpose.c
deleted file mode 100755
index 192c153835e1e..0000000000000
--- a/clang/test/CodeGen/X86/amx_movrs_tranpose.c
+++ /dev/null
@@ -1,53 +0,0 @@
-// RUN:  %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \
-// RUN:  -target-feature +amx-movrs  -emit-llvm -o - -Wall -Werror -pedantic \
-// RUN:  -target-feature +amx-transpose -Wno-gnu-statement-expression| FileCheck %s
-
-#include <immintrin.h>
-#include <stddef.h>
-
-char buf[2048];
-#define STRIDE 32
-
-// CHECK-LABEL:  define dso_local void @test_tile_2rpntlvwz0rs_internal(
-// CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rs.internal(i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}}, ptr %{{.*}}, i64 %{{.*}})
-// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 0
-// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}})
-// CHECK: store <256 x i32> %{{.*}}, ptr %{{.*}}, align 1024
-// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 1
-// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}})
-void test_tile_2rpntlvwz0rs_internal(int row, int col0, int col1, void *D0, void *D1, void *B) {
-  _tile_2rpntlvwz0rs_internal(row, col0, col1, D0, D1, B, 1);
-}
-
-// CHECK-LABEL:  define dso_local void @test_tile_2rpntlvwz0rst1_internal(
-// CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rst1.internal(i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}}, ptr %{{.*}}, i64 %{{.*}})
-// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 0
-// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}})
-// CHECK: store <256 x i32> %{{.*}}, ptr %{{.*}}, align 1024
-// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 1
-// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}})
-void test_tile_2rpntlvwz0rst1_internal(int row, int col0, int col1, void *D0, void *D1, void *B) {
-  _tile_2rpntlvwz0rst1_internal(row, col0, col1, D0, D1, B, 1);
-}
-
-// CHECK-LABEL:  define dso_local void @test_tile_2rpntlvwz1rs_internal(
-// CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rs.internal(i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}}, ptr %{{.*}}, i64 %{{.*}})
-// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 0
-// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}})
-// CHECK: store <256 x i32> %{{.*}}, ptr %{{.*}}, align 1024
-// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 1
-// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}})
-void test_tile_2rpntlvwz1rs_internal(int row, int col0, int col1, void *D0, void *D1, void *B) {
-  _tile_2rpntlvwz1rs_internal(row, col0, col1, D0, D1, B, 1);
-}
-
-// CHECK-LABEL:  define dso_local void @test_tile_2rpntlvwz1rst1_internal(
-// CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rst1.internal(i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}}, ptr %{{.*}}, i64 %{{.*}})
-// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 0
-// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}})
-// CHECK: store <256 x i32> %{{.*}}, ptr %{{.*}}, align 1024
-// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 1
-// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}})
-void test_tile_2rpntlvwz1rst1_internal(int row, int col0, int col1, void *D0, void *D1, void *B) {
-  _tile_2rpntlvwz1rst1_internal(row, col0, col1, D0, D1, B, 1);
-}
diff --git a/clang/test/CodeGen/X86/amx_movrs_tranpose_api.c b/clang/test/CodeGen/X86/amx_movrs_tranpose_api.c
deleted file mode 100755
index b174cc5067bf3..0000000000000
--- a/clang/test/CodeGen/X86/amx_movrs_tranpose_api.c
+++ /dev/null
@@ -1,81 +0,0 @@
-// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \
-// RUN: -target-feature +amx-movrs  -emit-llvm -o - -Wall -Werror -pedantic \
-// RUN: -target-feature +amx-transpose -Wno-gnu-statement-expression| FileCheck %s
-
-#include <immintrin.h>
-#include <stddef.h>
-
-char buf[2048];
-#define STRIDE 32
-
-void test_tile_2rpntlvwz0rs(const void *A, size_t B) {
-  // CHECK-LABEL: @test_tile_2rpntlvwz0rs
-  // CHECK: call void @llvm.x86.t2rpntlvwz0rs(i8 1, ptr %{{.*}}, i64 %{{.*}})
-  _tile_2rpntlvwz0rs(1, A, B);
-}
-
-void test_tile_2rpntlvwz0rst1(const void *A, size_t B) {
-  // CHECK-LABEL: @test_tile_2rpntlvwz0rst1
-  // CHECK: call void @llvm.x86.t2rpntlvwz0rst1(i8 1, ptr %{{.*}}, i64 %{{.*}})
-  _tile_2rpntlvwz0rst1(1, A, B);
-}
-
-void test_tile_2rpntlvwz1rs(const void *A, size_t B) {
-  // CHECK-LABEL: @test_tile_2rpntlvwz1rs
-  // CHECK: call void @llvm.x86.t2rpntlvwz1rs(i8 1, ptr %{{.*}}, i64 %{{.*}})
-  _tile_2rpntlvwz1rs(1, A, B);
-}
-
-void test_tile_2rpntlvwz1rst1(const void *A, size_t B) {
-  // CHECK-LABEL: @test_tile_2rpntlvwz1rst1
-  // CHECK: call void @llvm.x86.t2rpntlvwz1rst1(i8 1, ptr %{{.*}}, i64 %{{.*}})
-  _tile_2rpntlvwz1rst1(1, A, B);
-}
-
-void test__tile_2rpntlvwz0rs(__tile1024i dst0, __tile1024i dst1) {
-  //CHECK-LABEL: @test__tile_2rpntlvwz0rs
-  //CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rs.internal
-  //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 0
-  //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
-  //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
-  //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 1
-  //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
-  //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
-  __tile_2rpntlvwz0rs(&dst0, &dst1, buf, STRIDE);
-}
-
-void test__tile_2rpntlvwz0rst1(__tile1024i dst0, __tile1024i dst1) {
-  //CHECK-LABEL: @test__tile_2rpntlvwz0rst1
-  //CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rst1.internal
-  //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 0
-  //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
-  //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
-  //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 1
-  //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
-  //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
-  __tile_2rpntlvwz0rst1(&dst0, &dst1, buf, STRIDE);
-}
-
-void test__tile_2rpntlvwz1rs(__tile1024i dst0, __tile1024i dst1) {
-  //CHECK-LABEL: @test__tile_2rpntlvwz1rs
-  //CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rs.internal
-  //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 0
-  //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
-  //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
-  //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 1
-  //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
-  //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
-  __tile_2rpntlvwz1rs(&dst0, &dst1, buf, STRIDE);
-}
-
-void test__tile_2rpntlvwz1rst1(__tile1024i dst0, __tile1024i dst1) {
-  //CHECK-LABEL: @test__tile_2rpntlvwz1rst1
-  //CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rst1.internal
-  //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 0
-  //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
-  //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
-  //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 1
-  //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
-  //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
-  __tile_2rpntlvwz1rst1(&dst0, &dst1, buf, STRIDE);
-}
diff --git a/clang/test/CodeGen/X86/amx_movrs_transpose_errors.c b/clang/test/CodeGen/X86/amx_movrs_transpose_errors.c
deleted file mode 100755
index 840b52bbb29bb..0000000000000
--- a/clang/test/CodeGen/X86/amx_movrs_transpose_errors.c
+++ /dev/null
@@ -1,22 +0,0 @@
-// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \
-// RUN: -target-feature +amx-int8 -target-feature +amx-transpose -target-feature +amx-movrs \
-// RUN: -verify
-
-#include <immintrin.h>
-#include <stddef.h>
-
-void test_tile_2rpntlvwz0rs(const void *A, size_t B) {
-  _tile_2rpntlvwz0rs(8, A, B); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
-}
-
-void test_tile_2rpntlvwz0rst1(const void *A, size_t B) {
-  _tile_2rpntlvwz0rst1(8, A, B); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
-}
-
-void test_tile_2rpntlvwz1rs(const void *A, size_t B) {
-  _tile_2rpntlvwz1rs(8, A, B); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
-}
-
-void test_tile_2rpntlvwz1rst1(const void *A, size_t B) {
-  _tile_2rpntlvwz1rst1(8, A, B); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
-}
diff --git a/clang/test/CodeGen/X86/amx_tf32.c b/clang/test/CodeGen/X86/amx_tf32.c
index 661a9dfbc673b..54ad6bb714933 100644
--- a/clang/test/CodeGen/X86/amx_tf32.c
+++ b/clang/test/CodeGen/X86/amx_tf32.c
@@ -10,8 +10,3 @@ void test_tile_mmultf32ps(void) {
   _tile_mmultf32ps(1, 2, 3);
 }
 
-void test_tile_tmmultf32ps(void) {
-  // CHECK-LABEL: @test_tile_tmmultf32ps(
-  // CHECK: call void @llvm.x86.ttmmultf32ps(i8 1, i8 2, i8 3)
-  _tile_tmmultf32ps(1, 2, 3);
-}
diff --git a/clang/test/CodeGen/X86/amx_tf32_api.c b/clang/test/CodeGen/X86/amx_tf32_api.c
index 2ac8489e3e0ba..8f574b7bc71dc 100644
--- a/clang/test/CodeGen/X86/amx_tf32_api.c
+++ b/clang/test/CodeGen/X86/amx_tf32_api.c
@@ -18,10 +18,3 @@ void test_tile_mmultf32ps(__tile1024i a, __tile1024i b, __tile1024i c) {
   __tile_mmultf32ps(&c, a, b);
 }
 
-void test_tile_tmmultf32ps(__tile1024i a, __tile1024i b, __tile1024i c) {
-  //CHECK-LABEL: @test_tile_tmmultf32ps
-  //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}})
-  //CHECK-DAG: call x86_amx @llvm.x86.ttmmultf32ps.internal
-  //CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
-  __tile_tmmultf32ps(&c, a, b);
-}
diff --git a/clang/test/CodeGen/X86/amx_tf32_errors.c b/clang/test/CodeGen/X86/amx_tf32_errors.c
index 4502130692115..f0fdd060363cf 100644
--- a/clang/test/CodeGen/X86/amx_tf32_errors.c
+++ b/clang/test/CodeGen/X86/amx_tf32_errors.c
@@ -13,11 +13,3 @@ void test_tile_mmultf32ps() {
   _tile_mmultf32ps(1, 3, 3);  // expected-error {{tile arguments must refer to different tiles}}
 }
 
-void test_tile_tmmultf32ps() {
-  _tile_tmmultf32ps(16, 2, 3); // expected-error {{argument value 16 is outside the valid range [0, 7]}}
-  _tile_tmmultf32ps(1, 26, 3); // expected-error {{argument value 26 is outside the valid range [0, 7]}}
-  _tile_tmmultf32ps(1, 2, 36); // expected-error {{argument value 36 is outside the valid range [0, 7]}}
-  _tile_tmmultf32ps(1, 1, 3);  // expected-error {{tile arguments must refer to different tiles}}
-  _tile_tmmultf32ps(1, 2, 1);  // expected-error {{tile arguments must refer to different tiles}}
-  _tile_tmmultf32ps(1, 2, 2);  // expected-error {{tile arguments must refer to different tiles}}
-}
diff --git a/clang/test/CodeGen/X86/amx_transpose.c b/clang/test/CodeGen/X86/amx_transpose.c
deleted file mode 100644
index 7e88fd80592d6..0000000000000
--- a/clang/test/CodeGen/X86/amx_transpose.c
+++ /dev/null
@@ -1,75 +0,0 @@
-// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown -target-feature +amx-transpose \
-// RUN: -target-feature +amx-bf16 -target-feature +amx-fp16 -target-feature +amx-complex \
-// RUN: -target-feature +avx512f -emit-llvm -o - -Wall -Werror -pedantic -Wno-gnu-statement-expression| FileCheck %s
-
-#include <immintrin.h>
-#include <stddef.h>
-
-void test_tile_2rpntlvwz0(const void *A, size_t B) {
-  // CHECK-LABEL: @test_tile_2rpntlvwz0
-  // CHECK: call void @llvm.x86.t2rpntlvwz0(i8 1, ptr %{{.*}}, i64 %{{.*}})
-  _tile_2rpntlvwz0(1, A, B);
-}
-
-void test_tile_2rpntlvwz0t1(const void *A, size_t B) {
-  // CHECK-LABEL: @test_tile_2rpntlvwz0t1
-  // CHECK: call void @llvm.x86.t2rpntlvwz0t1(i8 1, ptr %{{.*}}, i64 %{{.*}})
-  _tile_2rpntlvwz0t1(1, A, B);
-}
-
-void test_tile_2rpntlvwz1(const void *A, size_t B) {
-  // CHECK-LABEL: @test_tile_2rpntlvwz1
-  // CHECK: call void @llvm.x86.t2rpntlvwz1(i8 1, ptr %{{.*}}, i64 %{{.*}})
-  _tile_2rpntlvwz1(1, A, B);
-}
-
-void test_tile_2rpntlvwz1t1(const void *A, size_t B) {
-  // CHECK-LABEL: @test_tile_2rpntlvwz1t1
-  // CHECK: call void @llvm.x86.t2rpntlvwz1t1(i8 1, ptr %{{.*}}, i64 %{{.*}})
-  _tile_2rpntlvwz1t1(1, A, B);
-}
-
-void test_tile_transposed(void)
-{
-  // CHECK-LABEL: @test_tile_transposed
-  // CHECK: call void @llvm.x86.ttransposed(i8 1, i8 2)
-  _tile_transposed(1, 2);
-}
-
-void test_tile_tdpbf16ps(void)
-{
-  // CHECK-LABEL: @test_tile_tdpbf16ps
-  // CHECK: call void @llvm.x86.ttdpbf16ps(i8 1, i8 2, i8 3)
-  _tile_tdpbf16ps(1, 2, 3);
-}
-
-void test_tile_tdpfp16ps(void)
-{
-  // CHECK-LABEL: @test_tile_tdpfp16ps
-  // CHECK: call void @llvm.x86.ttdpfp16ps(i8 4, i8 5, i8 6)
-  _tile_tdpfp16ps(4, 5, 6);
-}
-
-void test_tile_tcmmimfp16ps(void) {
-  // CHECK-LABEL: @test_tile_tcmmimfp16ps
-  // CHECK: call void @llvm.x86.ttcmmimfp16ps(i8 1, i8 2, i8 3)
-  _tile_tcmmimfp16ps(1, 2, 3);
-}
-
-void test_tile_tcmmrlfp16ps(void) {
-  // CHECK-LABEL: @test_tile_tcmmrlfp16ps
-  // CHECK: call void @llvm.x86.ttcmmrlfp16ps(i8 1, i8 2, i8 3)
-  _tile_tcmmrlfp16ps(1, 2, 3);
-}
-
-void test_tile_conjtcmmimfp16ps(void) {
-  // CHECK-LABEL: @test_tile_conjtcmmimfp16ps
-  // CHECK: call void @llvm.x86.tconjtcmmimfp16ps(i8 1, i8 2, i8 3)
-  _tile_conjtcmmimfp16ps(1, 2, 3);
-}
-
-void test_tile_conjtfp16(void) {
-  // CHECK-LABEL: @test_tile_conjtfp16
-  // CHECK: call void @llvm.x86.tconjtfp16(i8 1, i8 2)
-  _tile_conjtfp16(1, 2);
-}
diff --git a/clang/test/CodeGen/X86/amx_transpose_api.c b/clang/test/CodeGen/X86/amx_transpose_api.c
deleted file mode 100644
index dc3ef5104252c..0000000000000
--- a/clang/test/CodeGen/X86/amx_transpose_api.c
+++ /dev/null
@@ -1,114 +0,0 @@
-// RUN: %clang_cc1 %s -flax-vector-conversions=none -ffreestanding -triple=x86_64-unknown-unknown -target-feature +avx512f \
-// RUN: -target-feature +amx-transpose -target-feature +amx-bf16 -target-feature +amx-fp16 -target-feature +amx-complex \
-// RUN: -emit-llvm -o - -Werror -pedantic | FileCheck %s --check-prefixes=CHECK
-
-#include <immintrin.h>
-
-char buf[2048];
-#define STRIDE 32
-
-char buf2[2048];
-
-void test_tile_2rpntlvwz0(__tile1024i dst0, __tile1024i dst1) {
-  //CHECK-LABEL: @test_tile_2rpntlvwz0
-  //CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal
-  //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 0
-  //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
-  //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
-  //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 1
-  //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
-  //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
-  __tile_2rpntlvwz0(&dst0, &dst1, buf, STRIDE);
-}
-
-void test_tile_2rpntlvwz0t1(__tile1024i dst0, __tile1024i dst1) {
-  //CHECK-LABEL: @test_tile_2rpntlvwz0t1
-  //CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0t1.internal
-  //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 0
-  //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
-  //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
-  //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 1
-  //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
-  //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
-  __tile_2rpntlvwz0t1(&dst0, &dst1, buf, STRIDE);
-}
-
-void test_tile_2rpntlvwz1(__tile1024i dst0, __tile1024i dst1) {
-  //CHECK-LABEL: @test_tile_2rpntlvwz1
-  //CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1.internal
-  //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 0
-  //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
-  //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
-  //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 1
-  //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
-  //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
-  __tile_2rpntlvwz1(&dst0, &dst1, buf, STRIDE);
-}
-
-void test_tile_2rpntlvwz1t1(__tile1024i dst0, __tile1024i dst1) {
-  //CHECK-LABEL: @test_tile_2rpntlvwz1t1
-  //CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1t1.internal
-  //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 0
-  //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
-  //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
-  //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 1
-  //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
-  //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
-  __tile_2rpntlvwz1t1(&dst0, &dst1, buf, STRIDE);
-}
-
-void test_tile_transposed(__tile1024i dst, __tile1024i src) {
-  //CHECK-LABEL: @test_tile_transposed
-  //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}})
-  //CHECK-DAG: call x86_amx @llvm.x86.ttransposed.internal
-  //CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
-  __tile_transposed(&dst, src);
-}
-
-void test_tile_tdpbf16ps(__tile1024i a, __tile1024i b, __tile1024i c) {
-  //CHECK-LABEL: @test_tile_tdpbf16ps
-  //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}})
-  //CHECK-DAG: call x86_amx @llvm.x86.ttdpbf16ps.internal
-  //CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
-  __tile_tdpbf16ps(&c, a, b);
-}
-
-void test_tile_tdpfp16ps(__tile1024i a, __tile1024i b, __tile1024i c) {
-  //CHECK-LABEL: @test_tile_tdpfp16ps
-  //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}})
-  //CHECK-DAG: call x86_amx @llvm.x86.ttdpfp16ps.internal
-  //CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
-  __tile_tdpfp16ps(&c, a, b);
-}
-
-void test_tile_tcmmimfp16ps(__tile1024i a, __tile1024i b, __tile1024i c) {
-  //CHECK-LABEL: @test_tile_tcmmimfp16ps
-  //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}})
-  //CHECK-DAG: call x86_amx @llvm.x86.ttcmmimfp16ps.internal
-  //CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
-  __tile_tcmmimfp16ps(&c, a, b);
-}
-
-void test_tile_tcmmrlfp16ps(__tile1024i a, __tile1024i b, __tile1024i c) {
-  //CHECK-LABEL: @test_tile_tcmmrlfp16ps
-  //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}})
-  //CHECK-DAG: call x86_amx @llvm.x86.ttcmmrlfp16ps.internal
-  //CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
-  __tile_tcmmrlfp16ps(&c, a, b);
-}
-
-void test_tile_conjtcmmimfp16ps(__tile1024i a, __tile1024i b, __tile1024i c) {
-  //CHECK-LABEL: @test_tile_conjtcmmimfp16ps
-  //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}})
-  //CHECK-DAG: call x86_amx @llvm.x86.tconjtcmmimfp16ps.internal
-  //CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
-  __tile_conjtcmmimfp16ps(&c, a, b);
-}
-
-void test_tile_conjtfp16(__tile1024i dst, __tile1024i src) {
-  //CHECK-LABEL: @test_tile_conjtfp16
-  //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}})
-  //CHECK-DAG: call x86_amx @llvm.x86.tconjtfp16.internal
-  //CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
-  __tile_conjtfp16(&dst, src);
-}
diff --git a/clang/test/CodeGen/X86/amx_transpose_errors.c b/clang/test/CodeGen/X86/amx_transpose_errors.c
deleted file mode 100644
index 80368c580c793..0000000000000
--- a/clang/test/CodeGen/X86/amx_transpose_errors.c
+++ /dev/null
@@ -1,75 +0,0 @@
-// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \
-// RUN: -target-feature +amx-int8 -target-feature +amx-bf16 -target-feature +amx-transpose \
-// RUN: -target-feature +avx512f -target-feature +amx-fp16 -target-feature +amx-complex -verify
-
-#include <immintrin.h>
-#include <stddef.h>
-
-// Transpose
-void test_tile_2rpntlvwz0(const void *A, size_t B) {
-  _tile_2rpntlvwz0(8, A, B); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
-}
-
-void test_tile_2rpntlvwz0t1(const void *A, size_t B) {
-  _tile_2rpntlvwz0t1(8, A, B); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
-}
-
-void test_tile_2rpntlvwz1(const void *A, size_t B) {
-  _tile_2rpntlvwz1(8, A, B); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
-}
-
-void test_tile_2rpntlvwz1t1(const void *A, size_t B) {
-  _tile_2rpntlvwz1t1(8, A, B); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
-}
-
-void test_tile_tdpbf16ps()
-{
-  _tile_tdpbf16ps(8, 2, 3); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
-  _tile_tdpbf16ps(1, 8, 3); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
-  _tile_tdpbf16ps(1, 2, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
-  _tile_tdpbf16ps(1, 1, 3);  // expected-error {{tile arguments must refer to different tiles}}
-  _tile_tdpbf16ps(1, 2, 1);  // expected-error {{tile arguments must refer to different tiles}}
-  _tile_tdpbf16ps(1, 2, 2);  // expected-error {{tile arguments must refer to different tiles}}
-}
-
-void test_tile_tdpfp16ps()
-{
-  _tile_tdpfp16ps(8, 5, 6); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
-  _tile_tdpfp16ps(1, 8, 6); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
-  _tile_tdpfp16ps(1, 5, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
-  _tile_tdpfp16ps(1, 1, 3);  // expected-error {{tile arguments must refer to different tiles}}
-  _tile_tdpfp16ps(1, 2, 1);  // expected-error {{tile arguments must refer to different tiles}}
-  _tile_tdpfp16ps(1, 2, 2);  // expected-error {{tile arguments must refer to different tiles}}
-}
-
-void test_tile_transposed()
-{
-  _tile_transposed(8, 2); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
-  _tile_transposed(1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
-}
-
-void test_tile_tcmmimfp16ps() {
-  _tile_tcmmimfp16ps(16, 2, 3); // expected-error {{argument value 16 is outside the valid range [0, 7]}}
-  _tile_tcmmimfp16ps(1, 26, 3); // expected-error {{argument value 26 is outside the valid range [0, 7]}}
-  _tile_tcmmimfp16ps(1, 2, 36); // expected-error {{argument value 36 is outside the valid range [0, 7]}}
-  _tile_tcmmimfp16ps(1, 1, 3);  // expected-error {{tile arguments must refer to different tiles}}
-}
-
-void test_tile_tcmmrlfp16ps() {
-  _tile_tcmmrlfp16ps(16, 2, 3); // expected-error {{argument value 16 is outside the valid range [0, 7]}}
-  _tile_tcmmrlfp16ps(1, 26, 3); // expected-error {{argument value 26 is outside the valid range [0, 7]}}
-  _tile_tcmmrlfp16ps(1, 2, 36); // expected-error {{argument value 36 is outside the valid range [0, 7]}}
-  _tile_tcmmrlfp16ps(1, 1, 3);  // expected-error {{tile arguments must refer to different tiles}}
-}
-
-void test_tile_conjtcmmimfp16ps() {
-  _tile_conjtcmmimfp16ps(16, 2, 3); // expected-error {{argument value 16 is outside the valid range [0, 7]}}
-  _tile_conjtcmmimfp16ps(1, 26, 3); // expected-error {{argument value 26 is outside the valid range [0, 7]}}
-  _tile_conjtcmmimfp16ps(1, 2, 36); // expected-error {{argument value 36 is outside the valid range [0, 7]}}
-  _tile_conjtcmmimfp16ps(1, 2, 1);  // expected-error {{tile arguments must refer to different tiles}}
-}
-
-void test_tile_conjtfp16() {
-  _tile_conjtfp16(16, 2); // expected-error {{argument value 16 is outside the valid range [0, 7]}}
-  _tile_conjtfp16(1, 26); // expected-error {{argument value 26 is outside the valid range [0, 7]}}
-}
diff --git a/clang/test/CodeGen/X86/avx512vlbw-builtins.c b/clang/test/CodeGen/X86/avx512vlbw-builtins.c
index 116d86fcd597d..febef46458ae9 100644
--- a/clang/test/CodeGen/X86/avx512vlbw-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vlbw-builtins.c
@@ -645,6 +645,21 @@ __mmask16 test_mm_cmp_epi8_mask(__m128i __a, __m128i __b) {
   return (__mmask16)_mm_cmp_epi8_mask(__a, __b, 0);
 }
 
+TEST_CONSTEXPR(_mm_cmpeq_epi8_mask(
+    ((__m128i)(__v16qi){5, 3, 7, 2, 9, 3, 7, 1, 5, 4, 8, 2, 9, 6, 7, 5}),
+    ((__m128i)(__v16qi){5, 2, 7, 3, 9, 4, 6, 1, 5, 3, 8, 1, 9, 5, 7, 5})
+) == (__mmask16)0xd595);
+
+TEST_CONSTEXPR(_mm_cmplt_epi8_mask(
+    ((__m128i)(__v16qi){1, 5, 3, 7, 2, 8, 4, 6, 9, 5, 3, 11, 2, 6, 15, 8}),
+    ((__m128i)(__v16qi){2, 4, 6, 8, 3, 5, 7, 9, 4, 6, 8, 10, 5, 7, 9, 11})
+) == (__mmask16)0xb6dd);
+
+TEST_CONSTEXPR(_mm_cmple_epi8_mask(
+    ((__m128i)(__v16qi){1, 3, 5, 7, 2, 6, 6, 8, 1, 3, 9, 7, 2, 4, 6, 10}),
+    ((__m128i)(__v16qi){2, 3, 4, 7, 3, 4, 5, 8, 2, 3, 4, 7, 3, 4, 5, 8})
+) == (__mmask16)0x3b9b);
+
 __mmask16 test_mm_mask_cmp_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: test_mm_mask_cmp_epi8_mask
   // CHECK: icmp eq <16 x i8> %{{.*}}, %{{.*}}
@@ -2894,6 +2909,12 @@ __mmask16 test_mm_test_epi8_mask(__m128i __A, __m128i __B) {
   return _mm_test_epi8_mask(__A, __B); 
 }
 
+TEST_CONSTEXPR(_mm_test_epi8_mask(
+    (__m128i)(__v16qi){1, 2, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+    (__m128i)(__v16qi){1, 2, 4, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
+)
+== (__mmask16)0xfffb);
+
 __mmask16 test_mm_mask_test_epi8_mask(__mmask16 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_mask_test_epi8_mask
   // CHECK: and <2 x i64> %{{.*}}, %{{.*}}
@@ -2901,6 +2922,12 @@ __mmask16 test_mm_mask_test_epi8_mask(__mmask16 __U, __m128i __A, __m128i __B) {
   // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
   return _mm_mask_test_epi8_mask(__U, __A, __B); 
 }
+TEST_CONSTEXPR(_mm_mask_test_epi8_mask(
+    0xFFFF,
+    (__m128i)(__v16qi){1, 2, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+    (__m128i)(__v16qi){1, 2, 4, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
+)
+== (__mmask16)0xfffb);
 
 __mmask32 test_mm256_test_epi8_mask(__m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_test_epi8_mask
@@ -2908,6 +2935,11 @@ __mmask32 test_mm256_test_epi8_mask(__m256i __A, __m256i __B) {
   // CHECK: icmp ne <32 x i8> %{{.*}}, %{{.*}}
   return _mm256_test_epi8_mask(__A, __B); 
 }
+TEST_CONSTEXPR(_mm256_test_epi8_mask(
+    (__m256i)(__v32qi){1, 2, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1, 2, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+    (__m256i)(__v32qi){1, 2, 4, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1, 2, 4, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
+)
+== (__mmask32)0xfffbfffb);
 
 __mmask32 test_mm256_mask_test_epi8_mask(__mmask32 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_mask_test_epi8_mask
@@ -2954,6 +2986,12 @@ __mmask16 test_mm_testn_epi8_mask(__m128i __A, __m128i __B) {
   return _mm_testn_epi8_mask(__A, __B); 
 }
 
+TEST_CONSTEXPR(_mm_testn_epi8_mask(
+    (__m128i)(__v16qi){1, 2, 77, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1, 16, 16},
+    (__m128i)(__v16qi){2, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15}
+)
+== (__mmask16)0xe001);
+
 __mmask16 test_mm_mask_testn_epi8_mask(__mmask16 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_mask_testn_epi8_mask
   // CHECK: and <2 x i64> %{{.*}}, %{{.*}}
diff --git a/clang/test/CodeGen/X86/sse41-builtins.c b/clang/test/CodeGen/X86/sse41-builtins.c
index 62cd392824bb2..35fa65a99836b 100644
--- a/clang/test/CodeGen/X86/sse41-builtins.c
+++ b/clang/test/CodeGen/X86/sse41-builtins.c
@@ -307,6 +307,16 @@ __m128 test_mm_insert_ps(__m128 x, __m128 y) {
   return _mm_insert_ps(x, y, 4);
 }
 
+TEST_CONSTEXPR((match_m128(_mm_insert_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){10.0f, 20.0f, 30.0f, 40.0f}), 0x10), 1.0f, 10.0f, 3.0f, 4.0f))); // Insert Y[0] into X[1]
+TEST_CONSTEXPR((match_m128(_mm_insert_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){10.0f, 20.0f, 30.0f, 40.0f}), 0x00), 10.0f, 2.0f, 3.0f, 4.0f))); // Insert Y[0] into X[0]
+TEST_CONSTEXPR((match_m128(_mm_insert_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){10.0f, 20.0f, 30.0f, 40.0f}), 0x20), 1.0f, 2.0f, 10.0f, 4.0f))); // Insert Y[0] into X[2]
+TEST_CONSTEXPR((match_m128(_mm_insert_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){10.0f, 20.0f, 30.0f, 40.0f}), 0x30), 1.0f, 2.0f, 3.0f, 10.0f))); // Insert Y[0] into X[3]
+TEST_CONSTEXPR((match_m128(_mm_insert_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){10.0f, 20.0f, 30.0f, 40.0f}), 0x80), 30.0f, 2.0f, 3.0f, 4.0f))); // Insert Y[2] into X[0]
+TEST_CONSTEXPR((match_m128(_mm_insert_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){10.0f, 20.0f, 30.0f, 40.0f}), 0x01), 0.0f, 2.0f, 3.0f, 4.0f))); // Insert Y[0] into X[0], zero X[0]
+TEST_CONSTEXPR((match_m128(_mm_insert_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){10.0f, 20.0f, 30.0f, 40.0f}), 0x0A), 10.0f, 0.0f, 3.0f, 0.0f))); // Insert Y[0] into X[0], zero X[1] and X[3]
+TEST_CONSTEXPR((match_m128(_mm_insert_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){10.0f, 20.0f, 30.0f, 40.0f}), 0x0F), 0.0f, 0.0f, 0.0f, 0.0f))); // Insert Y[0] into X[0], zero all
+TEST_CONSTEXPR((match_m128(_mm_insert_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){10.0f, 20.0f, 30.0f, 40.0f}), 0xCF), 0.0f, 0.0f, 0.0f, 0.0f))); // Insert Y[3] into X[0], zero all
+
 __m128i test_mm_max_epi8(__m128i x, __m128i y) {
   // CHECK-LABEL: test_mm_max_epi8
   // CHECK: call <16 x i8> @llvm.smax.v16i8(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
diff --git a/clang/test/CodeGen/builtins-extended-image.c b/clang/test/CodeGen/builtins-extended-image.c
new file mode 100644
index 0000000000000..0dbf81dabd77b
--- /dev/null
+++ b/clang/test/CodeGen/builtins-extended-image.c
@@ -0,0 +1,1528 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx1100 -target-feature +extended-image-insts %s -emit-llvm -o - | FileCheck %s
+
+typedef int int4 __attribute__((ext_vector_type(4)));
+typedef float float4 __attribute__((ext_vector_type(4)));
+typedef _Float16 half4 __attribute__((ext_vector_type(4)));
+
+// CHECK-LABEL: define dso_local <4 x float> @test_amdgcn_image_gather4_lz_2d_v4f32_f32_r(
+// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+// CHECK-NEXT:    [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+// CHECK-NEXT:    [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:    [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5)
+// CHECK-NEXT:    [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5)
+// CHECK-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// CHECK-NEXT:    [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr
+// CHECK-NEXT:    [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr
+// CHECK-NEXT:    [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr
+// CHECK-NEXT:    [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr
+// CHECK-NEXT:    [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr
+// CHECK-NEXT:    store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32
+// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32.v8i32.v4i32(i32 1, float [[TMP0]], float [[TMP1]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP3]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    ret <4 x float> [[TMP4]]
+//
+float4 test_amdgcn_image_gather4_lz_2d_v4f32_f32_r(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_gather4_lz_2d_v4f32_f32(1, f32, f32, tex, vec4i32, 0, 120, 110);
+}
+
+// CHECK-LABEL: define dso_local <4 x float> @test_amdgcn_image_gather4_lz_2d_v4f32_f32_g(
+// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+// CHECK-NEXT:    [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+// CHECK-NEXT:    [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:    [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5)
+// CHECK-NEXT:    [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5)
+// CHECK-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// CHECK-NEXT:    [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr
+// CHECK-NEXT:    [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr
+// CHECK-NEXT:    [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr
+// CHECK-NEXT:    [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr
+// CHECK-NEXT:    [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr
+// CHECK-NEXT:    store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32
+// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32.v8i32.v4i32(i32 2, float [[TMP0]], float [[TMP1]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP3]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    ret <4 x float> [[TMP4]]
+//
+float4 test_amdgcn_image_gather4_lz_2d_v4f32_f32_g(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_gather4_lz_2d_v4f32_f32(2, f32, f32, tex, vec4i32, 0, 120, 110);
+}
+
+// CHECK-LABEL: define dso_local <4 x float> @test_amdgcn_image_gather4_lz_2d_v4f32_f32_b(
+// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+// CHECK-NEXT:    [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+// CHECK-NEXT:    [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:    [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5)
+// CHECK-NEXT:    [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5)
+// CHECK-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// CHECK-NEXT:    [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr
+// CHECK-NEXT:    [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr
+// CHECK-NEXT:    [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr
+// CHECK-NEXT:    [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr
+// CHECK-NEXT:    [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr
+// CHECK-NEXT:    store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32
+// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32.v8i32.v4i32(i32 4, float [[TMP0]], float [[TMP1]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP3]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    ret <4 x float> [[TMP4]]
+//
+float4 test_amdgcn_image_gather4_lz_2d_v4f32_f32_b(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_gather4_lz_2d_v4f32_f32(4, f32, f32, tex, vec4i32, 0, 120, 110);
+}
+
+// CHECK-LABEL: define dso_local <4 x float> @test_amdgcn_image_gather4_lz_2d_v4f32_f32_a(
+// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+// CHECK-NEXT:    [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+// CHECK-NEXT:    [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:    [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5)
+// CHECK-NEXT:    [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5)
+// CHECK-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// CHECK-NEXT:    [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr
+// CHECK-NEXT:    [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr
+// CHECK-NEXT:    [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr
+// CHECK-NEXT:    [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr
+// CHECK-NEXT:    [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr
+// CHECK-NEXT:    store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32
+// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32.v8i32.v4i32(i32 8, float [[TMP0]], float [[TMP1]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP3]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    ret <4 x float> [[TMP4]]
+//
+float4 test_amdgcn_image_gather4_lz_2d_v4f32_f32_a(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_gather4_lz_2d_v4f32_f32(8, f32, f32, tex, vec4i32, 0, 120, 110);
+}
+
+// CHECK-LABEL: define dso_local <4 x float> @test_amdgcn_image_sample_lz_1d_v4f32_f32(
+// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+// CHECK-NEXT:    [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+// CHECK-NEXT:    [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:    [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5)
+// CHECK-NEXT:    [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5)
+// CHECK-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// CHECK-NEXT:    [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr
+// CHECK-NEXT:    [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr
+// CHECK-NEXT:    [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr
+// CHECK-NEXT:    [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr
+// CHECK-NEXT:    [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr
+// CHECK-NEXT:    store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f32.v8i32.v4i32(i32 100, float [[TMP0]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP2]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    ret <4 x float> [[TMP3]]
+//
+float4 test_amdgcn_image_sample_lz_1d_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_lz_1d_v4f32_f32(100, f32, tex, vec4i32, 0, 120, 110);
+}
+
+// CHECK-LABEL: define dso_local <4 x float> @test_amdgcn_image_sample_l_1d_v4f32_f32(
+// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+// CHECK-NEXT:    [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+// CHECK-NEXT:    [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:    [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5)
+// CHECK-NEXT:    [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5)
+// CHECK-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// CHECK-NEXT:    [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr
+// CHECK-NEXT:    [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr
+// CHECK-NEXT:    [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr
+// CHECK-NEXT:    [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr
+// CHECK-NEXT:    [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr
+// CHECK-NEXT:    store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32
+// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.l.1d.v4f32.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP3]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    ret <4 x float> [[TMP4]]
+//
+float4 test_amdgcn_image_sample_l_1d_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_l_1d_v4f32_f32(100, f32, f32, tex, vec4i32, 0, 120, 110);
+}
+
+// CHECK-LABEL: define dso_local <4 x float> @test_amdgcn_image_sample_d_1d_v4f32_f32(
+// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+// CHECK-NEXT:    [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+// CHECK-NEXT:    [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:    [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5)
+// CHECK-NEXT:    [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5)
+// CHECK-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// CHECK-NEXT:    [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr
+// CHECK-NEXT:    [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr
+// CHECK-NEXT:    [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr
+// CHECK-NEXT:    [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr
+// CHECK-NEXT:    [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr
+// CHECK-NEXT:    store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP3]], align 32
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f32.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], float [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP4]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    ret <4 x float> [[TMP5]]
+//
+float4 test_amdgcn_image_sample_d_1d_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_d_1d_v4f32_f32(100, f32, f32, f32, tex, vec4i32, 0, 120, 110);
+}
+
+// CHECK-LABEL: define dso_local <4 x float> @test_amdgcn_image_sample_lz_2d_v4f32_f32(
+// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+// CHECK-NEXT:    [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+// CHECK-NEXT:    [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:    [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5)
+// CHECK-NEXT:    [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5)
+// CHECK-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// CHECK-NEXT:    [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr
+// CHECK-NEXT:    [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr
+// CHECK-NEXT:    [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr
+// CHECK-NEXT:    [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr
+// CHECK-NEXT:    [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr
+// CHECK-NEXT:    store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32
+// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP3]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    ret <4 x float> [[TMP4]]
+//
+float4 test_amdgcn_image_sample_lz_2d_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_lz_2d_v4f32_f32(100, f32, f32, tex, vec4i32, 0, 120, 110);
+}
+
+// CHECK-LABEL: define dso_local <4 x float> @test_amdgcn_image_sample_l_2d_v4f32_f32(
+// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+// CHECK-NEXT:    [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+// CHECK-NEXT:    [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:    [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5)
+// CHECK-NEXT:    [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5)
+// CHECK-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// CHECK-NEXT:    [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr
+// CHECK-NEXT:    [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr
+// CHECK-NEXT:    [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr
+// CHECK-NEXT:    [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr
+// CHECK-NEXT:    [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr
+// CHECK-NEXT:    store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP3]], align 32
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f32.v8i32.v4i32(i32 10, float [[TMP0]], float [[TMP1]], float [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP4]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    ret <4 x float> [[TMP5]]
+//
+float4 test_amdgcn_image_sample_l_2d_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_l_2d_v4f32_f32(10, f32, f32, f32, tex, vec4i32, 0, 120, 110);
+}
+
+// CHECK-LABEL: define dso_local <4 x float> @test_amdgcn_image_sample_d_2d_v4f32_f32(
+// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+// CHECK-NEXT:    [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+// CHECK-NEXT:    [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:    [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5)
+// CHECK-NEXT:    [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5)
+// CHECK-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// CHECK-NEXT:    [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr
+// CHECK-NEXT:    [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr
+// CHECK-NEXT:    [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr
+// CHECK-NEXT:    [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr
+// CHECK-NEXT:    [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr
+// CHECK-NEXT:    store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP6]], align 32
+// CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP8:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f32.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], float [[TMP2]], float [[TMP3]], float [[TMP4]], float [[TMP5]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP7]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    ret <4 x float> [[TMP8]]
+//
+float4 test_amdgcn_image_sample_d_2d_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_d_2d_v4f32_f32(100, f32, f32, f32, f32, f32, f32, tex, vec4i32, 0, 120, 110);
+}
+// CHECK-LABEL: define dso_local <4 x float> @test_amdgcn_image_sample_lz_3d_v4f32_f32(
+// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+// CHECK-NEXT:    [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+// CHECK-NEXT:    [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:    [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5)
+// CHECK-NEXT:    [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5)
+// CHECK-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// CHECK-NEXT:    [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr
+// CHECK-NEXT:    [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr
+// CHECK-NEXT:    [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr
+// CHECK-NEXT:    [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr
+// CHECK-NEXT:    [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr
+// CHECK-NEXT:    store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP3]], align 32
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.lz.3d.v4f32.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], float [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP4]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    ret <4 x float> [[TMP5]]
+//
+float4 test_amdgcn_image_sample_lz_3d_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_lz_3d_v4f32_f32(100, f32, f32, f32, tex, vec4i32, 0, 120, 110);
+}
+
+// CHECK-LABEL: define dso_local <4 x float> @test_amdgcn_image_sample_l_3d_v4f32_f32(
+// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+// CHECK-NEXT:    [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+// CHECK-NEXT:    [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:    [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5)
+// CHECK-NEXT:    [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5)
+// CHECK-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// CHECK-NEXT:    [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr
+// CHECK-NEXT:    [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr
+// CHECK-NEXT:    [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr
+// CHECK-NEXT:    [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr
+// CHECK-NEXT:    [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr
+// CHECK-NEXT:    store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP4]], align 32
+// CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.l.3d.v4f32.f32.v8i32.v4i32(i32 1, float [[TMP0]], float [[TMP1]], float [[TMP2]], float [[TMP3]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP5]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    ret <4 x float> [[TMP6]]
+//
+float4 test_amdgcn_image_sample_l_3d_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_l_3d_v4f32_f32(1, f32, f32, f32, f32, tex, vec4i32, 0, 120, 110);
+}
+
+// CHECK-LABEL: define dso_local <4 x float> @test_amdgcn_image_sample_d_3d_v4f32_f32(
+// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+// CHECK-NEXT:    [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+// CHECK-NEXT:    [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:    [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5)
+// CHECK-NEXT:    [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5)
+// CHECK-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// CHECK-NEXT:    [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr
+// CHECK-NEXT:    [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr
+// CHECK-NEXT:    [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr
+// CHECK-NEXT:    [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr
+// CHECK-NEXT:    [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr
+// CHECK-NEXT:    store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP9]], align 32
+// CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP11:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f32.f32.v8i32.v4i32(i32 1, float [[TMP0]], float [[TMP1]], float [[TMP2]], float [[TMP3]], float [[TMP4]], float [[TMP5]], float [[TMP6]], float [[TMP7]], float [[TMP8]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP10]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    ret <4 x float> [[TMP11]]
+//
+float4 test_amdgcn_image_sample_d_3d_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_d_3d_v4f32_f32(1, f32, f32, f32, f32, f32, f32, f32, f32, f32, tex, vec4i32, 0, 120, 110);
+}
+
+// CHECK-LABEL: define dso_local <4 x float> @test_amdgcn_image_sample_lz_cube_v4f32_f32(
+// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+// CHECK-NEXT:    [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+// CHECK-NEXT:    [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:    [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5)
+// CHECK-NEXT:    [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5)
+// CHECK-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// CHECK-NEXT:    [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr
+// CHECK-NEXT:    [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr
+// CHECK-NEXT:    [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr
+// CHECK-NEXT:    [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr
+// CHECK-NEXT:    [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr
+// CHECK-NEXT:    store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP3]], align 32
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.lz.cube.v4f32.f32.v8i32.v4i32(i32 1, float [[TMP0]], float [[TMP1]], float [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP4]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    ret <4 x float> [[TMP5]]
+//
+float4 test_amdgcn_image_sample_lz_cube_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_lz_cube_v4f32_f32(1, f32, f32, f32, tex, vec4i32, 0, 120, 110);
+}
+
+// CHECK-LABEL: define dso_local <4 x float> @test_amdgcn_image_sample_l_cube_v4f32_f32(
+// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+// CHECK-NEXT:    [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+// CHECK-NEXT:    [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:    [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5)
+// CHECK-NEXT:    [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5)
+// CHECK-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// CHECK-NEXT:    [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr
+// CHECK-NEXT:    [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr
+// CHECK-NEXT:    [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr
+// CHECK-NEXT:    [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr
+// CHECK-NEXT:    [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr
+// CHECK-NEXT:    store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP4]], align 32
+// CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.l.cube.v4f32.f32.v8i32.v4i32(i32 1, float [[TMP0]], float [[TMP1]], float [[TMP2]], float [[TMP3]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP5]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    ret <4 x float> [[TMP6]]
+//
+float4 test_amdgcn_image_sample_l_cube_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_l_cube_v4f32_f32(1, f32, f32, f32, f32, tex, vec4i32, 0, 120, 110);
+}
+
+// CHECK-LABEL: define dso_local <4 x float> @test_amdgcn_image_sample_lz_1darray_v4f32_f32(
+// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+// CHECK-NEXT:    [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+// CHECK-NEXT:    [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:    [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5)
+// CHECK-NEXT:    [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5)
+// CHECK-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// CHECK-NEXT:    [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr
+// CHECK-NEXT:    [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr
+// CHECK-NEXT:    [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr
+// CHECK-NEXT:    [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr
+// CHECK-NEXT:    [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr
+// CHECK-NEXT:    store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32
+// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.lz.1darray.v4f32.f32.v8i32.v4i32(i32 1, float [[TMP0]], float [[TMP1]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP3]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    ret <4 x float> [[TMP4]]
+//
+float4 test_amdgcn_image_sample_lz_1darray_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_lz_1darray_v4f32_f32(1, f32, f32, tex, vec4i32, 0, 120, 110);
+}
+
+// CHECK-LABEL: define dso_local <4 x float> @test_amdgcn_image_sample_l_1darray_v4f32_f32(
+// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+// CHECK-NEXT:    [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+// CHECK-NEXT:    [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:    [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5)
+// CHECK-NEXT:    [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5)
+// CHECK-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// CHECK-NEXT:    [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr
+// CHECK-NEXT:    [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr
+// CHECK-NEXT:    [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr
+// CHECK-NEXT:    [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr
+// CHECK-NEXT:    [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr
+// CHECK-NEXT:    store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP3]], align 32
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.l.1darray.v4f32.f32.v8i32.v4i32(i32 1, float [[TMP0]], float [[TMP1]], float [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP4]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    ret <4 x float> [[TMP5]]
+//
+float4 test_amdgcn_image_sample_l_1darray_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_l_1darray_v4f32_f32(1, f32, f32, f32, tex, vec4i32, 0, 120, 110);
+}
+
+// CHECK-LABEL: define dso_local <4 x float> @test_amdgcn_image_sample_d_1darray_v4f32_f32(
+// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+// CHECK-NEXT:    [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+// CHECK-NEXT:    [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:    [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5)
+// CHECK-NEXT:    [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5)
+// CHECK-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// CHECK-NEXT:    [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr
+// CHECK-NEXT:    [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr
+// CHECK-NEXT:    [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr
+// CHECK-NEXT:    [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr
+// CHECK-NEXT:    [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr
+// CHECK-NEXT:    store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP4]], align 32
+// CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.1darray.v4f32.f32.f32.v8i32.v4i32(i32 1, float [[TMP0]], float [[TMP1]], float [[TMP2]], float [[TMP3]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP5]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    ret <4 x float> [[TMP6]]
+//
+float4 test_amdgcn_image_sample_d_1darray_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_d_1darray_v4f32_f32(1, f32, f32, f32, f32, tex, vec4i32, 0, 120, 110);
+}
+
+// CHECK-LABEL: define dso_local <4 x float> @test_amdgcn_image_sample_lz_2darray_v4f32_f32(
+// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+// CHECK-NEXT:    [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+// CHECK-NEXT:    [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:    [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5)
+// CHECK-NEXT:    [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5)
+// CHECK-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// CHECK-NEXT:    [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr
+// CHECK-NEXT:    [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr
+// CHECK-NEXT:    [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr
+// CHECK-NEXT:    [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr
+// CHECK-NEXT:    [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr
+// CHECK-NEXT:    store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP3]], align 32
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.lz.2darray.v4f32.f32.v8i32.v4i32(i32 1, float [[TMP0]], float [[TMP1]], float [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP4]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    ret <4 x float> [[TMP5]]
+//
+float4 test_amdgcn_image_sample_lz_2darray_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_lz_2darray_v4f32_f32(1, f32, f32, f32, tex, vec4i32, 0, 120, 110);
+}
+
+// CHECK-LABEL: define dso_local <4 x float> @test_amdgcn_image_sample_l_2darray_v4f32_f32(
+// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+// CHECK-NEXT:    [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+// CHECK-NEXT:    [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:    [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5)
+// CHECK-NEXT:    [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5)
+// CHECK-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// CHECK-NEXT:    [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr
+// CHECK-NEXT:    [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr
+// CHECK-NEXT:    [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr
+// CHECK-NEXT:    [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr
+// CHECK-NEXT:    [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr
+// CHECK-NEXT:    store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP4]], align 32
+// CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.l.2darray.v4f32.f32.v8i32.v4i32(i32 1, float [[TMP0]], float [[TMP1]], float [[TMP2]], float [[TMP3]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP5]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    ret <4 x float> [[TMP6]]
+//
+float4 test_amdgcn_image_sample_l_2darray_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_l_2darray_v4f32_f32(1, f32, f32, f32, f32, tex, vec4i32, 0, 120, 110);
+}
+
+// CHECK-LABEL: define dso_local <4 x float> @test_amdgcn_image_sample_d_2darray_v4f32_f32(
+// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+// CHECK-NEXT:    [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+// CHECK-NEXT:    [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:    [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5)
+// CHECK-NEXT:    [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5)
+// CHECK-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// CHECK-NEXT:    [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr
+// CHECK-NEXT:    [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr
+// CHECK-NEXT:    [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr
+// CHECK-NEXT:    [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr
+// CHECK-NEXT:    [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr
+// CHECK-NEXT:    store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP7]], align 32
+// CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP9:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.2darray.v4f32.f32.f32.v8i32.v4i32(i32 1, float [[TMP0]], float [[TMP1]], float [[TMP2]], float [[TMP3]], float [[TMP4]], float [[TMP5]], float [[TMP6]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP8]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    ret <4 x float> [[TMP9]]
+//
+float4 test_amdgcn_image_sample_d_2darray_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_d_2darray_v4f32_f32(1, f32, f32, f32, f32, f32, f32, f32, tex, vec4i32, 0, 120, 110);
+}
+
+// CHECK-LABEL: define dso_local <4 x half> @test_amdgcn_image_sample_lz_1d_v4f16_f32(
+// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca <4 x half>, align 8, addrspace(5)
+// CHECK-NEXT:    [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+// CHECK-NEXT:    [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:    [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5)
+// CHECK-NEXT:    [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5)
+// CHECK-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// CHECK-NEXT:    [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr
+// CHECK-NEXT:    [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr
+// CHECK-NEXT:    [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr
+// CHECK-NEXT:    [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr
+// CHECK-NEXT:    [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr
+// CHECK-NEXT:    store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.lz.1d.v4f16.f32.v8i32.v4i32(i32 100, float [[TMP0]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP2]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    ret <4 x half> [[TMP3]]
+//
+half4 test_amdgcn_image_sample_lz_1d_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_lz_1d_v4f16_f32(100, f32, tex, vec4i32, 0, 120, 110);
+}
+
+// CHECK-LABEL: define dso_local <4 x half> @test_amdgcn_image_sample_l_1d_v4f16_f32(
+// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca <4 x half>, align 8, addrspace(5)
+// CHECK-NEXT:    [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+// CHECK-NEXT:    [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:    [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5)
+// CHECK-NEXT:    [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5)
+// CHECK-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// CHECK-NEXT:    [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr
+// CHECK-NEXT:    [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr
+// CHECK-NEXT:    [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr
+// CHECK-NEXT:    [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr
+// CHECK-NEXT:    [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr
+// CHECK-NEXT:    store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32
+// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.l.1d.v4f16.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP3]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    ret <4 x half> [[TMP4]]
+//
+half4 test_amdgcn_image_sample_l_1d_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_l_1d_v4f16_f32(100, f32, f32, tex, vec4i32, 0, 120, 110);
+}
+
+// CHECK-LABEL: define dso_local <4 x half> @test_amdgcn_image_sample_d_1d_v4f16_f32(
+// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca <4 x half>, align 8, addrspace(5)
+// CHECK-NEXT:    [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+// CHECK-NEXT:    [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:    [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5)
+// CHECK-NEXT:    [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5)
+// CHECK-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// CHECK-NEXT:    [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr
+// CHECK-NEXT:    [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr
+// CHECK-NEXT:    [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr
+// CHECK-NEXT:    [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr
+// CHECK-NEXT:    [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr
+// CHECK-NEXT:    store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP3]], align 32
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.d.1d.v4f16.f32.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], float [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP4]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    ret <4 x half> [[TMP5]]
+//
+half4 test_amdgcn_image_sample_d_1d_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_d_1d_v4f16_f32(100, f32, f32, f32, tex, vec4i32, 0, 120, 110);
+}
+
+// CHECK-LABEL: define dso_local <4 x half> @test_amdgcn_image_sample_lz_2d_v4f16_f32(
+// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca <4 x half>, align 8, addrspace(5)
+// CHECK-NEXT:    [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+// CHECK-NEXT:    [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:    [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5)
+// CHECK-NEXT:    [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5)
+// CHECK-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// CHECK-NEXT:    [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr
+// CHECK-NEXT:    [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr
+// CHECK-NEXT:    [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr
+// CHECK-NEXT:    [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr
+// CHECK-NEXT:    [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr
+// CHECK-NEXT:    store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32
+// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.lz.2d.v4f16.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP3]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    ret <4 x half> [[TMP4]]
+//
+half4 test_amdgcn_image_sample_lz_2d_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_lz_2d_v4f16_f32(100, f32, f32, tex, vec4i32, 0, 120, 110);
+}
+
+// CHECK-LABEL: define dso_local <4 x half> @test_amdgcn_image_sample_l_2d_v4f16_f32(
+// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca <4 x half>, align 8, addrspace(5)
+// CHECK-NEXT:    [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+// CHECK-NEXT:    [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:    [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5)
+// CHECK-NEXT:    [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5)
+// CHECK-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// CHECK-NEXT:    [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr
+// CHECK-NEXT:    [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr
+// CHECK-NEXT:    [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr
+// CHECK-NEXT:    [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr
+// CHECK-NEXT:    [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr
+// CHECK-NEXT:    store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP3]], align 32
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.l.2d.v4f16.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], float [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP4]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    ret <4 x half> [[TMP5]]
+//
+half4 test_amdgcn_image_sample_l_2d_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_l_2d_v4f16_f32(100, f32, f32, f32, tex, vec4i32, 0, 120, 110);
+}
+
+// CHECK-LABEL: define dso_local <4 x half> @test_amdgcn_image_sample_d_2d_v4f16_f32(
+// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca <4 x half>, align 8, addrspace(5)
+// CHECK-NEXT:    [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+// CHECK-NEXT:    [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:    [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5)
+// CHECK-NEXT:    [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5)
+// CHECK-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// CHECK-NEXT:    [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr
+// CHECK-NEXT:    [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr
+// CHECK-NEXT:    [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr
+// CHECK-NEXT:    [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr
+// CHECK-NEXT:    [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr
+// CHECK-NEXT:    store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP6]], align 32
+// CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP8:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.d.2d.v4f16.f32.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], float [[TMP2]], float [[TMP3]], float [[TMP4]], float [[TMP5]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP7]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    ret <4 x half> [[TMP8]]
+//
+half4 test_amdgcn_image_sample_d_2d_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_d_2d_v4f16_f32(100, f32, f32, f32, f32, f32, f32, tex, vec4i32, 0, 120, 110);
+}
+
+// CHECK-LABEL: define dso_local <4 x half> @test_amdgcn_image_sample_lz_3d_v4f16_f32(
+// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca <4 x half>, align 8, addrspace(5)
+// CHECK-NEXT:    [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+// CHECK-NEXT:    [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:    [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5)
+// CHECK-NEXT:    [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5)
+// CHECK-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// CHECK-NEXT:    [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr
+// CHECK-NEXT:    [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr
+// CHECK-NEXT:    [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr
+// CHECK-NEXT:    [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr
+// CHECK-NEXT:    [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr
+// CHECK-NEXT:    store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP3]], align 32
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.lz.3d.v4f16.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], float [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP4]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    ret <4 x half> [[TMP5]]
+//
+half4 test_amdgcn_image_sample_lz_3d_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_lz_3d_v4f16_f32(100, f32, f32, f32, tex, vec4i32, 0, 120, 110);
+}
+
+// CHECK-LABEL: define dso_local <4 x half> @test_amdgcn_image_sample_l_3d_v4f16_f32(
+// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca <4 x half>, align 8, addrspace(5)
+// CHECK-NEXT:    [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+// CHECK-NEXT:    [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:    [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5)
+// CHECK-NEXT:    [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5)
+// CHECK-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// CHECK-NEXT:    [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr
+// CHECK-NEXT:    [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr
+// CHECK-NEXT:    [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr
+// CHECK-NEXT:    [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr
+// CHECK-NEXT:    [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr
+// CHECK-NEXT:    store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP4]], align 32
+// CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.l.3d.v4f16.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], float [[TMP2]], float [[TMP3]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP5]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    ret <4 x half> [[TMP6]]
+//
+half4 test_amdgcn_image_sample_l_3d_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_l_3d_v4f16_f32(100, f32, f32, f32, f32, tex, vec4i32, 0, 120, 110);
+}
+
+// CHECK-LABEL: define dso_local <4 x half> @test_amdgcn_image_sample_d_3d_v4f16_f32(
+// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca <4 x half>, align 8, addrspace(5)
+// CHECK-NEXT:    [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+// CHECK-NEXT:    [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:    [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5)
+// CHECK-NEXT:    [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5)
+// CHECK-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// CHECK-NEXT:    [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr
+// CHECK-NEXT:    [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr
+// CHECK-NEXT:    [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr
+// CHECK-NEXT:    [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr
+// CHECK-NEXT:    [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr
+// CHECK-NEXT:    store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP9]], align 32
+// CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP11:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.d.3d.v4f16.f32.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], float [[TMP2]], float [[TMP3]], float [[TMP4]], float [[TMP5]], float [[TMP6]], float [[TMP7]], float [[TMP8]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP10]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    ret <4 x half> [[TMP11]]
+//
+half4 test_amdgcn_image_sample_d_3d_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_d_3d_v4f16_f32(100, f32, f32, f32, f32, f32, f32, f32, f32, f32, tex, vec4i32, 0, 120, 110);
+}
+
+// CHECK-LABEL: define dso_local <4 x half> @test_amdgcn_image_sample_lz_cube_v4f16_f32(
+// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca <4 x half>, align 8, addrspace(5)
+// CHECK-NEXT:    [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+// CHECK-NEXT:    [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:    [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5)
+// CHECK-NEXT:    [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5)
+// CHECK-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// CHECK-NEXT:    [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr
+// CHECK-NEXT:    [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr
+// CHECK-NEXT:    [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr
+// CHECK-NEXT:    [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr
+// CHECK-NEXT:    [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr
+// CHECK-NEXT:    store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP3]], align 32
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.lz.cube.v4f16.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], float [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP4]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    ret <4 x half> [[TMP5]]
+//
+half4 test_amdgcn_image_sample_lz_cube_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_lz_cube_v4f16_f32(100, f32, f32, f32, tex, vec4i32, 0, 120, 110);
+}
+
+// CHECK-LABEL: define dso_local <4 x half> @test_amdgcn_image_sample_l_cube_v4f16_f32(
+// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca <4 x half>, align 8, addrspace(5)
+// CHECK-NEXT:    [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+// CHECK-NEXT:    [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:    [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5)
+// CHECK-NEXT:    [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5)
+// CHECK-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// CHECK-NEXT:    [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr
+// CHECK-NEXT:    [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr
+// CHECK-NEXT:    [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr
+// CHECK-NEXT:    [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr
+// CHECK-NEXT:    [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr
+// CHECK-NEXT:    store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP4]], align 32
+// CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.l.cube.v4f16.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], float [[TMP2]], float [[TMP3]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP5]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    ret <4 x half> [[TMP6]]
+//
+half4 test_amdgcn_image_sample_l_cube_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_l_cube_v4f16_f32(100, f32, f32, f32, f32, tex, vec4i32, 0, 120, 110);
+}
+
+// CHECK-LABEL: define dso_local <4 x half> @test_amdgcn_image_sample_lz_1darray_v4f16_f32(
+// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca <4 x half>, align 8, addrspace(5)
+// CHECK-NEXT:    [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+// CHECK-NEXT:    [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:    [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5)
+// CHECK-NEXT:    [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5)
+// CHECK-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// CHECK-NEXT:    [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr
+// CHECK-NEXT:    [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr
+// CHECK-NEXT:    [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr
+// CHECK-NEXT:    [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr
+// CHECK-NEXT:    [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr
+// CHECK-NEXT:    store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32
+// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.lz.1darray.v4f16.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP3]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    ret <4 x half> [[TMP4]]
+//
+half4 test_amdgcn_image_sample_lz_1darray_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_lz_1darray_v4f16_f32(100, f32, f32, tex, vec4i32, 0, 120, 110);
+}
+
+// CHECK-LABEL: define dso_local <4 x half> @test_amdgcn_image_sample_l_1darray_v4f16_f32(
+// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca <4 x half>, align 8, addrspace(5)
+// CHECK-NEXT:    [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+// CHECK-NEXT:    [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:    [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5)
+// CHECK-NEXT:    [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5)
+// CHECK-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// CHECK-NEXT:    [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr
+// CHECK-NEXT:    [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr
+// CHECK-NEXT:    [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr
+// CHECK-NEXT:    [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr
+// CHECK-NEXT:    [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr
+// CHECK-NEXT:    store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP3]], align 32
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.l.1darray.v4f16.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], float [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP4]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    ret <4 x half> [[TMP5]]
+//
+half4 test_amdgcn_image_sample_l_1darray_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_l_1darray_v4f16_f32(100, f32, f32, f32, tex, vec4i32, 0, 120, 110);
+}
+
+// CHECK-LABEL: define dso_local <4 x half> @test_amdgcn_image_sample_d_1darray_v4f16_f32(
+// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca <4 x half>, align 8, addrspace(5)
+// CHECK-NEXT:    [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+// CHECK-NEXT:    [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:    [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5)
+// CHECK-NEXT:    [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5)
+// CHECK-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// CHECK-NEXT:    [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr
+// CHECK-NEXT:    [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr
+// CHECK-NEXT:    [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr
+// CHECK-NEXT:    [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr
+// CHECK-NEXT:    [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr
+// CHECK-NEXT:    store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP4]], align 32
+// CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.d.1darray.v4f16.f32.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], float [[TMP2]], float [[TMP3]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP5]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    ret <4 x half> [[TMP6]]
+//
+half4 test_amdgcn_image_sample_d_1darray_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_d_1darray_v4f16_f32(100, f32, f32, f32, f32, tex, vec4i32, 0, 120, 110);
+}
+
+// CHECK-LABEL: define dso_local <4 x half> @test_amdgcn_image_sample_lz_2darray_v4f16_f32(
+// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca <4 x half>, align 8, addrspace(5)
+// CHECK-NEXT:    [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+// CHECK-NEXT:    [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:    [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5)
+// CHECK-NEXT:    [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5)
+// CHECK-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// CHECK-NEXT:    [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr
+// CHECK-NEXT:    [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr
+// CHECK-NEXT:    [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr
+// CHECK-NEXT:    [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr
+// CHECK-NEXT:    [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr
+// CHECK-NEXT:    store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP3]], align 32
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.lz.2darray.v4f16.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], float [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP4]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    ret <4 x half> [[TMP5]]
+//
+half4 test_amdgcn_image_sample_lz_2darray_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_lz_2darray_v4f16_f32(100, f32, f32, f32, tex, vec4i32, 0, 120, 110);
+}
+
+// CHECK-LABEL: define dso_local <4 x half> @test_amdgcn_image_sample_l_2darray_v4f16_f32(
+// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca <4 x half>, align 8, addrspace(5)
+// CHECK-NEXT:    [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+// CHECK-NEXT:    [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:    [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5)
+// CHECK-NEXT:    [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5)
+// CHECK-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// CHECK-NEXT:    [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr
+// CHECK-NEXT:    [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr
+// CHECK-NEXT:    [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr
+// CHECK-NEXT:    [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr
+// CHECK-NEXT:    [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr
+// CHECK-NEXT:    store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP4]], align 32
+// CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.l.2darray.v4f16.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], float [[TMP2]], float [[TMP3]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP5]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    ret <4 x half> [[TMP6]]
+//
+half4 test_amdgcn_image_sample_l_2darray_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_l_2darray_v4f16_f32(100, f32, f32, f32, f32, tex, vec4i32, 0, 120, 110);
+}
+
+// CHECK-LABEL: define dso_local <4 x half> @test_amdgcn_image_sample_d_2darray_v4f16_f32(
+// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca <4 x half>, align 8, addrspace(5)
+// CHECK-NEXT:    [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+// CHECK-NEXT:    [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:    [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5)
+// CHECK-NEXT:    [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5)
+// CHECK-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// CHECK-NEXT:    [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr
+// CHECK-NEXT:    [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr
+// CHECK-NEXT:    [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr
+// CHECK-NEXT:    [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr
+// CHECK-NEXT:    [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr
+// CHECK-NEXT:    store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP7]], align 32
+// CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP9:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.d.2darray.v4f16.f32.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], float [[TMP2]], float [[TMP3]], float [[TMP4]], float [[TMP5]], float [[TMP6]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP8]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    ret <4 x half> [[TMP9]]
+//
+half4 test_amdgcn_image_sample_d_2darray_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_d_2darray_v4f16_f32(100, f32, f32, f32, f32, f32, f32, f32, tex, vec4i32, 0, 120, 110);
+}
+
+// CHECK-LABEL: define dso_local float @test_amdgcn_image_sample_lz_2d_f32_f32(
+// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:    [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+// CHECK-NEXT:    [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:    [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5)
+// CHECK-NEXT:    [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5)
+// CHECK-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// CHECK-NEXT:    [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr
+// CHECK-NEXT:    [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr
+// CHECK-NEXT:    [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr
+// CHECK-NEXT:    [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr
+// CHECK-NEXT:    [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr
+// CHECK-NEXT:    store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32
+// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = call float @llvm.amdgcn.image.sample.lz.2d.f32.f32.v8i32.v4i32(i32 1, float [[TMP0]], float [[TMP1]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP3]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    ret float [[TMP4]]
+//
+float test_amdgcn_image_sample_lz_2d_f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_lz_2d_f32_f32(1, f32, f32, tex, vec4i32, 0, 120, 110);
+}
+
+// CHECK-LABEL: define dso_local float @test_amdgcn_image_sample_l_2d_f32_f32(
+// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:    [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+// CHECK-NEXT:    [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:    [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5)
+// CHECK-NEXT:    [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5)
+// CHECK-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// CHECK-NEXT:    [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr
+// CHECK-NEXT:    [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr
+// CHECK-NEXT:    [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr
+// CHECK-NEXT:    [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr
+// CHECK-NEXT:    [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr
+// CHECK-NEXT:    store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP3]], align 32
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = call float @llvm.amdgcn.image.sample.l.2d.f32.f32.v8i32.v4i32(i32 1, float [[TMP0]], float [[TMP1]], float [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP4]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    ret float [[TMP5]]
+//
+float test_amdgcn_image_sample_l_2d_f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_l_2d_f32_f32(1, f32, f32, f32, tex, vec4i32, 0, 120, 110);
+}
+
+// CHECK-LABEL: define dso_local float @test_amdgcn_image_sample_d_2d_f32_f32(
+// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:    [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+// CHECK-NEXT:    [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:    [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5)
+// CHECK-NEXT:    [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5)
+// CHECK-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// CHECK-NEXT:    [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr
+// CHECK-NEXT:    [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr
+// CHECK-NEXT:    [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr
+// CHECK-NEXT:    [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr
+// CHECK-NEXT:    [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr
+// CHECK-NEXT:    store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP6]], align 32
+// CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP8:%.*]] = call float @llvm.amdgcn.image.sample.d.2d.f32.f32.f32.v8i32.v4i32(i32 1, float [[TMP0]], float [[TMP1]], float [[TMP2]], float [[TMP3]], float [[TMP4]], float [[TMP5]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP7]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    ret float [[TMP8]]
+//
+float test_amdgcn_image_sample_d_2d_f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_d_2d_f32_f32(1, f32, f32, f32, f32, f32, f32, tex, vec4i32, 0, 120, 110);
+}
+
+// CHECK-LABEL: define dso_local float @test_amdgcn_image_sample_lz_2darray_f32_f32(
+// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:    [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+// CHECK-NEXT:    [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:    [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5)
+// CHECK-NEXT:    [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5)
+// CHECK-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// CHECK-NEXT:    [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr
+// CHECK-NEXT:    [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr
+// CHECK-NEXT:    [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr
+// CHECK-NEXT:    [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr
+// CHECK-NEXT:    [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr
+// CHECK-NEXT:    store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP3]], align 32
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = call float @llvm.amdgcn.image.sample.lz.2darray.f32.f32.v8i32.v4i32(i32 1, float [[TMP0]], float [[TMP1]], float [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP4]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    ret float [[TMP5]]
+//
+float test_amdgcn_image_sample_lz_2darray_f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_lz_2darray_f32_f32(1, f32, f32, f32, tex, vec4i32, 0, 120, 110);
+}
+
+// CHECK-LABEL: define dso_local float @test_amdgcn_image_sample_l_2darray_f32_f32(
+// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:    [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+// CHECK-NEXT:    [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:    [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5)
+// CHECK-NEXT:    [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5)
+// CHECK-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// CHECK-NEXT:    [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr
+// CHECK-NEXT:    [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr
+// CHECK-NEXT:    [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr
+// CHECK-NEXT:    [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr
+// CHECK-NEXT:    [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr
+// CHECK-NEXT:    store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP4]], align 32
+// CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = call float @llvm.amdgcn.image.sample.l.2darray.f32.f32.v8i32.v4i32(i32 1, float [[TMP0]], float [[TMP1]], float [[TMP2]], float [[TMP3]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP5]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    ret float [[TMP6]]
+//
+float test_amdgcn_image_sample_l_2darray_f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_l_2darray_f32_f32(1, f32, f32, f32, f32, tex, vec4i32, 0, 120, 110);
+}
+
+// CHECK-LABEL: define dso_local float @test_amdgcn_image_sample_d_2darray_f32_f32(
+// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:    [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+// CHECK-NEXT:    [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:    [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5)
+// CHECK-NEXT:    [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5)
+// CHECK-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// CHECK-NEXT:    [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr
+// CHECK-NEXT:    [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr
+// CHECK-NEXT:    [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr
+// CHECK-NEXT:    [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr
+// CHECK-NEXT:    [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr
+// CHECK-NEXT:    store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP7]], align 32
+// CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP9:%.*]] = call float @llvm.amdgcn.image.sample.d.2darray.f32.f32.f32.v8i32.v4i32(i32 1, float [[TMP0]], float [[TMP1]], float [[TMP2]], float [[TMP3]], float [[TMP4]], float [[TMP5]], float [[TMP6]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP8]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    ret float [[TMP9]]
+//
+float test_amdgcn_image_sample_d_2darray_f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_d_2darray_f32_f32(1, f32, f32, f32, f32, f32, f32, f32, tex, vec4i32, 0, 120, 110);
+}
diff --git a/clang/test/CodeGen/builtins-nvptx-native-half-type-native.c b/clang/test/CodeGen/builtins-nvptx-native-half-type-native.c
index 035c4c6066be2..60a35f4fe0c37 100644
--- a/clang/test/CodeGen/builtins-nvptx-native-half-type-native.c
+++ b/clang/test/CodeGen/builtins-nvptx-native-half-type-native.c
@@ -8,7 +8,7 @@
 typedef __fp16 __fp16v2 __attribute__((ext_vector_type(2)));
 
 // CHECK: call half @llvm.nvvm.ex2.approx.f16(half {{.*}})
-// CHECK: call <2 x half> @llvm.nvvm.ex2.approx.f16x2(<2 x half> {{.*}})
+// CHECK: call <2 x half> @llvm.nvvm.ex2.approx.v2f16(<2 x half> {{.*}})
 // CHECK: call half @llvm.nvvm.fma.rn.relu.f16(half {{.*}}, half {{.*}}, half {{.*}})
 // CHECK: call half @llvm.nvvm.fma.rn.ftz.relu.f16(half {{.*}}, half {{.*}}, half {{.*}})
 // CHECK: call <2 x half> @llvm.nvvm.fma.rn.relu.f16x2(<2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}})
diff --git a/clang/test/CodeGen/builtins-nvptx-native-half-type.c b/clang/test/CodeGen/builtins-nvptx-native-half-type.c
index 01a004efd71e4..1f16c7e54b85d 100644
--- a/clang/test/CodeGen/builtins-nvptx-native-half-type.c
+++ b/clang/test/CodeGen/builtins-nvptx-native-half-type.c
@@ -41,7 +41,7 @@ __device__ void nvvm_ex2_sm75() {
 #if __CUDA_ARCH__ >= 750
   // CHECK_PTX70_SM75: call half @llvm.nvvm.ex2.approx.f16
   __nvvm_ex2_approx_f16(0.1f16);
-  // CHECK_PTX70_SM75: call <2 x half> @llvm.nvvm.ex2.approx.f16x2
+  // CHECK_PTX70_SM75: call <2 x half> @llvm.nvvm.ex2.approx.v2f16
   __nvvm_ex2_approx_f16x2({0.1f16, 0.7f16});
 #endif
   // CHECK: ret void
diff --git a/clang/test/CodeGen/lto-newpm-pipeline.c b/clang/test/CodeGen/lto-newpm-pipeline.c
index ea9784a76f923..dceaaf136ebfc 100644
--- a/clang/test/CodeGen/lto-newpm-pipeline.c
+++ b/clang/test/CodeGen/lto-newpm-pipeline.c
@@ -32,10 +32,12 @@
 // CHECK-FULL-O0-NEXT: Running pass: AlwaysInlinerPass
 // CHECK-FULL-O0-NEXT: Running analysis: ProfileSummaryAnalysis
 // CHECK-FULL-O0-NEXT: Running pass: CoroConditionalWrapper
+// CHECK-FULL-O0-NEXT: Running pass: AllocTokenPass
+// CHECK-FULL-O0-NEXT: Running analysis: OptimizationRemarkEmitterAnalysis
+// CHECK-FULL-O0-NEXT: Running analysis: TargetLibraryAnalysis
 // CHECK-FULL-O0-NEXT: Running pass: CanonicalizeAliasesPass
 // CHECK-FULL-O0-NEXT: Running pass: NameAnonGlobalPass
 // CHECK-FULL-O0-NEXT: Running pass: AnnotationRemarksPass
-// CHECK-FULL-O0-NEXT: Running analysis: TargetLibraryAnalysis
 // CHECK-FULL-O0-NEXT: Running pass: VerifierPass
 // CHECK-FULL-O0-NEXT: Running pass: BitcodeWriterPass
 
@@ -46,10 +48,12 @@
 // CHECK-THIN-O0-NEXT: Running pass: AlwaysInlinerPass
 // CHECK-THIN-O0-NEXT: Running analysis: ProfileSummaryAnalysis
 // CHECK-THIN-O0-NEXT: Running pass: CoroConditionalWrapper
+// CHECK-THIN-O0-NEXT: Running pass: AllocTokenPass
+// CHECK-THIN-O0-NEXT: Running analysis: OptimizationRemarkEmitterAnalysis
+// CHECK-THIN-O0-NEXT: Running analysis: TargetLibraryAnalysis
 // CHECK-THIN-O0-NEXT: Running pass: CanonicalizeAliasesPass
 // CHECK-THIN-O0-NEXT: Running pass: NameAnonGlobalPass
 // CHECK-THIN-O0-NEXT: Running pass: AnnotationRemarksPass
-// CHECK-THIN-O0-NEXT: Running analysis: TargetLibraryAnalysis
 // CHECK-THIN-O0-NEXT: Running pass: VerifierPass
 // CHECK-THIN-O0-NEXT: Running pass: ThinLTOBitcodeWriterPass
 
diff --git a/clang/test/CodeGenCXX/alloc-token-builtin.cpp b/clang/test/CodeGenCXX/alloc-token-builtin.cpp
new file mode 100644
index 0000000000000..adadf7bbe4174
--- /dev/null
+++ b/clang/test/CodeGenCXX/alloc-token-builtin.cpp
@@ -0,0 +1,97 @@
+// To test IR generation of the builtin without evaluating the LLVM intrinsic,
+// we set the mode to a stateful mode, which prohibits constant evaluation.
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -Werror -std=c++20 -emit-llvm -falloc-token-mode=random -disable-llvm-passes %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-CODEGEN
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -Werror -std=c++20 -emit-llvm -falloc-token-max=2 %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-LOWER
+
+extern "C" void *my_malloc(unsigned long, unsigned long);
+
+struct NoPtr {
+  int x;
+  long y;
+};
+
+struct WithPtr {
+  int a;
+  char *buf;
+};
+
+int unevaluated_fn();
+
+// CHECK-LABEL: @_Z16test_builtin_intv(
+// CHECK-CODEGEN: call i64 @llvm.alloc.token.id.i64(metadata ![[META_INT:[0-9]+]])
+// CHECK-LOWER: ret i64 0
+unsigned long test_builtin_int() {
+  return __builtin_infer_alloc_token(sizeof(1));
+}
+
+// CHECK-LABEL: @_Z16test_builtin_ptrv(
+// CHECK-CODEGEN: call i64 @llvm.alloc.token.id.i64(metadata ![[META_PTR:[0-9]+]])
+// CHECK-LOWER: ret i64 1
+unsigned long test_builtin_ptr() {
+  return __builtin_infer_alloc_token(sizeof(int *));
+}
+
+// CHECK-LABEL: @_Z25test_builtin_struct_noptrv(
+// CHECK-CODEGEN: call i64 @llvm.alloc.token.id.i64(metadata ![[META_NOPTR:[0-9]+]])
+// CHECK-LOWER: ret i64 0
+unsigned long test_builtin_struct_noptr() {
+  return __builtin_infer_alloc_token(sizeof(NoPtr));
+}
+
+// CHECK-LABEL: @_Z25test_builtin_struct_w_ptrv(
+// CHECK-CODEGEN: call i64 @llvm.alloc.token.id.i64(metadata ![[META_WITHPTR:[0-9]+]])
+// CHECK-LOWER: ret i64 1
+unsigned long test_builtin_struct_w_ptr() {
+  return __builtin_infer_alloc_token(sizeof(WithPtr), 123);
+}
+
+// CHECK-LABEL: @_Z24test_builtin_unevaluatedv(
+// CHECK-NOT: call{{.*}}unevaluated_fn
+// CHECK-CODEGEN: call i64 @llvm.alloc.token.id.i64(metadata ![[META_INT:[0-9]+]])
+// CHECK-LOWER: ret i64 0
+unsigned long test_builtin_unevaluated() {
+	return __builtin_infer_alloc_token(sizeof(int) * unevaluated_fn());
+}
+
+// CHECK-LABEL: @_Z36test_builtin_unsequenced_unevaluatedi(
+// CHECK:     add nsw
+// CHECK-NOT: add nsw
+// CHECK-CODEGEN: %[[REG:[0-9]+]] = call i64 @llvm.alloc.token.id.i64(metadata ![[META_UNKNOWN:[0-9]+]])
+// CHECK-CODEGEN: call{{.*}}@my_malloc({{.*}}, i64 noundef %[[REG]])
+// CHECK-LOWER: call{{.*}}@my_malloc({{.*}}, i64 noundef 0)
+void test_builtin_unsequenced_unevaluated(int x) {
+  my_malloc(++x, __builtin_infer_alloc_token(++x));
+}
+
+// CHECK-LABEL: @_Z20test_builtin_unknownv(
+// CHECK-CODEGEN: call i64 @llvm.alloc.token.id.i64(metadata ![[META_UNKNOWN:[0-9]+]])
+// CHECK-LOWER: ret i64 0
+unsigned long test_builtin_unknown() {
+  return __builtin_infer_alloc_token(4096);
+}
+
+// Test template instantiation.
+template <typename T>
+constexpr unsigned long get_token() {
+  return __builtin_infer_alloc_token(sizeof(T));
+}
+
+// CHECK-LABEL: @_Z13get_token_intv()
+// CHECK-CODEGEN: call i64 @llvm.alloc.token.id.i64(metadata ![[META_INT]])
+// CHECK-LOWER: ret i64 0
+unsigned long get_token_int() {
+  return get_token<int>();
+}
+
+// CHECK-LABEL: @_Z13get_token_ptrv()
+// CHECK-CODEGEN: call i64 @llvm.alloc.token.id.i64(metadata ![[META_PTR]])
+// CHECK-LOWER: ret i64 1
+unsigned long get_token_ptr() {
+  return get_token<int *>();
+}
+
+// CHECK-CODEGEN: ![[META_INT]] = !{!"int", i1 false}
+// CHECK-CODEGEN: ![[META_PTR]] = !{!"int *", i1 true}
+// CHECK-CODEGEN: ![[META_NOPTR]] = !{!"NoPtr", i1 false}
+// CHECK-CODEGEN: ![[META_WITHPTR]] = !{!"WithPtr", i1 true}
+// CHECK-CODEGEN: ![[META_UNKNOWN]] = !{}
diff --git a/clang/test/CodeGenCXX/attr-callback.cpp b/clang/test/CodeGenCXX/attr-callback.cpp
index c3456d6c430ff..efa705b9d06dc 100644
--- a/clang/test/CodeGenCXX/attr-callback.cpp
+++ b/clang/test/CodeGenCXX/attr-callback.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple i386-unknown-unknown %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -triple i386-unknown-unknown -std=c++23 %s -emit-llvm -o - | FileCheck %s
 
 struct Base {
 
@@ -47,9 +47,30 @@ struct Derived_2 : public Base {
 // CHECK-NOT: !callback
 void Derived_2::virtual_1(void (*callback)(void)) {}
 
+class ExplicitParameterObject {
+  __attribute__((callback(1, 0))) void implicit_this_idx(void (*callback)(ExplicitParameterObject*));
+  __attribute__((callback(1, this))) void implicit_this_identifier(void (*callback)(ExplicitParameterObject*));
+  __attribute__((callback(2, 1))) void explicit_this_idx(this ExplicitParameterObject* self, void (*callback)(ExplicitParameterObject*));
+  __attribute__((callback(2, self))) void explicit_this_identifier(this ExplicitParameterObject* self, void (*callback)(ExplicitParameterObject*));
+};
+
+// CHECK-DAG: define{{.*}} void @_ZN23ExplicitParameterObject17implicit_this_idxEPFvPS_E({{[^!]*!callback}} ![[cid3:[0-9]+]]
+void ExplicitParameterObject::implicit_this_idx(void (*callback)(ExplicitParameterObject*)) {}
+
+// CHECK-DAG: define{{.*}} void @_ZN23ExplicitParameterObject24implicit_this_identifierEPFvPS_E({{[^!]*!callback}} ![[cid3]]
+void ExplicitParameterObject::implicit_this_identifier(void (*callback)(ExplicitParameterObject*)) {}
+
+// CHECK-DAG: define{{.*}} void @_ZNH23ExplicitParameterObject17explicit_this_idxEPS_PFvS0_E({{[^!]*!callback}} ![[cid3]]
+void ExplicitParameterObject::explicit_this_idx(this ExplicitParameterObject* self, void (*callback)(ExplicitParameterObject*)) {}
+
+// CHECK-DAG: define{{.*}} void @_ZNH23ExplicitParameterObject24explicit_this_identifierEPS_PFvS0_E({{[^!]*!callback}} ![[cid3]]
+void ExplicitParameterObject::explicit_this_identifier(this ExplicitParameterObject* self, void (*callback)(ExplicitParameterObject*)) {}
+
 // CHECK-DAG: ![[cid0]] = !{![[cid0b:[0-9]+]]}
 // CHECK-DAG: ![[cid0b]] = !{i64 1, i1 false}
 // CHECK-DAG: ![[cid1]] = !{![[cid1b:[0-9]+]]}
 // CHECK-DAG: ![[cid1b]] = !{i64 2, i1 false}
 // CHECK-DAG: ![[cid2]] = !{![[cid2b:[0-9]+]]}
 // CHECK-DAG: ![[cid2b]] = !{i64 1, i64 0, i64 -1, i64 0, i1 false}
+// CHECK-DAG: ![[cid3]] = !{![[cid3b:[0-9]+]]}
+// CHECK-DAG: ![[cid3b]] = !{i64 1, i64 0, i1 false}
diff --git a/clang/test/CodeGenCXX/matrix-vector-bit-int.cpp b/clang/test/CodeGenCXX/matrix-vector-bit-int.cpp
index 2e7531b334ecb..4be1cb3067c2f 100644
--- a/clang/test/CodeGenCXX/matrix-vector-bit-int.cpp
+++ b/clang/test/CodeGenCXX/matrix-vector-bit-int.cpp
@@ -19,7 +19,7 @@ using i4x3x3 = _BitInt(4) __attribute__((matrix_type(3, 3)));
 // CHECK-NEXT:    store i32 [[A_COERCE]], ptr [[A]], align 4
 // CHECK-NEXT:    [[LOADVECN:%.*]] = load <4 x i8>, ptr [[A]], align 4
 // CHECK-NEXT:    [[A1:%.*]] = shufflevector <4 x i8> [[LOADVECN]], <4 x i8> poison, <3 x i32> <i32 0, i32 1, i32 2>
-// CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x i8> [[A1]], <3 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x i8> [[A1]], <3 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 // CHECK-NEXT:    store <4 x i8> [[EXTRACTVEC]], ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[LOADVECN2:%.*]] = load <4 x i8>, ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[EXTRACTVEC3:%.*]] = shufflevector <4 x i8> [[LOADVECN2]], <4 x i8> poison, <3 x i32> <i32 0, i32 1, i32 2>
@@ -38,7 +38,7 @@ i8x3 v1(i8x3 a) {
 // CHECK-SAME: <3 x i32> noundef [[A:%.*]]) #[[ATTR1:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <3 x i32>, align 16
-// CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x i32> [[A]], <3 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x i32> [[A]], <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 // CHECK-NEXT:    store <4 x i32> [[EXTRACTVEC]], ptr [[A_ADDR]], align 16
 // CHECK-NEXT:    [[LOADVECN:%.*]] = load <4 x i32>, ptr [[A_ADDR]], align 16
 // CHECK-NEXT:    [[EXTRACTVEC1:%.*]] = shufflevector <4 x i32> [[LOADVECN]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
@@ -57,7 +57,7 @@ i32x3 v2(i32x3 a) {
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <3 x i512>, align 256
 // CHECK-NEXT:    [[LOADVECN:%.*]] = load <4 x i512>, ptr [[TMP0]], align 256
 // CHECK-NEXT:    [[A:%.*]] = shufflevector <4 x i512> [[LOADVECN]], <4 x i512> poison, <3 x i32> <i32 0, i32 1, i32 2>
-// CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x i512> [[A]], <3 x i512> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x i512> [[A]], <3 x i512> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 // CHECK-NEXT:    store <4 x i512> [[EXTRACTVEC]], ptr [[A_ADDR]], align 256
 // CHECK-NEXT:    [[LOADVECN1:%.*]] = load <4 x i512>, ptr [[A_ADDR]], align 256
 // CHECK-NEXT:    [[EXTRACTVEC2:%.*]] = shufflevector <4 x i512> [[LOADVECN1]], <4 x i512> poison, <3 x i32> <i32 0, i32 1, i32 2>
@@ -80,7 +80,7 @@ i512x3 v3(i512x3 a) {
 // CHECK-NEXT:    store i32 [[A_COERCE]], ptr [[A]], align 4
 // CHECK-NEXT:    [[LOADVECN:%.*]] = load <4 x i4>, ptr [[A]], align 4
 // CHECK-NEXT:    [[A1:%.*]] = shufflevector <4 x i4> [[LOADVECN]], <4 x i4> poison, <3 x i32> <i32 0, i32 1, i32 2>
-// CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x i4> [[A1]], <3 x i4> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x i4> [[A1]], <3 x i4> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 // CHECK-NEXT:    store <4 x i4> [[EXTRACTVEC]], ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[LOADVECN2:%.*]] = load <4 x i4>, ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[EXTRACTVEC3:%.*]] = shufflevector <4 x i4> [[LOADVECN2]], <4 x i4> poison, <3 x i32> <i32 0, i32 1, i32 2>
diff --git a/clang/test/CodeGenCXX/ubsan-coroutines.cpp b/clang/test/CodeGenCXX/ubsan-coroutines.cpp
index 04ab0505f1401..60c89a47f9046 100644
--- a/clang/test/CodeGenCXX/ubsan-coroutines.cpp
+++ b/clang/test/CodeGenCXX/ubsan-coroutines.cpp
@@ -1,6 +1,7 @@
 // This test merely verifies that emitting the object file does not cause a
 // crash when the LLVM coroutines passes are run.
 // RUN: %clang_cc1 -emit-obj -std=c++2a -fsanitize=null %s -o %t.o
+// UNSUPPORTED: target={{.*}}-zos{{.*}}
 
 namespace std {
 template <typename R, typename... T> struct coroutine_traits {
diff --git a/clang/test/CodeGenHIP/maybe_undef-attr-verify.hip b/clang/test/CodeGenHIP/maybe_undef-attr-verify.hip
index 571fba148f5cc..6dc57c4fcc5fc 100644
--- a/clang/test/CodeGenHIP/maybe_undef-attr-verify.hip
+++ b/clang/test/CodeGenHIP/maybe_undef-attr-verify.hip
@@ -20,7 +20,7 @@
 #define __maybe_undef __attribute__((maybe_undef))
 #define WARP_SIZE 64
 
-static constexpr int warpSize = __AMDGCN_WAVEFRONT_SIZE__;
+static constexpr int warpSize = WARP_SIZE;
 
 __device__ static inline unsigned int __lane_id() {
     return  __builtin_amdgcn_mbcnt_hi(
diff --git a/clang/test/CodeGenHLSL/BasicFeatures/StructElementwiseCast.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/StructElementwiseCast.hlsl
index 4e29994afd27e..bd9a62f4db359 100644
--- a/clang/test/CodeGenHLSL/BasicFeatures/StructElementwiseCast.hlsl
+++ b/clang/test/CodeGenHLSL/BasicFeatures/StructElementwiseCast.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -fnative-half-type -triple dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -fnative-half-type -fnative-int16-type -triple dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s
 
 struct S {
   int X;
diff --git a/clang/test/CodeGenHLSL/BasicFeatures/frem_modulo.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/frem_modulo.hlsl
index edc28c5c80b51..393efcc360d08 100644
--- a/clang/test/CodeGenHLSL/BasicFeatures/frem_modulo.hlsl
+++ b/clang/test/CodeGenHLSL/BasicFeatures/frem_modulo.hlsl
@@ -1,8 +1,8 @@
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
-// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s
 // RUN: %clang_cc1 -finclude-default-header -triple spirv-unknown-vulkan-compute %s \
-// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s
 
  half2 half_vec_mod_by_int(half2 p1) {
diff --git a/clang/test/CodeGenHLSL/HLSLControlFlowHint.hlsl b/clang/test/CodeGenHLSL/HLSLControlFlowHint.hlsl
index aa13b27581850..6737cd3ee78ba 100644
--- a/clang/test/CodeGenHLSL/HLSLControlFlowHint.hlsl
+++ b/clang/test/CodeGenHLSL/HLSLControlFlowHint.hlsl
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -o - | FileCheck %s
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple spirv-vulkan-library %s -fnative-half-type -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple spirv-vulkan-library %s -fnative-half-type -fnative-int16-type -emit-llvm -o - | FileCheck %s
 
 // CHECK: define {{.*}} i32 {{.*}}test_branch{{.*}}(i32 {{.*}} [[VALD:%.*]])
 // CHECK: [[PARAM:%.*]] = load i32, ptr [[VALD]].addr, align 4
diff --git a/clang/test/CodeGenHLSL/Operators/logical-not.hlsl b/clang/test/CodeGenHLSL/Operators/logical-not.hlsl
index 0f9d0677d8610..d5130ab88ea64 100644
--- a/clang/test/CodeGenHLSL/Operators/logical-not.hlsl
+++ b/clang/test/CodeGenHLSL/Operators/logical-not.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.6-library -disable-llvm-passes -emit-llvm -finclude-default-header -fnative-half-type -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.6-library -disable-llvm-passes -emit-llvm -finclude-default-header -fnative-half-type -fnative-int16-type -o - %s | FileCheck %s
 
 // CHECK-LABEL: case1
 // CHECK: [[ToBool:%.*]] = icmp ne <2 x i32> {{.*}}, zeroinitializer
diff --git a/clang/test/CodeGenHLSL/basic_types.hlsl b/clang/test/CodeGenHLSL/basic_types.hlsl
index 37fb5195e9768..8836126934957 100644
--- a/clang/test/CodeGenHLSL/basic_types.hlsl
+++ b/clang/test/CodeGenHLSL/basic_types.hlsl
@@ -1,8 +1,8 @@
 // RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s
 // RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - -DNAMESPACED| FileCheck %s
 
 
diff --git a/clang/test/CodeGenHLSL/builtins/WaveActiveAllTrue.hlsl b/clang/test/CodeGenHLSL/builtins/WaveActiveAllTrue.hlsl
index df530a9cee561..f499fc97f43fc 100644
--- a/clang/test/CodeGenHLSL/builtins/WaveActiveAllTrue.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/WaveActiveAllTrue.hlsl
@@ -1,7 +1,7 @@
-// RUN: %clang_cc1 -finclude-default-header -fnative-half-type -triple \
+// RUN: %clang_cc1 -finclude-default-header -fnative-half-type -fnative-int16-type -triple \
 // RUN:   dxil-pc-shadermodel6.3-compute %s -emit-llvm -disable-llvm-passes -o - | \
 // RUN:   FileCheck %s --check-prefixes=CHECK,CHECK-DXIL
-// RUN: %clang_cc1 -finclude-default-header -fnative-half-type -triple \
+// RUN: %clang_cc1 -finclude-default-header -fnative-half-type -fnative-int16-type -triple \
 // RUN:   spirv-pc-vulkan-compute %s -emit-llvm -disable-llvm-passes -o - | \
 // RUN:   FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV
 
diff --git a/clang/test/CodeGenHLSL/builtins/WaveActiveAnyTrue.hlsl b/clang/test/CodeGenHLSL/builtins/WaveActiveAnyTrue.hlsl
index 87bb1dee01905..3655cdb443fa9 100644
--- a/clang/test/CodeGenHLSL/builtins/WaveActiveAnyTrue.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/WaveActiveAnyTrue.hlsl
@@ -1,7 +1,7 @@
-// RUN: %clang_cc1 -finclude-default-header -fnative-half-type -triple \
+// RUN: %clang_cc1 -finclude-default-header -fnative-half-type -fnative-int16-type -triple \
 // RUN:   dxil-pc-shadermodel6.3-compute %s -emit-llvm -disable-llvm-passes -o - | \
 // RUN:   FileCheck %s --check-prefixes=CHECK,CHECK-DXIL
-// RUN: %clang_cc1 -finclude-default-header -fnative-half-type -triple \
+// RUN: %clang_cc1 -finclude-default-header -fnative-half-type -fnative-int16-type -triple \
 // RUN:   spirv-pc-vulkan-compute %s -emit-llvm -disable-llvm-passes -o - | \
 // RUN:   FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV
 
diff --git a/clang/test/CodeGenHLSL/builtins/WaveActiveMin.hlsl b/clang/test/CodeGenHLSL/builtins/WaveActiveMin.hlsl
new file mode 100644
index 0000000000000..1194f842deed6
--- /dev/null
+++ b/clang/test/CodeGenHLSL/builtins/WaveActiveMin.hlsl
@@ -0,0 +1,46 @@
+// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -triple \
+// RUN:   dxil-pc-shadermodel6.3-compute %s -emit-llvm -disable-llvm-passes -o - | \
+// RUN:   FileCheck %s --check-prefixes=CHECK,CHECK-DXIL
+// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -triple \
+// RUN:   spirv-pc-vulkan-compute %s -emit-llvm -disable-llvm-passes -o - | \
+// RUN:   FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV
+
+// Test basic lowering to runtime function call.
+
+// CHECK-LABEL: test_int
+int test_int(int expr) {
+  // CHECK-SPIRV:  %[[RET:.*]] = call spir_func [[TY:.*]] @llvm.spv.wave.reduce.min.i32([[TY]] %[[#]])
+  // CHECK-DXIL:  %[[RET:.*]] = call [[TY:.*]] @llvm.dx.wave.reduce.min.i32([[TY]] %[[#]])
+  // CHECK:  ret [[TY]] %[[RET]]
+  return WaveActiveMin(expr);
+}
+
+// CHECK-DXIL: declare [[TY]] @llvm.dx.wave.reduce.min.i32([[TY]]) #[[#attr:]]
+// CHECK-SPIRV: declare [[TY]] @llvm.spv.wave.reduce.min.i32([[TY]]) #[[#attr:]]
+
+// CHECK-LABEL: test_uint64_t
+uint64_t test_uint64_t(uint64_t expr) {
+  // CHECK-SPIRV:  %[[RET:.*]] = call spir_func [[TY:.*]] @llvm.spv.wave.reduce.umin.i64([[TY]] %[[#]])
+  // CHECK-DXIL:  %[[RET:.*]] = call [[TY:.*]] @llvm.dx.wave.reduce.umin.i64([[TY]] %[[#]])
+  // CHECK:  ret [[TY]] %[[RET]]
+  return WaveActiveMin(expr);
+}
+
+// CHECK-DXIL: declare [[TY]] @llvm.dx.wave.reduce.umin.i64([[TY]]) #[[#attr:]]
+// CHECK-SPIRV: declare [[TY]] @llvm.spv.wave.reduce.umin.i64([[TY]]) #[[#attr:]]
+
+// Test basic lowering to runtime function call with array and float value.
+
+// CHECK-LABEL: test_floatv4
+float4 test_floatv4(float4 expr) {
+  // CHECK-SPIRV:  %[[RET1:.*]] = call reassoc nnan ninf nsz arcp afn spir_func [[TY1:.*]] @llvm.spv.wave.reduce.min.v4f32([[TY1]] %[[#]]
+  // CHECK-DXIL:  %[[RET1:.*]] = call reassoc nnan ninf nsz arcp afn [[TY1:.*]] @llvm.dx.wave.reduce.min.v4f32([[TY1]] %[[#]])
+  // CHECK:  ret [[TY1]] %[[RET1]]
+  return WaveActiveMin(expr);
+}
+
+// CHECK-DXIL: declare [[TY1]] @llvm.dx.wave.reduce.min.v4f32([[TY1]]) #[[#attr]]
+// CHECK-SPIRV: declare [[TY1]] @llvm.spv.wave.reduce.min.v4f32([[TY1]]) #[[#attr]]
+
+// CHECK: attributes #[[#attr]] = {{{.*}} convergent {{.*}}}
+
diff --git a/clang/test/CodeGenHLSL/builtins/WaveReadLaneAt.hlsl b/clang/test/CodeGenHLSL/builtins/WaveReadLaneAt.hlsl
index 8c787a42618ac..da6cbc40a79bb 100644
--- a/clang/test/CodeGenHLSL/builtins/WaveReadLaneAt.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/WaveReadLaneAt.hlsl
@@ -1,7 +1,7 @@
-// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -fnative-half-type -triple \
+// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -fnative-half-type -fnative-int16-type -triple \
 // RUN:   dxil-pc-shadermodel6.3-compute %s -emit-llvm -disable-llvm-passes -o - | \
 // RUN:   FileCheck %s --check-prefixes=CHECK,CHECK-DXIL
-// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -fnative-half-type -triple \
+// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -fnative-half-type -fnative-int16-type -triple \
 // RUN:   spirv-pc-vulkan-compute %s -emit-llvm -disable-llvm-passes -o - | \
 // RUN:   FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV
 
diff --git a/clang/test/CodeGenHLSL/builtins/abs.hlsl b/clang/test/CodeGenHLSL/builtins/abs.hlsl
index 6abe2f816c844..45cc907c0ada9 100644
--- a/clang/test/CodeGenHLSL/builtins/abs.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/abs.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
-// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
diff --git a/clang/test/CodeGenHLSL/builtins/acos.hlsl b/clang/test/CodeGenHLSL/builtins/acos.hlsl
index 8152339a34e87..f710d1f738a48 100644
--- a/clang/test/CodeGenHLSL/builtins/acos.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/acos.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
diff --git a/clang/test/CodeGenHLSL/builtins/all.hlsl b/clang/test/CodeGenHLSL/builtins/all.hlsl
index 391fad0ef33f5..bfa3b903d66a8 100644
--- a/clang/test/CodeGenHLSL/builtins/all.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/all.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
+// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
 // RUN:   -DFNATTRS="hidden spir_func noundef" -DTARGET=spv
@@ -8,7 +8,7 @@
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK \
 // RUN:   -DFNATTRS="hidden spir_func noundef" -DTARGET=spv
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
 // RUN:   -DFNATTRS="hidden noundef" -DTARGET=dx
diff --git a/clang/test/CodeGenHLSL/builtins/any.hlsl b/clang/test/CodeGenHLSL/builtins/any.hlsl
index e4837876e2693..fa2cd2698b392 100644
--- a/clang/test/CodeGenHLSL/builtins/any.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/any.hlsl
@@ -1,19 +1,19 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
+// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
 // RUN:   -DFNATTRS="hidden spir_func noundef" -DTARGET=spv
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
+// RUN:   spirv-unknown-vulkan-compute %s -fnative-int16-type -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK \
 // RUN:   -DFNATTRS="hidden spir_func noundef" -DTARGET=spv
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
 // RUN:   -DFNATTRS="hidden noundef" -DTARGET=dx
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-int16-type -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK \
 // RUN:   -DFNATTRS="hidden noundef" -DTARGET=dx
 
diff --git a/clang/test/CodeGenHLSL/builtins/asfloat.hlsl b/clang/test/CodeGenHLSL/builtins/asfloat.hlsl
index 59fc15fa60b1e..72802e8ef09be 100644
--- a/clang/test/CodeGenHLSL/builtins/asfloat.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/asfloat.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -O1 -o - | FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -O1 -o - | FileCheck %s
 
 // CHECK: define {{.*}}test_uint{{.*}}(i32 {{.*}} [[VAL:%.*]]){{.*}} 
 // CHECK: bitcast i32 [[VAL]] to float
diff --git a/clang/test/CodeGenHLSL/builtins/asin.hlsl b/clang/test/CodeGenHLSL/builtins/asin.hlsl
index 16efbba79670e..ccf704834116c 100644
--- a/clang/test/CodeGenHLSL/builtins/asin.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/asin.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
diff --git a/clang/test/CodeGenHLSL/builtins/asint.hlsl b/clang/test/CodeGenHLSL/builtins/asint.hlsl
index e1d80df5015c9..587d2bdc657d8 100644
--- a/clang/test/CodeGenHLSL/builtins/asint.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/asint.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -O1 -o - | FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -O1 -o - | FileCheck %s
 
 // CHECK: define {{.*}}test_int{{.*}}(i32 {{.*}} [[VAL:%.*]]){{.*}}
 // CHECK-NOT: bitcast
diff --git a/clang/test/CodeGenHLSL/builtins/asint16.hlsl b/clang/test/CodeGenHLSL/builtins/asint16.hlsl
index 8a1513012fd99..fd2cb8d10ee6b 100644
--- a/clang/test/CodeGenHLSL/builtins/asint16.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/asint16.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.2-library %s -fnative-half-type -emit-llvm -O1 -o - | FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.2-library %s -fnative-half-type -fnative-int16-type -emit-llvm -O1 -o - | FileCheck %s
 
 //CHECK-LABEL: define {{.*}}test_ints
 //CHECK-SAME: {{.*}}(i16 {{.*}} [[VAL:%.*]]){{.*}}
diff --git a/clang/test/CodeGenHLSL/builtins/asuint.hlsl b/clang/test/CodeGenHLSL/builtins/asuint.hlsl
index 252a434ccce0d..5fd1e62d66ddb 100644
--- a/clang/test/CodeGenHLSL/builtins/asuint.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/asuint.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -O1 -o - | FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -O1 -o - | FileCheck %s
 
 // CHECK: define {{.*}}test_uint{{.*}}(i32 {{.*}} [[VAL:%.*]]){{.*}}
 // CHECK-NOT: bitcast
diff --git a/clang/test/CodeGenHLSL/builtins/asuint16.hlsl b/clang/test/CodeGenHLSL/builtins/asuint16.hlsl
index 6d44377df2ffb..31e151e210d7e 100644
--- a/clang/test/CodeGenHLSL/builtins/asuint16.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/asuint16.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.2-library %s -fnative-half-type -emit-llvm -O1 -o - | FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.2-library %s -fnative-half-type -fnative-int16-type -emit-llvm -O1 -o - | FileCheck %s
 
 //CHECK-LABEL: define {{.*}}test_ints
 //CHECK-SAME: {{.*}}(i16 {{.*}} [[VAL:%.*]]){{.*}}
diff --git a/clang/test/CodeGenHLSL/builtins/atan.hlsl b/clang/test/CodeGenHLSL/builtins/atan.hlsl
index 437835a863703..91fe139ddf05b 100644
--- a/clang/test/CodeGenHLSL/builtins/atan.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/atan.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
diff --git a/clang/test/CodeGenHLSL/builtins/atan2.hlsl b/clang/test/CodeGenHLSL/builtins/atan2.hlsl
index 6c93f57be6b3d..512b44a5780db 100644
--- a/clang/test/CodeGenHLSL/builtins/atan2.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/atan2.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
diff --git a/clang/test/CodeGenHLSL/builtins/ceil.hlsl b/clang/test/CodeGenHLSL/builtins/ceil.hlsl
index 1a9c630b60e57..d87d56edd9443 100644
--- a/clang/test/CodeGenHLSL/builtins/ceil.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/ceil.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
-// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
diff --git a/clang/test/CodeGenHLSL/builtins/clamp-builtin.hlsl b/clang/test/CodeGenHLSL/builtins/clamp-builtin.hlsl
index 356836b40e9c0..56a2b090bdeaf 100644
--- a/clang/test/CodeGenHLSL/builtins/clamp-builtin.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/clamp-builtin.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
 
 
 // CHECK-LABEL: builtin_clamp_half
diff --git a/clang/test/CodeGenHLSL/builtins/clamp-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/clamp-overloads.hlsl
index eaedfb419c195..8044047c5ef40 100644
--- a/clang/test/CodeGenHLSL/builtins/clamp-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/clamp-overloads.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
-// RUN:  -fnative-half-type -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NATIVE_HALF \
+// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NATIVE_HALF \
 // RUN:  -DTARGET=dx -DFNATTRS="hidden noundef" -DFFNATTRS="nofpclass(nan inf)"
 
 // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
@@ -7,7 +7,7 @@
 // RUN:  -DTARGET=dx -DFNATTRS="hidden noundef" -DFFNATTRS="nofpclass(nan inf)"
 
 // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -triple spirv-unknown-vulkan-compute %s \
-// RUN:  -fnative-half-type -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NATIVE_HALF \
+// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NATIVE_HALF \
 // RUN:  -DTARGET=spv -DFNATTRS="hidden spir_func noundef" -DFFNATTRS="nofpclass(nan inf)"
 
 // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -triple spirv-unknown-vulkan-compute %s \
diff --git a/clang/test/CodeGenHLSL/builtins/clamp.hlsl b/clang/test/CodeGenHLSL/builtins/clamp.hlsl
index 58db4423799be..10570e9b6ddb4 100644
--- a/clang/test/CodeGenHLSL/builtins/clamp.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/clamp.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
-// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF \
 // RUN:  -DTARGET=dx -DFNATTRS="hidden noundef" -DFFNATTRS="nofpclass(nan inf)"
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
@@ -7,7 +7,7 @@
 // RUN:  FileCheck %s --check-prefixes=CHECK,NO_HALF \
 // RUN:  -DTARGET=dx -DFNATTRS="hidden noundef" -DFFNATTRS="nofpclass(nan inf)"
 // RUN: %clang_cc1 -finclude-default-header -triple spirv-unknown-vulkan-compute %s \
-// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF \
 // RUN:  -DTARGET=spv -DFNATTRS="hidden spir_func noundef" -DFFNATTRS="nofpclass(nan inf)"
 // RUN: %clang_cc1 -finclude-default-header -triple spirv-unknown-vulkan-compute %s \
diff --git a/clang/test/CodeGenHLSL/builtins/clip-builtin.hlsl b/clang/test/CodeGenHLSL/builtins/clip-builtin.hlsl
index aaeb2f026449b..0baf0db9bd0b6 100644
--- a/clang/test/CodeGenHLSL/builtins/clip-builtin.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/clip-builtin.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
 
 // CHECK:      define hidden void @{{.*}}builtin_clip_float{{.*}}(float {{.*}} [[P0:%.*]])
 // CHECK:      [[LOAD:%.*]] = load float, ptr [[P0]].addr, align 4
diff --git a/clang/test/CodeGenHLSL/builtins/clip.hlsl b/clang/test/CodeGenHLSL/builtins/clip.hlsl
index e067828c38bf6..bb21f084deba5 100644
--- a/clang/test/CodeGenHLSL/builtins/clip.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/clip.hlsl
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-pixel %s -fnative-half-type -emit-llvm -o - | FileCheck %s
-// RUN: %clang_cc1 -finclude-default-header -triple spirv-vulkan-pixel %s -fnative-half-type -emit-llvm -o - | FileCheck %s --check-prefix=SPIRV
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-pixel %s -fnative-half-type -fnative-int16-type -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -triple spirv-vulkan-pixel %s -fnative-half-type -fnative-int16-type -emit-llvm -o - | FileCheck %s --check-prefix=SPIRV
 
 
 void test_scalar(float Buf) {
diff --git a/clang/test/CodeGenHLSL/builtins/cos.hlsl b/clang/test/CodeGenHLSL/builtins/cos.hlsl
index 79f9e1e6fbec2..1f8970096a349 100644
--- a/clang/test/CodeGenHLSL/builtins/cos.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/cos.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
-// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
diff --git a/clang/test/CodeGenHLSL/builtins/cosh.hlsl b/clang/test/CodeGenHLSL/builtins/cosh.hlsl
index 07c64206412db..80474d459fcbd 100644
--- a/clang/test/CodeGenHLSL/builtins/cosh.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/cosh.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
diff --git a/clang/test/CodeGenHLSL/builtins/countbits.hlsl b/clang/test/CodeGenHLSL/builtins/countbits.hlsl
index 218d8dcd10f8d..87524ae58a0d6 100644
--- a/clang/test/CodeGenHLSL/builtins/countbits.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/countbits.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -O3 -o - | FileCheck %s
 
 #ifdef __HLSL_ENABLE_16_BIT
diff --git a/clang/test/CodeGenHLSL/builtins/cross.hlsl b/clang/test/CodeGenHLSL/builtins/cross.hlsl
index 873cb6db30425..e53b34bb9dc42 100644
--- a/clang/test/CodeGenHLSL/builtins/cross.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/cross.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
 // RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
@@ -8,7 +8,7 @@
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
 // RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
+// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
 // RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
diff --git a/clang/test/CodeGenHLSL/builtins/degrees-builtin.hlsl b/clang/test/CodeGenHLSL/builtins/degrees-builtin.hlsl
index 2e639f5577d20..3098ed242a492 100644
--- a/clang/test/CodeGenHLSL/builtins/degrees-builtin.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/degrees-builtin.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
 
 
 // CHECK-LABEL: builtin_degrees_half
diff --git a/clang/test/CodeGenHLSL/builtins/degrees.hlsl b/clang/test/CodeGenHLSL/builtins/degrees.hlsl
index f0fb12855e5f6..645e44eba3d95 100644
--- a/clang/test/CodeGenHLSL/builtins/degrees.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/degrees.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
 // RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
@@ -8,7 +8,7 @@
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
 // RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
 // RUN: %clang_cc1 -finclude-default-header -triple \
-// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
+// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
 // RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
diff --git a/clang/test/CodeGenHLSL/builtins/distance.hlsl b/clang/test/CodeGenHLSL/builtins/distance.hlsl
index 0c24fbb9f1859..bf015415a7d2f 100644
--- a/clang/test/CodeGenHLSL/builtins/distance.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/distance.hlsl
@@ -1,9 +1,9 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -O1 -o - | FileCheck %s
 // RUN: %clang_cc1 -finclude-default-header -triple \
-// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type \
+// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
 // RUN: -emit-llvm -O1 -o - | FileCheck %s --check-prefix=SPVCHECK
 
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) half @_Z18test_distance_halfDhDh(
diff --git a/clang/test/CodeGenHLSL/builtins/dot-builtin.hlsl b/clang/test/CodeGenHLSL/builtins/dot-builtin.hlsl
index 716704a1bfdad..cbbf38aba3504 100644
--- a/clang/test/CodeGenHLSL/builtins/dot-builtin.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/dot-builtin.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
 
 
 // CHECK-LABEL: builtin_dot_half
diff --git a/clang/test/CodeGenHLSL/builtins/dot.hlsl b/clang/test/CodeGenHLSL/builtins/dot.hlsl
index c1fdb0740adc3..a496842281d6d 100644
--- a/clang/test/CodeGenHLSL/builtins/dot.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/dot.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,DXCHECK,NATIVE_HALF
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
@@ -7,7 +7,7 @@
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,DXCHECK,NO_HALF
 
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
+// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,SPVCHECK,NATIVE_HALF
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
diff --git a/clang/test/CodeGenHLSL/builtins/dot2add.hlsl b/clang/test/CodeGenHLSL/builtins/dot2add.hlsl
index e80ffba2bcfdb..3165c24f2a60e 100644
--- a/clang/test/CodeGenHLSL/builtins/dot2add.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/dot2add.hlsl
@@ -1,7 +1,7 @@
-// RUN: %clang_cc1 -finclude-default-header -fnative-half-type -triple \
+// RUN: %clang_cc1 -finclude-default-header -fnative-half-type -fnative-int16-type -triple \
 // RUN:   dxil-pc-shadermodel6.4-compute %s -emit-llvm -o - | \
 // RUN:   FileCheck %s --check-prefixes=CHECK,CHECK-DXIL
-// RUN: %clang_cc1 -finclude-default-header -fnative-half-type -triple \
+// RUN: %clang_cc1 -finclude-default-header -fnative-half-type -fnative-int16-type -triple \
 // RUN:   spirv-pc-vulkan-compute %s -emit-llvm -o - | \
 // RUN:   FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV
 
diff --git a/clang/test/CodeGenHLSL/builtins/dst.hlsl b/clang/test/CodeGenHLSL/builtins/dst.hlsl
index a0840c66e5da9..d8292d31fba7c 100644
--- a/clang/test/CodeGenHLSL/builtins/dst.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/dst.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.2-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.2-library %s -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
 
 
 // CHECK-LABEL: define {{.*}} <4 x float> @{{[A-Za-z1-9_]+}}dst_impl{{[A-Za-z1-9_]*}}(
diff --git a/clang/test/CodeGenHLSL/builtins/exp.hlsl b/clang/test/CodeGenHLSL/builtins/exp.hlsl
index 5a8f60528a84c..d50ef021eecb8 100644
--- a/clang/test/CodeGenHLSL/builtins/exp.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/exp.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
-// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
diff --git a/clang/test/CodeGenHLSL/builtins/exp2.hlsl b/clang/test/CodeGenHLSL/builtins/exp2.hlsl
index a9bbcb0d9bff9..ed8cfcf47b04b 100644
--- a/clang/test/CodeGenHLSL/builtins/exp2.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/exp2.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
-// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
diff --git a/clang/test/CodeGenHLSL/builtins/faceforward.hlsl b/clang/test/CodeGenHLSL/builtins/faceforward.hlsl
index d2ece57aba4ae..70459d81685a1 100644
--- a/clang/test/CodeGenHLSL/builtins/faceforward.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/faceforward.hlsl
@@ -1,8 +1,8 @@
 // RUN: %clang_cc1 -finclude-default-header -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -o - | FileCheck %s
 // RUN: %clang_cc1 -finclude-default-header -triple \
-// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
+// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -o - | FileCheck %s --check-prefix=SPVCHECK
 
 // CHECK-LABEL: test_faceforward_half
diff --git a/clang/test/CodeGenHLSL/builtins/firstbithigh.hlsl b/clang/test/CodeGenHLSL/builtins/firstbithigh.hlsl
index a71b1878f8b55..368d652a6f779 100644
--- a/clang/test/CodeGenHLSL/builtins/firstbithigh.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/firstbithigh.hlsl
@@ -1,8 +1,8 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s -DTARGET=dx
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
+// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
 // RUN: -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s -DTARGET=spv
 
diff --git a/clang/test/CodeGenHLSL/builtins/firstbitlow.hlsl b/clang/test/CodeGenHLSL/builtins/firstbitlow.hlsl
index 007db0c9c2ad5..a1d2a1b31c99a 100644
--- a/clang/test/CodeGenHLSL/builtins/firstbitlow.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/firstbitlow.hlsl
@@ -1,8 +1,8 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s -DTARGET=dx
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
+// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
 // RUN: -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s -DTARGET=spv
 
diff --git a/clang/test/CodeGenHLSL/builtins/floor.hlsl b/clang/test/CodeGenHLSL/builtins/floor.hlsl
index b3ff58317981a..4763e54f92b8e 100644
--- a/clang/test/CodeGenHLSL/builtins/floor.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/floor.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
-// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
diff --git a/clang/test/CodeGenHLSL/builtins/fmod.hlsl b/clang/test/CodeGenHLSL/builtins/fmod.hlsl
index cc91c0b67f6cc..527eb6020469e 100644
--- a/clang/test/CodeGenHLSL/builtins/fmod.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/fmod.hlsl
@@ -3,7 +3,7 @@
 // ---------- Native Half support test -----------
 //
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -o - | FileCheck %s -DFNATTRS="hidden noundef nofpclass(nan inf)" \
 // RUN:   -DTYPE=half -DINT_TYPE=f16 --check-prefixes=DXCHECK
 
@@ -21,7 +21,7 @@
 // ---------- Native Half support test -----------
 //
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
+// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -o - | FileCheck %s \
 // RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTYPE=half
 
diff --git a/clang/test/CodeGenHLSL/builtins/frac-builtin.hlsl b/clang/test/CodeGenHLSL/builtins/frac-builtin.hlsl
index 9f144f470ed90..e41fd856c6a42 100644
--- a/clang/test/CodeGenHLSL/builtins/frac-builtin.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/frac-builtin.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
 
 
 // CHECK-LABEL: builtin_frac_half
diff --git a/clang/test/CodeGenHLSL/builtins/frac.hlsl b/clang/test/CodeGenHLSL/builtins/frac.hlsl
index d8397407cd013..3b61c482e86ad 100644
--- a/clang/test/CodeGenHLSL/builtins/frac.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/frac.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
 // RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
@@ -8,7 +8,7 @@
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
 // RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
+// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
 // RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
diff --git a/clang/test/CodeGenHLSL/builtins/isinf.hlsl b/clang/test/CodeGenHLSL/builtins/isinf.hlsl
index dc869a64a65b7..b778df38bc9b6 100644
--- a/clang/test/CodeGenHLSL/builtins/isinf.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/isinf.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,DXCHECK,NATIVE_HALF
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
@@ -7,7 +7,7 @@
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,DXCHECK,NO_HALF
 
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
+// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,SPVCHECK,NATIVE_HALF
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
diff --git a/clang/test/CodeGenHLSL/builtins/isnan.hlsl b/clang/test/CodeGenHLSL/builtins/isnan.hlsl
index ce7dbe1aedea4..cca3863557229 100644
--- a/clang/test/CodeGenHLSL/builtins/isnan.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/isnan.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,DXCHECK,NATIVE_HALF
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
@@ -7,7 +7,7 @@
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,DXCHECK,NO_HALF
 
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
+// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,SPVCHECK,NATIVE_HALF
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
diff --git a/clang/test/CodeGenHLSL/builtins/ldexp.hlsl b/clang/test/CodeGenHLSL/builtins/ldexp.hlsl
index f8fa06c39f2a1..012adc588ddfa 100644
--- a/clang/test/CodeGenHLSL/builtins/ldexp.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/ldexp.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
 
 // CHECK-LABEL: define linkonce_odr hidden noundef nofpclass(nan inf) half @_ZN4hlsl8__detail10ldexp_implIDhEET_S2_S2_
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn half @llvm.exp2.f16(half %{{.*}})
diff --git a/clang/test/CodeGenHLSL/builtins/length.hlsl b/clang/test/CodeGenHLSL/builtins/length.hlsl
index 9297c35abfd16..95edb20dacdac 100644
--- a/clang/test/CodeGenHLSL/builtins/length.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/length.hlsl
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -O1 -o - | FileCheck %s
 
 // RUN: %clang_cc1 -finclude-default-header -triple \
-// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type \
+// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
 // RUN: -emit-llvm -O1 -o - | FileCheck %s --check-prefix=SPVCHECK
 
 
diff --git a/clang/test/CodeGenHLSL/builtins/lerp-builtin.hlsl b/clang/test/CodeGenHLSL/builtins/lerp-builtin.hlsl
index 96bcf2b49bf25..cb8634c9234e3 100644
--- a/clang/test/CodeGenHLSL/builtins/lerp-builtin.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/lerp-builtin.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
 
 
 // CHECK-LABEL: builtin_lerp_half
diff --git a/clang/test/CodeGenHLSL/builtins/lerp-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/lerp-overloads.hlsl
index 3b13e43873c77..20f758b18218e 100644
--- a/clang/test/CodeGenHLSL/builtins/lerp-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/lerp-overloads.hlsl
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple  dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NATIVE_HALF -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
+// RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple  dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NATIVE_HALF -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
 // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple  dxil-pc-shadermodel6.3-library %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
-// RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple spirv-unknown-vulkan-compute %s -fnative-half-type -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NATIVE_HALF -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
+// RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NATIVE_HALF -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
 // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple spirv-unknown-vulkan-compute %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
 
 // CHECK: define [[FNATTRS]] float @_Z16test_lerp_doubled(
diff --git a/clang/test/CodeGenHLSL/builtins/lerp.hlsl b/clang/test/CodeGenHLSL/builtins/lerp.hlsl
index d7a7113de4878..02cf14c0e1772 100644
--- a/clang/test/CodeGenHLSL/builtins/lerp.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/lerp.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
 // RUN:   -DFNATTRS="noundef nofpclass(nan inf)" -DTARGET=dx
@@ -8,7 +8,7 @@
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
 // RUN:   -DFNATTRS="noundef nofpclass(nan inf)" -DTARGET=dx
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
+// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
 // RUN:   -DFNATTRS="spir_func noundef nofpclass(nan inf)" -DTARGET=spv
diff --git a/clang/test/CodeGenHLSL/builtins/lit.hlsl b/clang/test/CodeGenHLSL/builtins/lit.hlsl
index 44b3e96ef88bf..c0b109a75906b 100644
--- a/clang/test/CodeGenHLSL/builtins/lit.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/lit.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -o - | FileCheck %s
 
 // CHECK-LABEL: test_lit_half
 // CHECK: %cmp.i = fcmp reassoc nnan ninf nsz arcp afn olt half %{{.*}}, 0xH0000
diff --git a/clang/test/CodeGenHLSL/builtins/log.hlsl b/clang/test/CodeGenHLSL/builtins/log.hlsl
index 0136c1a052ed4..20e62120b64a6 100644
--- a/clang/test/CodeGenHLSL/builtins/log.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/log.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
-// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
diff --git a/clang/test/CodeGenHLSL/builtins/log10.hlsl b/clang/test/CodeGenHLSL/builtins/log10.hlsl
index 6a75444143b18..feeccf7cd7ab3 100644
--- a/clang/test/CodeGenHLSL/builtins/log10.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/log10.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
-// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
diff --git a/clang/test/CodeGenHLSL/builtins/log2.hlsl b/clang/test/CodeGenHLSL/builtins/log2.hlsl
index 84d73c1810890..a57fc44e09b70 100644
--- a/clang/test/CodeGenHLSL/builtins/log2.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/log2.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
-// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
diff --git a/clang/test/CodeGenHLSL/builtins/mad.hlsl b/clang/test/CodeGenHLSL/builtins/mad.hlsl
index e764e20748d58..1116c1419997d 100644
--- a/clang/test/CodeGenHLSL/builtins/mad.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/mad.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,DXIL_CHECK,DXIL_NATIVE_HALF,NATIVE_HALF
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
@@ -7,7 +7,7 @@
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,DXIL_CHECK,NO_HALF
 
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
+// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF,SPIR_NATIVE_HALF,SPIR_CHECK
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
diff --git a/clang/test/CodeGenHLSL/builtins/max-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/max-overloads.hlsl
index cd7013ba75825..a5ef87a822dd5 100644
--- a/clang/test/CodeGenHLSL/builtins/max-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/max-overloads.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
-// RUN:  -fnative-half-type -emit-llvm  -o - | FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
+// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm  -o - | FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
 // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
 // RUN:  -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
 
diff --git a/clang/test/CodeGenHLSL/builtins/max.hlsl b/clang/test/CodeGenHLSL/builtins/max.hlsl
index fab53a160c856..9c621e62b5336 100644
--- a/clang/test/CodeGenHLSL/builtins/max.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/max.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
-// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
diff --git a/clang/test/CodeGenHLSL/builtins/min-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/min-overloads.hlsl
index f81fa128ce9c7..c0e06b0d204b3 100644
--- a/clang/test/CodeGenHLSL/builtins/min-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/min-overloads.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
-// RUN:  -fnative-half-type -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
+// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
 // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
 // RUN:  -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
 
diff --git a/clang/test/CodeGenHLSL/builtins/min.hlsl b/clang/test/CodeGenHLSL/builtins/min.hlsl
index b3e8fedff9b1b..44d2063229cdb 100644
--- a/clang/test/CodeGenHLSL/builtins/min.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/min.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
-// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
diff --git a/clang/test/CodeGenHLSL/builtins/normalize-builtin.hlsl b/clang/test/CodeGenHLSL/builtins/normalize-builtin.hlsl
index 3db64604a1319..46bfb44c9b2a1 100644
--- a/clang/test/CodeGenHLSL/builtins/normalize-builtin.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/normalize-builtin.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
 
 
 // CHECK-LABEL: builtin_normalize_half
diff --git a/clang/test/CodeGenHLSL/builtins/normalize.hlsl b/clang/test/CodeGenHLSL/builtins/normalize.hlsl
index 85937346ead65..bbea11a8b432f 100644
--- a/clang/test/CodeGenHLSL/builtins/normalize.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/normalize.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
 // RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
@@ -8,7 +8,7 @@
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
 // RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
+// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
 // RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
diff --git a/clang/test/CodeGenHLSL/builtins/pow.hlsl b/clang/test/CodeGenHLSL/builtins/pow.hlsl
index fcde755e15fcc..b11ded8c1d173 100644
--- a/clang/test/CodeGenHLSL/builtins/pow.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/pow.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
-// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
diff --git a/clang/test/CodeGenHLSL/builtins/radians-builtin.hlsl b/clang/test/CodeGenHLSL/builtins/radians-builtin.hlsl
index 0c86357d5ecad..1f7e19055ee6b 100644
--- a/clang/test/CodeGenHLSL/builtins/radians-builtin.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/radians-builtin.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
 
 
 // CHECK-LABEL: builtin_radians_half
diff --git a/clang/test/CodeGenHLSL/builtins/radians.hlsl b/clang/test/CodeGenHLSL/builtins/radians.hlsl
index f281747fbf298..6521606a25c05 100644
--- a/clang/test/CodeGenHLSL/builtins/radians.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/radians.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
 // RUN:   -DTARGET=dx -DFNATTRS="hidden noundef nofpclass(nan inf)"
@@ -8,7 +8,7 @@
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
 // RUN:   -DTARGET=dx -DFNATTRS="hidden noundef nofpclass(nan inf)"
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
+// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
 // RUN:   -DTARGET=spv -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)"
diff --git a/clang/test/CodeGenHLSL/builtins/rcp-builtin.hlsl b/clang/test/CodeGenHLSL/builtins/rcp-builtin.hlsl
index d81a49b8c6048..2cc38203bd060 100644
--- a/clang/test/CodeGenHLSL/builtins/rcp-builtin.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/rcp-builtin.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
 
 
 // CHECK-LABEL: builtin_rcp_half
diff --git a/clang/test/CodeGenHLSL/builtins/rcp.hlsl b/clang/test/CodeGenHLSL/builtins/rcp.hlsl
index cdfaa3c5f1ee3..c9c47c737114d 100644
--- a/clang/test/CodeGenHLSL/builtins/rcp.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/rcp.hlsl
@@ -1,12 +1,12 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
 // RUN:   --check-prefixes=CHECK,DXIL_CHECK,DXIL_NATIVE_HALF,NATIVE_HALF
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,DXIL_CHECK,NO_HALF,DXIL_NO_HALF
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
+// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF,SPIR_NATIVE_HALF,SPIR_CHECK
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
diff --git a/clang/test/CodeGenHLSL/builtins/reflect.hlsl b/clang/test/CodeGenHLSL/builtins/reflect.hlsl
index 65fefd801ffed..feb5a5b2ea78f 100644
--- a/clang/test/CodeGenHLSL/builtins/reflect.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/reflect.hlsl
@@ -1,9 +1,9 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
 // RUN: %clang_cc1 -finclude-default-header -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -O1 -o - | FileCheck %s
 // RUN: %clang_cc1 -finclude-default-header -triple \
-// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
+// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -O1 -o - | FileCheck %s --check-prefix=SPVCHECK
 
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) half @_Z17test_reflect_halfDhDh(
diff --git a/clang/test/CodeGenHLSL/builtins/refract.hlsl b/clang/test/CodeGenHLSL/builtins/refract.hlsl
index eda256451ee2b..ffeb2a78b2517 100644
--- a/clang/test/CodeGenHLSL/builtins/refract.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/refract.hlsl
@@ -1,8 +1,8 @@
 // RUN: %clang_cc1 -finclude-default-header -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -o - | FileCheck %s
 // RUN: %clang_cc1 -finclude-default-header -triple \
-// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
+// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -o - | FileCheck %s --check-prefix=SPVCHECK
 
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) half @_Z17test_refract_halfDhDhDh(
diff --git a/clang/test/CodeGenHLSL/builtins/reversebits.hlsl b/clang/test/CodeGenHLSL/builtins/reversebits.hlsl
index 91375c8f4eb8f..5fd8de9c95df8 100644
--- a/clang/test/CodeGenHLSL/builtins/reversebits.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/reversebits.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -O3 -o - | FileCheck %s
 
 #ifdef __HLSL_ENABLE_16_BIT
diff --git a/clang/test/CodeGenHLSL/builtins/round.hlsl b/clang/test/CodeGenHLSL/builtins/round.hlsl
index 755f2e86fb116..0d4afee6ba9a8 100644
--- a/clang/test/CodeGenHLSL/builtins/round.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/round.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
-// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
diff --git a/clang/test/CodeGenHLSL/builtins/rsqrt-builtin.hlsl b/clang/test/CodeGenHLSL/builtins/rsqrt-builtin.hlsl
index 43ad9d0d0b844..d45f8cbbb5cf1 100644
--- a/clang/test/CodeGenHLSL/builtins/rsqrt-builtin.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/rsqrt-builtin.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
 
 
 // CHECK-LABEL: builtin_rsqrt_half
diff --git a/clang/test/CodeGenHLSL/builtins/rsqrt.hlsl b/clang/test/CodeGenHLSL/builtins/rsqrt.hlsl
index 9c398fd6f06cb..de2a222ae78d1 100644
--- a/clang/test/CodeGenHLSL/builtins/rsqrt.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/rsqrt.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
 // RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
@@ -8,7 +8,7 @@
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
 // RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
+// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
 // RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
diff --git a/clang/test/CodeGenHLSL/builtins/saturate-builtin.hlsl b/clang/test/CodeGenHLSL/builtins/saturate-builtin.hlsl
index 7dbba72f3abb5..c407362c1c85f 100644
--- a/clang/test/CodeGenHLSL/builtins/saturate-builtin.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/saturate-builtin.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
 
 
 // CHECK-LABEL: builtin_saturate_half
diff --git a/clang/test/CodeGenHLSL/builtins/saturate.hlsl b/clang/test/CodeGenHLSL/builtins/saturate.hlsl
index 3304073d9b501..c583013d4b245 100644
--- a/clang/test/CodeGenHLSL/builtins/saturate.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/saturate.hlsl
@@ -1,12 +1,12 @@
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
-// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF -Dtar=dx
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NO_HALF -Dtar=dx
 
 // RUN: %clang_cc1 -finclude-default-header -triple spirv-unknown-vulkan-library %s \
-// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF -Dtar=spv
 // RUN: %clang_cc1 -finclude-default-header -triple spirv-unknown-vulkan-library %s \
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
diff --git a/clang/test/CodeGenHLSL/builtins/sign.hlsl b/clang/test/CodeGenHLSL/builtins/sign.hlsl
index cbdb929388934..ef8f7168b1002 100644
--- a/clang/test/CodeGenHLSL/builtins/sign.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/sign.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
 // RUN:   -DTARGET=dx -DFNATTRS="hidden noundef"
@@ -8,7 +8,7 @@
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
 // RUN:   -DTARGET=dx -DFNATTRS="hidden noundef"
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
+// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
 // RUN:   -DTARGET=spv -DFNATTRS="hidden spir_func noundef"
diff --git a/clang/test/CodeGenHLSL/builtins/sin.hlsl b/clang/test/CodeGenHLSL/builtins/sin.hlsl
index 9bbe97997aa33..5a900972c7ac9 100644
--- a/clang/test/CodeGenHLSL/builtins/sin.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/sin.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
-// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
diff --git a/clang/test/CodeGenHLSL/builtins/sinh.hlsl b/clang/test/CodeGenHLSL/builtins/sinh.hlsl
index d55d60515418c..ab0f814ecd694 100644
--- a/clang/test/CodeGenHLSL/builtins/sinh.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/sinh.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
diff --git a/clang/test/CodeGenHLSL/builtins/smoothstep.hlsl b/clang/test/CodeGenHLSL/builtins/smoothstep.hlsl
index bef64ce77d470..dcf9013045c07 100644
--- a/clang/test/CodeGenHLSL/builtins/smoothstep.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/smoothstep.hlsl
@@ -1,9 +1,9 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -O1 -o - | FileCheck %s
 // RUN: %clang_cc1 -finclude-default-header -triple \
-// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
+// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -O1 -o - | FileCheck %s --check-prefix=SPVCHECK
 
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) half @_Z20test_smoothstep_halfDhDhDh(
diff --git a/clang/test/CodeGenHLSL/builtins/splitdouble.hlsl b/clang/test/CodeGenHLSL/builtins/splitdouble.hlsl
index aeb2b79e90291..53f4f6aa2cb5f 100644
--- a/clang/test/CodeGenHLSL/builtins/splitdouble.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/splitdouble.hlsl
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -O1 -o - | FileCheck %s
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple spirv-vulkan-library %s -fnative-half-type -emit-llvm -O0 -o - | FileCheck %s --check-prefix=SPIRV
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -O1 -o - | FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple spirv-vulkan-library %s -fnative-half-type -fnative-int16-type -emit-llvm -O0 -o - | FileCheck %s --check-prefix=SPIRV
 
 
 
diff --git a/clang/test/CodeGenHLSL/builtins/sqrt.hlsl b/clang/test/CodeGenHLSL/builtins/sqrt.hlsl
index 31839f6bc177d..ce77459c77c41 100644
--- a/clang/test/CodeGenHLSL/builtins/sqrt.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/sqrt.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
-// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
diff --git a/clang/test/CodeGenHLSL/builtins/step.hlsl b/clang/test/CodeGenHLSL/builtins/step.hlsl
index 6f6588a026a45..5061f8126d7e2 100644
--- a/clang/test/CodeGenHLSL/builtins/step.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/step.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
 // RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
@@ -8,7 +8,7 @@
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
 // RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
+// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
 // RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
diff --git a/clang/test/CodeGenHLSL/builtins/tan.hlsl b/clang/test/CodeGenHLSL/builtins/tan.hlsl
index c8c948624a613..2a108bf97bd1f 100644
--- a/clang/test/CodeGenHLSL/builtins/tan.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/tan.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
diff --git a/clang/test/CodeGenHLSL/builtins/tanh.hlsl b/clang/test/CodeGenHLSL/builtins/tanh.hlsl
index f947c7f53b110..91345caad84c9 100644
--- a/clang/test/CodeGenHLSL/builtins/tanh.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/tanh.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
diff --git a/clang/test/CodeGenHLSL/builtins/transpose-builtin.hlsl b/clang/test/CodeGenHLSL/builtins/transpose-builtin.hlsl
index 86aa7cd6985dd..ef282fc355b23 100644
--- a/clang/test/CodeGenHLSL/builtins/transpose-builtin.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/transpose-builtin.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
 
 // NOTE: This test is only to confirm we can do codgen with the matrix alias.
 
diff --git a/clang/test/CodeGenHLSL/builtins/trunc.hlsl b/clang/test/CodeGenHLSL/builtins/trunc.hlsl
index c1c6ee4119f0d..58cc78ed03596 100644
--- a/clang/test/CodeGenHLSL/builtins/trunc.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/trunc.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
-// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
diff --git a/clang/test/CodeGenHLSL/enable-16bit-types.hlsl b/clang/test/CodeGenHLSL/enable-16bit-types.hlsl
index 690404c4fde24..9e92eb04ada5b 100644
--- a/clang/test/CodeGenHLSL/enable-16bit-types.hlsl
+++ b/clang/test/CodeGenHLSL/enable-16bit-types.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fnative-half-type -std=hlsl202x -triple dxilv1.3-unknown-shadermodel6.3-library \
+// RUN: %clang_cc1 -fnative-half-type -fnative-int16-type -std=hlsl202x -triple dxilv1.3-unknown-shadermodel6.3-library \
 // RUN:  -finclude-default-header -emit-llvm -o - %s 2>&1 | FileCheck %s --check-prefix=FLAG
 // RUN: %clang_cc1 -std=hlsl202x -triple dxilv1.3-unknown-shadermodel6.3-library \
 // RUN:  -finclude-default-header -emit-llvm -o - %s 2>&1 | FileCheck %s --check-prefix=NOFLAG
diff --git a/clang/test/CodeGenHLSL/float3.hlsl b/clang/test/CodeGenHLSL/float3.hlsl
index 4f03464586bf0..4abd18713e718 100644
--- a/clang/test/CodeGenHLSL/float3.hlsl
+++ b/clang/test/CodeGenHLSL/float3.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s
 
 // Make sure float3 is not changed into float4.
diff --git a/clang/test/CodeGenHLSL/no_int_promotion.hlsl b/clang/test/CodeGenHLSL/no_int_promotion.hlsl
index b4ffcb477f1ba..adea165c1c864 100644
--- a/clang/test/CodeGenHLSL/no_int_promotion.hlsl
+++ b/clang/test/CodeGenHLSL/no_int_promotion.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -D__HLSL_ENABLE_16_BIT \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -O3 -o - | FileCheck %s
 
 // FIXME: add test for char/int8_t/uint8_t when these types are supported in HLSL.
diff --git a/clang/test/CodeGenHLSL/resources/RasterizerOrderedStructuredBuffer-elementtype.hlsl b/clang/test/CodeGenHLSL/resources/RasterizerOrderedStructuredBuffer-elementtype.hlsl
index c97ad4237000f..843f14474a23f 100644
--- a/clang/test/CodeGenHLSL/resources/RasterizerOrderedStructuredBuffer-elementtype.hlsl
+++ b/clang/test/CodeGenHLSL/resources/RasterizerOrderedStructuredBuffer-elementtype.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type -emit-llvm -o - %s | FileCheck %s -check-prefixes=DXIL
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type -fnative-int16-type -emit-llvm -o - %s | FileCheck %s -check-prefixes=DXIL
 
 struct MyStruct {
   float4 a;
diff --git a/clang/test/CodeGenHLSL/resources/StructuredBuffers-elementtype.hlsl b/clang/test/CodeGenHLSL/resources/StructuredBuffers-elementtype.hlsl
index 2b286bde88468..43f2e9cb7f333 100644
--- a/clang/test/CodeGenHLSL/resources/StructuredBuffers-elementtype.hlsl
+++ b/clang/test/CodeGenHLSL/resources/StructuredBuffers-elementtype.hlsl
@@ -1,25 +1,25 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type \
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type -fnative-int16-type \
 // RUN: -emit-llvm -o - -DRESOURCE=StructuredBuffer %s | FileCheck %s -DRESOURCE=StructuredBuffer -check-prefixes=DXIL-RO
 
-// RUN: %clang_cc1 -triple spirv-unknown-vulkan1.3-compute -finclude-default-header -fnative-half-type \
+// RUN: %clang_cc1 -triple spirv-unknown-vulkan1.3-compute -finclude-default-header -fnative-half-type -fnative-int16-type \
 // RUN: -emit-llvm -o - -DRESOURCE=StructuredBuffer %s | FileCheck %s -DRESOURCE=StructuredBuffer -check-prefixes=SPV-RO
 
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type \
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type -fnative-int16-type \
 // RUN: -emit-llvm -o - -DRESOURCE=RWStructuredBuffer %s | FileCheck %s -DRESOURCE=RWStructuredBuffer -check-prefixes=DXIL-RW
 
-// RUN: %clang_cc1 -triple spirv-unknown-vulkan1.3-compute -finclude-default-header -fnative-half-type \
+// RUN: %clang_cc1 -triple spirv-unknown-vulkan1.3-compute -finclude-default-header -fnative-half-type -fnative-int16-type \
 // RUN: -emit-llvm -o - -DRESOURCE=RWStructuredBuffer %s | FileCheck %s -DRESOURCE=RWStructuredBuffer -check-prefixes=SPV-RW
 
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type \
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type -fnative-int16-type \
 // RUN: -emit-llvm -o - -DRESOURCE=AppendStructuredBuffer %s | FileCheck %s -DRESOURCE=AppendStructuredBuffer -check-prefixes=DXIL-RW
 
-// RUN: %clang_cc1 -triple spirv-unknown-vulkan1.3-compute -finclude-default-header -fnative-half-type \
+// RUN: %clang_cc1 -triple spirv-unknown-vulkan1.3-compute -finclude-default-header -fnative-half-type -fnative-int16-type \
 // RUN: -emit-llvm -o - -DRESOURCE=AppendStructuredBuffer %s | FileCheck %s -DRESOURCE=AppendStructuredBuffer -check-prefixes=SPV-RW
 
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type \
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type -fnative-int16-type \
 // RUN: -emit-llvm -o - -DRESOURCE=ConsumeStructuredBuffer %s | FileCheck %s -DRESOURCE=ConsumeStructuredBuffer -check-prefixes=DXIL-RW
 
-// RUN: %clang_cc1 -triple spirv-unknown-vulkan1.3-compute -finclude-default-header -fnative-half-type \
+// RUN: %clang_cc1 -triple spirv-unknown-vulkan1.3-compute -finclude-default-header -fnative-half-type -fnative-int16-type \
 // RUN: -emit-llvm -o - -DRESOURCE=ConsumeStructuredBuffer %s | FileCheck %s -DRESOURCE=ConsumeStructuredBuffer -check-prefixes=SPV-RW
 
 // DXIL-RO: %"class.hlsl::[[RESOURCE]]" = type { target("dx.RawBuffer", i16, 0, 0) }
diff --git a/clang/test/CodeGenHLSL/resources/TypedBuffers-elementtype.hlsl b/clang/test/CodeGenHLSL/resources/TypedBuffers-elementtype.hlsl
index d3dba8a69cc72..7d59bc5fed5ea 100644
--- a/clang/test/CodeGenHLSL/resources/TypedBuffers-elementtype.hlsl
+++ b/clang/test/CodeGenHLSL/resources/TypedBuffers-elementtype.hlsl
@@ -1,13 +1,13 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type \
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type -fnative-int16-type  \
 // RUN: -emit-llvm -o - -DRESOURCE=Buffer %s | FileCheck %s -DRESOURCE=Buffer -DRW=0 -check-prefixes=DXIL
 
-// RUN: %clang_cc1 -triple spirv-pc-vulkan-compute -finclude-default-header -fnative-half-type \
+// RUN: %clang_cc1 -triple spirv-pc-vulkan-compute -finclude-default-header -fnative-half-type -fnative-int16-type \
 // RUN: -emit-llvm -o - -DRESOURCE=Buffer %s | FileCheck %s -DRESOURCE=Buffer -DRW=1 -check-prefixes=SPV-RO
 
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type \
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type -fnative-int16-type \
 // RUN: -emit-llvm -o - -DRESOURCE=RWBuffer %s | FileCheck %s -DRESOURCE=RWBuffer -DRW=1 -check-prefixes=DXIL
 
-// RUN: %clang_cc1 -triple spirv-pc-vulkan-compute -finclude-default-header -fnative-half-type \
+// RUN: %clang_cc1 -triple spirv-pc-vulkan-compute -finclude-default-header -fnative-half-type -fnative-int16-type \
 // RUN: -emit-llvm -o - -DRESOURCE=RWBuffer %s | FileCheck %s -DRESOURCE=RWBuffer --DRW=2 -check-prefixes=SPV-RW
 
 // DXIL: %"class.hlsl::[[RESOURCE]]" = type { target("dx.TypedBuffer", i16, [[RW]], 0, 1) }
diff --git a/clang/test/CodeGenHLSL/resources/cbuffer.hlsl b/clang/test/CodeGenHLSL/resources/cbuffer.hlsl
index 8dcff5dad9d13..c8efe0d64c985 100644
--- a/clang/test/CodeGenHLSL/resources/cbuffer.hlsl
+++ b/clang/test/CodeGenHLSL/resources/cbuffer.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-compute -fnative-half-type -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-compute -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s
 
 // CHECK: %__cblayout_CBScalars = type <{ float, double, half, i64, i32, i16, i32, i64 }>
 // CHECK: %__cblayout_CBVectors = type <{ <3 x float>, <3 x double>, <2 x half>, <3 x i64>, <4 x i32>, <3 x i16>, <3 x i64> }>
diff --git a/clang/test/CodeGenHLSL/vk-features/vk.spec-constant.hlsl b/clang/test/CodeGenHLSL/vk-features/vk.spec-constant.hlsl
index 15c54beb03d38..3f7c59916316d 100644
--- a/clang/test/CodeGenHLSL/vk-features/vk.spec-constant.hlsl
+++ b/clang/test/CodeGenHLSL/vk-features/vk.spec-constant.hlsl
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals all --include-generated-funcs --version 5
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
+// RUN:   spirv-unknown-vulkan-compute %s -fnative-int16-type -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s
 
 [[vk::constant_id(1)]]
diff --git a/clang/test/CodeGenOpenCL/amdgpu-nullptr.cl b/clang/test/CodeGenOpenCL/amdgpu-nullptr.cl
deleted file mode 100644
index d0bcd1fccb7ce..0000000000000
--- a/clang/test/CodeGenOpenCL/amdgpu-nullptr.cl
+++ /dev/null
@@ -1,633 +0,0 @@
-// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -include opencl-c.h -triple amdgcn -emit-llvm -o - | FileCheck %s
-// RUN: %clang_cc1 -no-enable-noundef-analysis %s -O0 -cl-std=CL2.0 -include opencl-c.h -triple amdgcn -emit-llvm -o - | FileCheck --check-prefix=NOOPT %s
-// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -include opencl-c.h -triple amdgcn---opencl -emit-llvm -o - | FileCheck %s
-// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -include opencl-c.h -triple amdgcn -fcommon -emit-llvm -o - | FileCheck %s --check-prefix=COMMON
-
-typedef struct {
-  private char *p1;
-  local char *p2;
-  constant char *p3;
-  global char *p4;
-  generic char *p5;
-} StructTy1;
-
-typedef struct {
-  constant char *p3;
-  global char *p4;
-  generic char *p5;
-} StructTy2;
-
-// Test 0 as initializer.
-
-// CHECK: @private_p ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), align 4
-private char *private_p = 0;
-
-// CHECK: @local_p ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), align 4
-local char *local_p = 0;
-
-// CHECK: @global_p ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(1) null, align 8
-global char *global_p = 0;
-
-// CHECK: @constant_p ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(4) null, align 8
-constant char *constant_p = 0;
-
-// CHECK: @generic_p ={{.*}} local_unnamed_addr addrspace(1) global ptr null, align 8
-generic char *generic_p = 0;
-
-// Test NULL as initializer.
-
-// CHECK: @private_p_NULL ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), align 4
-private char *private_p_NULL = NULL;
-
-// CHECK: @local_p_NULL ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), align 4
-local char *local_p_NULL = NULL;
-
-// CHECK: @global_p_NULL ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(1) null, align 8
-global char *global_p_NULL = NULL;
-
-// CHECK: @constant_p_NULL ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(4) null, align 8
-constant char *constant_p_NULL = NULL;
-
-// CHECK: @generic_p_NULL ={{.*}} local_unnamed_addr addrspace(1) global ptr null, align 8
-generic char *generic_p_NULL = NULL;
-
-// Test constant folding of null pointer.
-// A null pointer should be folded to a null pointer in the target address space.
-
-// CHECK: @fold_generic ={{.*}} local_unnamed_addr addrspace(1) global ptr null, align 8
-generic int *fold_generic = (global int*)(generic float*)(private char*)0;
-
-// CHECK: @fold_priv ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(5) addrspacecast (ptr addrspace(1) null to ptr addrspace(5)), align 4
-private short *fold_priv = (private short*)(generic int*)(global void*)0;
-
-// CHECK: @fold_priv_arith ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(5) inttoptr (i32 9 to ptr addrspace(5)), align 4
-private char *fold_priv_arith = (private char*)0 + 10;
-
-// CHECK: @fold_local_arith ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(3) inttoptr (i32 9 to ptr addrspace(3)), align 4
-local char *fold_local_arith = (local char*)0 + 10;
-
-// CHECK: @fold_int ={{.*}} local_unnamed_addr addrspace(1) global i32 13, align 4
-int fold_int = (int)(private void*)(generic char*)(global int*)0 + 14;
-
-// CHECK: @fold_int2 ={{.*}} local_unnamed_addr addrspace(1) global i32 12, align 4
-int fold_int2 = (int) ((private void*)0 + 13);
-
-// CHECK: @fold_int3 ={{.*}} local_unnamed_addr addrspace(1) global i32 -1, align 4
-int fold_int3 = (int) ((private int*)0);
-
-// CHECK: @fold_int4 ={{.*}} local_unnamed_addr addrspace(1) global i32 7, align 4
-int fold_int4 = (int) &((private int*)0)[2];
-
-// CHECK: @fold_int5 ={{.*}} local_unnamed_addr addrspace(1) global i32 3, align 4
-int fold_int5 = (int) &((private StructTy1*)0)->p2;
-
-
-// CHECK: @fold_int_local ={{.*}} local_unnamed_addr addrspace(1) global i32 13, align 4
-int fold_int_local = (int)(local void*)(generic char*)(global int*)0 + 14;
-
-// CHECK: @fold_int2_local ={{.*}} local_unnamed_addr addrspace(1) global i32 12, align 4
-int fold_int2_local = (int) ((local void*)0 + 13);
-
-// CHECK: @fold_int3_local ={{.*}} local_unnamed_addr addrspace(1) global i32 -1, align 4
-int fold_int3_local = (int) ((local int*)0);
-
-// CHECK: @fold_int4_local ={{.*}} local_unnamed_addr addrspace(1) global i32 7, align 4
-int fold_int4_local = (int) &((local int*)0)[2];
-
-// CHECK: @fold_int5_local ={{.*}} local_unnamed_addr addrspace(1) global i32 3, align 4
-int fold_int5_local = (int) &((local StructTy1*)0)->p2;
-
-
-// Test static variable initialization.
-
-// NOOPT: @test_static_var_private.sp1 = internal addrspace(1) global ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), align 4
-// NOOPT: @test_static_var_private.sp2 = internal addrspace(1) global ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), align 4
-// NOOPT: @test_static_var_private.sp3 = internal addrspace(1) global ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), align 4
-// NOOPT: @test_static_var_private.sp4 = internal addrspace(1) global ptr addrspace(5) null, align 4
-// NOOPT: @test_static_var_private.sp5 = internal addrspace(1) global ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), align 4
-// NOOPT: @test_static_var_private.SS1 = internal addrspace(1) global %struct.StructTy1 { ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(4) null, ptr addrspace(1) null, ptr null }, align 8
-// NOOPT: @test_static_var_private.SS2 = internal addrspace(1) global %struct.StructTy2 zeroinitializer, align 8
-
-void test_static_var_private(void) {
-  static private char *sp1 = 0;
-  static private char *sp2 = NULL;
-  static private char *sp3;
-  static private char *sp4 = (private char*)((void)0, 0);
-  const int x = 0;
-  static private char *sp5 = (private char*)x;
-  static StructTy1 SS1;
-  static StructTy2 SS2;
-}
-
-// NOOPT: @test_static_var_local.sp1 = internal addrspace(1) global ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), align 4
-// NOOPT: @test_static_var_local.sp2 = internal addrspace(1) global ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), align 4
-// NOOPT: @test_static_var_local.sp3 = internal addrspace(1) global ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), align 4
-// NOOPT: @test_static_var_local.sp4 = internal addrspace(1) global ptr addrspace(3) null, align 4
-// NOOPT: @test_static_var_local.sp5 = internal addrspace(1) global ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), align 4
-// NOOPT: @test_static_var_local.SS1 = internal addrspace(1) global %struct.StructTy1 { ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(4) null, ptr addrspace(1) null, ptr null }, align 8
-// NOOPT: @test_static_var_local.SS2 = internal addrspace(1) global %struct.StructTy2 zeroinitializer, align 8
-void test_static_var_local(void) {
-  static local char *sp1 = 0;
-  static local char *sp2 = NULL;
-  static local char *sp3;
-  static local char *sp4 = (local char*)((void)0, 0);
-  const int x = 0;
-  static local char *sp5 = (local char*)x;
-  static StructTy1 SS1;
-  static StructTy2 SS2;
-}
-
-// Test function-scope variable initialization.
-// NOOPT-LABEL: @test_func_scope_var_private(
-// NOOPT: store ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(5) %sp1{{.*}}, align 4
-// NOOPT: store ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(5) %sp2{{.*}}, align 4
-// NOOPT: store ptr addrspace(5) null, ptr addrspace(5) %sp3{{.*}}, align 4
-// NOOPT: store ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(5) %sp4{{.*}}, align 4
-// NOOPT: call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 %SS1{{.*}}, ptr addrspace(4) align 8 @__const.test_func_scope_var_private.SS1, i64 32, i1 false)
-// NOOPT: call void @llvm.memset.p5.i64(ptr addrspace(5) align 8 %SS2{{.*}}, i8 0, i64 24, i1 false)
-void test_func_scope_var_private(void) {
-  private char *sp1 = 0;
-  private char *sp2 = NULL;
-  private char *sp3 = (private char*)((void)0, 0);
-  const int x = 0;
-  private char *sp4 = (private char*)x;
-  StructTy1 SS1 = {0, 0, 0, 0, 0};
-  StructTy2 SS2 = {0, 0, 0};
-}
-
-// Test function-scope variable initialization.
-// NOOPT-LABEL: @test_func_scope_var_local(
-// NOOPT: store ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(5) %sp1{{.*}}, align 4
-// NOOPT: store ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(5) %sp2{{.*}}, align 4
-// NOOPT: store ptr addrspace(3) null, ptr addrspace(5) %sp3{{.*}}, align 4
-// NOOPT: store ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(5) %sp4{{.*}}, align 4
-// NOOPT: call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 %SS1{{.*}}, ptr addrspace(4) align 8 @__const.test_func_scope_var_local.SS1, i64 32, i1 false)
-// NOOPT: call void @llvm.memset.p5.i64(ptr addrspace(5) align 8 %SS2{{.*}}, i8 0, i64 24, i1 false)
-void test_func_scope_var_local(void) {
-  local char *sp1 = 0;
-  local char *sp2 = NULL;
-  local char *sp3 = (local char*)((void)0, 0);
-  const int x = 0;
-  local char *sp4 = (local char*)x;
-  StructTy1 SS1 = {0, 0, 0, 0, 0};
-  StructTy2 SS2 = {0, 0, 0};
-}
-
-
-// Test default initialization of pointers.
-
-// Tentative definition of global variables with non-zero initializer
-// cannot have common linkage since common linkage requires zero initialization
-// and does not have explicit section.
-
-// CHECK: @p1 ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), align 4
-// COMMON: @p1 = weak local_unnamed_addr addrspace(1) global ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), align 4
-private char *p1;
-
-// CHECK: @p2 ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), align 4
-// COMMON: @p2 = weak local_unnamed_addr addrspace(1) global ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), align 4
-local char *p2;
-
-// CHECK: @p3 ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(4) null, align 8
-// COMMON: @p3 = common local_unnamed_addr addrspace(1) global ptr addrspace(4) null, align 8
-constant char *p3;
-
-// CHECK: @p4 ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(1) null, align 8
-// COMMON: @p4 = common local_unnamed_addr addrspace(1) global ptr addrspace(1) null, align 8
-global char *p4;
-
-// CHECK: @p5 ={{.*}} local_unnamed_addr addrspace(1) global ptr null, align 8
-// COMMON: @p5 = common local_unnamed_addr addrspace(1) global ptr null, align 8
-generic char *p5;
-
-// Test default initialization of structure.
-
-// CHECK: @S1 ={{.*}} local_unnamed_addr addrspace(1) global %struct.StructTy1 { ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(4) null, ptr addrspace(1) null, ptr null }, align 8
-StructTy1 S1;
-
-// CHECK: @S2 ={{.*}} local_unnamed_addr addrspace(1) global %struct.StructTy2 zeroinitializer, align 8
-StructTy2 S2;
-
-// Test default initialization of array.
-// CHECK: @A1 ={{.*}} local_unnamed_addr addrspace(1) global [2 x %struct.StructTy1] [%struct.StructTy1 { ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(4) null, ptr addrspace(1) null, ptr null }, %struct.StructTy1 { ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(4) null, ptr addrspace(1) null, ptr null }], align 8
-StructTy1 A1[2];
-
-// CHECK: @A2 ={{.*}} local_unnamed_addr addrspace(1) global [2 x %struct.StructTy2] zeroinitializer, align 8
-StructTy2 A2[2];
-
-// Test comparison with 0.
-
-// CHECK-LABEL: cmp_private
-// CHECK: icmp eq ptr addrspace(5) %p, addrspacecast (ptr null to ptr addrspace(5))
-void cmp_private(private char* p) {
-  if (p != 0)
-    *p = 0;
-}
-
-// CHECK-LABEL: cmp_local
-// CHECK: icmp eq ptr addrspace(3) %p, addrspacecast (ptr null to ptr addrspace(3))
-void cmp_local(local char* p) {
-  if (p != 0)
-    *p = 0;
-}
-
-// CHECK-LABEL: cmp_global
-// CHECK: icmp eq ptr addrspace(1) %p, null
-void cmp_global(global char* p) {
-  if (p != 0)
-    *p = 0;
-}
-
-// CHECK-LABEL: cmp_constant
-// CHECK: icmp eq ptr addrspace(4) %p, null
-char cmp_constant(constant char* p) {
-  if (p != 0)
-    return *p;
-  else
-    return 0;
-}
-
-// CHECK-LABEL: cmp_generic
-// CHECK: icmp eq ptr %p, null
-void cmp_generic(generic char* p) {
-  if (p != 0)
-    *p = 0;
-}
-
-// Test comparison with NULL.
-
-// CHECK-LABEL: cmp_NULL_private
-// CHECK: icmp eq ptr addrspace(5) %p, addrspacecast (ptr null to ptr addrspace(5))
-void cmp_NULL_private(private char* p) {
-  if (p != NULL)
-    *p = 0;
-}
-
-// CHECK-LABEL: cmp_NULL_local
-// CHECK: icmp eq ptr addrspace(3) %p, addrspacecast (ptr null to ptr addrspace(3))
-void cmp_NULL_local(local char* p) {
-  if (p != NULL)
-    *p = 0;
-}
-
-// CHECK-LABEL: cmp_NULL_global
-// CHECK: icmp eq ptr addrspace(1) %p, null
-void cmp_NULL_global(global char* p) {
-  if (p != NULL)
-    *p = 0;
-}
-
-// CHECK-LABEL: cmp_NULL_constant
-// CHECK: icmp eq ptr addrspace(4) %p, null
-char cmp_NULL_constant(constant char* p) {
-  if (p != NULL)
-    return *p;
-  else
-    return 0;
-}
-
-// CHECK-LABEL: cmp_NULL_generic
-// CHECK: icmp eq ptr %p, null
-void cmp_NULL_generic(generic char* p) {
-  if (p != NULL)
-    *p = 0;
-}
-
-// Test storage 0 as null pointer.
-// CHECK-LABEL: test_storage_null_pointer
-// CHECK: store ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr %arg_private
-// CHECK: store ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr %arg_local
-// CHECK: store ptr addrspace(1) null, ptr %arg_global
-// CHECK: store ptr addrspace(4) null, ptr %arg_constant
-// CHECK: store ptr null, ptr %arg_generic
-void test_storage_null_pointer(private char** arg_private,
-                               local char** arg_local,
-                               global char** arg_global,
-                               constant char** arg_constant,
-                               generic char** arg_generic) {
-   *arg_private = 0;
-   *arg_local = 0;
-   *arg_global = 0;
-   *arg_constant = 0;
-   *arg_generic = 0;
-}
-
-// Test storage NULL as null pointer.
-// CHECK-LABEL: test_storage_null_pointer_NULL
-// CHECK: store ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr %arg_private
-// CHECK: store ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr %arg_local
-// CHECK: store ptr addrspace(1) null, ptr %arg_global
-// CHECK: store ptr addrspace(4) null, ptr %arg_constant
-// CHECK: store ptr null, ptr %arg_generic
-void test_storage_null_pointer_NULL(private char** arg_private,
-                                    local char** arg_local,
-                                    global char** arg_global,
-                                    constant char** arg_constant,
-                                    generic char** arg_generic) {
-   *arg_private = NULL;
-   *arg_local = NULL;
-   *arg_global = NULL;
-   *arg_constant = NULL;
-   *arg_generic = NULL;
-}
-
-// Test pass null pointer to function as argument.
-void test_pass_null_pointer_arg_calee(private char* arg_private,
-                                      local char* arg_local,
-                                      global char* arg_global,
-                                      constant char* arg_constant,
-                                      generic char* arg_generic);
-
-// CHECK-LABEL: test_pass_null_pointer_arg
-// CHECK: call void @test_pass_null_pointer_arg_calee(ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(1) null, ptr addrspace(4) null, ptr null)
-// CHECK: call void @test_pass_null_pointer_arg_calee(ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(1) null, ptr addrspace(4) null, ptr null)
-void test_pass_null_pointer_arg(void) {
-  test_pass_null_pointer_arg_calee(0, 0, 0, 0, 0);
-  test_pass_null_pointer_arg_calee(NULL, NULL, NULL, NULL, NULL);
-}
-
-// Test cast null pointer to size_t.
-void test_cast_null_pointer_to_sizet_calee(size_t arg_private,
-                                           size_t arg_local,
-                                           size_t arg_global,
-                                           size_t arg_constant,
-                                           size_t arg_generic);
-
-// CHECK-LABEL: test_cast_null_pointer_to_sizet
-// CHECK: call void @test_cast_null_pointer_to_sizet_calee(i64 ptrtoint (ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)) to i64), i64 ptrtoint (ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)) to i64), i64 0, i64 0, i64 0)
-// CHECK: call void @test_cast_null_pointer_to_sizet_calee(i64 ptrtoint (ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)) to i64), i64 ptrtoint (ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)) to i64), i64 0, i64 0, i64 0)
-void test_cast_null_pointer_to_sizet(void) {
-  test_cast_null_pointer_to_sizet_calee((size_t)((private char*)0),
-                                        (size_t)((local char*)0),
-                                        (size_t)((global char*)0),
-                                        (size_t)((constant char*)0),
-                                        (size_t)((generic char*)0));
-  test_cast_null_pointer_to_sizet_calee((size_t)((private char*)NULL),
-                                        (size_t)((local char*)NULL),
-                                        (size_t)((global char*)NULL),
-                                        (size_t)((constant char*)0), // NULL cannot be casted to constant pointer since it is defined as a generic pointer
-                                        (size_t)((generic char*)NULL));
-}
-
-// Test comparison between null pointers.
-#define TEST_EQ00(addr1, addr2) int test_eq00_##addr1##_##addr2(void) { return (addr1 char*)0 == (addr2 char*)0; }
-#define TEST_EQ0N(addr1, addr2) int test_eq0N_##addr1##_##addr2(void) { return (addr1 char*)0 == (addr2 char*)NULL; }
-#define TEST_EQN0(addr1, addr2) int test_eqN0_##addr1##_##addr2(void) { return (addr1 char*)NULL == (addr2 char*)0; }
-#define TEST_EQNN(addr1, addr2) int test_eqNN_##addr1##_##addr2(void) { return (addr1 char*)0 == (addr2 char*)NULL; }
-#define TEST_NE00(addr1, addr2) int test_ne00_##addr1##_##addr2(void) { return (addr1 char*)0 != (addr2 char*)0; }
-#define TEST_NE0N(addr1, addr2) int test_ne0N_##addr1##_##addr2(void) { return (addr1 char*)0 != (addr2 char*)NULL; }
-#define TEST_NEN0(addr1, addr2) int test_neN0_##addr1##_##addr2(void) { return (addr1 char*)NULL != (addr2 char*)0; }
-#define TEST_NENN(addr1, addr2) int test_neNN_##addr1##_##addr2(void) { return (addr1 char*)0 != (addr2 char*)NULL; }
-#define TEST(addr1, addr2) \
-        TEST_EQ00(addr1, addr2) \
-        TEST_EQ0N(addr1, addr2) \
-        TEST_EQN0(addr1, addr2) \
-        TEST_EQNN(addr1, addr2) \
-        TEST_NE00(addr1, addr2) \
-        TEST_NE0N(addr1, addr2) \
-        TEST_NEN0(addr1, addr2) \
-        TEST_NENN(addr1, addr2)
-
-// CHECK-LABEL: test_eq00_generic_private
-// CHECK: ret i32 1
-// CHECK-LABEL: test_eq0N_generic_private
-// CHECK: ret i32 1
-// CHECK-LABEL: test_eqN0_generic_private
-// CHECK: ret i32 1
-// CHECK-LABEL: test_eqNN_generic_private
-// CHECK: ret i32 1
-// CHECK-LABEL: test_ne00_generic_private
-// CHECK: ret i32 0
-// CHECK-LABEL: test_ne0N_generic_private
-// CHECK: ret i32 0
-// CHECK-LABEL: test_neN0_generic_private
-// CHECK: ret i32 0
-// CHECK-LABEL: test_neNN_generic_private
-// CHECK: ret i32 0
-TEST(generic, private)
-
-// CHECK-LABEL: test_eq00_generic_local
-// CHECK: ret i32 1
-// CHECK-LABEL: test_eq0N_generic_local
-// CHECK: ret i32 1
-// CHECK-LABEL: test_eqN0_generic_local
-// CHECK: ret i32 1
-// CHECK-LABEL: test_eqNN_generic_local
-// CHECK: ret i32 1
-// CHECK-LABEL: test_ne00_generic_local
-// CHECK: ret i32 0
-// CHECK-LABEL: test_ne0N_generic_local
-// CHECK: ret i32 0
-// CHECK-LABEL: test_neN0_generic_local
-// CHECK: ret i32 0
-// CHECK-LABEL: test_neNN_generic_local
-// CHECK: ret i32 0
-TEST(generic, local)
-
-// CHECK-LABEL: test_eq00_generic_global
-// CHECK: ret i32 1
-// CHECK-LABEL: test_eq0N_generic_global
-// CHECK: ret i32 1
-// CHECK-LABEL: test_eqN0_generic_global
-// CHECK: ret i32 1
-// CHECK-LABEL: test_eqNN_generic_global
-// CHECK: ret i32 1
-// CHECK-LABEL: test_ne00_generic_global
-// CHECK: ret i32 0
-// CHECK-LABEL: test_ne0N_generic_global
-// CHECK: ret i32 0
-// CHECK-LABEL: test_neN0_generic_global
-// CHECK: ret i32 0
-// CHECK-LABEL: test_neNN_generic_global
-// CHECK: ret i32 0
-TEST(generic, global)
-
-// CHECK-LABEL: test_eq00_generic_generic
-// CHECK: ret i32 1
-// CHECK-LABEL: test_eq0N_generic_generic
-// CHECK: ret i32 1
-// CHECK-LABEL: test_eqN0_generic_generic
-// CHECK: ret i32 1
-// CHECK-LABEL: test_eqNN_generic_generic
-// CHECK: ret i32 1
-// CHECK-LABEL: test_ne00_generic_generic
-// CHECK: ret i32 0
-// CHECK-LABEL: test_ne0N_generic_generic
-// CHECK: ret i32 0
-// CHECK-LABEL: test_neN0_generic_generic
-// CHECK: ret i32 0
-// CHECK-LABEL: test_neNN_generic_generic
-// CHECK: ret i32 0
-TEST(generic, generic)
-
-// CHECK-LABEL: test_eq00_constant_constant
-// CHECK: ret i32 1
-TEST_EQ00(constant, constant)
-
-// Test cast to bool.
-
-// CHECK-LABEL: cast_bool_private
-// CHECK: icmp eq ptr addrspace(5) %p, addrspacecast (ptr null to ptr addrspace(5))
-void cast_bool_private(private char* p) {
-  if (p)
-    *p = 0;
-}
-
-// CHECK-LABEL: cast_bool_local
-// CHECK: icmp eq ptr addrspace(3) %p, addrspacecast (ptr null to ptr addrspace(3))
-void cast_bool_local(local char* p) {
-  if (p)
-    *p = 0;
-}
-
-// CHECK-LABEL: cast_bool_global
-// CHECK: icmp eq ptr addrspace(1) %p, null
-void cast_bool_global(global char* p) {
-  if (p)
-    *p = 0;
-}
-
-// CHECK-LABEL: cast_bool_constant
-// CHECK: icmp eq ptr addrspace(4) %p, null
-char cast_bool_constant(constant char* p) {
-  if (p)
-    return *p;
-  else
-    return 0;
-}
-
-// CHECK-LABEL: cast_bool_generic
-// CHECK: icmp eq ptr %p, null
-void cast_bool_generic(generic char* p) {
-  if (p)
-    *p = 0;
-}
-
-// Test initialize a struct using memset.
-// For large structures which is mostly zero, clang generats llvm.memset for
-// the zero part and store for non-zero members.
-typedef struct {
-  long a, b, c, d;
-  private char *p;
-} StructTy3;
-
-// CHECK-LABEL: test_memset_private
-// CHECK: call void @llvm.memset.p5.i64(ptr addrspace(5) noundef align 8 {{.*}}, i8 0, i64 32, i1 false)
-// CHECK: [[GEP:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(5) %ptr, i32 32
-// CHECK: store ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(5) [[GEP]]
-// CHECK: [[GEP1:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(5) {{.*}}, i32 36
-// CHECK: store i32 0, ptr addrspace(5) [[GEP1]], align 4
-void test_memset_private(private StructTy3 *ptr) {
-  StructTy3 S3 = {0, 0, 0, 0, 0};
-  *ptr = S3;
-}
-
-// Test casting literal 0 to pointer.
-// A 0 literal casted to pointer should become a null pointer.
-
-// CHECK-LABEL: test_cast_0_to_local_ptr
-// CHECK: ret ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3))
-local int* test_cast_0_to_local_ptr(void) {
-  return (local int*)0;
-}
-
-// CHECK-LABEL: test_cast_0_to_private_ptr
-// CHECK: ret ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5))
-private int* test_cast_0_to_private_ptr(void) {
-  return (private int*)0;
-}
-
-// Test casting non-literal integer with 0 value to pointer.
-// A non-literal integer expression with 0 value is casted to a pointer with
-// zero value.
-
-// CHECK-LABEL: test_cast_int_to_ptr1_private
-// CHECK: ret ptr addrspace(5) null
-private int* test_cast_int_to_ptr1_private(void) {
-  return (private int*)((void)0, 0);
-}
-
-// CHECK-LABEL: test_cast_int_to_ptr1_local
- // CHECK: ret ptr addrspace(3) null
-local int* test_cast_int_to_ptr1_local(void) {
-  return (local int*)((void)0, 0);
-}
-
-// CHECK-LABEL: test_cast_int_to_ptr2
-// CHECK: ret ptr addrspace(5) null
-private int* test_cast_int_to_ptr2(void) {
-  int x = 0;
-  return (private int*)x;
-}
-
-// Test logical operations.
-// CHECK-LABEL: test_not_nullptr
-// CHECK: ret i32 1
-int test_not_nullptr(void) {
-  return !(private char*)NULL;
-}
-
-// CHECK-LABEL: test_and_nullptr
-// CHECK: ret i32 0
-int test_and_nullptr(int a) {
-  return a && ((private char*)NULL);
-}
-
-// CHECK-LABEL: test_not_private_ptr
-// CHECK: %[[lnot:.*]] = icmp eq ptr addrspace(5) %p, addrspacecast (ptr null to ptr addrspace(5))
-// CHECK: %[[lnot_ext:.*]] = zext i1 %[[lnot]] to i32
-// CHECK: ret i32 %[[lnot_ext]]
-int test_not_private_ptr(private char* p) {
-  return !p;
-}
-
-// CHECK-LABEL: test_not_local_ptr
-// CHECK: %[[lnot:.*]] = icmp eq ptr addrspace(3) %p, addrspacecast (ptr null to ptr addrspace(3))
-// CHECK: %[[lnot_ext:.*]] = zext i1 %[[lnot]] to i32
-// CHECK: ret i32 %[[lnot_ext]]
-int test_not_local_ptr(local char* p) {
-  return !p;
-}
-
-
-// CHECK-LABEL: test_and_ptr
-// CHECK: %[[tobool:.*]] = icmp ne ptr addrspace(5) %p1, addrspacecast (ptr null to ptr addrspace(5))
-// CHECK: %[[tobool1:.*]] = icmp ne ptr addrspace(3) %p2, addrspacecast (ptr null to ptr addrspace(3))
-// CHECK: %[[res:.*]] = select i1 %[[tobool]], i1 %[[tobool1]], i1 false
-// CHECK: %[[land_ext:.*]] = zext i1 %[[res]] to i32
-// CHECK: ret i32 %[[land_ext]]
-int test_and_ptr(private char* p1, local char* p2) {
-  return p1 && p2;
-}
-
-// Test folding of null pointer in function scope.
-// NOOPT-LABEL: test_fold_private
-// NOOPT: call void @test_fold_callee
-// NOOPT: store ptr addrspace(1) null, ptr addrspace(5) %glob{{.*}}, align 8
-// NOOPT: %{{.*}} = sub i64 %{{.*}}, 0
-// NOOPT: call void @test_fold_callee
-// NOOPT: %[[SEXT:.*]] = sext i32 ptrtoint (ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)) to i32) to i64
-// NOOPT: %{{.*}} = add nsw i64 %1, %[[SEXT]]
-// NOOPT: %{{.*}} = sub nsw i64 %{{.*}}, 1
-void test_fold_callee(void);
-void test_fold_private(void) {
-  global int* glob = (test_fold_callee(), (global int*)(generic char*)0);
-  long x = glob - (global int*)(generic char*)0;
-  x = x + (int)(test_fold_callee(), (private int*)(generic char*)(global short*)0);
-  x = x - (int)((private int*)0 == (private int*)(generic char*)0);
-}
-
-// NOOPT-LABEL: test_fold_local
-// NOOPT: call void @test_fold_callee
-// NOOPT: store ptr addrspace(1) null, ptr addrspace(5) %glob{{.*}}, align 8
-// NOOPT: %{{.*}} = sub i64 %{{.*}}, 0
-// NOOPT: call void @test_fold_callee
-// NOOPT: %[[SEXT:.*]] = sext i32 ptrtoint (ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)) to i32) to i64
-// NOOPT: %{{.*}} = add nsw i64 %{{.*}}, %[[SEXT]]
-// NOOPT: %{{.*}} = sub nsw i64 %{{.*}}, 1
-void test_fold_local(void) {
-  global int* glob = (test_fold_callee(), (global int*)(generic char*)0);
-  long x = glob - (global int*)(generic char*)0;
-  x = x + (int)(test_fold_callee(), (local int*)(generic char*)(global short*)0);
-  x = x - (int)((local int*)0 == (local int*)(generic char*)0);
-}
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-wave32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-wave32.cl
index d390418523694..31fd0e7bceaf5 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-wave32.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-wave32.cl
@@ -1,5 +1,5 @@
 // REQUIRES: amdgpu-registered-target
-// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -D__AMDGCN_WAVEFRONT_SIZE=32 -target-feature +wavefrontsize32 -emit-llvm -o - %s | FileCheck -enable-var-scope %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-feature +wavefrontsize32 -emit-llvm -o - %s | FileCheck -enable-var-scope %s
 // RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -emit-llvm -o - %s | FileCheck -enable-var-scope %s
 // RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -target-feature +wavefrontsize32 -emit-llvm -o - %s | FileCheck -enable-var-scope %s
 // RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1100 -target-feature +wavefrontsize32 -emit-llvm -o - %s | FileCheck -enable-var-scope %s
@@ -48,7 +48,3 @@ void test_read_exec_lo(global uint* out) {
 void test_read_exec_hi(global uint* out) {
   *out = __builtin_amdgcn_read_exec_hi();
 }
-
-#if __AMDGCN_WAVEFRONT_SIZE != 32
-#error Wrong wavesize detected
-#endif
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-wave64.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-wave64.cl
index d851ec7e6734f..758b5aa532d73 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-wave64.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-wave64.cl
@@ -50,7 +50,3 @@ void test_read_exec_lo(global ulong* out) {
 void test_read_exec_hi(global ulong* out) {
   *out = __builtin_amdgcn_read_exec_hi();
 }
-
-#if defined(__AMDGCN_WAVEFRONT_SIZE__) && __AMDGCN_WAVEFRONT_SIZE__ != 64
-#error Wrong wavesize detected
-#endif
diff --git a/clang/test/CodeGenOpenCL/nullptr.cl b/clang/test/CodeGenOpenCL/nullptr.cl
new file mode 100644
index 0000000000000..976e12c0bef47
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/nullptr.cl
@@ -0,0 +1,735 @@
+// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -include opencl-c.h -triple spir64 -emit-llvm -o - -Wno-void-pointer-to-int-cast -Wno-pointer-to-int-cast -Wno-int-to-pointer-cast | FileCheck %s --check-prefixes=CHECK,SPIR64
+// RUN: %clang_cc1 -no-enable-noundef-analysis %s -O0 -cl-std=CL2.0 -include opencl-c.h -triple spir64 -emit-llvm -o - -Wno-void-pointer-to-int-cast -Wno-pointer-to-int-cast -Wno-int-to-pointer-cast | FileCheck --check-prefixes=CHECK-NOOPT,SPIR64-NOOPT %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -include opencl-c.h -triple amdgcn -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,AMDGCN
+// RUN: %clang_cc1 -no-enable-noundef-analysis %s -O0 -cl-std=CL2.0 -include opencl-c.h -triple amdgcn -emit-llvm -o - | FileCheck --check-prefixes=CHECK-NOOPT,AMDGCN-NOOPT %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -include opencl-c.h -triple amdgcn---opencl -emit-llvm -o - | FileCheck %s --check-prefix=AMDGCN
+// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -include opencl-c.h -triple amdgcn -fcommon -emit-llvm -o - | FileCheck %s --check-prefix=AMDGCN-COMMON
+
+typedef struct {
+  private char *p1;
+  local char *p2;
+  constant char *p3;
+  global char *p4;
+  generic char *p5;
+} StructTy1;
+
+typedef struct {
+  constant char *p3;
+  global char *p4;
+  generic char *p5;
+} StructTy2;
+
+// Test 0 as initializer.
+
+// SPIR64: @private_p ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspacecast (ptr addrspace(4) null to ptr), align 8
+// AMDGCN: @private_p ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), align 4
+private char *private_p = 0;
+
+// SPIR64: @local_p = local_unnamed_addr addrspace(1) global ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3)), align 8
+// AMDGCN: @local_p ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), align 4
+local char *local_p = 0;
+
+// SPIR64: @global_p ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(1) addrspacecast (ptr addrspace(4) null to ptr addrspace(1)), align 8
+// AMDGCN: @global_p ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(1) null, align 8
+global char *global_p = 0;
+
+// SPIR64: @constant_p ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(2) null, align 8
+// AMDGCN: @constant_p ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(4) null, align 8
+constant char *constant_p = 0;
+
+// SPIR64: @generic_p ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(4) null, align 8
+// AMDGCN: @generic_p ={{.*}} local_unnamed_addr addrspace(1) global ptr null, align 8
+generic char *generic_p = 0;
+
+// Test NULL as initializer.
+
+// SPIR64: @private_p_NULL ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspacecast (ptr addrspace(4) null to ptr), align 8
+// AMDGCN: @private_p_NULL ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), align 4
+private char *private_p_NULL = NULL;
+
+// SPIR64: @local_p_NULL ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3)), align 8
+// AMDGCN: @local_p_NULL ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), align 4
+local char *local_p_NULL = NULL;
+
+// SPIR64: @global_p_NULL ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(1) addrspacecast (ptr addrspace(4) null to ptr addrspace(1)), align 8
+// AMDGCN: @global_p_NULL ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(1) null, align 8
+global char *global_p_NULL = NULL;
+
+// SPIR64: @constant_p_NULL ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(2) null, align 8
+// AMDGCN: @constant_p_NULL ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(4) null, align 8
+constant char *constant_p_NULL = NULL;
+
+// SPIR64: @generic_p_NULL ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(4) null, align 8
+// AMDGCN: @generic_p_NULL ={{.*}} local_unnamed_addr addrspace(1) global ptr null, align 8
+generic char *generic_p_NULL = NULL;
+
+// Test constant folding of null pointer.
+// A null pointer should be folded to a null pointer in the target address space.
+
+// SPIR64: @fold_generic ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(4) null, align 8
+// AMDGCN: @fold_generic ={{.*}} local_unnamed_addr addrspace(1) global ptr null, align 8
+generic int *fold_generic = (global int*)(generic float*)(private char*)0;
+
+// SPIR64: @fold_priv ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspacecast (ptr addrspace(4) null to ptr), align 8
+// AMDGCN: @fold_priv ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(5) addrspacecast (ptr addrspace(1) null to ptr addrspace(5)), align 4
+private short *fold_priv = (private short*)(generic int*)(global void*)0;
+
+// SPIR64: @fold_priv_arith ={{.*}} local_unnamed_addr addrspace(1) global ptr inttoptr (i64 10 to ptr), align 8
+// AMDGCN: @fold_priv_arith ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(5) inttoptr (i32 9 to ptr addrspace(5)), align 4
+private char *fold_priv_arith = (private char*)0 + 10;
+
+// SPIR64: @fold_local_arith ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(3) inttoptr (i64 10 to ptr addrspace(3)), align 8
+// AMDGCN: @fold_local_arith ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(3) inttoptr (i32 9 to ptr addrspace(3)), align 4
+local char *fold_local_arith = (local char*)0 + 10;
+
+// SPIR64: @fold_int ={{.*}} local_unnamed_addr addrspace(1) global i32 14, align 4
+// AMDGCN: @fold_int ={{.*}} local_unnamed_addr addrspace(1) global i32 13, align 4
+int fold_int = (int)(private void*)(generic char*)(global int*)0 + 14;
+
+// SPIR64: @fold_int2 ={{.*}} local_unnamed_addr addrspace(1) global i32 13, align 4
+// AMDGCN: @fold_int2 ={{.*}} local_unnamed_addr addrspace(1) global i32 12, align 4
+int fold_int2 = (int) ((private void*)0 + 13);
+
+// SPIR64: @fold_int3 ={{.*}} local_unnamed_addr addrspace(1) global i32 0, align 4
+// AMDGCN: @fold_int3 ={{.*}} local_unnamed_addr addrspace(1) global i32 -1, align 4
+int fold_int3 = (int) ((private int*)0);
+
+// SPIR64: @fold_int4 ={{.*}} local_unnamed_addr addrspace(1) global i32 8, align 4
+// AMDGCN: @fold_int4 ={{.*}} local_unnamed_addr addrspace(1) global i32 7, align 4
+int fold_int4 = (int) &((private int*)0)[2];
+
+// SPIR64: @fold_int5 ={{.*}} local_unnamed_addr addrspace(1) global i32 8, align 4
+// AMDGCN: @fold_int5 ={{.*}} local_unnamed_addr addrspace(1) global i32 3, align 4
+int fold_int5 = (int) &((private StructTy1*)0)->p2;
+
+// SPIR64: @fold_int_local ={{.*}} local_unnamed_addr addrspace(1) global i32 14, align 4
+// AMDGCN: @fold_int_local = local_unnamed_addr addrspace(1) global i32 13, align 4
+int fold_int_local = (int)(local void*)(generic char*)(global int*)0 + 14;
+
+// SPIR64: @fold_int2_local ={{.*}} local_unnamed_addr addrspace(1) global i32 13, align 4
+// AMDGCN: @fold_int2_local ={{.*}} local_unnamed_addr addrspace(1) global i32 12, align 4
+int fold_int2_local = (int) ((local void*)0 + 13);
+
+// SPIR64: @fold_int3_local ={{.*}} local_unnamed_addr addrspace(1) global i32 0, align 4
+// AMDGCN: @fold_int3_local ={{.*}} local_unnamed_addr addrspace(1) global i32 -1, align 4
+int fold_int3_local = (int) ((local int*)0);
+
+// SPIR64: @fold_int4_local ={{.*}} local_unnamed_addr addrspace(1) global i32 8, align 4
+// AMDGCN: @fold_int4_local ={{.*}} local_unnamed_addr addrspace(1) global i32 7, align 4
+int fold_int4_local = (int) &((local int*)0)[2];
+
+// SPIR64: @fold_int5_local ={{.*}} local_unnamed_addr addrspace(1) global i32 8, align 4
+// AMDGCN: @fold_int5_local ={{.*}} local_unnamed_addr addrspace(1) global i32 3, align 4
+int fold_int5_local = (int) &((local StructTy1*)0)->p2;
+
+
+// Test static variable initialization.
+
+// SPIR64-NOOPT: @test_static_var_private.sp1 = internal addrspace(1) global ptr addrspacecast (ptr addrspace(4) null to ptr), align 8
+// SPIR64-NOOPT: @test_static_var_private.sp2 = internal addrspace(1) global ptr addrspacecast (ptr addrspace(4) null to ptr), align 8
+// SPIR64-NOOPT: @test_static_var_private.sp3 = internal addrspace(1) global ptr addrspacecast (ptr addrspace(4) null to ptr), align 8
+// SPIR64-NOOPT: @test_static_var_private.sp4 = internal addrspace(1) global ptr addrspacecast (ptr addrspace(4) null to ptr), align 8
+// SPIR64-NOOPT: @test_static_var_private.sp5 = internal addrspace(1) global ptr addrspacecast (ptr addrspace(4) null to ptr), align 8
+// SPIR64-NOOPT: @test_static_var_private.SS1 = internal addrspace(1) global %struct.StructTy1 zeroinitializer, align 8
+// AMDGCN-NOOPT: @test_static_var_private.sp1 = internal addrspace(1) global ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), align 4
+// AMDGCN-NOOPT: @test_static_var_private.sp2 = internal addrspace(1) global ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), align 4
+// AMDGCN-NOOPT: @test_static_var_private.sp3 = internal addrspace(1) global ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), align 4
+// AMDGCN-NOOPT: @test_static_var_private.sp4 = internal addrspace(1) global ptr addrspace(5) null, align 4
+// AMDGCN-NOOPT: @test_static_var_private.sp5 = internal addrspace(1) global ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), align 4
+// AMDGCN-NOOPT: @test_static_var_private.SS1 = internal addrspace(1) global %struct.StructTy1 { ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(4) null, ptr addrspace(1) null, ptr null }, align 8
+// CHECK-NOOPT: @test_static_var_private.SS2 = internal addrspace(1) global %struct.StructTy2 zeroinitializer, align 8
+
+void test_static_var_private(void) {
+  static private char *sp1 = 0;
+  static private char *sp2 = NULL;
+  static private char *sp3;
+  static private char *sp4 = (private char*)((void)0, 0);
+  const int x = 0;
+  static private char *sp5 = (private char*)x;
+  static StructTy1 SS1;
+  static StructTy2 SS2;
+}
+
+// SPIR64-NOOPT: @test_static_var_local.sp1 = internal addrspace(1) global ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3)), align 8
+// SPIR64-NOOPT: @test_static_var_local.sp2 = internal addrspace(1) global ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3)), align 8
+// SPIR64-NOOPT: @test_static_var_local.sp3 = internal addrspace(1) global ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3)), align 8
+// SPIR64-NOOPT: @test_static_var_local.sp4 = internal addrspace(1) global ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3)), align 8
+// SPIR64-NOOPT: @test_static_var_local.sp5 = internal addrspace(1) global ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3)), align 8
+// SPIR64-NOOPT: @test_static_var_local.SS1 = internal addrspace(1) global %struct.StructTy1 zeroinitializer, align 8
+// AMDGCN-NOOPT: @test_static_var_local.sp1 = internal addrspace(1) global ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), align 4
+// AMDGCN-NOOPT: @test_static_var_local.sp2 = internal addrspace(1) global ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), align 4
+// AMDGCN-NOOPT: @test_static_var_local.sp3 = internal addrspace(1) global ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), align 4
+// AMDGCN-NOOPT: @test_static_var_local.sp4 = internal addrspace(1) global ptr addrspace(3) null, align 4
+// AMDGCN-NOOPT: @test_static_var_local.sp5 = internal addrspace(1) global ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), align 4
+// AMDGCN-NOOPT: @test_static_var_local.SS1 = internal addrspace(1) global %struct.StructTy1 { ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(4) null, ptr addrspace(1) null, ptr null }, align 8
+// CHECK-NOOPT: @test_static_var_local.SS2 = internal addrspace(1) global %struct.StructTy2 zeroinitializer, align 8
+void test_static_var_local(void) {
+  static local char *sp1 = 0;
+  static local char *sp2 = NULL;
+  static local char *sp3;
+  static local char *sp4 = (local char*)((void)0, 0);
+  const int x = 0;
+  static local char *sp5 = (local char*)x;
+  static StructTy1 SS1;
+  static StructTy2 SS2;
+}
+
+// Test function-scope variable initialization.
+// CHECK-NOOPT-LABEL: @test_func_scope_var_private(
+// SPIR64-NOOPT: store ptr addrspacecast (ptr addrspace(4) null to ptr), ptr %sp1{{.*}}, align 8
+// SPIR64-NOOPT: store ptr addrspacecast (ptr addrspace(4) null to ptr), ptr %sp2{{.*}}, align 8
+// SPIR64-NOOPT: store ptr null, ptr %sp3{{.*}}, align 8
+// SPIR64-NOOPT: store ptr addrspacecast (ptr addrspace(4) null to ptr), ptr %sp4{{.*}}, align 8
+// SPIR64-NOOPT: call void @llvm.memset.p0.i64(ptr align 8 %SS1{{.*}}, i8 0, i64 40, i1 false)
+// SPIR64-NOOPT: call void @llvm.memcpy.p0.p2.i64(ptr align 8 %SS2{{.*}}, ptr addrspace(2) align 8 @__const.test_func_scope_var_private.SS2, i64 24, i1 false)
+// AMDGCN-NOOPT: store ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(5) %sp1{{.*}}, align 4
+// AMDGCN-NOOPT: store ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(5) %sp2{{.*}}, align 4
+// AMDGCN-NOOPT: store ptr addrspace(5) null, ptr addrspace(5) %sp3{{.*}}, align 4
+// AMDGCN-NOOPT: store ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(5) %sp4{{.*}}, align 4
+// AMDGCN-NOOPT: call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 %SS1{{.*}}, ptr addrspace(4) align 8 @__const.test_func_scope_var_private.SS1, i64 32, i1 false)
+// AMDGCN-NOOPT: call void @llvm.memset.p5.i64(ptr addrspace(5) align 8 %SS2{{.*}}, i8 0, i64 24, i1 false)
+void test_func_scope_var_private(void) {
+  private char *sp1 = 0;
+  private char *sp2 = NULL;
+  private char *sp3 = (private char*)((void)0, 0);
+  const int x = 0;
+  private char *sp4 = (private char*)x;
+  StructTy1 SS1 = {0, 0, 0, 0, 0};
+  StructTy2 SS2 = {0, 0, 0};
+}
+
+// Test function-scope variable initialization.
+// CHECK-NOOPT-LABEL: @test_func_scope_var_local(
+// SPIR64-NOOPT: store ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3)), ptr %sp1{{.*}}, align 8
+// SPIR64-NOOPT: store ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3)), ptr %sp2{{.*}}, align 8
+// SPIR64-NOOPT: store ptr addrspace(3) null, ptr %sp3{{.*}}, align 8
+// SPIR64-NOOPT: store ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3)), ptr %sp4{{.*}}, align 8
+// SPIR64-NOOPT: call void @llvm.memset.p0.i64(ptr align 8 %SS1{{.*}}, i8 0, i64 40, i1 false)
+// SPIR64-NOOPT: call void @llvm.memcpy.p0.p2.i64(ptr align 8 %SS2{{.*}}, ptr addrspace(2) align 8 @__const.test_func_scope_var_local.SS2, i64 24, i1 false)
+// AMDGCN-NOOPT: store ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(5) %sp1{{.*}}, align 4
+// AMDGCN-NOOPT: store ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(5) %sp2{{.*}}, align 4
+// AMDGCN-NOOPT: store ptr addrspace(3) null, ptr addrspace(5) %sp3{{.*}}, align 4
+// AMDGCN-NOOPT: store ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(5) %sp4{{.*}}, align 4
+// AMDGCN-NOOPT: call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 %SS1{{.*}}, ptr addrspace(4) align 8 @__const.test_func_scope_var_local.SS1, i64 32, i1 false)
+// AMDGCN-NOOPT: call void @llvm.memset.p5.i64(ptr addrspace(5) align 8 %SS2{{.*}}, i8 0, i64 24, i1 false)
+void test_func_scope_var_local(void) {
+  local char *sp1 = 0;
+  local char *sp2 = NULL;
+  local char *sp3 = (local char*)((void)0, 0);
+  const int x = 0;
+  local char *sp4 = (local char*)x;
+  StructTy1 SS1 = {0, 0, 0, 0, 0};
+  StructTy2 SS2 = {0, 0, 0};
+}
+
+
+// Test default initialization of pointers.
+
+// Tentative definition of global variables with non-zero initializer
+// cannot have common linkage since common linkage requires zero initialization
+// and does not have explicit section.
+
+// SPIR64: @p1 ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspacecast (ptr addrspace(4) null to ptr), align 8
+// AMDGCN: @p1 ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), align 4
+// AMDGCN-COMMON: @p1 = weak local_unnamed_addr addrspace(1) global ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), align 4
+private char *p1;
+
+// SPIR64: @p2 ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3)), align 8
+// AMDGCN: @p2 ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), align 4
+// AMDGCN-COMMON: @p2 = weak local_unnamed_addr addrspace(1) global ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), align 4
+local char *p2;
+
+// SPIR64: @p3 ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(2) null, align 8
+// AMDGCN: @p3 ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(4) null, align 8
+// AMDGCN-COMMON: @p3 = common local_unnamed_addr addrspace(1) global ptr addrspace(4) null, align 8
+constant char *p3;
+
+// SPIR64: @p4 ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(1) addrspacecast (ptr addrspace(4) null to ptr addrspace(1)), align 8
+// AMDGCN: @p4 ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(1) null, align 8
+// AMDGCN-COMMON: @p4 = common local_unnamed_addr addrspace(1) global ptr addrspace(1) null, align 8
+global char *p4;
+
+// SPIR64: @p5 ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(4) null, align 8
+// AMDGCN: @p5 ={{.*}} local_unnamed_addr addrspace(1) global ptr null, align 8
+// AMDGCN-COMMON: @p5 = common local_unnamed_addr addrspace(1) global ptr null, align 8
+generic char *p5;
+
+// Test default initialization of structure.
+
+// SPIR64: @S1 ={{.*}} local_unnamed_addr addrspace(1) global %struct.StructTy1 zeroinitializer, align 8
+// AMDGCN: @S1 ={{.*}} local_unnamed_addr addrspace(1) global %struct.StructTy1 { ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(4) null, ptr addrspace(1) null, ptr null }, align 8
+StructTy1 S1;
+
+// CHECK: @S2 ={{.*}} local_unnamed_addr addrspace(1) global %struct.StructTy2 zeroinitializer, align 8
+StructTy2 S2;
+
+// Test default initialization of array.
+// SPIR64: @A1 ={{.*}} local_unnamed_addr addrspace(1) global [2 x %struct.StructTy1] zeroinitializer, align 8
+// AMDGCN: @A1 ={{.*}} local_unnamed_addr addrspace(1) global [2 x %struct.StructTy1] [%struct.StructTy1 { ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(4) null, ptr addrspace(1) null, ptr null }, %struct.StructTy1 { ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(4) null, ptr addrspace(1) null, ptr null }], align 8
+StructTy1 A1[2];
+
+// CHECK: @A2 ={{.*}} local_unnamed_addr addrspace(1) global [2 x %struct.StructTy2] zeroinitializer, align 8
+StructTy2 A2[2];
+
+// Test comparison with 0.
+
+// CHECK-LABEL: cmp_private
+// SPIR64: icmp eq ptr %p, addrspacecast (ptr addrspace(4) null to ptr)
+// AMDGCN: icmp eq ptr addrspace(5) %p, addrspacecast (ptr null to ptr addrspace(5))
+void cmp_private(private char* p) {
+  if (p != 0)
+    *p = 0;
+}
+
+// CHECK-LABEL: cmp_local
+// SPIR64: icmp eq ptr addrspace(3) %p, addrspacecast (ptr addrspace(4) null to ptr addrspace(3))
+// AMDGCN: icmp eq ptr addrspace(3) %p, addrspacecast (ptr null to ptr addrspace(3))
+void cmp_local(local char* p) {
+  if (p != 0)
+    *p = 0;
+}
+
+// CHECK-LABEL: cmp_global
+// SPIR64: icmp eq ptr addrspace(1) %p, addrspacecast (ptr addrspace(4) null to ptr addrspace(1))
+// AMDGCN: icmp eq ptr addrspace(1) %p, null
+void cmp_global(global char* p) {
+  if (p != 0)
+    *p = 0;
+}
+
+// CHECK-LABEL: cmp_constant
+// SPIR64: icmp eq ptr addrspace(2) %p, null
+// AMDGCN: icmp eq ptr addrspace(4) %p, null
+char cmp_constant(constant char* p) {
+  if (p != 0)
+    return *p;
+  else
+    return 0;
+}
+
+// CHECK-LABEL: cmp_generic
+// SPIR64: icmp eq ptr addrspace(4) %p, null
+// AMDGCN: icmp eq ptr %p, null
+void cmp_generic(generic char* p) {
+  if (p != 0)
+    *p = 0;
+}
+
+// Test comparison with NULL.
+
+// CHECK-LABEL: cmp_NULL_private
+// SPIR64: icmp eq ptr %p, addrspacecast (ptr addrspace(4) null to ptr)
+// AMDGCN: icmp eq ptr addrspace(5) %p, addrspacecast (ptr null to ptr addrspace(5))
+void cmp_NULL_private(private char* p) {
+  if (p != NULL)
+    *p = 0;
+}
+
+// CHECK-LABEL: cmp_NULL_local
+// SPIR64: icmp eq ptr addrspace(3) %p, addrspacecast (ptr addrspace(4) null to ptr addrspace(3))
+// AMDGCN: icmp eq ptr addrspace(3) %p, addrspacecast (ptr null to ptr addrspace(3))
+void cmp_NULL_local(local char* p) {
+  if (p != NULL)
+    *p = 0;
+}
+
+// CHECK-LABEL: cmp_NULL_global
+// SPIR64: icmp eq ptr addrspace(1) %p, addrspacecast (ptr addrspace(4) null to ptr addrspace(1))
+// AMDGCN: icmp eq ptr addrspace(1) %p, null
+void cmp_NULL_global(global char* p) {
+  if (p != NULL)
+    *p = 0;
+}
+
+// CHECK-LABEL: cmp_NULL_constant
+// SPIR64: icmp eq ptr addrspace(2) %p, null
+// AMDGCN: icmp eq ptr addrspace(4) %p, null
+char cmp_NULL_constant(constant char* p) {
+  if (p != NULL)
+    return *p;
+  else
+    return 0;
+}
+
+// CHECK-LABEL: cmp_NULL_generic
+// SPIR64: icmp eq ptr addrspace(4) %p, null
+// AMDGCN: icmp eq ptr %p, null
+void cmp_NULL_generic(generic char* p) {
+  if (p != NULL)
+    *p = 0;
+}
+
+// Test storage 0 as null pointer.
+// CHECK-LABEL: test_storage_null_pointer
+// SPIR64: store ptr addrspacecast (ptr addrspace(4) null to ptr), ptr addrspace(4) %arg_private
+// SPIR64: store ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3)), ptr addrspace(4) %arg_local
+// SPIR64: store ptr addrspace(1) addrspacecast (ptr addrspace(4) null to ptr addrspace(1)), ptr addrspace(4) %arg_global
+// SPIR64: store ptr addrspace(2) null, ptr addrspace(4) %arg_constant
+// SPIR64: store ptr addrspace(4) null, ptr addrspace(4) %arg_generic
+// AMDGCN: store ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr %arg_private
+// AMDGCN: store ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr %arg_local
+// AMDGCN: store ptr addrspace(1) null, ptr %arg_global
+// AMDGCN: store ptr addrspace(4) null, ptr %arg_constant
+// AMDGCN: store ptr null, ptr %arg_generic
+void test_storage_null_pointer(private char** arg_private,
+                               local char** arg_local,
+                               global char** arg_global,
+                               constant char** arg_constant,
+                               generic char** arg_generic) {
+   *arg_private = 0;
+   *arg_local = 0;
+   *arg_global = 0;
+   *arg_constant = 0;
+   *arg_generic = 0;
+}
+
+// Test storage NULL as null pointer.
+// CHECK-LABEL: test_storage_null_pointer_NULL
+// SPIR64: store ptr addrspacecast (ptr addrspace(4) null to ptr), ptr addrspace(4) %arg_private
+// SPIR64: store ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3)), ptr addrspace(4) %arg_local
+// SPIR64: store ptr addrspace(1) addrspacecast (ptr addrspace(4) null to ptr addrspace(1)), ptr addrspace(4) %arg_global
+// SPIR64: store ptr addrspace(2) null, ptr addrspace(4) %arg_constant
+// SPIR64: store ptr addrspace(4) null, ptr addrspace(4) %arg_generic
+// AMDGCN: store ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr %arg_private
+// AMDGCN: store ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr %arg_local
+// AMDGCN: store ptr addrspace(1) null, ptr %arg_global
+// AMDGCN: store ptr addrspace(4) null, ptr %arg_constant
+// AMDGCN: store ptr null, ptr %arg_generic
+void test_storage_null_pointer_NULL(private char** arg_private,
+                                    local char** arg_local,
+                                    global char** arg_global,
+                                    constant char** arg_constant,
+                                    generic char** arg_generic) {
+   *arg_private = NULL;
+   *arg_local = NULL;
+   *arg_global = NULL;
+   *arg_constant = NULL;
+   *arg_generic = NULL;
+}
+
+// Test pass null pointer to function as argument.
+void test_pass_null_pointer_arg_calee(private char* arg_private,
+                                      local char* arg_local,
+                                      global char* arg_global,
+                                      constant char* arg_constant,
+                                      generic char* arg_generic);
+
+// CHECK-LABEL: test_pass_null_pointer_arg
+// SPIR64: call spir_func void @test_pass_null_pointer_arg_calee(ptr addrspacecast (ptr addrspace(4) null to ptr), ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3)), ptr addrspace(1) addrspacecast (ptr addrspace(4) null to ptr addrspace(1)), ptr addrspace(2) null, ptr addrspace(4) null)
+// SPIR64: call spir_func void @test_pass_null_pointer_arg_calee(ptr addrspacecast (ptr addrspace(4) null to ptr), ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3)), ptr addrspace(1) addrspacecast (ptr addrspace(4) null to ptr addrspace(1)), ptr addrspace(2) null, ptr addrspace(4) null)
+// AMDGCN: call void @test_pass_null_pointer_arg_calee(ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(1) null, ptr addrspace(4) null, ptr null)
+// AMDGCN: call void @test_pass_null_pointer_arg_calee(ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(1) null, ptr addrspace(4) null, ptr null)
+void test_pass_null_pointer_arg(void) {
+  test_pass_null_pointer_arg_calee(0, 0, 0, 0, 0);
+  test_pass_null_pointer_arg_calee(NULL, NULL, NULL, NULL, NULL);
+}
+
+// Test cast null pointer to size_t.
+void test_cast_null_pointer_to_sizet_calee(size_t arg_private,
+                                           size_t arg_local,
+                                           size_t arg_global,
+                                           size_t arg_constant,
+                                           size_t arg_generic);
+
+// CHECK-LABEL: test_cast_null_pointer_to_sizet
+// SPIR64: call spir_func void @test_cast_null_pointer_to_sizet_calee(i64 ptrtoint (ptr addrspacecast (ptr addrspace(4) null to ptr) to i64), i64 ptrtoint (ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3)) to i64), i64 ptrtoint (ptr addrspace(1) addrspacecast (ptr addrspace(4) null to ptr addrspace(1)) to i64), i64 0, i64 0)
+// SPIR64: call spir_func void @test_cast_null_pointer_to_sizet_calee(i64 ptrtoint (ptr addrspacecast (ptr addrspace(4) null to ptr) to i64), i64 ptrtoint (ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3)) to i64), i64 ptrtoint (ptr addrspace(1) addrspacecast (ptr addrspace(4) null to ptr addrspace(1)) to i64), i64 0, i64 0)
+// AMDGCN: call void @test_cast_null_pointer_to_sizet_calee(i64 ptrtoint (ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)) to i64), i64 ptrtoint (ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)) to i64), i64 0, i64 0, i64 0)
+// AMDGCN: call void @test_cast_null_pointer_to_sizet_calee(i64 ptrtoint (ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)) to i64), i64 ptrtoint (ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)) to i64), i64 0, i64 0, i64 0)
+void test_cast_null_pointer_to_sizet(void) {
+  test_cast_null_pointer_to_sizet_calee((size_t)((private char*)0),
+                                        (size_t)((local char*)0),
+                                        (size_t)((global char*)0),
+                                        (size_t)((constant char*)0),
+                                        (size_t)((generic char*)0));
+  test_cast_null_pointer_to_sizet_calee((size_t)((private char*)NULL),
+                                        (size_t)((local char*)NULL),
+                                        (size_t)((global char*)NULL),
+                                        (size_t)((constant char*)0), // NULL cannot be casted to constant pointer since it is defined as a generic pointer
+                                        (size_t)((generic char*)NULL));
+}
+
+// Test comparison between null pointers.
+#define TEST_EQ00(addr1, addr2) int test_eq00_##addr1##_##addr2(void) { return (addr1 char*)0 == (addr2 char*)0; }
+#define TEST_EQ0N(addr1, addr2) int test_eq0N_##addr1##_##addr2(void) { return (addr1 char*)0 == (addr2 char*)NULL; }
+#define TEST_EQN0(addr1, addr2) int test_eqN0_##addr1##_##addr2(void) { return (addr1 char*)NULL == (addr2 char*)0; }
+#define TEST_EQNN(addr1, addr2) int test_eqNN_##addr1##_##addr2(void) { return (addr1 char*)0 == (addr2 char*)NULL; }
+#define TEST_NE00(addr1, addr2) int test_ne00_##addr1##_##addr2(void) { return (addr1 char*)0 != (addr2 char*)0; }
+#define TEST_NE0N(addr1, addr2) int test_ne0N_##addr1##_##addr2(void) { return (addr1 char*)0 != (addr2 char*)NULL; }
+#define TEST_NEN0(addr1, addr2) int test_neN0_##addr1##_##addr2(void) { return (addr1 char*)NULL != (addr2 char*)0; }
+#define TEST_NENN(addr1, addr2) int test_neNN_##addr1##_##addr2(void) { return (addr1 char*)0 != (addr2 char*)NULL; }
+#define TEST(addr1, addr2) \
+        TEST_EQ00(addr1, addr2) \
+        TEST_EQ0N(addr1, addr2) \
+        TEST_EQN0(addr1, addr2) \
+        TEST_EQNN(addr1, addr2) \
+        TEST_NE00(addr1, addr2) \
+        TEST_NE0N(addr1, addr2) \
+        TEST_NEN0(addr1, addr2) \
+        TEST_NENN(addr1, addr2)
+
+// CHECK-LABEL: test_eq00_generic_private
+// CHECK: ret i32 1
+// CHECK-LABEL: test_eq0N_generic_private
+// CHECK: ret i32 1
+// CHECK-LABEL: test_eqN0_generic_private
+// CHECK: ret i32 1
+// CHECK-LABEL: test_eqNN_generic_private
+// CHECK: ret i32 1
+// CHECK-LABEL: test_ne00_generic_private
+// CHECK: ret i32 0
+// CHECK-LABEL: test_ne0N_generic_private
+// CHECK: ret i32 0
+// CHECK-LABEL: test_neN0_generic_private
+// CHECK: ret i32 0
+// CHECK-LABEL: test_neNN_generic_private
+// CHECK: ret i32 0
+TEST(generic, private)
+
+// CHECK-LABEL: test_eq00_generic_local
+// CHECK: ret i32 1
+// CHECK-LABEL: test_eq0N_generic_local
+// CHECK: ret i32 1
+// CHECK-LABEL: test_eqN0_generic_local
+// CHECK: ret i32 1
+// CHECK-LABEL: test_eqNN_generic_local
+// CHECK: ret i32 1
+// CHECK-LABEL: test_ne00_generic_local
+// CHECK: ret i32 0
+// CHECK-LABEL: test_ne0N_generic_local
+// CHECK: ret i32 0
+// CHECK-LABEL: test_neN0_generic_local
+// CHECK: ret i32 0
+// CHECK-LABEL: test_neNN_generic_local
+// CHECK: ret i32 0
+TEST(generic, local)
+
+// CHECK-LABEL: test_eq00_generic_global
+// CHECK: ret i32 1
+// CHECK-LABEL: test_eq0N_generic_global
+// CHECK: ret i32 1
+// CHECK-LABEL: test_eqN0_generic_global
+// CHECK: ret i32 1
+// CHECK-LABEL: test_eqNN_generic_global
+// CHECK: ret i32 1
+// CHECK-LABEL: test_ne00_generic_global
+// CHECK: ret i32 0
+// CHECK-LABEL: test_ne0N_generic_global
+// CHECK: ret i32 0
+// CHECK-LABEL: test_neN0_generic_global
+// CHECK: ret i32 0
+// CHECK-LABEL: test_neNN_generic_global
+// CHECK: ret i32 0
+TEST(generic, global)
+
+// CHECK-LABEL: test_eq00_generic_generic
+// CHECK: ret i32 1
+// CHECK-LABEL: test_eq0N_generic_generic
+// CHECK: ret i32 1
+// CHECK-LABEL: test_eqN0_generic_generic
+// CHECK: ret i32 1
+// CHECK-LABEL: test_eqNN_generic_generic
+// CHECK: ret i32 1
+// CHECK-LABEL: test_ne00_generic_generic
+// CHECK: ret i32 0
+// CHECK-LABEL: test_ne0N_generic_generic
+// CHECK: ret i32 0
+// CHECK-LABEL: test_neN0_generic_generic
+// CHECK: ret i32 0
+// CHECK-LABEL: test_neNN_generic_generic
+// CHECK: ret i32 0
+TEST(generic, generic)
+
+// CHECK-LABEL: test_eq00_constant_constant
+// CHECK: ret i32 1
+TEST_EQ00(constant, constant)
+
+// Test cast to bool.
+
+// CHECK-LABEL: cast_bool_private
+// SPIR64: icmp eq ptr %p, addrspacecast (ptr addrspace(4) null to ptr)
+// AMDGCN: icmp eq ptr addrspace(5) %p, addrspacecast (ptr null to ptr addrspace(5))
+void cast_bool_private(private char* p) {
+  if (p)
+    *p = 0;
+}
+
+// CHECK-LABEL: cast_bool_local
+// SPIR64: icmp eq ptr addrspace(3) %p, addrspacecast (ptr addrspace(4) null to ptr addrspace(3))
+// AMDGCN: icmp eq ptr addrspace(3) %p, addrspacecast (ptr null to ptr addrspace(3))
+void cast_bool_local(local char* p) {
+  if (p)
+    *p = 0;
+}
+
+// CHECK-LABEL: cast_bool_global
+// SPIR64: icmp eq ptr addrspace(1) %p, addrspacecast (ptr addrspace(4) null to ptr addrspace(1))
+// AMDGCN: icmp eq ptr addrspace(1) %p, null
+void cast_bool_global(global char* p) {
+  if (p)
+    *p = 0;
+}
+
+// CHECK-LABEL: cast_bool_constant
+// SPIR64: icmp eq ptr addrspace(2) %p, null
+// AMDGCN: icmp eq ptr addrspace(4) %p, null
+char cast_bool_constant(constant char* p) {
+  if (p)
+    return *p;
+  else
+    return 0;
+}
+
+// CHECK-LABEL: cast_bool_generic
+// SPIR64: icmp eq ptr addrspace(4) %p, null
+// AMDGCN: icmp eq ptr %p, null
+void cast_bool_generic(generic char* p) {
+  if (p)
+    *p = 0;
+}
+
+// Test initialize a struct using memset.
+// For large structures which is mostly zero, clang generats llvm.memset for
+// the zero part and store for non-zero members.
+typedef struct {
+  long a, b, c, d;
+  private char *p;
+} StructTy3;
+
+// CHECK-LABEL: test_memset_private
+// SPIR64: call void @llvm.memset.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) %ptr, i8 0, i64 32, i1 false)
+// SPIR64: [[GEP:%.*]] = getelementptr inbounds nuw i8, ptr %ptr, i64 32
+// SPIR64: store ptr addrspacecast (ptr addrspace(4) null to ptr), ptr [[GEP]], align 8
+// AMDGCN: call void @llvm.memset.p5.i64(ptr addrspace(5) noundef align 8 {{.*}}, i8 0, i64 32, i1 false)
+// AMDGCN: [[GEP:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(5) %ptr, i32 32
+// AMDGCN: store ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(5) [[GEP]]
+// AMDGCN: [[GEP1:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(5) {{.*}}, i32 36
+// AMDGCN: store i32 0, ptr addrspace(5) [[GEP1]], align 4
+void test_memset_private(private StructTy3 *ptr) {
+  StructTy3 S3 = {0, 0, 0, 0, 0};
+  *ptr = S3;
+}
+
+// Test casting literal 0 to pointer.
+// A 0 literal casted to pointer should become a null pointer.
+
+// CHECK-LABEL: test_cast_0_to_local_ptr
+// SPIR64: ret ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3))
+// AMDGCN: ret ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3))
+local int* test_cast_0_to_local_ptr(void) {
+  return (local int*)0;
+}
+
+// CHECK-LABEL: test_cast_0_to_private_ptr
+// SPIR64: ptr addrspacecast (ptr addrspace(4) null to ptr)
+// AMDGCN: ret ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5))
+private int* test_cast_0_to_private_ptr(void) {
+  return (private int*)0;
+}
+
+// Test casting non-literal integer with 0 value to pointer.
+// A non-literal integer expression with 0 value is casted to a pointer with
+// zero value.
+
+// CHECK-LABEL: test_cast_int_to_ptr1_private
+// SPIR64: ret ptr null
+// AMDGCN: ret ptr addrspace(5) null
+private int* test_cast_int_to_ptr1_private(void) {
+  return (private int*)((void)0, 0);
+}
+
+// CHECK-LABEL: test_cast_int_to_ptr1_local
+// CHECK: ret ptr addrspace(3) null
+local int* test_cast_int_to_ptr1_local(void) {
+  return (local int*)((void)0, 0);
+}
+
+// CHECK-LABEL: test_cast_int_to_ptr2
+// SPIR64: ret ptr null
+// AMDGCN: ret ptr addrspace(5) null
+private int* test_cast_int_to_ptr2(void) {
+  int x = 0;
+  return (private int*)x;
+}
+
+// Test logical operations.
+// CHECK-LABEL: test_not_nullptr
+// CHECK: ret i32 1
+int test_not_nullptr(void) {
+  return !(private char*)NULL;
+}
+
+// CHECK-LABEL: test_and_nullptr
+// CHECK: ret i32 0
+int test_and_nullptr(int a) {
+  return a && ((private char*)NULL);
+}
+
+// CHECK-LABEL: test_not_private_ptr
+// SPIR64: %[[lnot:.*]] = icmp eq ptr %p, addrspacecast (ptr addrspace(4) null to ptr)
+// AMDGCN: %[[lnot:.*]] = icmp eq ptr addrspace(5) %p, addrspacecast (ptr null to ptr addrspace(5))
+// CHECK: %[[lnot_ext:.*]] = zext i1 %[[lnot]] to i32
+// CHECK: ret i32 %[[lnot_ext]]
+int test_not_private_ptr(private char* p) {
+  return !p;
+}
+
+// CHECK-LABEL: test_not_local_ptr
+// SPIR64: %[[lnot:.*]] = icmp eq ptr addrspace(3) %p, addrspacecast (ptr addrspace(4) null to ptr addrspace(3))
+// AMDGCN: %[[lnot:.*]] = icmp eq ptr addrspace(3) %p, addrspacecast (ptr null to ptr addrspace(3))
+// CHECK: %[[lnot_ext:.*]] = zext i1 %[[lnot]] to i32
+// CHECK: ret i32 %[[lnot_ext]]
+int test_not_local_ptr(local char* p) {
+  return !p;
+}
+
+
+// CHECK-LABEL: test_and_ptr
+// SPIR64: %[[tobool:.*]] = icmp ne ptr %p1, addrspacecast (ptr addrspace(4) null to ptr)
+// SPIR64: %[[tobool1:.*]] = icmp ne ptr addrspace(3) %p2, addrspacecast (ptr addrspace(4) null to ptr addrspace(3))
+// AMDGCN: %[[tobool:.*]] = icmp ne ptr addrspace(5) %p1, addrspacecast (ptr null to ptr addrspace(5))
+// AMDGCN: %[[tobool1:.*]] = icmp ne ptr addrspace(3) %p2, addrspacecast (ptr null to ptr addrspace(3))
+// CHECK: %[[res:.*]] = select i1 %[[tobool]], i1 %[[tobool1]], i1 false
+// CHECK: %[[land_ext:.*]] = zext i1 %[[res]] to i32
+// CHECK: ret i32 %[[land_ext]]
+int test_and_ptr(private char* p1, local char* p2) {
+  return p1 && p2;
+}
+
+// Test folding of null pointer in function scope.
+// CHECK-NOOPT-LABEL: test_fold_private
+// SPIR64-NOOPT:  call{{.*}} void @test_fold_callee
+// SPIR64-NOOPT:  store ptr addrspace(1) addrspacecast (ptr addrspace(4) null to ptr addrspace(1)), ptr %glob{{.*}}, align 8
+// SPIR64-NOOPT:  %{{.*}} = sub i64 %{{.*}}, ptrtoint (ptr addrspace(1) addrspacecast (ptr addrspace(4) null to ptr addrspace(1)) to i64)
+// AMDGCN-NOOPT: store ptr addrspace(1) null, ptr addrspace(5) %glob{{.*}}, align 8
+// AMDGCN-NOOPT: %{{.*}} = sub i64 %{{.*}}, 0
+// SPIR64-NOOPT:  call{{.*}} void @test_fold_callee
+// SPIR64-NOOPT:  %[[SEXT:.*]] = sext i32 ptrtoint (ptr addrspacecast (ptr addrspace(4) null to ptr) to i32) to i64
+// AMDGCN-NOOPT: %[[SEXT:.*]] = sext i32 ptrtoint (ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)) to i32) to i64
+// CHECK-NOOPT: %{{.*}} = add nsw i64 %{{.*}}, %[[SEXT]]
+// CHECK-NOOPT: %{{.*}} = sub nsw i64 %{{.*}}, 1
+void test_fold_callee(void);
+void test_fold_private(void) {
+  global int* glob = (test_fold_callee(), (global int*)(generic char*)0);
+  long x = glob - (global int*)(generic char*)0;
+  x = x + (int)(test_fold_callee(), (private int*)(generic char*)(global short*)0);
+  x = x - (int)((private int*)0 == (private int*)(generic char*)0);
+}
+
+// CHECK-NOOPT-LABEL: test_fold_local
+// CHECK-NOOPT:  call{{.*}} void @test_fold_callee
+// SPIR64-NOOPT: store ptr addrspace(1) addrspacecast (ptr addrspace(4) null to ptr addrspace(1)), ptr %glob{{.*}}, align 8
+// SPIR64-NOOPT: %{{.*}} = sub i64 %{{.*}}, ptrtoint (ptr addrspace(1) addrspacecast (ptr addrspace(4) null to ptr addrspace(1)) to i64)
+// AMDGCN-NOOPT: store ptr addrspace(1) null, ptr addrspace(5) %glob{{.*}}, align 8
+// AMDGCN-NOOPT: %{{.*}} = sub i64 %{{.*}}, 0
+// CHECK-NOOPT:  call{{.*}} void @test_fold_callee
+// SPIR64-NOOPT: %[[SEXT:.*]] = sext i32 ptrtoint (ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3)) to i32) to i64
+// AMDGCN-NOOPT: %[[SEXT:.*]] = sext i32 ptrtoint (ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)) to i32) to i64
+// CHECK-NOOPT: %{{.*}} = add nsw i64 %{{.*}}, %[[SEXT]]
+// CHECK-NOOPT: %{{.*}} = sub nsw i64 %{{.*}}, 1
+void test_fold_local(void) {
+  global int* glob = (test_fold_callee(), (global int*)(generic char*)0);
+  long x = glob - (global int*)(generic char*)0;
+  x = x + (int)(test_fold_callee(), (local int*)(generic char*)(global short*)0);
+  x = x - (int)((local int*)0 == (local int*)(generic char*)0);
+}
diff --git a/clang/test/CodeGenOpenCL/preserve_vec3.cl b/clang/test/CodeGenOpenCL/preserve_vec3.cl
index e76aa81f918cb..0017169b8cf48 100644
--- a/clang/test/CodeGenOpenCL/preserve_vec3.cl
+++ b/clang/test/CodeGenOpenCL/preserve_vec3.cl
@@ -12,7 +12,7 @@ typedef float float4 __attribute__((ext_vector_type(4)));
 // CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] !kernel_arg_addr_space [[META7:![0-9]+]] !kernel_arg_access_qual [[META8:![0-9]+]] !kernel_arg_type [[META9:![0-9]+]] !kernel_arg_base_type [[META10:![0-9]+]] !kernel_arg_type_qual [[META11:![0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16
-// CHECK-NEXT:    [[EXTRACTVEC1_I:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    [[EXTRACTVEC1_I:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> <float undef, float poison, float poison>, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 // CHECK-NEXT:    store <4 x float> [[EXTRACTVEC1_I]], ptr addrspace(1) [[B]], align 16, !tbaa [[CHAR_TBAA12:![0-9]+]]
 // CHECK-NEXT:    ret void
 //
@@ -24,7 +24,7 @@ void kernel foo(global float3 *a, global float3 *b) {
 // CHECK-SAME: ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[A:%.*]], ptr addrspace(1) noundef readonly align 16 captures(none) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META7]] !kernel_arg_access_qual [[META8]] !kernel_arg_type [[META13:![0-9]+]] !kernel_arg_base_type [[META14:![0-9]+]] !kernel_arg_type_qual [[META11]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[B]], align 16, !tbaa [[CHAR_TBAA12]]
-// CHECK-NEXT:    [[EXTRACTVEC_I:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    [[EXTRACTVEC_I:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> <float undef, float poison, float poison>, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 // CHECK-NEXT:    store <4 x float> [[EXTRACTVEC_I]], ptr addrspace(1) [[A]], align 16, !tbaa [[CHAR_TBAA12]]
 // CHECK-NEXT:    ret void
 //
@@ -60,7 +60,7 @@ void kernel float3_to_double2(global float3 *a, global double2 *b) {
 // CHECK-SAME: ptr addrspace(1) noundef writeonly align 8 captures(none) initializes((0, 8)) [[A:%.*]], ptr addrspace(1) noundef readonly align 8 captures(none) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META7]] !kernel_arg_access_qual [[META8]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META18:![0-9]+]] !kernel_arg_type_qual [[META11]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load <3 x i16>, ptr addrspace(1) [[B]], align 8, !tbaa [[CHAR_TBAA12]]
-// CHECK-NEXT:    [[EXTRACTVEC_I:%.*]] = shufflevector <3 x i16> [[TMP0]], <3 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    [[EXTRACTVEC_I:%.*]] = shufflevector <3 x i16> [[TMP0]], <3 x i16> <i16 undef, i16 poison, i16 poison>, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 // CHECK-NEXT:    store <4 x i16> [[EXTRACTVEC_I]], ptr addrspace(1) [[A]], align 8, !tbaa [[CHAR_TBAA12]]
 // CHECK-NEXT:    ret void
 //
@@ -71,8 +71,8 @@ void kernel char8_to_short3(global short3 *a, global char8 *b) {
 // CHECK-LABEL: define dso_local spir_func void @from_char3(
 // CHECK-SAME: <3 x i8> noundef [[A:%.*]], ptr addrspace(1) noundef writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x i8> [[A]], <3 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
-// CHECK-NEXT:    store <4 x i8> [[EXTRACTVEC]], ptr addrspace(1) [[OUT]], align 4, !tbaa [[INT_TBAA3:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <3 x i8> [[A]], <3 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    store <4 x i8> [[TMP0]], ptr addrspace(1) [[OUT]], align 4, !tbaa [[INT_TBAA3:![0-9]+]]
 // CHECK-NEXT:    ret void
 //
 void from_char3(char3 a, global int *out) {
@@ -82,8 +82,8 @@ void from_char3(char3 a, global int *out) {
 // CHECK-LABEL: define dso_local spir_func void @from_short3(
 // CHECK-SAME: <3 x i16> noundef [[A:%.*]], ptr addrspace(1) noundef writeonly captures(none) initializes((0, 8)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x i16> [[A]], <3 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
-// CHECK-NEXT:    store <4 x i16> [[EXTRACTVEC]], ptr addrspace(1) [[OUT]], align 8, !tbaa [[LONG_TBAA19:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <3 x i16> [[A]], <3 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    store <4 x i16> [[TMP0]], ptr addrspace(1) [[OUT]], align 8, !tbaa [[LONG_TBAA19:![0-9]+]]
 // CHECK-NEXT:    ret void
 //
 void from_short3(short3 a, global long *out) {
@@ -94,7 +94,8 @@ void from_short3(short3 a, global long *out) {
 // CHECK-SAME: i32 noundef [[A:%.*]], ptr addrspace(1) noundef writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A]] to <4 x i8>
-// CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    [[ASTYPE:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <3 x i32> <i32 0, i32 1, i32 2>
+// CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x i8> [[ASTYPE]], <3 x i8> <i8 undef, i8 poison, i8 poison>, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 // CHECK-NEXT:    store <4 x i8> [[EXTRACTVEC]], ptr addrspace(1) [[OUT]], align 4, !tbaa [[CHAR_TBAA12]]
 // CHECK-NEXT:    ret void
 //
@@ -106,7 +107,8 @@ void scalar_to_char3(int a, global char3 *out) {
 // CHECK-SAME: i64 noundef [[A:%.*]], ptr addrspace(1) noundef writeonly captures(none) initializes((0, 8)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A]] to <4 x i16>
-// CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    [[ASTYPE:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <3 x i32> <i32 0, i32 1, i32 2>
+// CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x i16> [[ASTYPE]], <3 x i16> <i16 undef, i16 poison, i16 poison>, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 // CHECK-NEXT:    store <4 x i16> [[EXTRACTVEC]], ptr addrspace(1) [[OUT]], align 8, !tbaa [[CHAR_TBAA12]]
 // CHECK-NEXT:    ret void
 //
diff --git a/clang/test/DebugInfo/Generic/bit-int.c b/clang/test/DebugInfo/Generic/bit-int.c
new file mode 100644
index 0000000000000..88ecc139eee9f
--- /dev/null
+++ b/clang/test/DebugInfo/Generic/bit-int.c
@@ -0,0 +1,8 @@
+// RUN: %clang_cc1 -x c++ %s -debug-info-kind=standalone -gno-column-info -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -x c   %s -debug-info-kind=standalone -gno-column-info -emit-llvm -o - | FileCheck %s
+
+unsigned _BitInt(17) a;
+_BitInt(2) b;
+
+// CHECK: !DIBasicType(name: "_BitInt(2)", size: 8, dataSize: 2, encoding: DW_ATE_signed)
+// CHECK: !DIBasicType(name: "unsigned _BitInt(17)", size: 32,  dataSize: 17, encoding: DW_ATE_unsigned)
diff --git a/clang/test/DebugInfo/Generic/macro-info.c b/clang/test/DebugInfo/Generic/macro-info.c
new file mode 100644
index 0000000000000..ec49eb5d65f9c
--- /dev/null
+++ b/clang/test/DebugInfo/Generic/macro-info.c
@@ -0,0 +1,35 @@
+// RUN: %clang_cc1 %s -debug-info-kind=standalone -emit-llvm -o - | FileCheck %s
+
+#define GLOBAL(num) global## num
+#define DECL_GLOBAL(x) int x
+#define SAME_ORDER(x, y) x; y
+#define SWAP_ORDER(x,y) y; x
+
+
+
+SAME_ORDER(
+  int
+// CHECK: DIGlobalVariable(name: "global",{{.*}} line: [[@LINE+1]]
+    GLOBAL  // <- global
+      () = 42,
+  const char* s() {
+// CHECK: DIGlobalVariable({{.*}}line: [[@LINE+1]],{{.*}} type: [[TYPEID:![0-9]+]]
+    return "1234567890";
+  }
+)
+
+SWAP_ORDER(
+  int GLOBAL(  // <- global2
+    2) = 43,
+// CHECK: DIGlobalVariable(name: "global3",{{.*}} line: [[@LINE+3]]
+// CHECK: DIGlobalVariable(name: "global2",{{.*}} line: [[@LINE-3]]
+  DECL_GLOBAL(
+    GLOBAL(  // <- global3
+      3)) = 44
+);
+
+
+DECL_GLOBAL(
+// CHECK: DIGlobalVariable(name: "global4",{{.*}} line: [[@LINE+1]]
+  GLOBAL(  // <- global4
+    4));
diff --git a/clang/include/clang/Driver/aarch64-mlr-for-calls-only.c b/clang/test/Driver/aarch64-mlr-for-calls-only.c
similarity index 100%
rename from clang/include/clang/Driver/aarch64-mlr-for-calls-only.c
rename to clang/test/Driver/aarch64-mlr-for-calls-only.c
diff --git a/clang/test/Driver/aarch64-ptrauth.c b/clang/test/Driver/aarch64-ptrauth.c
index b080a77195c8c..a67e98fdda714 100644
--- a/clang/test/Driver/aarch64-ptrauth.c
+++ b/clang/test/Driver/aarch64-ptrauth.c
@@ -4,7 +4,8 @@
 // NONE:     "-cc1"
 // NONE-NOT: "-fptrauth-
 
-// RUN: %clang -### -c --target=aarch64 \
+//// -fptauth-* driver flags on Linux are only supported with pauthtest ABI.
+// RUN: %clang -### -c --target=aarch64-linux -mabi=pauthtest \
 // RUN:   -fno-ptrauth-intrinsics -fptrauth-intrinsics \
 // RUN:   -fno-ptrauth-calls -fptrauth-calls \
 // RUN:   -fno-ptrauth-returns -fptrauth-returns \
@@ -15,9 +16,43 @@
 // RUN:   -fno-ptrauth-indirect-gotos -fptrauth-indirect-gotos \
 // RUN:   -fno-ptrauth-init-fini -fptrauth-init-fini \
 // RUN:   -fno-ptrauth-init-fini-address-discrimination -fptrauth-init-fini-address-discrimination \
+// RUN:   -fno-ptrauth-elf-got -fptrauth-elf-got \
 // RUN:   -fno-aarch64-jump-table-hardening -faarch64-jump-table-hardening \
-// RUN:   %s 2>&1 | FileCheck %s --check-prefix=ALL
-// ALL: "-cc1"{{.*}} "-fptrauth-intrinsics" "-fptrauth-calls" "-fptrauth-returns" "-fptrauth-auth-traps" "-fptrauth-vtable-pointer-address-discrimination" "-fptrauth-vtable-pointer-type-discrimination" "-fptrauth-type-info-vtable-pointer-discrimination" "-fptrauth-indirect-gotos" "-fptrauth-init-fini" "-fptrauth-init-fini-address-discrimination" "-faarch64-jump-table-hardening"
+// RUN:   %s 2>&1 | FileCheck %s --check-prefix=ALL-LINUX-PAUTHABI
+// RUN: %clang -### -c --target=aarch64-linux-pauthtest \
+// RUN:   -fno-ptrauth-intrinsics -fptrauth-intrinsics \
+// RUN:   -fno-ptrauth-calls -fptrauth-calls \
+// RUN:   -fno-ptrauth-returns -fptrauth-returns \
+// RUN:   -fno-ptrauth-auth-traps -fptrauth-auth-traps \
+// RUN:   -fno-ptrauth-vtable-pointer-address-discrimination -fptrauth-vtable-pointer-address-discrimination \
+// RUN:   -fno-ptrauth-vtable-pointer-type-discrimination -fptrauth-vtable-pointer-type-discrimination \
+// RUN:   -fno-ptrauth-type-info-vtable-pointer-discrimination -fptrauth-type-info-vtable-pointer-discrimination \
+// RUN:   -fno-ptrauth-indirect-gotos -fptrauth-indirect-gotos \
+// RUN:   -fno-ptrauth-init-fini -fptrauth-init-fini \
+// RUN:   -fno-ptrauth-init-fini-address-discrimination -fptrauth-init-fini-address-discrimination \
+// RUN:   -fno-ptrauth-elf-got -fptrauth-elf-got \
+// RUN:   -fno-aarch64-jump-table-hardening -faarch64-jump-table-hardening \
+// RUN:   %s 2>&1 | FileCheck %s --check-prefix=ALL-LINUX-PAUTHABI
+// ALL-LINUX-PAUTHABI: "-cc1"{{.*}} "-fptrauth-intrinsics" "-fptrauth-calls" "-fptrauth-returns" "-fptrauth-auth-traps" "-fptrauth-vtable-pointer-address-discrimination" "-fptrauth-vtable-pointer-type-discrimination" "-fptrauth-type-info-vtable-pointer-discrimination" "-fptrauth-indirect-gotos" "-fptrauth-init-fini" "-fptrauth-init-fini-address-discrimination" "-fptrauth-elf-got"{{.*}} "-faarch64-jump-table-hardening"
+
+// RUN: %clang -### -c --target=aarch64-linux \
+// RUN:   -fno-aarch64-jump-table-hardening -faarch64-jump-table-hardening \
+// RUN:   %s 2>&1 | FileCheck %s --check-prefix=ALL-LINUX
+// ALL-LINUX: "-cc1"{{.*}} "-faarch64-jump-table-hardening"
+
+//// Some -fptrauth-* flags are supported for ARM64 Darwin.
+// RUN: %clang -### -c --target=arm64-darwin \
+// RUN:   -fno-ptrauth-intrinsics -fptrauth-intrinsics \
+// RUN:   -fno-ptrauth-calls -fptrauth-calls \
+// RUN:   -fno-ptrauth-returns -fptrauth-returns \
+// RUN:   -fno-ptrauth-auth-traps -fptrauth-auth-traps \
+// RUN:   -fno-ptrauth-vtable-pointer-address-discrimination -fptrauth-vtable-pointer-address-discrimination \
+// RUN:   -fno-ptrauth-vtable-pointer-type-discrimination -fptrauth-vtable-pointer-type-discrimination \
+// RUN:   -fno-ptrauth-type-info-vtable-pointer-discrimination -fptrauth-type-info-vtable-pointer-discrimination \
+// RUN:   -fno-ptrauth-indirect-gotos -fptrauth-indirect-gotos \
+// RUN:   -fno-aarch64-jump-table-hardening -faarch64-jump-table-hardening \
+// RUN:   %s 2>&1 | FileCheck %s --check-prefix=ALL-DARWIN
+// ALL-DARWIN: "-cc1"{{.*}} "-fptrauth-intrinsics" "-fptrauth-calls" "-fptrauth-returns" "-fptrauth-auth-traps" "-fptrauth-vtable-pointer-address-discrimination" "-fptrauth-vtable-pointer-type-discrimination" "-fptrauth-type-info-vtable-pointer-discrimination" "-fptrauth-indirect-gotos"{{.*}} "-faarch64-jump-table-hardening"
 
 // RUN: %clang -### -c --target=aarch64-linux -mabi=pauthtest %s 2>&1 | FileCheck %s --check-prefix=PAUTHABI1
 // RUN: %clang -### -c --target=aarch64-linux-pauthtest %s 2>&1 | FileCheck %s --check-prefix=PAUTHABI1
@@ -40,7 +75,7 @@
 // RUN:   -fno-aarch64-jump-table-hardening %s 2>&1 | FileCheck %s --check-prefix=PAUTHABI2
 
 //// Non-linux OS: pauthtest ABI has no effect in terms of passing ptrauth cc1 flags.
-//// An error about unsupported ABI will be emitted later in pipeline (see ERR2 below)
+//// An error about unsupported ABI will be emitted later in pipeline (see ERR3 below)
 // RUN: %clang -### -c --target=aarch64 -mabi=pauthtest %s 2>&1 | FileCheck %s --check-prefix=PAUTHABI2
 
 // PAUTHABI2:      "-cc1"
@@ -55,10 +90,11 @@
 // PAUTHABI3-NOT:  "-fptrauth-
 // PAUTHABI3-NOT: "-faarch64-jump-table-hardening"
 
-// RUN: not %clang -### -c --target=x86_64 -fptrauth-intrinsics -fptrauth-calls -fptrauth-returns -fptrauth-auth-traps \
+//// Non-pauthtest ABI.
+// RUN: not %clang -### -c --target=aarch64-linux -fptrauth-intrinsics -fptrauth-calls -fptrauth-returns -fptrauth-auth-traps \
 // RUN:   -fptrauth-vtable-pointer-address-discrimination -fptrauth-vtable-pointer-type-discrimination \
 // RUN:   -fptrauth-type-info-vtable-pointer-discrimination -fptrauth-indirect-gotos -fptrauth-init-fini \
-// RUN:   -fptrauth-init-fini-address-discrimination -faarch64-jump-table-hardening %s 2>&1 | FileCheck %s --check-prefix=ERR1
+// RUN:   -fptrauth-init-fini-address-discrimination -fptrauth-elf-got %s 2>&1 | FileCheck %s --check-prefix=ERR1
 // ERR1:      error: unsupported option '-fptrauth-intrinsics' for target '{{.*}}'
 // ERR1-NEXT: error: unsupported option '-fptrauth-calls' for target '{{.*}}'
 // ERR1-NEXT: error: unsupported option '-fptrauth-returns' for target '{{.*}}'
@@ -69,59 +105,64 @@
 // ERR1-NEXT: error: unsupported option '-fptrauth-indirect-gotos' for target '{{.*}}'
 // ERR1-NEXT: error: unsupported option '-fptrauth-init-fini' for target '{{.*}}'
 // ERR1-NEXT: error: unsupported option '-fptrauth-init-fini-address-discrimination' for target '{{.*}}'
-// ERR1-NEXT: error: unsupported option '-faarch64-jump-table-hardening' for target '{{.*}}'
+// ERR1-NEXT: error: unsupported option '-fptrauth-elf-got' for target '{{.*}}'
 
+//// Non-AArch64.
+// RUN: not %clang -### -c --target=x86_64-linux -faarch64-jump-table-hardening %s 2>&1 | FileCheck %s --check-prefix=ERR2
+// ERR2: error: unsupported option '-faarch64-jump-table-hardening' for target '{{.*}}'
+
+//// Only support PAuth ABI for Linux as for now.
+// RUN: not %clang -c --target=aarch64 -mabi=pauthtest %s 2>&1 | FileCheck %s --check-prefix=ERR3
+// ERR3: error: unknown target ABI 'pauthtest'
 
-// RUN: not %clang -c --target=aarch64 -mabi=pauthtest %s 2>&1 | FileCheck %s --check-prefix=ERR2
 //// The ABI is not specified explicitly, and for non-Linux pauthtest environment does not correspond
 //// to pauthtest ABI (each OS target defines this behavior separately). Do not emit an error.
-// RUN:     %clang -c --target=aarch64-pauthtest       %s -o /dev/null
-// ERR2: error: unknown target ABI 'pauthtest'
+// RUN: %clang -c --target=aarch64-pauthtest %s -o /dev/null
 
 //// PAuth ABI is encoded as environment part of the triple, so don't allow to explicitly set other environments.
-// RUN: not %clang -### -c --target=aarch64-linux-gnu -mabi=pauthtest %s 2>&1 | FileCheck %s --check-prefix=ERR3
-// ERR3: error: unsupported option '-mabi=pauthtest' for target 'aarch64-unknown-linux-gnu'
+// RUN: not %clang -### -c --target=aarch64-linux-gnu -mabi=pauthtest %s 2>&1 | FileCheck %s --check-prefix=ERR4
+// ERR4: error: unsupported option '-mabi=pauthtest' for target 'aarch64-unknown-linux-gnu'
 // RUN: %clang -### -c --target=aarch64-linux-pauthtest -mabi=pauthtest %s
 
 //// The only branch protection option compatible with PAuthABI is BTI.
 // RUN: not %clang -### -c --target=aarch64-linux -mabi=pauthtest -mbranch-protection=pac-ret %s 2>&1 | \
-// RUN:   FileCheck %s --check-prefix=ERR4_1
+// RUN:   FileCheck %s --check-prefix=ERR5_1
 // RUN: not %clang -### -c --target=aarch64-linux-pauthtest       -mbranch-protection=pac-ret %s 2>&1 | \
-// RUN:   FileCheck %s --check-prefix=ERR4_1
+// RUN:   FileCheck %s --check-prefix=ERR5_1
 // RUN: not %clang -### -c --target=aarch64 -fptrauth-returns     -mbranch-protection=pac-ret %s 2>&1 | \
-// RUN:   FileCheck %s --check-prefix=ERR4_2
-// ERR4_1: error: unsupported option '-mbranch-protection=pac-ret' for target 'aarch64-unknown-linux-pauthtest'
-// ERR4_2: error: the combination of '-mbranch-protection=pac-ret' and '-fptrauth-returns' is incompatible
+// RUN:   FileCheck %s --check-prefix=ERR5_2
+// ERR5_1: error: unsupported option '-mbranch-protection=pac-ret' for target 'aarch64-unknown-linux-pauthtest'
+// ERR5_2: error: the combination of '-mbranch-protection=pac-ret' and '-fptrauth-returns' is incompatible
 
 // RUN: not %clang -### -c --target=aarch64-linux -mabi=pauthtest -mbranch-protection=gcs %s 2>&1 | \
-// RUN:   FileCheck %s --check-prefix=ERR5_1
+// RUN:   FileCheck %s --check-prefix=ERR6_1
 // RUN: not %clang -### -c --target=aarch64-linux-pauthtest       -mbranch-protection=gcs %s 2>&1 | \
-// RUN:   FileCheck %s --check-prefix=ERR5_1
+// RUN:   FileCheck %s --check-prefix=ERR6_1
 // RUN: not %clang -### -c --target=aarch64 -fptrauth-returns     -mbranch-protection=gcs %s 2>&1 | \
-// RUN:   FileCheck %s --check-prefix=ERR5_2
-// ERR5_1: error: unsupported option '-mbranch-protection=gcs' for target 'aarch64-unknown-linux-pauthtest'
-// ERR5_2: error: the combination of '-mbranch-protection=gcs' and '-fptrauth-returns' is incompatible
+// RUN:   FileCheck %s --check-prefix=ERR6_2
+// ERR6_1: error: unsupported option '-mbranch-protection=gcs' for target 'aarch64-unknown-linux-pauthtest'
+// ERR6_2: error: the combination of '-mbranch-protection=gcs' and '-fptrauth-returns' is incompatible
 
 // RUN: not %clang -### -c --target=aarch64-linux -mabi=pauthtest -mbranch-protection=standard %s 2>&1 | \
-// RUN:   FileCheck %s --check-prefix=ERR6_1
+// RUN:   FileCheck %s --check-prefix=ERR7_1
 // RUN: not %clang -### -c --target=aarch64-linux-pauthtest       -mbranch-protection=standard %s 2>&1 | \
-// RUN:   FileCheck %s --check-prefix=ERR6_1
+// RUN:   FileCheck %s --check-prefix=ERR7_1
 // RUN: not %clang -### -c --target=aarch64 -fptrauth-returns     -mbranch-protection=standard %s 2>&1 | \
-// RUN:   FileCheck %s --check-prefix=ERR6_2
-// ERR6_1: error: unsupported option '-mbranch-protection=standard' for target 'aarch64-unknown-linux-pauthtest'
-// ERR6_2: error: the combination of '-mbranch-protection=standard' and '-fptrauth-returns' is incompatible
+// RUN:   FileCheck %s --check-prefix=ERR7_2
+// ERR7_1: error: unsupported option '-mbranch-protection=standard' for target 'aarch64-unknown-linux-pauthtest'
+// ERR7_2: error: the combination of '-mbranch-protection=standard' and '-fptrauth-returns' is incompatible
 
 // RUN: not %clang -### -c --target=aarch64-linux -mabi=pauthtest -msign-return-address=all %s 2>&1 | \
-// RUN:   FileCheck %s --check-prefix=ERR7
+// RUN:   FileCheck %s --check-prefix=ERR8
 // RUN: not %clang -### -c --target=aarch64-linux-pauthtest       -msign-return-address=all %s 2>&1 | \
-// RUN:   FileCheck %s --check-prefix=ERR7
-// ERR7: error: unsupported option '-msign-return-address=all' for target 'aarch64-unknown-linux-pauthtest'
+// RUN:   FileCheck %s --check-prefix=ERR8
+// ERR8: error: unsupported option '-msign-return-address=all' for target 'aarch64-unknown-linux-pauthtest'
 
 // RUN: not %clang -### -c --target=aarch64-linux -mabi=pauthtest -msign-return-address=non-leaf %s 2>&1 | \
-// RUN:   FileCheck %s --check-prefix=ERR8
+// RUN:   FileCheck %s --check-prefix=ERR9
 // RUN: not %clang -### -c --target=aarch64-linux-pauthtest       -msign-return-address=non-leaf %s 2>&1 | \
-// RUN:   FileCheck %s --check-prefix=ERR8
-// ERR8: error: unsupported option '-msign-return-address=non-leaf' for target 'aarch64-unknown-linux-pauthtest'
+// RUN:   FileCheck %s --check-prefix=ERR9
+// ERR9: error: unsupported option '-msign-return-address=non-leaf' for target 'aarch64-unknown-linux-pauthtest'
 
 // RUN: %clang -### -c --target=aarch64-linux -mabi=pauthtest -msign-return-address=none %s
 // RUN: %clang -### -c --target=aarch64-linux-pauthtest       -msign-return-address=none %s
diff --git a/clang/test/Driver/amdgpu-macros.cl b/clang/test/Driver/amdgpu-macros.cl
index 9fda2f3657430..6d049e7a9bc39 100644
--- a/clang/test/Driver/amdgpu-macros.cl
+++ b/clang/test/Driver/amdgpu-macros.cl
@@ -154,26 +154,10 @@
 // ARCH-GCN-DAG: #define __[[CPU]]__ 1
 // ARCH-GCN-DAG: #define __[[FAMILY]]__ 1
 // ARCH-GCN-DAG: #define __amdgcn_processor__ "[[CPU]]"
-// ARCH-GCN-DAG: #define __AMDGCN_WAVEFRONT_SIZE [[WAVEFRONT_SIZE]]
 // ARCH-GCN-DAG: #define __GCC_DESTRUCTIVE_SIZE 128
 // ARCH-GCN-DAG: #define __GCC_CONSTRUCTIVE_SIZE 128
 // UNSAFEFPATOMIC-DAG: #define __AMDGCN_UNSAFE_FP_ATOMICS__ 1
 
-// RUN: %clang -E -dM -target amdgcn -mcpu=gfx906 -mwavefrontsize64 \
-// RUN:   %s 2>&1 | FileCheck --check-prefix=WAVE64 %s
-// RUN: %clang -E -dM -target amdgcn -mcpu=gfx1010 -mwavefrontsize64 \
-// RUN:   %s 2>&1 | FileCheck --check-prefix=WAVE64 %s
-// RUN: %clang -E -dM -target amdgcn -mcpu=gfx906 -mwavefrontsize64 \
-// RUN:   -mno-wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=WAVE64 %s
-// RUN: %clang -E -dM -target amdgcn -mcpu=gfx1010 -mwavefrontsize64 \
-// RUN:   -mno-wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=WAVE32 %s
-// RUN: %clang -E -dM -target amdgcn -mcpu=gfx906 -mno-wavefrontsize64 \
-// RUN:   -mwavefrontsize64 %s 2>&1 | FileCheck --check-prefix=WAVE64 %s
-// RUN: %clang -E -dM -target amdgcn -mcpu=gfx1010 -mno-wavefrontsize64 \
-// RUN:   -mwavefrontsize64 %s 2>&1 | FileCheck --check-prefix=WAVE64 %s
-// WAVE64-DAG: #define __AMDGCN_WAVEFRONT_SIZE 64
-// WAVE32-DAG: #define __AMDGCN_WAVEFRONT_SIZE 32
-
 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx906 \
 // RUN:   %s 2>&1 | FileCheck --check-prefix=CUMODE-ON %s
 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx906 -mcumode \
diff --git a/clang/test/Driver/dxc_enable16bittypes.hlsl b/clang/test/Driver/dxc_enable16bittypes.hlsl
new file mode 100644
index 0000000000000..4cd1d2fd402b3
--- /dev/null
+++ b/clang/test/Driver/dxc_enable16bittypes.hlsl
@@ -0,0 +1,7 @@
+// RUN: %clang_dxc -enable-16bit-types -T lib_6_7 %s -### %s 2>&1 | FileCheck %s
+
+// Make sure enable-16bit-types flag translates into '-fnative-half-type' and 'fnative-int16-type'
+// CHECK: "-fnative-half-type"
+// CHECK-SAME: "-fnative-int16-type"
+
+// expected-no-diagnostics
diff --git a/clang/test/Driver/dxc_fcgl.hlsl b/clang/test/Driver/dxc_fcgl.hlsl
index fe65124c197bc..4db7ada9622c5 100644
--- a/clang/test/Driver/dxc_fcgl.hlsl
+++ b/clang/test/Driver/dxc_fcgl.hlsl
@@ -1,9 +1,5 @@
-// RUN: not %clang_dxc -fcgl -T lib_6_7 foo.hlsl -### %s 2>&1 | FileCheck %s
-// RUN: %clang_dxc -fcgl -T lib_6_7 %s -Xclang -verify
+// RUN: %clang_dxc -fcgl -T lib_6_7 %s -### %s 2>&1 | FileCheck %s
 
 // Make sure fcgl option flag which translated into "-emit-llvm" "-disable-llvm-passes".
 // CHECK: "-emit-llvm"
 // CHECK-SAME: "-disable-llvm-passes"
-
-// Make sure fcgl option not generate any diagnostics.
-// expected-no-diagnostics
diff --git a/clang/test/Driver/fat-archive-unbundle-ext.c b/clang/test/Driver/fat-archive-unbundle-ext.c
index e797acccf02b4..d658ad05b345c 100644
--- a/clang/test/Driver/fat-archive-unbundle-ext.c
+++ b/clang/test/Driver/fat-archive-unbundle-ext.c
@@ -1,5 +1,5 @@
 // REQUIRES: x86-registered-target
-// UNSUPPORTED: target={{.*-windows.*}}, target={{.*}}-macosx{{.*}}, target={{.*-darwin.*}}, target={{.*}}-aix{{.*}}
+// UNSUPPORTED: target={{.*-windows.*}}, target={{.*}}-macosx{{.*}}, target={{.*-darwin.*}}, target={{.*}}-aix{{.*}}, target={{.*}}-zos{{.*}}
 
 // Generate dummy fat object
 // RUN: %clang -O0 --target=%itanium_abi_triple %s -c -o %t.host.o
diff --git a/clang/test/Driver/fuchsia.c b/clang/test/Driver/fuchsia.c
index d0fec18e13a20..99e5018117924 100644
--- a/clang/test/Driver/fuchsia.c
+++ b/clang/test/Driver/fuchsia.c
@@ -130,6 +130,11 @@
 // RUN:     -resource-dir=%S/Inputs/resource_dir_with_per_target_subdir \
 // RUN:     -fuse-ld=ld \
 // RUN:     | FileCheck %s -check-prefix=CHECK-SAFESTACK
+// RUN: %clang -### %s --target=x86_64-unknown-fuchsia -m32 \
+// RUN:     -fsanitize=safe-stack 2>&1 \
+// RUN:     -resource-dir=%S/Inputs/resource_dir_with_per_target_subdir \
+// RUN:     -fuse-ld=ld \
+// RUN:     | FileCheck %s -check-prefix=CHECK-SAFESTACK
 // CHECK-SAFESTACK: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
 // CHECK-SAFESTACK: "-fsanitize=safe-stack"
 // CHECK-SAFESTACK-NOT: "[[RESOURCE_DIR]]{{/|\\\\}}lib{{/|\\\\}}x86_64-unknown-fuchsia{{/|\\\\}}libclang_rt.safestack.a"
diff --git a/clang/test/Driver/hip-macros.hip b/clang/test/Driver/hip-macros.hip
index 516e01a6c4743..4c460d50bf39a 100644
--- a/clang/test/Driver/hip-macros.hip
+++ b/clang/test/Driver/hip-macros.hip
@@ -1,27 +1,4 @@
 // REQUIRES: amdgpu-registered-target
-// RUN: %clang -E -dM --offload-arch=gfx906 -mwavefrontsize64 \
-// RUN:   --cuda-device-only -nogpuinc -nogpulib \
-// RUN:   %s 2>&1 | FileCheck --check-prefixes=WAVE64 %s
-// RUN: %clang -E -dM --offload-arch=gfx1010 -mwavefrontsize64 \
-// RUN:   --cuda-device-only -nogpuinc -nogpulib \
-// RUN:   %s 2>&1 | FileCheck --check-prefixes=WAVE64 %s
-// RUN: %clang -E -dM --offload-arch=gfx906 -mwavefrontsize64 \
-// RUN:   --cuda-device-only -nogpuinc -nogpulib \
-// RUN:   -mno-wavefrontsize64 %s 2>&1 | FileCheck --check-prefixes=WAVE64 %s
-// RUN: %clang -E -dM --offload-arch=gfx1010 -mwavefrontsize64 \
-// RUN:   --cuda-device-only -nogpuinc -nogpulib \
-// RUN:   -mno-wavefrontsize64 %s 2>&1 | FileCheck --check-prefixes=WAVE32 %s
-// RUN: %clang -E -dM --offload-arch=gfx906 -mno-wavefrontsize64 \
-// RUN:   --cuda-device-only -nogpuinc -nogpulib \
-// RUN:   -mwavefrontsize64 %s 2>&1 | FileCheck --check-prefixes=WAVE64 %s
-// RUN: %clang -E -dM --offload-arch=gfx1010 -mno-wavefrontsize64 \
-// RUN:   --cuda-device-only -nogpuinc -nogpulib \
-// RUN:   -mwavefrontsize64 %s 2>&1 | FileCheck --check-prefixes=WAVE64 %s
-// WAVE64-DAG: #define __AMDGCN_WAVEFRONT_SIZE__ 64
-// WAVE32-DAG: #define __AMDGCN_WAVEFRONT_SIZE__ 32
-// WAVE64-DAG: #define __AMDGCN_WAVEFRONT_SIZE 64
-// WAVE32-DAG: #define __AMDGCN_WAVEFRONT_SIZE 32
-
 // RUN: %clang -E -dM --offload-arch=gfx906 --cuda-device-only -nogpuinc -nogpulib \
 // RUN:   %s 2>&1 | FileCheck --check-prefix=CUMODE-ON %s
 // RUN: %clang -E -dM --offload-arch=gfx906 --cuda-device-only -nogpuinc -nogpulib -mcumode \
diff --git a/clang/test/Driver/hip-wavefront-size-deprecation-diagnostics.hip b/clang/test/Driver/hip-wavefront-size-deprecation-diagnostics.hip
deleted file mode 100644
index 8a60f5a150048..0000000000000
--- a/clang/test/Driver/hip-wavefront-size-deprecation-diagnostics.hip
+++ /dev/null
@@ -1,115 +0,0 @@
-// REQUIRES: amdgpu-registered-target
-// RUN: %clang -xhip --offload-arch=gfx1030 --offload-host-only -pedantic -nogpuinc -nogpulib -nobuiltininc -fsyntax-only -Xclang -verify %s
-// RUN: %clang -xhip --offload-arch=gfx1030 --offload-device-only -pedantic -nogpuinc -nogpulib -nobuiltininc -fsyntax-only -Xclang -verify %s
-
-// Test that deprecation warnings for the wavefront size macro are emitted properly.
-
-#define WRAPPED __AMDGCN_WAVEFRONT_SIZE__
-
-#define DOUBLE_WRAPPED (WRAPPED)
-
-template <bool C, class T = void> struct my_enable_if {};
-
-template <class T> struct my_enable_if<true, T> {
-  typedef T type;
-};
-
-__attribute__((host, device)) void use(int, const char*);
-
-template<int N> __attribute__((host, device)) int templatify(int x) {
-    return x + N;
-}
-
-__attribute__((device)) const int GlobalConst = __AMDGCN_WAVEFRONT_SIZE__; // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}}
-constexpr int GlobalConstExpr = __AMDGCN_WAVEFRONT_SIZE__; // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}}
-
-#if defined(__HIP_DEVICE_COMPILE__) && (__AMDGCN_WAVEFRONT_SIZE__ == 64) // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}}
-int foo(void);
-#endif
-
-__attribute__((device)) int device_var = __AMDGCN_WAVEFRONT_SIZE__; // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}}
-
-__attribute__((device))
-void device_fun() {
-    use(__AMDGCN_WAVEFRONT_SIZE, "device function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE' has been marked as deprecated}}
-    use(__AMDGCN_WAVEFRONT_SIZE__, "device function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}}
-    use(WRAPPED, "device function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}}
-    use(DOUBLE_WRAPPED, "device function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}}
-    use(templatify<__AMDGCN_WAVEFRONT_SIZE__>(42), "device function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}}
-    use(GlobalConst, "device function");
-    use(GlobalConstExpr, "device function");
-}
-
-__attribute__((global))
-void global_fun() {
-    // no warnings expected
-    use(__AMDGCN_WAVEFRONT_SIZE, "global function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE' has been marked as deprecated}}
-    use(__AMDGCN_WAVEFRONT_SIZE__, "global function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}}
-    use(WRAPPED, "global function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}}
-    use(DOUBLE_WRAPPED, "global function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}}
-    use(templatify<__AMDGCN_WAVEFRONT_SIZE__>(42), "global function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}}
-}
-
-int host_var = __AMDGCN_WAVEFRONT_SIZE__; // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}}
-int host_var_alt = __AMDGCN_WAVEFRONT_SIZE; // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE' has been marked as deprecated}}
-int host_var_wrapped = WRAPPED; // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}}
-int host_var_double_wrapped = DOUBLE_WRAPPED; // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}}
-
-__attribute__((host))
-void host_fun() {
-    use(__AMDGCN_WAVEFRONT_SIZE, "host function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE' has been marked as deprecated}}
-    use(__AMDGCN_WAVEFRONT_SIZE__, "host function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}}
-    use(WRAPPED, "host function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}}
-    use(DOUBLE_WRAPPED, "host function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}}
-    use(templatify<__AMDGCN_WAVEFRONT_SIZE__>(42), "host function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}}
-    use(GlobalConst, "host function");
-    use(GlobalConstExpr, "host function");
-}
-
-__attribute((host, device))
-void host_device_fun() {
-    use(__AMDGCN_WAVEFRONT_SIZE__, "host device function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}}
-    use(WRAPPED, "host device function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}}
-    use(DOUBLE_WRAPPED, "host device function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}}
-    use(templatify<__AMDGCN_WAVEFRONT_SIZE__>(42), "host device function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}}
-}
-
-template <unsigned int OuterWarpSize = __AMDGCN_WAVEFRONT_SIZE__> // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}}
-class FunSelector {
-public:
-    template<unsigned int FunWarpSize = OuterWarpSize>
-    __attribute__((device))
-    auto fun(void)
-        -> typename my_enable_if<(FunWarpSize <= __AMDGCN_WAVEFRONT_SIZE__), void>::type // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}}
-    {
-        use(1, "yay!");
-    }
-
-    template<unsigned int FunWarpSize = OuterWarpSize>
-    __attribute__((device))
-    auto fun(void)
-        -> typename my_enable_if<(FunWarpSize > __AMDGCN_WAVEFRONT_SIZE__), void>::type // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}}
-    {
-        use(0, "nay!");
-    }
-};
-
-__attribute__((device))
-void device_fun_selector_user() {
-    FunSelector<> f;
-    f.fun<>();
-    f.fun<1>();
-    f.fun<1000>();
-
-    my_enable_if<(1 <= __AMDGCN_WAVEFRONT_SIZE__), int>::type x = 42; // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}}
-}
-
-__attribute__((device)) my_enable_if<(1 <= __AMDGCN_WAVEFRONT_SIZE__), int>::type DeviceFunTemplateRet(void) { // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}}
-    return 42;
-}
-
-__attribute__((device)) int DeviceFunTemplateArg(my_enable_if<(1 <= __AMDGCN_WAVEFRONT_SIZE__), int>::type x) { // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}}
-    return x;
-}
-
-// expected-note@* 0+ {{macro marked 'deprecated' here}}
diff --git a/clang/test/Driver/x86-target-features.c b/clang/test/Driver/x86-target-features.c
index 3717c449d6601..f1660b1afb518 100644
--- a/clang/test/Driver/x86-target-features.c
+++ b/clang/test/Driver/x86-target-features.c
@@ -304,13 +304,6 @@
 // AMX-COMPLEX: "-target-feature" "+amx-complex"
 // NO-AMX-COMPLEX: "-target-feature" "-amx-complex"
 
-// RUN: %clang --target=x86_64-unknown-linux-gnu -mamx-transpose %s \
-// RUN: -### -o %t.o 2>&1 | FileCheck -check-prefix=AMX-TRANSPOSE %s
-// RUN: %clang --target=x86_64-unknown-linux-gnu -mno-amx-transpose %s \
-// RUN: -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-AMX-TRANSPOSE %s
-// AMX-TRANSPOSE: "-target-feature" "+amx-transpose"
-// NO-AMX-TRANSPOSE: "-target-feature" "-amx-transpose"
-
 // RUN: %clang --target=x86_64-unknown-linux-gnu -mamx-avx512 %s \
 // RUN: -### -o %t.o 2>&1 | FileCheck -check-prefix=AMX-AVX512 %s
 // RUN: %clang --target=x86_64-unknown-linux-gnu -mno-amx-avx512 %s \
diff --git a/clang/test/Frontend/aarch64-ignore-branch-protection-attribute.c b/clang/test/Frontend/aarch64-ignore-branch-protection-attribute.c
index 32cc98dd4e037..e6605ce5c630f 100644
--- a/clang/test/Frontend/aarch64-ignore-branch-protection-attribute.c
+++ b/clang/test/Frontend/aarch64-ignore-branch-protection-attribute.c
@@ -1,7 +1,11 @@
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang -target aarch64-linux-pauthtest   %s -S -emit-llvm -o - 2>&1 | FileCheck --implicit-check-not=warning: %s
-// RUN: %clang -target aarch64 -fptrauth-returns %s -S -emit-llvm -o - 2>&1 | FileCheck --implicit-check-not=warning: %s
+// RUN:     %clang -target aarch64-linux-pauthtest   %s -S -emit-llvm -o - 2>&1 | FileCheck --implicit-check-not=warning: %s
+// RUN: not %clang -target aarch64 -fptrauth-returns %s -S -emit-llvm -o - 2>&1 | FileCheck --implicit-check-not=warning: --check-prefix=PTRAUTH-RETURNS %s
+
+// Clang fails early, no LLVM IR output produced.
+// PTRAUTH-RETURNS: clang: error: unsupported option '-fptrauth-returns' for target 'aarch64'
+// PTRAUTH-RETURNS-NOT: attributes
 
 /// Unsupported with pauthtest, warning emitted
 __attribute__((target("branch-protection=pac-ret"))) void f1() {}
diff --git a/clang/test/Frontend/diag-wrap-colors.cpp b/clang/test/Frontend/diag-wrap-colors.cpp
new file mode 100644
index 0000000000000..e3dccb1bd2dee
--- /dev/null
+++ b/clang/test/Frontend/diag-wrap-colors.cpp
@@ -0,0 +1,6 @@
+// RUN: not %clang_cc1 %s -fmessage-length=50 -fcolor-diagnostics -fno-show-source-location -o - 2>&1 | FileCheck %s
+
+struct F {
+  float a : 10;
+};
+// CHECK: bit-field 'a' has non-integral type 'float'
diff --git a/clang/test/Headers/cuda_with_openmp.cu b/clang/test/Headers/cuda_with_openmp.cu
index efde4ecdc6626..8ea0de5972ff2 100644
--- a/clang/test/Headers/cuda_with_openmp.cu
+++ b/clang/test/Headers/cuda_with_openmp.cu
@@ -2,7 +2,7 @@
 // Reported in https://bugs.llvm.org/show_bug.cgi?id=48014
 ///==========================================================================///
 
-// REQUIRES: nvptx-registered-target
+// REQUIRES: nvptx-registered-target, host-supports-cuda
 
 // RUN: %clang -x cuda -fopenmp -c %s -o - --cuda-path=%S/../Driver/Inputs/CUDA/usr/local/cuda -nocudalib -isystem %S/Inputs/include -isystem %S/../../lib/Headers -fsyntax-only
 
diff --git a/clang/test/Index/complete-preprocessor.m b/clang/test/Index/complete-preprocessor.m
index 1cc2f32b7efa6..bd90a796240c4 100644
--- a/clang/test/Index/complete-preprocessor.m
+++ b/clang/test/Index/complete-preprocessor.m
@@ -80,3 +80,8 @@
 // RUN: env CINDEXTEST_EDITING=1 CINDEXTEST_COMPLETION_CACHING=1 c-index-test -code-completion-at=%s:9:8 %s | FileCheck -check-prefix=CHECK-CC3 %s
 // RUN: env CINDEXTEST_EDITING=1 CINDEXTEST_COMPLETION_CACHING=1 c-index-test -code-completion-at=%s:11:5 %s | FileCheck -check-prefix=CHECK-CC4 %s
 // RUN: env CINDEXTEST_EDITING=1 CINDEXTEST_COMPLETION_CACHING=1 c-index-test -code-completion-at=%s:14:5 %s | FileCheck -check-prefix=CHECK-CC5 %s
+
+// Test #embed completion in C23 mode
+// RUN: c-index-test -code-completion-at=%s:4:2 %s -std=c23 | FileCheck -check-prefix=CHECK-EMBED %s
+// CHECK-EMBED: NotImplemented:{TypedText embed}{HorizontalSpace  }{Text "}{Placeholder file}{Text "} (40)
+// CHECK-EMBED: NotImplemented:{TypedText embed}{HorizontalSpace  }{Text <}{Placeholder file}{Text >} (40)
diff --git a/clang/test/Interpreter/pretty-print.c b/clang/test/Interpreter/pretty-print.c
index d0712fb152107..9a7bf752238ab 100644
--- a/clang/test/Interpreter/pretty-print.c
+++ b/clang/test/Interpreter/pretty-print.c
@@ -78,14 +78,16 @@ int * null_ptr = (int*)0; null_ptr
 union U { int I; float F; } u; u.I = 12; u.I
 // CHECK-NEXT: (int) 12
 
-// TODO: _Bool, _Complex, _Atomic, and _BitInt
-// struct S1{} s1; s1
-// TODO-CHECK-NEXT: (S1 &) @0x{{[0-9a-f]+}}
+struct S1{} s1; s1
+// CHECK-NEXT: (S1 &) @0x{{[0-9a-f]+}}
+
+struct S2 {int d;} E = {22}; E
+// CHECK-NEXT: (S2 &) @0x{{[0-9a-f]+}}
 
-// struct S2 {int d;} E = {22}; E
-// TODO-CHECK-NEXT: (struct S2 &) @0x{{[0-9a-f]+}}
-// E.d
-// TODO-CHECK-NEXT: (int) 22
+E.d
+// CHECK-NEXT: (int) 22
+
+// TODO: _Bool, _Complex, _Atomic, and _BitInt
 
 // -----------------------------------------------------------------------------
 // Tentative definition handling (C99 6.9.2)
diff --git a/clang/test/OpenMP/task_ast_print.cpp b/clang/test/OpenMP/task_ast_print.cpp
index 30fb7ab75cc87..b059f187156ee 100644
--- a/clang/test/OpenMP/task_ast_print.cpp
+++ b/clang/test/OpenMP/task_ast_print.cpp
@@ -1,8 +1,10 @@
 // RUN: %clang_cc1 -verify -Wno-vla -fopenmp -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -verify -Wno-vla -fopenmp -fopenmp-version=60 -DOMP60 -ast-print %s | FileCheck %s --check-prefix=CHECK60
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
 // RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify -Wno-vla %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -verify -Wno-vla -fopenmp-simd -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -verify -Wno-vla -fopenmp-simd -fopenmp-version=60 -DOMP60 -ast-print %s | FileCheck %s --check-prefix=CHECK60
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s
 // RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify -Wno-vla %s -ast-print | FileCheck %s
 // RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -ast-dump  %s | FileCheck %s --check-prefix=DUMP
@@ -101,8 +103,8 @@ T tmain(T argc, T *argv) {
   a = 2;
 #pragma omp task default(none), private(argc, b) firstprivate(argv) shared(d) if (argc > 0) final(S<T>::TS > 0) priority(argc) affinity(argc, argv[b:argc], arr[:], ([argc][sizeof(T)])argv)
   foo();
-#pragma omp taskgroup task_reduction(-: argc)
-#pragma omp task if (C) mergeable priority(C) in_reduction(-: argc)
+#pragma omp taskgroup task_reduction(+: argc)
+#pragma omp task if (C) mergeable priority(C) in_reduction(+: argc)
   foo();
   return 0;
 }
@@ -119,8 +121,8 @@ T tmain(T argc, T *argv) {
 // CHECK-NEXT: a = 2;
 // CHECK-NEXT: #pragma omp task default(none) private(argc,b) firstprivate(argv) shared(d) if(argc > 0) final(S<T>::TS > 0) priority(argc) affinity(argc,argv[b:argc],arr[:],([argc][sizeof(T)])argv)
 // CHECK-NEXT: foo()
-// CHECK-NEXT: #pragma omp taskgroup task_reduction(-: argc)
-// CHECK-NEXT: #pragma omp task if(C) mergeable priority(C) in_reduction(-: argc)
+// CHECK-NEXT: #pragma omp taskgroup task_reduction(+: argc)
+// CHECK-NEXT: #pragma omp task if(C) mergeable priority(C) in_reduction(+: argc)
 // CHECK-NEXT: foo()
 // CHECK: template<> int tmain<int, 5>(int argc, int *argv) {
 // CHECK-NEXT: int b = argc, c, d, e, f, g;
@@ -134,8 +136,8 @@ T tmain(T argc, T *argv) {
 // CHECK-NEXT: a = 2;
 // CHECK-NEXT: #pragma omp task default(none) private(argc,b) firstprivate(argv) shared(d) if(argc > 0) final(S<int>::TS > 0) priority(argc) affinity(argc,argv[b:argc],arr[:],([argc][sizeof(int)])argv)
 // CHECK-NEXT: foo()
-// CHECK-NEXT: #pragma omp taskgroup task_reduction(-: argc)
-// CHECK-NEXT: #pragma omp task if(5) mergeable priority(5) in_reduction(-: argc)
+// CHECK-NEXT: #pragma omp taskgroup task_reduction(+: argc)
+// CHECK-NEXT: #pragma omp task if(5) mergeable priority(5) in_reduction(+: argc)
 // CHECK-NEXT: foo()
 // CHECK: template<> long tmain<long, 1>(long argc, long *argv) {
 // CHECK-NEXT: long b = argc, c, d, e, f, g;
@@ -149,8 +151,8 @@ T tmain(T argc, T *argv) {
 // CHECK-NEXT: a = 2;
 // CHECK-NEXT: #pragma omp task default(none) private(argc,b) firstprivate(argv) shared(d) if(argc > 0) final(S<long>::TS > 0) priority(argc) affinity(argc,argv[b:argc],arr[:],([argc][sizeof(long)])argv)
 // CHECK-NEXT: foo()
-// CHECK-NEXT: #pragma omp taskgroup task_reduction(-: argc)
-// CHECK-NEXT: #pragma omp task if(1) mergeable priority(1) in_reduction(-: argc)
+// CHECK-NEXT: #pragma omp taskgroup task_reduction(+: argc)
+// CHECK-NEXT: #pragma omp task if(1) mergeable priority(1) in_reduction(+: argc)
 // CHECK-NEXT: foo()
 
 enum Enum {};
@@ -199,6 +201,14 @@ int main(int argc, char **argv) {
 #pragma omp task depend(inout: omp_all_memory)
   foo();
   // CHECK-NEXT: foo();
+#ifdef OMP60
+#pragma omp task threadset(omp_pool)
+#pragma omp task threadset(omp_team)
+  foo();
+#endif
+  // CHECK60: #pragma omp task threadset(omp_pool)
+  // CHECK60: #pragma omp task threadset(omp_team)
+  // CHECK60-NEXT: foo();
   return tmain<int, 5>(b, &b) + tmain<long, 1>(x, &x);
 }
 
diff --git a/clang/test/OpenMP/task_codegen.cpp b/clang/test/OpenMP/task_codegen.cpp
index c3e6d9e6b1cf7..ba8e6945de9d0 100644
--- a/clang/test/OpenMP/task_codegen.cpp
+++ b/clang/test/OpenMP/task_codegen.cpp
@@ -41,6 +41,9 @@
 // RUN: -emit-llvm -o - -DOMP51 | FileCheck %s \
 // RUN: --implicit-check-not="{{__kmpc|__tgt}}"
 
+// RUN: %clang_cc1 -verify -Wno-vla -triple x86_64-apple-darwin10 -fopenmp -fopenmp-version=60 -DOMP60 -fopenmp-enable-irbuilder -x c++ -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK6
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=60 -DOMP60 -fopenmp-enable-irbuilder -x c++ -triple x86_64-apple-darwin10 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=60 -DOMP60 -fopenmp-enable-irbuilder -x c++ -triple x86_64-apple-darwin10 -include-pch %t -verify -Wno-vla %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK6
 
 // expected-no-diagnostics
 #ifndef HEADER
@@ -65,6 +68,7 @@ struct S {
   S(const S &s) : a(s.a) {}
   ~S() {}
 };
+
 int a;
 int main() {
   char b;
@@ -147,6 +151,7 @@ int main() {
 
 
 
+
 // s1 = S();
 
 
@@ -215,6 +220,19 @@ void test_omp_all_memory()
   }
 }
 #endif // OMP51
+
+#ifdef OMP60
+void test_threadset()
+{
+#pragma omp task threadset(omp_team)
+  {
+  }
+#pragma omp task threadset(omp_pool)
+  {
+  }
+}
+#endif // OMP60
+
 #endif
 // CHECK1-LABEL: define {{[^@]+}}@main
 // CHECK1-SAME: () #[[ATTR0:[0-9]+]] {
@@ -10243,3 +10261,18 @@ void test_omp_all_memory()
 // CHECK4-51-NEXT:    call void @__cxx_global_var_init()
 // CHECK4-51-NEXT:    ret void
 //
+// CHECK6-LABEL: define void @_Z14test_threadsetv()
+// CHECK6-NEXT:  entry:
+// CHECK6-NEXT:       [[AGG_CAPTURED:%.*]] = alloca [[STRUCT_ANON_23:%.*]], align 1
+// CHECK6-NEXT:       [[AGG_CAPTURED2:%.*]] = alloca [[STRUCT_ANON_25:%.*]], align 1
+// CHECK6-NEXT:       call i32 @__kmpc_global_thread_num(ptr @[[GLOB_PTR:[0-9]+]])
+// CHECK6-NEXT:       [[TMP0:%.*]] = call ptr @__kmpc_omp_task_alloc(ptr @1, i32 %omp_global_thread_num, i32 1, i64 40, i64 1, ptr @.omp_task_entry..[[ENTRY1:[0-9]+]])
+// CHECK6-NEXT:       getelementptr inbounds nuw %struct.kmp_task_t_with_privates{{.*}}, ptr %0, i32 0, i32 0
+// CHECK6-NEXT:       call i32 @__kmpc_global_thread_num(ptr @[[GLOB_PTR:[0-9]+]])
+// CHECK6-NEXT:       call i32 @__kmpc_omp_task(ptr @1, i32 %omp_global_thread_num1, ptr %0)
+// CHECK6-NEXT:       call i32 @__kmpc_global_thread_num(ptr @[[GLOB_PTR2:[0-9]+]])
+// CHECK6-NEXT:       [[TMP3:%.*]] = call ptr @__kmpc_omp_task_alloc(ptr @1, i32 %omp_global_thread_num3, i32 129, i64 40, i64 1, ptr @.omp_task_entry..[[ENTRY2:[0-9]+]])
+// CHECK6-NEXT:       getelementptr inbounds nuw %struct.kmp_task_t_with_privates{{.*}}, ptr %3, i32 0, i32 0
+// CHECK6-NEXT:       call i32 @__kmpc_global_thread_num(ptr @[[GLOB_PTR2:[0-9]+]])
+// CHECK6-NEXT:       call i32 @__kmpc_omp_task(ptr @1, i32 %omp_global_thread_num4, ptr %3)
+// CHECK6-NEXT:       ret void
diff --git a/clang/test/OpenMP/task_threadset_messages.cpp b/clang/test/OpenMP/task_threadset_messages.cpp
new file mode 100755
index 0000000000000..f553a2da17ab9
--- /dev/null
+++ b/clang/test/OpenMP/task_threadset_messages.cpp
@@ -0,0 +1,99 @@
+// RUN: %clang_cc1 -verify=expected,omp45 -fopenmp -fopenmp-version=45 -std=c++11 -ferror-limit 200 -o - %s
+// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -fopenmp-version=50 -std=c++11 -ferror-limit 200 -o - %s
+// RUN: %clang_cc1 -verify=expected,omp51 -fopenmp -fopenmp-version=51 -std=c++11 -ferror-limit 200 -o - %s
+// RUN: %clang_cc1 -verify=expected -DOMP60 -fopenmp -fopenmp-version=60 -std=c++11 -ferror-limit 200 -o - %s
+
+// RUN: %clang_cc1 -verify=expected,omp45 -fopenmp-simd -fopenmp-version=45 -std=c++11 -ferror-limit 200 -o - %s
+// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -fopenmp-version=50 -std=c++11 -ferror-limit 200 -o - %s
+// RUN: %clang_cc1 -verify=expected,omp51 -fopenmp-simd -fopenmp-version=51 -std=c++11 -ferror-limit 200 -o - %s
+// RUN: %clang_cc1 -verify=expected -DOMP60 -fopenmp-simd -fopenmp-version=60 -std=c++11 -ferror-limit 200 -o - %s
+
+#ifdef OMP60
+struct ComplexStruct {
+  int data[10];
+  struct InnerStruct {
+    float value;
+  } inner;
+};
+
+// Template class with member functions using 'threadset'.
+template <typename T>
+class TemplateClass {
+public:
+  void foo() {
+    #pragma omp task threadset(omp_pool)
+    {
+      T temp;
+    }
+  }
+  void bar() {
+    #pragma omp taskloop threadset(omp_team)
+    for (int i = 0; i < 10; ++i) {}
+  }
+};
+
+// Valid uses of 'threadset' with 'omp_pool' and 'omp_team' in task directive.
+void test_task_threadset_valid() {
+  int a;
+  #pragma omp task threadset(omp_pool)
+  #pragma omp task threadset(omp_team)
+  #pragma omp task threadset(omp_pool) if(1)
+  #pragma omp task threadset(omp_team) priority(5)
+  #pragma omp task threadset(omp_pool) depend(out: a)
+  #pragma omp parallel
+  {
+    #pragma omp task threadset(omp_pool)
+    {
+      #pragma omp taskloop threadset(omp_team)
+      for (int i = 0; i < 5; ++i) {}
+    }
+  }
+
+  TemplateClass<int> obj;
+  obj.foo();
+  obj.bar();
+}
+
+// Invalid uses of 'threadset' with incorrect arguments in task directive.
+void test_task_threadset_invalid_args() {
+  #pragma omp task threadset(invalid_arg) // expected-error {{expected 'omp_pool' or 'omp_team' in OpenMP clause 'threadset'}}
+  #pragma omp task threadset(123) // expected-error {{expected 'omp_pool' or 'omp_team' in OpenMP clause 'threadset'}}
+  #pragma omp task threadset(omp_pool, omp_team) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  #pragma omp task threadset() // expected-error {{expected 'omp_pool' or 'omp_team' in OpenMP clause 'threadset'}}
+  {}
+}
+
+// Valid uses of 'threadset' with 'omp_pool' and 'omp_team' in taskloop directive.
+void test_taskloop_threadset_valid() {
+  #pragma omp taskloop threadset(omp_pool)
+  for (int i = 0; i < 10; ++i) {}
+  #pragma omp taskloop threadset(omp_team)
+  for (int i = 0; i < 10; ++i) {}
+  #pragma omp taskloop threadset(omp_pool) grainsize(5)
+  for (int i = 0; i < 10; ++i) {}
+  #pragma omp taskloop threadset(omp_team) num_tasks(2)
+  for (int i = 0; i < 10; ++i) {}
+}
+
+// Invalid uses of 'threadset' with incorrect arguments in taskloop directive.
+void test_taskloop_threadset_invalid_args() {
+  #pragma omp taskloop threadset(invalid_arg) // expected-error {{expected 'omp_pool' or 'omp_team' in OpenMP clause 'threadset'}}
+  for (int i = 0; i < 10; ++i) {}
+  #pragma omp taskloop threadset(123) // expected-error {{expected 'omp_pool' or 'omp_team' in OpenMP clause 'threadset'}}
+  for (int i = 0; i < 10; ++i) {}
+  #pragma omp taskloop threadset(omp_pool, omp_team) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i) {}
+  #pragma omp taskloop threadset() // expected-error {{expected 'omp_pool' or 'omp_team' in OpenMP clause 'threadset'}}
+  for (int i = 0; i < 10; ++i) {}
+}
+
+#else
+void test_threadset_not_supported() {
+  #pragma omp task threadset(omp_pool) // omp45-error {{unexpected OpenMP clause 'threadset' in directive '#pragma omp task'}} omp50-error {{unexpected OpenMP clause 'threadset' in directive '#pragma omp task'}} omp51-error {{unexpected OpenMP clause 'threadset' in directive '#pragma omp task'}}
+  #pragma omp task threadset(omp_team) // omp45-error {{unexpected OpenMP clause 'threadset' in directive '#pragma omp task'}} omp50-error {{unexpected OpenMP clause 'threadset' in directive '#pragma omp task'}} omp51-error {{unexpected OpenMP clause 'threadset' in directive '#pragma omp task'}}
+  #pragma omp taskloop threadset(omp_team) // omp45-error {{unexpected OpenMP clause 'threadset' in directive '#pragma omp taskloop'}} omp50-error {{unexpected OpenMP clause 'threadset' in directive '#pragma omp taskloop'}} omp51-error {{unexpected OpenMP clause 'threadset' in directive '#pragma omp taskloop'}}
+  for (int i = 0; i < 10; ++i) {}
+  #pragma omp taskloop threadset(omp_pool) // omp45-error {{unexpected OpenMP clause 'threadset' in directive '#pragma omp taskloop'}} omp50-error {{unexpected OpenMP clause 'threadset' in directive '#pragma omp taskloop'}} omp51-error {{unexpected OpenMP clause 'threadset' in directive '#pragma omp taskloop'}}
+  for (int i = 0; i < 10; ++i) {}
+}
+#endif
diff --git a/clang/test/OpenMP/taskloop_ast_print.cpp b/clang/test/OpenMP/taskloop_ast_print.cpp
index 1b6d7240fa66c..e4bf20af5d78e 100644
--- a/clang/test/OpenMP/taskloop_ast_print.cpp
+++ b/clang/test/OpenMP/taskloop_ast_print.cpp
@@ -1,8 +1,10 @@
 // RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=60 -DOMP60 -ast-print %s | FileCheck %s --check-prefix=CHECK60
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
 // RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=60 -DOMP60 -ast-print %s | FileCheck %s --check-prefix=CHECK60
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s
 // RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
@@ -87,6 +89,20 @@ int main(int argc, char **argv) {
   // CHECK-NEXT: #pragma omp cancel taskgroup
   // CHECK-NEXT: #pragma omp cancellation point taskgroup
   // CHECK-NEXT: foo();
+#ifdef OMP60
+#pragma omp taskloop threadset(omp_team)
+  for (int i = 0; i < 10; ++i) {
+#pragma omp taskloop threadset(omp_pool)
+  for (int j = 0; j < 10; ++j) {
+    foo();
+  }
+}
+#endif
+ // CHECK60: #pragma omp taskloop threadset(omp_team)
+ // CHECK60-NEXT: for (int i = 0; i < 10; ++i) {
+ // CHECK60: #pragma omp taskloop threadset(omp_pool)
+ // CHECK60-NEXT: for (int j = 0; j < 10; ++j) {
+ // CHECK60-NEXT: foo();
   return (tmain<int, 5>(argc) + tmain<char, 1>(argv[0][0]));
 }
 
diff --git a/clang/test/OpenMP/taskloop_codegen.cpp b/clang/test/OpenMP/taskloop_codegen.cpp
index 69f8d3b160bfd..d1197607a2684 100644
--- a/clang/test/OpenMP/taskloop_codegen.cpp
+++ b/clang/test/OpenMP/taskloop_codegen.cpp
@@ -5,7 +5,12 @@
 // RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp-simd -x c++ -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -triple x86_64-apple-darwin10 -emit-pch -o %t %s
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -triple x86_64-apple-darwin10 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+
 // SIMD-ONLY0-NOT: {{__kmpc|__tgt}}
+
+// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp -fopenmp-version=60 -DOMP60 -x c++ -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK6
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=60 -DOMP60 -x c++ -triple x86_64-apple-darwin10 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=60 -DOMP60 -x c++ -triple x86_64-apple-darwin10 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK6
 // expected-no-diagnostics
 #ifndef HEADER
 #define HEADER
@@ -241,4 +246,52 @@ void taskloop_with_class() {
   }
 }
 
+#ifdef OMP60
+void test_threadset()
+{
+#pragma omp taskloop threadset(omp_team)
+  for (int i = 0; i < 10; ++i) {
+  }
+#pragma omp taskloop threadset(omp_pool)
+  for (int i = 0; i < 10; ++i) {
+  }
+}
+#endif // OMP60
+// CHECK6-LABEL: define void @_Z14test_threadsetv()
+// CHECK6-NEXT:  entry:
+// CHECK6-NEXT:       [[AGG_CAPTURED:%.*]] = alloca [[STRUCT_ANON_14:%.*]], align 1
+// CHECK6-NEXT:       %[[TMP:.*]] = alloca i32, align 4
+// CHECK6-NEXT:       [[AGG_CAPTURED1:%.*]] = alloca [[STRUCT_ANON_16:%.*]], align 1
+// CHECK6-NEXT:       %[[TMP2:.*]] = alloca i32, align 4
+// CHECK6-NEXT:       %[[TID0:.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB_PTR:[0-9]+]])
+// CHECK6-NEXT:       call void @__kmpc_taskgroup(ptr @1, i32 %[[TID0:.*]])
+// CHECK6-NEXT:       %[[TID1:.*]] = call ptr @__kmpc_omp_task_alloc(ptr @1, i32 %[[TID0:.*]], i32 1, i64 80, i64 1, ptr @.omp_task_entry..[[ENTRY1:[0-9]+]])
+// CHECK6-NEXT:       %[[TID2:.*]] = getelementptr inbounds nuw %struct.kmp_task_t_with_privates{{.*}}, ptr %[[TID1:.*]], i32 0, i32 0
+// CHECK6-NEXT:       %[[TID3:.*]] = getelementptr inbounds nuw %struct.kmp_task_t{{.*}}, ptr %[[TID2:.*]], i32 0, i32 5
+// CHECK6-NEXT:       store i64 0, ptr %[[TID3:.*]], align 8
+// CHECK6-NEXT:       %[[TID4:.*]] = getelementptr inbounds nuw %struct.kmp_task_t{{.*}}, ptr %[[TID2:.*]], i32 0, i32 6
+// CHECK6-NEXT:       store i64 9, ptr %[[TID4:.*]], align 8
+// CHECK6-NEXT:       %[[TID5:.*]] = getelementptr inbounds nuw %struct.kmp_task_t{{.*}}, ptr %[[TID2:.*]], i32 0, i32 7
+// CHECK6-NEXT:       store i64 1, ptr %[[TID5:.*]], align 8
+// CHECK6-NEXT:       %[[TID6:.*]] = getelementptr inbounds nuw %struct.kmp_task_t{{.*}}, ptr %[[TID2:.*]], i32 0, i32 9
+// CHECK6-NEXT:       call void @llvm.memset.p0.i64(ptr align 8 %[[TID6:.*]], i8 0, i64 8, i1 false)
+// CHECK6-NEXT:       %[[TID7:.*]] = load i64, ptr %[[TID5:.*]], align 8
+// CHECK6-NEXT:       call void @__kmpc_taskloop(ptr @1, i32 %[[TID0:.*]], ptr %[[TID1:.*]], i32 1, ptr %[[TID3:.*]], ptr %4, i64 %[[TID7:.*]], i32 1, i32 0, i64 0, ptr null)
+// CHECK6-NEXT:       call void @__kmpc_end_taskgroup(ptr @1, i32 %[[TID0:.*]])
+// CHECK6-NEXT:       call void @__kmpc_taskgroup(ptr @1, i32 %[[TID0:.*]])
+// CHECK6-NEXT:       %[[TID8:.*]] = call ptr @__kmpc_omp_task_alloc(ptr @1, i32 %[[TID0:.*]], i32 129, i64 80, i64 1, ptr @.omp_task_entry..[[ENTRY1:[0-9]+]])
+// CHECK6-NEXT:       %[[TID9:.*]] = getelementptr inbounds nuw %struct.kmp_task_t_with_privates{{.*}}, ptr %[[TID8:.*]], i32 0, i32 0
+// CHECK6-NEXT:       %[[TID10:.*]] = getelementptr inbounds nuw %struct.kmp_task_t{{.*}}, ptr %[[TID9:.*]], i32 0, i32 5
+// CHECK6-NEXT:       store i64 0, ptr %[[TID10:.*]], align 8
+// CHECK6-NEXT:       %[[TID11:.*]] = getelementptr inbounds nuw %struct.kmp_task_t{{.*}}, ptr %[[TID9:.*]], i32 0, i32 6
+// CHECK6-NEXT:       store i64 9, ptr %[[TID11:.*]], align 8
+// CHECK6-NEXT:       %[[TID12:.*]] = getelementptr inbounds nuw %struct.kmp_task_t{{.*}}, ptr %[[TID9:.*]], i32 0, i32 7
+// CHECK6-NEXT:       store i64 1, ptr %[[TID12:.*]], align 8
+// CHECK6-NEXT:       %[[TID13:.*]] = getelementptr inbounds nuw %struct.kmp_task_t{{.*}}, ptr %[[TID9:.*]], i32 0, i32 9
+// CHECK6-NEXT:       call void @llvm.memset.p0.i64(ptr align 8 [[TID13:.*]], i8 0, i64 8, i1 false)
+// CHECK6-NEXT:       %[[TID14:.*]] = load i64, ptr [[TID12:.*]], align 8
+// CHECK6-NEXT:       call void @__kmpc_taskloop(ptr @1, i32 %[[TID0:.*]], ptr %[[TID8:.*]], i32 1, ptr %[[TID10:.*]], ptr %[[TID11:.*]], i64 %[[TID14:.*]], i32 1, i32 0, i64 0, ptr null)
+// CHECK6-NEXT:       call void @__kmpc_end_taskgroup(ptr @1, i32 %[[TID0:.*]])
+// CHECK6-NEXT:       ret void
+
 #endif
diff --git a/clang/test/Options/enable_16bit_types_validation_spirv.hlsl b/clang/test/Options/enable_16bit_types_validation_spirv.hlsl
index f37d00503fe57..6a507b0990df5 100644
--- a/clang/test/Options/enable_16bit_types_validation_spirv.hlsl
+++ b/clang/test/Options/enable_16bit_types_validation_spirv.hlsl
@@ -1,7 +1,9 @@
-// RUN: not %clang_cc1 -internal-isystem D:\llvm-project\build\x64-Release\lib\clang\19\include -nostdsysteminc -triple spirv-vulkan-library -x hlsl -std=hlsl2016 -fnative-half-type -emit-llvm -disable-llvm-passes  -o - %s 2>&1 | FileCheck %s --check-prefix=SPIRV
-// RUN: %clang_cc1 -internal-isystem D:\llvm-project\build\x64-Release\lib\clang\19\include -nostdsysteminc -triple spirv-vulkan-library -x hlsl -std=hlsl2021 -fnative-half-type -emit-llvm -disable-llvm-passes  -o - %s 2>&1 | FileCheck %s --check-prefix=valid
+// RUN: not %clang_cc1 -internal-isystem D:\llvm-project\build\x64-Release\lib\clang\19\include -nostdsysteminc -triple spirv-vulkan-library -x hlsl -std=hlsl2016 -fnative-half-type -emit-llvm -disable-llvm-passes  -o - %s 2>&1 | FileCheck %s --check-prefix=SPIRV-HALF
+// RUN: not %clang_cc1 -internal-isystem D:\llvm-project\build\x64-Release\lib\clang\19\include -nostdsysteminc -triple spirv-vulkan-library -x hlsl -std=hlsl2016 -fnative-int16-type -emit-llvm -disable-llvm-passes  -o - %s 2>&1 | FileCheck %s --check-prefix=SPIRV-INT
+// RUN: %clang_cc1 -internal-isystem D:\llvm-project\build\x64-Release\lib\clang\19\include -nostdsysteminc -triple spirv-vulkan-library -x hlsl -std=hlsl2021 -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes  -o - %s 2>&1 | FileCheck %s --check-prefix=valid
 
-// SPIRV: error: '-fnative-half-type' option requires target HLSL Version >= 2018, but HLSL Version is 'hlsl2016'
+// SPIRV-HALF: error: '-fnative-half-type' option requires target HLSL Version >= 2018, but HLSL Version is 'hlsl2016'
+// SPIRV-INT: error: '-fnative-int16-type' option requires target HLSL Version >= 2018, but HLSL Version is 'hlsl2016'
 
 // valid: "spirv-unknown-vulkan-library"
 // valid: define hidden spir_func void @{{.*main.*}}() #0 {
diff --git a/clang/test/Preprocessor/bpf-predefined-macros.c b/clang/test/Preprocessor/bpf-predefined-macros.c
index cd8a2ec031925..a9ae8c58c3ba7 100644
--- a/clang/test/Preprocessor/bpf-predefined-macros.c
+++ b/clang/test/Preprocessor/bpf-predefined-macros.c
@@ -70,6 +70,9 @@ int u;
 #ifdef __BPF_FEATURE_LOAD_ACQ_STORE_REL
 int v;
 #endif
+#ifdef __BPF_FEATURE_GOTOX
+int w;
+#endif
 
 // CHECK: int b;
 // CHECK: int c;
@@ -110,6 +113,7 @@ int v;
 // CPU_V4: int u;
 
 // CPU_V4: int v;
+// CPU_V4: int w;
 
 // CPU_GENERIC: int g;
 
diff --git a/clang/test/Preprocessor/predefined-arch-macros.c b/clang/test/Preprocessor/predefined-arch-macros.c
index a3c3697c3a0b9..cf2cd4a10b056 100644
--- a/clang/test/Preprocessor/predefined-arch-macros.c
+++ b/clang/test/Preprocessor/predefined-arch-macros.c
@@ -1841,7 +1841,6 @@
 // CHECK_DMR_M32: #define __AMX_MOVRS__ 1
 // CHECK_DMR_M32: #define __AMX_TF32__ 1
 // CHECK_GNR_M32: #define __AMX_TILE__ 1
-// CHECK_DMR_M32: #define __AMX_TRANSPOSE__ 1
 // CHECK_DMR_M32: #define __AVX10_2_512__ 1
 // CHECK_DMR_M32: #define __AVX10_2__ 1
 // CHECK_GNR_M32: #define __AVX2__ 1
@@ -1947,7 +1946,6 @@
 // CHECK_DMR_M64: #define __AMX_MOVRS__ 1
 // CHECK_DMR_M64: #define __AMX_TF32__ 1
 // CHECK_GNR_M64: #define __AMX_TILE__ 1
-// CHECK_DMR_M64: #define __AMX_TRANSPOSE__ 1
 // CHECK_DMR_M64: #define __AVX10_2_512__ 1
 // CHECK_DMR_M64: #define __AVX10_2__ 1
 // CHECK_GNR_M64: #define __AVX2__ 1
@@ -4418,7 +4416,6 @@
 // CHECK_AMDGCN_NONE-NOT: #define __HAS_FMAF__
 // CHECK_AMDGCN_NONE-NOT: #define __HAS_FP64__
 // CHECK_AMDGCN_NONE-NOT: #define __HAS_LDEXPF__
-// CHECK_AMDGCN_NONE-NOT: #define __AMDGCN_WAVEFRONT_SIZE__
 
 // Begin r600 tests ----------------
 
@@ -4439,7 +4436,6 @@
 // RUN: %clang -x hip -E -dM %s -o - 2>&1 --offload-host-only -nogpulib \
 // RUN:     -nogpuinc --offload-arch=gfx803 -target x86_64-unknown-linux \
 // RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_HIP_HOST
-// CHECK_HIP_HOST: #define __AMDGCN_WAVEFRONT_SIZE__ 64
 // CHECK_HIP_HOST: #define __AMDGPU__ 1
 // CHECK_HIP_HOST: #define __AMD__ 1
 
diff --git a/clang/test/Preprocessor/predefined-macros-hlsl.hlsl b/clang/test/Preprocessor/predefined-macros-hlsl.hlsl
index 26bda6b7be167..f10c79cc9c2d4 100644
--- a/clang/test/Preprocessor/predefined-macros-hlsl.hlsl
+++ b/clang/test/Preprocessor/predefined-macros-hlsl.hlsl
@@ -7,7 +7,7 @@
 // RUN: %clang_cc1 %s -E -dM -o - -triple dxil-pc-shadermodel6.0-mesh | FileCheck -match-full-lines %s --check-prefixes=CHECK,MESH,NOHALF
 // RUN: %clang_cc1 %s -E -dM -o - -triple dxil-pc-shadermodel6.0-pixel | FileCheck -match-full-lines %s --check-prefixes=CHECK,PIXEL,NOHALF
 // RUN: %clang_cc1 %s -E -dM -o - -triple dxil-pc-shadermodel6.0-vertex | FileCheck -match-full-lines %s --check-prefixes=CHECK,VERTEX,NOHALF
-// RUN: %clang_cc1 %s -E -dM -o - -triple dxil-pc-shadermodel6.3-vertex -fnative-half-type | FileCheck -match-full-lines %s --check-prefixes=CHECK,VERTEX,HALF
+// RUN: %clang_cc1 %s -E -dM -o - -triple dxil-pc-shadermodel6.3-vertex -fnative-half-type -fnative-int16-type | FileCheck -match-full-lines %s --check-prefixes=CHECK,VERTEX,HALF
 
 // RUN: %clang_cc1 %s -E -dM -o - -triple spirv-unknown-vulkan-compute | FileCheck -match-full-lines %s --check-prefixes=CHECK,COMPUTE,NOHALF,SPIRV
 
diff --git a/clang/test/Preprocessor/x86_target_features.c b/clang/test/Preprocessor/x86_target_features.c
index 5f17641878761..78f8b19459c2f 100644
--- a/clang/test/Preprocessor/x86_target_features.c
+++ b/clang/test/Preprocessor/x86_target_features.c
@@ -526,18 +526,6 @@
 
 // NO-AMX-COMPLEX-NOT: #define __AMX_COMPLEX__ 1
 
-// RUN: %clang -target x86_64-unknown-linux-gnu -march=x86-64 -mamx-transpose -x c \
-// RUN: -E -dM -o - %s | FileCheck  -check-prefix=AMX-TRANSPOSE %s
-
-// AMX-TRANSPOSE: #define __AMX_TRANSPOSE__ 1
-
-// RUN: %clang -target x86_64-unknown-linux-gnu -march=x86-64 -mno-amx-transpose -x c \
-// RUN: -E -dM -o - %s | FileCheck  -check-prefix=NO-AMX-TRANSPOSE %s
-// RUN: %clang -target x86_64-unknown-linux-gnu -march=x86-64 -mamx-transpose -mno-amx-tile \
-// RUN: -x c -E -dM -o - %s | FileCheck  -check-prefix=NO-AMX-TRANSPOSE %s
-
-// NO-AMX-TRANSPOSE-NOT: #define __AMX_TRANSPOSE__ 1
-
 // RUN: %clang -target x86_64-unknown-linux-gnu -march=x86-64 -mamx-avx512 -x c \
 // RUN: -E -dM -o - %s | FileCheck  -check-prefix=AMX-AVX512 %s
 
diff --git a/clang/test/SemaCXX/attr-callback-broken.cpp b/clang/test/SemaCXX/attr-callback-broken.cpp
index a5469b22ba350..53b331a49251b 100644
--- a/clang/test/SemaCXX/attr-callback-broken.cpp
+++ b/clang/test/SemaCXX/attr-callback-broken.cpp
@@ -1,7 +1,12 @@
-// RUN: %clang_cc1 %s -verify -fsyntax-only
+// RUN: %clang_cc1 %s -std=c++23 -verify -fsyntax-only
 
 class C_in_class {
 #define HAS_THIS
 #include "../Sema/attr-callback-broken.c"
 #undef HAS_THIS
 };
+
+class ExplicitParameterObject {
+  __attribute__((callback(2, 0))) void explicit_this_idx(this ExplicitParameterObject* self, void (*callback)(ExplicitParameterObject*));           // expected-error {{'callback' argument at position 2 references unavailable implicit 'this'}}
+  __attribute__((callback(2, this))) void explicit_this_identifier(this ExplicitParameterObject* self, void (*callback)(ExplicitParameterObject*)); // expected-error {{'callback' argument at position 2 references unavailable implicit 'this'}}
+};
diff --git a/clang/test/SemaCXX/attr-callback.cpp b/clang/test/SemaCXX/attr-callback.cpp
index ee02f7d3d24f7..ff5a241e92f74 100644
--- a/clang/test/SemaCXX/attr-callback.cpp
+++ b/clang/test/SemaCXX/attr-callback.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 %s -verify -fsyntax-only
+// RUN: %clang_cc1 %s -std=c++23 -verify -fsyntax-only
 
 // expected-no-diagnostics
 
@@ -6,6 +6,11 @@ class C_in_class {
 #include "../Sema/attr-callback.c"
 };
 
+class ExplicitParameterObject {
+  __attribute__((callback(2, 1))) void explicit_this_idx(this ExplicitParameterObject* self, void (*callback)(ExplicitParameterObject*));
+  __attribute__((callback(2, self))) void explicit_this_identifier(this ExplicitParameterObject* self, void (*callback)(ExplicitParameterObject*));
+};
+
 struct Base {
 
   void no_args_1(void (*callback)(void));
diff --git a/clang/test/SemaCXX/attr-format.cpp b/clang/test/SemaCXX/attr-format.cpp
index adc05fc46776c..c0aeb5d07dfe9 100644
--- a/clang/test/SemaCXX/attr-format.cpp
+++ b/clang/test/SemaCXX/attr-format.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -Wformat-nonliteral -verify %s
+// RUN: %clang_cc1 -fsyntax-only -std=c++23 -Wformat-nonliteral -verify %s
 #include <stdarg.h>
 
 int printf(const char *fmt, ...) __attribute__((format(printf, 1, 2)));
@@ -11,6 +11,10 @@ struct S {
   // the format argument is argument 2 here.
   void g(const char*, ...) __attribute__((format(printf, 2, 3)));
   const char* g2(const char*) __attribute__((format_arg(2)));
+  // From C++23 'this' can also be specified explicitly.
+  void g3(this S&, const char *, ...) __attribute__((format(printf, 2, 3)));
+  void g4(this const char* s, ...) __attribute__((format(printf, 1, 2)));
+  consteval operator const char*() const { return "%f"; } // #g4_fmt_string
 
   void h(const char*, ...) __attribute__((format(printf, 1, 4))); // \
       expected-error{{implicit this argument as the format string}}
@@ -18,10 +22,17 @@ struct S {
       expected-error{{out of bounds}}
   const char* h3(const char*) __attribute__((format_arg(1))); // \
       expected-error{{invalid for the implicit this argument}}
+  void h4(this S&, const char *, ...) __attribute__((format(printf, 1, 3))); // \
+      expected-error {{format argument not a string type}}
 
   void operator() (const char*, ...) __attribute__((format(printf, 2, 3)));
 };
 
+void s() {
+  S().g4(4); // expected-warning {{format specifies type 'double' but the argument has type 'int'}}
+             // expected-note@#g4_fmt_string {{format string is defined here}}
+}
+
 // PR5521
 struct A { void a(const char*,...) __attribute((format(printf,2,3))); };
 void b(A x) {
diff --git a/clang/test/SemaCXX/attr-lifetime-capture-by.cpp b/clang/test/SemaCXX/attr-lifetime-capture-by.cpp
index 70a5fe5a45376..8606592c6b771 100644
--- a/clang/test/SemaCXX/attr-lifetime-capture-by.cpp
+++ b/clang/test/SemaCXX/attr-lifetime-capture-by.cpp
@@ -44,4 +44,7 @@ struct T {
   {
     s.captureInt(x);
   }
+
+  void explicit_this1(this T& self, const int &x [[clang::lifetime_capture_by(self)]]);
+  void explicit_this2(this T& self, const int &x [[clang::lifetime_capture_by(this)]]); // expected-error {{argument references unavailable implicit 'this'}}
 };
diff --git a/clang/test/SemaCXX/attr-nonnull.cpp b/clang/test/SemaCXX/attr-nonnull.cpp
index 6f9119b519d09..0fba6b50cb354 100644
--- a/clang/test/SemaCXX/attr-nonnull.cpp
+++ b/clang/test/SemaCXX/attr-nonnull.cpp
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -fsyntax-only -verify %s
-// RUN: %clang_cc1 -fsyntax-only -verify %s -fexperimental-new-constant-interpreter
+// RUN: %clang_cc1 -std=c++23 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -std=c++23 -fsyntax-only -verify %s -fexperimental-new-constant-interpreter
 struct S {
   S(const char *) __attribute__((nonnull(2)));
 
@@ -11,6 +11,13 @@ struct S {
 
   void h(const char*) __attribute__((nonnull(1))); // \
       expected-error{{invalid for the implicit this argument}}
+
+  void i(this S* self, const char*) __attribute__((nonnull(1)));
+
+  void j(this S* self, const char*) __attribute__((nonnull(2)));
+
+  void k(this S* self, const char*) __attribute__((nonnull(3))); // \
+      expected-error{{'nonnull' attribute parameter 1 is out of bounds}}
 };
 
 void test() {
diff --git a/clang/test/SemaHLSL/BuiltIns/AddUint64-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/AddUint64-errors.hlsl
index b4ef0550bf88a..553db49231ae0 100644
--- a/clang/test/SemaHLSL/BuiltIns/AddUint64-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/AddUint64-errors.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify
 
 uint2 test_too_few_arg() {
   return __builtin_hlsl_adduint64();
diff --git a/clang/test/SemaHLSL/BuiltIns/WaveActiveMin.hlsl b/clang/test/SemaHLSL/BuiltIns/WaveActiveMin.hlsl
new file mode 100644
index 0000000000000..3b12faf8d9978
--- /dev/null
+++ b/clang/test/SemaHLSL/BuiltIns/WaveActiveMin.hlsl
@@ -0,0 +1,29 @@
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -emit-llvm-only -disable-llvm-passes -verify
+
+int test_too_few_arg() {
+  return __builtin_hlsl_wave_active_min();
+  // expected-error@-1 {{too few arguments to function call, expected 1, have 0}}
+}
+
+float2 test_too_many_arg(float2 p0) {
+  return __builtin_hlsl_wave_active_min(p0, p0);
+  // expected-error@-1 {{too many arguments to function call, expected 1, have 2}}
+}
+
+bool test_expr_bool_type_check(bool p0) {
+  return __builtin_hlsl_wave_active_min(p0);
+  // expected-error@-1 {{invalid operand of type 'bool'}}
+}
+
+bool2 test_expr_bool_vec_type_check(bool2 p0) {
+  return __builtin_hlsl_wave_active_min(p0);
+  // expected-error@-1 {{invalid operand of type 'bool2' (aka 'vector<bool, 2>')}}
+}
+
+struct S { float f; };
+
+S test_expr_struct_type_check(S p0) {
+  return __builtin_hlsl_wave_active_min(p0);
+  // expected-error@-1 {{invalid operand of type 'S' where a scalar or vector is required}}
+}
+
diff --git a/clang/test/SemaHLSL/BuiltIns/all-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/all-errors.hlsl
index 4afd799f8539e..5e00428de0c82 100644
--- a/clang/test/SemaHLSL/BuiltIns/all-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/all-errors.hlsl
@@ -1,5 +1,5 @@
 
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected
 
 bool test_too_few_arg() {
   return __builtin_hlsl_all();
diff --git a/clang/test/SemaHLSL/BuiltIns/any-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/any-errors.hlsl
index e42fd97b40219..6210c998d8e2d 100644
--- a/clang/test/SemaHLSL/BuiltIns/any-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/any-errors.hlsl
@@ -1,5 +1,5 @@
 
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected
 
 bool test_too_few_arg() {
   return __builtin_hlsl_any();
diff --git a/clang/test/SemaHLSL/BuiltIns/asfloat-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/asfloat-errors.hlsl
index f5f223943b4cd..9872f39ebcfba 100644
--- a/clang/test/SemaHLSL/BuiltIns/asfloat-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/asfloat-errors.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -verify
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -verify
 
 
 float4 test_float_too_many_arg(float p0, float p1) {
diff --git a/clang/test/SemaHLSL/BuiltIns/asint-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/asint-errors.hlsl
index 815a0c35cb04c..52f2cd224a13c 100644
--- a/clang/test/SemaHLSL/BuiltIns/asint-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/asint-errors.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -verify
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -verify
 
 
 int4 test_asint_too_many_arg(float p0, float p1) {
diff --git a/clang/test/SemaHLSL/BuiltIns/asint16-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/asint16-errors.hlsl
index fee1c2eb87b11..5f3d5c9772d84 100644
--- a/clang/test/SemaHLSL/BuiltIns/asint16-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/asint16-errors.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.2-library %s -fnative-half-type -verify
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.2-library %s -fnative-half-type -fnative-int16-type -verify
 
 
 int16_t4 test_asint16_too_many_arg(uint16_t p0, uint16_t p1)
diff --git a/clang/test/SemaHLSL/BuiltIns/asuint-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/asuint-errors.hlsl
index 9d0c206a3b3ad..3bb6cc0094926 100644
--- a/clang/test/SemaHLSL/BuiltIns/asuint-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/asuint-errors.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -verify
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -verify
 
 
 uint4 test_asuint_too_many_arg(float p0, float p1) {
diff --git a/clang/test/SemaHLSL/BuiltIns/asuint16-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/asuint16-errors.hlsl
index 024fd406fe8ef..709d2067d9df2 100644
--- a/clang/test/SemaHLSL/BuiltIns/asuint16-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/asuint16-errors.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.2-library %s -fnative-half-type -verify
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.2-library %s -fnative-half-type -fnative-int16-type -verify
 
 uint16_t test_asuint16_less_argument()
 {
diff --git a/clang/test/SemaHLSL/BuiltIns/clamp-errors-16bit.hlsl b/clang/test/SemaHLSL/BuiltIns/clamp-errors-16bit.hlsl
index 7a6341659493b..40910bc9108ed 100644
--- a/clang/test/SemaHLSL/BuiltIns/clamp-errors-16bit.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/clamp-errors-16bit.hlsl
@@ -1,8 +1,8 @@
-// RUN: not %clang_cc1 -fnative-half-type -std=hlsl202x -triple dxilv1.0-unknown-shadermodel6.0-compute \
+// RUN: not %clang_cc1 -fnative-half-type -fnative-int16-type -std=hlsl202x -triple dxilv1.0-unknown-shadermodel6.0-compute \
 // RUN:  -finclude-default-header -S -o - %s 2>&1 | FileCheck %s -DTEST_TYPE=half
-// RUN: not %clang_cc1 -fnative-half-type -std=hlsl202x -triple dxilv1.0-unknown-shadermodel6.0-compute \
+// RUN: not %clang_cc1 -fnative-half-type -fnative-int16-type -std=hlsl202x -triple dxilv1.0-unknown-shadermodel6.0-compute \
 // RUN:  -finclude-default-header -S -o - %s 2>&1 | FileCheck %s -DTEST_TYPE=int16_t
-// RUN: not %clang_cc1 -fnative-half-type -std=hlsl202x -triple dxilv1.0-unknown-shadermodel6.0-compute \
+// RUN: not %clang_cc1 -fnative-half-type -fnative-int16-type -std=hlsl202x -triple dxilv1.0-unknown-shadermodel6.0-compute \
 // RUN:  -finclude-default-header -S -o - %s 2>&1 | FileCheck %s -DTEST_TYPE=uint16_t
 
 // check we error on 16 bit type if shader model is too old
diff --git a/clang/test/SemaHLSL/BuiltIns/clamp-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/clamp-errors.hlsl
index 93e37075773f5..bbe567b6d6ac1 100644
--- a/clang/test/SemaHLSL/BuiltIns/clamp-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/clamp-errors.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected=note
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected=note
 
 float2 test_no_second_arg(float2 p0) {
   return __builtin_hlsl_elementwise_clamp(p0);
diff --git a/clang/test/SemaHLSL/BuiltIns/clip-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/clip-errors.hlsl
index 2cb401601f7eb..f47468897312c 100644
--- a/clang/test/SemaHLSL/BuiltIns/clip-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/clip-errors.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -verify
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -verify
 
 
 void test_arg_missing() {
diff --git a/clang/test/SemaHLSL/BuiltIns/countbits-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/countbits-errors.hlsl
index 5704165e1a450..8949324ec69f6 100644
--- a/clang/test/SemaHLSL/BuiltIns/countbits-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/countbits-errors.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected
 
 
 double test_int_builtin(double p0) {
diff --git a/clang/test/SemaHLSL/BuiltIns/cross-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/cross-errors.hlsl
index 4f73dad79f21f..2c3e8d1560c87 100644
--- a/clang/test/SemaHLSL/BuiltIns/cross-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/cross-errors.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -disable-llvm-passes -verify
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -disable-llvm-passes -verify
 
 void test_too_few_arg()
 {
diff --git a/clang/test/SemaHLSL/BuiltIns/distance-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/distance-errors.hlsl
index e7521c7251432..4ec1bcef2b6fc 100644
--- a/clang/test/SemaHLSL/BuiltIns/distance-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/distance-errors.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify
 
 float test_no_second_arg(float2 p0) {
   return distance(p0);
diff --git a/clang/test/SemaHLSL/BuiltIns/dot-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/dot-errors.hlsl
index 606194692931f..f514a04eb9f49 100644
--- a/clang/test/SemaHLSL/BuiltIns/dot-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/dot-errors.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected=note
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected=note
 
 float test_no_second_arg(float2 p0) {
   return __builtin_hlsl_dot(p0);
diff --git a/clang/test/SemaHLSL/BuiltIns/dot2add-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/dot2add-errors.hlsl
index 5933faeae2aac..84333ba08b9b8 100644
--- a/clang/test/SemaHLSL/BuiltIns/dot2add-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/dot2add-errors.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify
 
 float test_too_few_arg() {
   return dot2add();
diff --git a/clang/test/SemaHLSL/BuiltIns/exp-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/exp-errors.hlsl
index 1435232cbfbc5..f0076ac4e5881 100644
--- a/clang/test/SemaHLSL/BuiltIns/exp-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/exp-errors.hlsl
@@ -1,7 +1,7 @@
 
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected -DTEST_FUNC=__builtin_elementwise_exp
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected -DTEST_FUNC=__builtin_elementwise_exp2
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected -DTEST_FUNC=__builtin_elementwise_exp10
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected -DTEST_FUNC=__builtin_elementwise_exp
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected -DTEST_FUNC=__builtin_elementwise_exp2
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected -DTEST_FUNC=__builtin_elementwise_exp10
 float test_too_few_arg() {
   return TEST_FUNC();
   // expected-error@-1 {{too few arguments to function call, expected 1, have 0}}
diff --git a/clang/test/SemaHLSL/BuiltIns/faceforward-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/faceforward-errors.hlsl
index 469d55995f966..01261a00295b1 100644
--- a/clang/test/SemaHLSL/BuiltIns/faceforward-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/faceforward-errors.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify
 
 float test_double_inputs(double p0, double p1, double p2) {
   return faceforward(p0, p1, p2);
diff --git a/clang/test/SemaHLSL/BuiltIns/firstbithigh-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/firstbithigh-errors.hlsl
index 8badaf0b99a20..f99e606fc6562 100644
--- a/clang/test/SemaHLSL/BuiltIns/firstbithigh-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/firstbithigh-errors.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected
 
 int test_too_few_arg() {
   return firstbithigh();
diff --git a/clang/test/SemaHLSL/BuiltIns/firstbitlow-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/firstbitlow-errors.hlsl
index b12afe65a863e..37090796577fc 100644
--- a/clang/test/SemaHLSL/BuiltIns/firstbitlow-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/firstbitlow-errors.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected
 
 int test_too_few_arg() {
   return firstbitlow();
diff --git a/clang/test/SemaHLSL/BuiltIns/fmod-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/fmod-errors.hlsl
index fc931139e523d..eceac9be8d7d1 100644
--- a/clang/test/SemaHLSL/BuiltIns/fmod-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/fmod-errors.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify
 
 float test_no_second_arg(float2 p0) {
   return fmod(p0);
diff --git a/clang/test/SemaHLSL/BuiltIns/frac-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/frac-errors.hlsl
index 1e277186f22c4..cdf2b61c45207 100644
--- a/clang/test/SemaHLSL/BuiltIns/frac-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/frac-errors.hlsl
@@ -1,5 +1,5 @@
 
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify
 
 float test_too_few_arg() {
   return __builtin_hlsl_elementwise_frac();
diff --git a/clang/test/SemaHLSL/BuiltIns/half-float-only-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/half-float-only-errors.hlsl
index bf044797c3acb..e9cc0ed338e3e 100644
--- a/clang/test/SemaHLSL/BuiltIns/half-float-only-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/half-float-only-errors.hlsl
@@ -1,25 +1,25 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_acos
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_asin
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_atan
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_ceil
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_cos
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_cosh
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_exp
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_exp2
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_exp10
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_floor
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_log
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_log2
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_log10
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_sin
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_sinh
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_sqrt
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_roundeven
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_tan
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_tanh
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_trunc
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_hlsl_elementwise_degrees
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_hlsl_elementwise_radians
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_acos
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_asin
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_atan
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_ceil
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_cos
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_cosh
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_exp
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_exp2
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_exp10
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_floor
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_log
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_log2
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_log10
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_sin
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_sinh
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_sqrt
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_roundeven
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_tan
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_tanh
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_trunc
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_hlsl_elementwise_degrees
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_hlsl_elementwise_radians
 
 double test_double_builtin(double p0) {
     return TEST_FUNC(p0);
diff --git a/clang/test/SemaHLSL/BuiltIns/half-float-only-errors2.hlsl b/clang/test/SemaHLSL/BuiltIns/half-float-only-errors2.hlsl
index c264617558261..9e10e1afa9385 100644
--- a/clang/test/SemaHLSL/BuiltIns/half-float-only-errors2.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/half-float-only-errors2.hlsl
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_atan2
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_fmod
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_pow
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_atan2
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_fmod
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_pow
 
 double test_double_builtin(double p0, double p1) {
     return TEST_FUNC(p0, p1);
diff --git a/clang/test/SemaHLSL/BuiltIns/isinf-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/isinf-errors.hlsl
index 8d14df91f1409..a32bc9628a295 100644
--- a/clang/test/SemaHLSL/BuiltIns/isinf-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/isinf-errors.hlsl
@@ -1,5 +1,5 @@
 
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify
 
 bool test_too_few_arg() {
   return __builtin_hlsl_elementwise_isinf();
diff --git a/clang/test/SemaHLSL/BuiltIns/isnan-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/isnan-errors.hlsl
index a6be28117af4f..625c415f91de2 100644
--- a/clang/test/SemaHLSL/BuiltIns/isnan-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/isnan-errors.hlsl
@@ -1,5 +1,5 @@
 
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify
 
 bool test_too_few_arg() {
   return __builtin_hlsl_elementwise_isnan();
diff --git a/clang/test/SemaHLSL/BuiltIns/ldexp-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/ldexp-errors.hlsl
index 0bc7f7e40f5d3..fa146a5bce525 100644
--- a/clang/test/SemaHLSL/BuiltIns/ldexp-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/ldexp-errors.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify
 
 float test_double_inputs(double p0, double p1) {
   return ldexp(p0, p1);
diff --git a/clang/test/SemaHLSL/BuiltIns/length-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/length-errors.hlsl
index 3aaafa37e8e82..8c5c9a4a0d22a 100644
--- a/clang/test/SemaHLSL/BuiltIns/length-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/length-errors.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify
 
 void test_too_few_arg()
 {
diff --git a/clang/test/SemaHLSL/BuiltIns/lerp-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/lerp-errors.hlsl
index 9592d8766dada..22720a4a37d02 100644
--- a/clang/test/SemaHLSL/BuiltIns/lerp-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/lerp-errors.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected=note
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected=note
 
 float2 test_no_second_arg(float2 p0) {
   return __builtin_hlsl_lerp(p0);
diff --git a/clang/test/SemaHLSL/BuiltIns/mad-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/mad-errors.hlsl
index 5dec0f68d71fa..0e9dda7055f98 100644
--- a/clang/test/SemaHLSL/BuiltIns/mad-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/mad-errors.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected=note
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected=note
 
 float2 test_no_second_arg(float2 p0) {
   return __builtin_hlsl_mad(p0);
diff --git a/clang/test/SemaHLSL/BuiltIns/matrix-basic_types-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/matrix-basic_types-errors.hlsl
index 5ad1d6aefde38..6a6f14b52cb16 100644
--- a/clang/test/SemaHLSL/BuiltIns/matrix-basic_types-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/matrix-basic_types-errors.hlsl
@@ -3,7 +3,7 @@
 uint64_t5x5 mat;
 // expected-error@-1  {{unknown type name 'uint64_t5x5'}}
 
-// Note: this one only fails because -fnative-half-type is not set
+// Note: this one only fails because -fnative-half-type -fnative-int16-type is not set
 uint16_t4x4 mat2;
 // expected-error@-1  {{unknown type name 'uint16_t4x4'}}
 
diff --git a/clang/test/SemaHLSL/BuiltIns/max-errors-16bit.hlsl b/clang/test/SemaHLSL/BuiltIns/max-errors-16bit.hlsl
index 32a4bbd42e5ec..71c14efa60b0f 100644
--- a/clang/test/SemaHLSL/BuiltIns/max-errors-16bit.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/max-errors-16bit.hlsl
@@ -1,8 +1,8 @@
-// RUN: not %clang_cc1 -fnative-half-type -std=hlsl202x -triple dxilv1.0-unknown-shadermodel6.0-compute \
+// RUN: not %clang_cc1 -fnative-half-type -fnative-int16-type -std=hlsl202x -triple dxilv1.0-unknown-shadermodel6.0-compute \
 // RUN:  -finclude-default-header -S -o - %s 2>&1 | FileCheck %s -DTEST_TYPE=half
-// RUN: not %clang_cc1 -fnative-half-type -std=hlsl202x -triple dxilv1.0-unknown-shadermodel6.0-compute \
+// RUN: not %clang_cc1 -fnative-half-type -fnative-int16-type -std=hlsl202x -triple dxilv1.0-unknown-shadermodel6.0-compute \
 // RUN:  -finclude-default-header -S -o - %s 2>&1 | FileCheck %s -DTEST_TYPE=int16_t
-// RUN: not %clang_cc1 -fnative-half-type -std=hlsl202x -triple dxilv1.0-unknown-shadermodel6.0-compute \
+// RUN: not %clang_cc1 -fnative-half-type -fnative-int16-type -std=hlsl202x -triple dxilv1.0-unknown-shadermodel6.0-compute \
 // RUN:  -finclude-default-header -S -o - %s 2>&1 | FileCheck %s -DTEST_TYPE=uint16_t
 
 // check we error on 16 bit type if shader model is too old
diff --git a/clang/test/SemaHLSL/BuiltIns/min-errors-16bit.hlsl b/clang/test/SemaHLSL/BuiltIns/min-errors-16bit.hlsl
index eb0066835689a..c2cffa18892d5 100644
--- a/clang/test/SemaHLSL/BuiltIns/min-errors-16bit.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/min-errors-16bit.hlsl
@@ -1,8 +1,8 @@
-// RUN: not %clang_cc1 -fnative-half-type -std=hlsl202x -triple dxilv1.0-unknown-shadermodel6.0-compute \
+// RUN: not %clang_cc1 -fnative-half-type -fnative-int16-type -std=hlsl202x -triple dxilv1.0-unknown-shadermodel6.0-compute \
 // RUN:  -finclude-default-header -S -o - %s 2>&1 | FileCheck %s -DTEST_TYPE=half
-// RUN: not %clang_cc1 -fnative-half-type -std=hlsl202x -triple dxilv1.0-unknown-shadermodel6.0-compute \
+// RUN: not %clang_cc1 -fnative-half-type -fnative-int16-type -std=hlsl202x -triple dxilv1.0-unknown-shadermodel6.0-compute \
 // RUN:  -finclude-default-header -S -o - %s 2>&1 | FileCheck %s -DTEST_TYPE=int16_t
-// RUN: not %clang_cc1 -fnative-half-type -std=hlsl202x -triple dxilv1.0-unknown-shadermodel6.0-compute \
+// RUN: not %clang_cc1 -fnative-half-type -fnative-int16-type -std=hlsl202x -triple dxilv1.0-unknown-shadermodel6.0-compute \
 // RUN:  -finclude-default-header -S -o - %s 2>&1 | FileCheck %s -DTEST_TYPE=uint16_t
 
 // check we error on 16 bit type if shader model is too old
diff --git a/clang/test/SemaHLSL/BuiltIns/normalize-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/normalize-errors.hlsl
index 6ec32257a370f..377c2d5e41a73 100644
--- a/clang/test/SemaHLSL/BuiltIns/normalize-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/normalize-errors.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -disable-llvm-passes -verify
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -disable-llvm-passes -verify
 
 void test_too_few_arg()
 {
diff --git a/clang/test/SemaHLSL/BuiltIns/radians-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/radians-errors.hlsl
index dbffce226b54e..70e5b671bb3c9 100644
--- a/clang/test/SemaHLSL/BuiltIns/radians-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/radians-errors.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify
 
 float test_too_few_arg() {
   return __builtin_hlsl_elementwise_radians();
diff --git a/clang/test/SemaHLSL/BuiltIns/rcp-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/rcp-errors.hlsl
index 01876240e82d0..79076b4815a6e 100644
--- a/clang/test/SemaHLSL/BuiltIns/rcp-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/rcp-errors.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify
 
 float test_too_few_arg() {
   return __builtin_hlsl_elementwise_rcp();
diff --git a/clang/test/SemaHLSL/BuiltIns/reflect-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/reflect-errors.hlsl
index 9934a3e525d38..b0ae770f49f20 100644
--- a/clang/test/SemaHLSL/BuiltIns/reflect-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/reflect-errors.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify
 
 float test_no_second_arg(float2 p0) {
   return reflect(p0);
diff --git a/clang/test/SemaHLSL/BuiltIns/refract-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/refract-errors.hlsl
index 6cb3e56c20f0e..fce41a4a46d38 100644
--- a/clang/test/SemaHLSL/BuiltIns/refract-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/refract-errors.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify
 
 float test_no_second_arg(float3 p0) {
   return refract(p0);
diff --git a/clang/test/SemaHLSL/BuiltIns/reversebits-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/reversebits-errors.hlsl
index 1ac275beba642..5b33b89cb8eb8 100644
--- a/clang/test/SemaHLSL/BuiltIns/reversebits-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/reversebits-errors.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify
 
 
 double2 test_int_builtin(double2 p0) {
diff --git a/clang/test/SemaHLSL/BuiltIns/round-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/round-errors.hlsl
index 45f86450b37c2..54feed35379d7 100644
--- a/clang/test/SemaHLSL/BuiltIns/round-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/round-errors.hlsl
@@ -1,5 +1,5 @@
 
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected
 
 float test_too_few_arg() {
   return __builtin_elementwise_round();
diff --git a/clang/test/SemaHLSL/BuiltIns/rsqrt-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/rsqrt-errors.hlsl
index 1f81c51207bc3..cedfcca35225e 100644
--- a/clang/test/SemaHLSL/BuiltIns/rsqrt-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/rsqrt-errors.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify
 
 float test_too_few_arg() {
   return __builtin_hlsl_elementwise_rsqrt();
diff --git a/clang/test/SemaHLSL/BuiltIns/saturate-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/saturate-errors.hlsl
index 721b28f86f950..4054ebfb3f649 100644
--- a/clang/test/SemaHLSL/BuiltIns/saturate-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/saturate-errors.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected -Werror
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected -Werror
 
 float2 test_no_arg() {
   return saturate();
diff --git a/clang/test/SemaHLSL/BuiltIns/sign-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/sign-errors.hlsl
index b67725fc77e52..68583d10d1287 100644
--- a/clang/test/SemaHLSL/BuiltIns/sign-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/sign-errors.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected
 
 bool test_too_few_arg() {
   return __builtin_hlsl_elementwise_sign();
diff --git a/clang/test/SemaHLSL/BuiltIns/smoothstep-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/smoothstep-errors.hlsl
index e5e902d6ab887..4c6bea8f02411 100644
--- a/clang/test/SemaHLSL/BuiltIns/smoothstep-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/smoothstep-errors.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify
 
 float test_no_second_arg(float2 p0) {
   return smoothstep(p0);
diff --git a/clang/test/SemaHLSL/BuiltIns/splitdouble-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/splitdouble-errors.hlsl
index 312230a2d6aff..e2ef0f796c166 100644
--- a/clang/test/SemaHLSL/BuiltIns/splitdouble-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/splitdouble-errors.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -verify
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -verify
 
 void test_no_second_arg(double D) {
   __builtin_hlsl_elementwise_splitdouble(D);
diff --git a/clang/test/SemaHLSL/BuiltIns/step-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/step-errors.hlsl
index 5346f217b83aa..993450a17ebfb 100644
--- a/clang/test/SemaHLSL/BuiltIns/step-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/step-errors.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -disable-llvm-passes -verify
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -disable-llvm-passes -verify
 
 void test_too_few_arg()
 {
diff --git a/clang/test/SemaHLSL/Operators/logical-not.hlsl b/clang/test/SemaHLSL/Operators/logical-not.hlsl
index d06ca3982be05..bd1a4be84c47f 100644
--- a/clang/test/SemaHLSL/Operators/logical-not.hlsl
+++ b/clang/test/SemaHLSL/Operators/logical-not.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -triple  dxil-pc-shadermodel6.6-library %s -fnative-half-type -ast-dump -ast-dump-filter=case | FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -triple  dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -ast-dump -ast-dump-filter=case | FileCheck %s
 
 // CHECK-LABEL: FunctionDecl {{.*}} used case1 'uint32_t2 (uint32_t2)'
 // CHECK-NEXT: ParmVarDecl {{.*}} used b 'uint32_t2':'vector<uint32_t, 2>'
diff --git a/clang/test/SemaHLSL/Types/Arithmetic/half_size.hlsl b/clang/test/SemaHLSL/Types/Arithmetic/half_size.hlsl
index 7de4674699930..22e18769a2fe4 100644
--- a/clang/test/SemaHLSL/Types/Arithmetic/half_size.hlsl
+++ b/clang/test/SemaHLSL/Types/Arithmetic/half_size.hlsl
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.4-library -verify %s
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.4-library -verify -fnative-half-type %s
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.4-library -verify -fnative-half-type -fnative-int16-type %s
 // RUN: %clang_cc1 -triple spirv-linux-vulkan-library -verify %s
-// RUN: %clang_cc1 -triple spirv-linux-vulkan-library -verify -fnative-half-type %s
+// RUN: %clang_cc1 -triple spirv-linux-vulkan-library -verify -fnative-half-type -fnative-int16-type %s
 
 // expected-no-diagnostics
 #ifdef __HLSL_ENABLE_16_BIT
diff --git a/clang/test/SemaHLSL/Types/short-errors.hlsl b/clang/test/SemaHLSL/Types/short-errors.hlsl
new file mode 100644
index 0000000000000..93250084e300b
--- /dev/null
+++ b/clang/test/SemaHLSL/Types/short-errors.hlsl
@@ -0,0 +1,21 @@
+// RUN: %clang_cc1 -finclude-default-header -triple spirv-pc-vulkan1.3-compute -verify %s
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.8-compute -verify %s
+
+void asArg(inout short F) { F + 1;}
+// expected-error@-1 {{unknown type name short}}
+
+export void asVarDecl() {
+  short A = 1;
+  // expected-error@-1 {{unknown type name short}}  
+  fn(A);
+}
+
+export short asReturnType() {
+// expected-error@-1 {{unknown type name short}}
+  return 1;
+}
+
+struct S {
+  short A;
+  // expected-error@-1 {{unknown type name short}}
+};
diff --git a/clang/test/SemaHLSL/Types/typedefs.hlsl b/clang/test/SemaHLSL/Types/typedefs.hlsl
index fd72b1ae8a47f..c9c8ff2fc02de 100644
--- a/clang/test/SemaHLSL/Types/typedefs.hlsl
+++ b/clang/test/SemaHLSL/Types/typedefs.hlsl
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.4-library -finclude-default-header -verify -fnative-half-type %s
-// RUN: %clang_cc1 -triple spirv-linux-vulkan-library -finclude-default-header -verify -fnative-half-type %s
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.4-library -finclude-default-header -verify -fnative-half-type -fnative-int16-type %s
+// RUN: %clang_cc1 -triple spirv-linux-vulkan-library -finclude-default-header -verify -fnative-half-type -fnative-int16-type %s
 
 // expected-no-diagnostics
 #define SizeCheck(Ty, SizeInBits)                                              \
diff --git a/clang/test/SemaHLSL/VectorOverloadResolution.hlsl b/clang/test/SemaHLSL/VectorOverloadResolution.hlsl
index b320abdd81182..756dcb4034e4e 100644
--- a/clang/test/SemaHLSL/VectorOverloadResolution.hlsl
+++ b/clang/test/SemaHLSL/VectorOverloadResolution.hlsl
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -triple dxil-unknown-shadermodel6.6-library -S -fnative-half-type -finclude-default-header -o - -ast-dump %s | FileCheck %s
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s --check-prefixes=CHECKIR
+// RUN: %clang_cc1 -triple dxil-unknown-shadermodel6.6-library -S -fnative-half-type -fnative-int16-type -finclude-default-header -o - -ast-dump %s | FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s --check-prefixes=CHECKIR
 void Fn(double2 D);
 void Fn(half2 H);
 
diff --git a/clang/test/SemaOpenCL/builtins-extended-image-param-gfx1100-err.cl b/clang/test/SemaOpenCL/builtins-extended-image-param-gfx1100-err.cl
new file mode 100644
index 0000000000000..47dbdd4e51782
--- /dev/null
+++ b/clang/test/SemaOpenCL/builtins-extended-image-param-gfx1100-err.cl
@@ -0,0 +1,227 @@
+// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx1100 -target-feature +extended-image-insts -S -verify=expected -o - %s
+// REQUIRES: amdgpu-registered-target
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+typedef int int4 __attribute__((ext_vector_type(4)));
+typedef float float4 __attribute__((ext_vector_type(4)));
+typedef half half4 __attribute__((ext_vector_type(4)));
+
+float4 test_amdgcn_image_gather4_lz_2d_v4f32_f32_r(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_gather4_lz_2d_v4f32_f32(1, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_gather4_lz_2d_v4f32_f32' must be a constant integer}}
+}
+
+float4 test_amdgcn_image_gather4_lz_2d_v4f32_f32_g(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_gather4_lz_2d_v4f32_f32(2, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_gather4_lz_2d_v4f32_f32' must be a constant integer}}
+}
+
+float4 test_amdgcn_image_gather4_lz_2d_v4f32_f32_b(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_gather4_lz_2d_v4f32_f32(4, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_gather4_lz_2d_v4f32_f32' must be a constant integer}}
+}
+
+float4 test_amdgcn_image_gather4_lz_2d_v4f32_f32_a(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_gather4_lz_2d_v4f32_f32(8, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_gather4_lz_2d_v4f32_f32' must be a constant integer}}
+}
+
+float4 test_amdgcn_image_sample_lz_1d_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_lz_1d_v4f32_f32(i32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_lz_1d_v4f32_f32' must be a constant integer}}
+}
+
+float4 test_amdgcn_image_sample_l_1d_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_l_1d_v4f32_f32(100, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_l_1d_v4f32_f32' must be a constant integer}}
+}
+
+float4 test_amdgcn_image_sample_d_1d_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_d_1d_v4f32_f32(100, f32, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_d_1d_v4f32_f32' must be a constant integer}}
+}
+
+float4 test_amdgcn_image_sample_lz_2d_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_lz_2d_v4f32_f32(100, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_lz_2d_v4f32_f32' must be a constant integer}}
+}
+
+float4 test_amdgcn_image_sample_l_2d_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_l_2d_v4f32_f32(100, f32, f32, f32, tex, vec4i32, 0, f32, 103); //expected-error{{argument to '__builtin_amdgcn_image_sample_l_2d_v4f32_f32' must be a constant integer}}
+}
+
+float4 test_amdgcn_image_sample_d_2d_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_d_2d_v4f32_f32(i32, f32, f32, f32, f32, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_d_2d_v4f32_f32' must be a constant integer}}
+}
+float4 test_amdgcn_image_sample_lz_3d_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_lz_3d_v4f32_f32(i32, f32, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_lz_3d_v4f32_f32' must be a constant integer}}
+}
+
+float4 test_amdgcn_image_sample_l_3d_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_l_3d_v4f32_f32(1, f32, f32, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_l_3d_v4f32_f32' must be a constant integer}}
+}
+
+float4 test_amdgcn_image_sample_d_3d_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_d_3d_v4f32_f32(1, f32, f32, f32, f32, f32, f32, f32, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_d_3d_v4f32_f32' must be a constant integer}}
+}
+
+float4 test_amdgcn_image_sample_lz_cube_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_lz_cube_v4f32_f32(1, f32, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_lz_cube_v4f32_f32' must be a constant integer}}
+}
+
+float4 test_amdgcn_image_sample_l_cube_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_l_cube_v4f32_f32(1, f32, f32, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_l_cube_v4f32_f32' must be a constant integer}}
+}
+
+float4 test_amdgcn_image_sample_lz_1darray_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_lz_1darray_v4f32_f32(1, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_lz_1darray_v4f32_f32' must be a constant integer}}
+}
+
+float4 test_amdgcn_image_sample_l_1darray_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_l_1darray_v4f32_f32(1, f32, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_l_1darray_v4f32_f32' must be a constant integer}}
+}
+
+float4 test_amdgcn_image_sample_d_1darray_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_d_1darray_v4f32_f32(1, f32, f32, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_d_1darray_v4f32_f32' must be a constant integer}}
+}
+
+float4 test_amdgcn_image_sample_lz_2darray_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_lz_2darray_v4f32_f32(1, f32, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_lz_2darray_v4f32_f32' must be a constant integer}}
+}
+
+float4 test_amdgcn_image_sample_l_2darray_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_l_2darray_v4f32_f32(1, f32, f32, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_l_2darray_v4f32_f32' must be a constant integer}}
+}
+
+float4 test_amdgcn_image_sample_d_2darray_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_d_2darray_v4f32_f32(1, f32, f32, f32, f32, f32, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_d_2darray_v4f32_f32' must be a constant integer}}
+}
+
+half4 test_amdgcn_image_sample_lz_1d_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_lz_1d_v4f16_f32(23, f32, tex, vec4i32, 0, i32, 11); //expected-error{{argument to '__builtin_amdgcn_image_sample_lz_1d_v4f16_f32' must be a constant integer}}
+}
+
+half4 test_amdgcn_image_sample_l_1d_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_l_1d_v4f16_f32(i32, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_l_1d_v4f16_f32' must be a constant integer}}
+}
+
+half4 test_amdgcn_image_sample_d_1d_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_d_1d_v4f16_f32(i32, f32, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_d_1d_v4f16_f32' must be a constant integer}}
+}
+
+half4 test_amdgcn_image_sample_lz_2d_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_lz_2d_v4f16_f32(100, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_lz_2d_v4f16_f32' must be a constant integer}}
+}
+
+half4 test_amdgcn_image_sample_l_2d_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_l_2d_v4f16_f32(100, f32, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_l_2d_v4f16_f32' must be a constant integer}}
+}
+
+half4 test_amdgcn_image_sample_d_2d_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_d_2d_v4f16_f32(100, f32, f32, f32, f32, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_d_2d_v4f16_f32' must be a constant integer}}
+}
+
+half4 test_amdgcn_image_sample_lz_3d_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_lz_3d_v4f16_f32(100, f32, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_lz_3d_v4f16_f32' must be a constant integer}}
+}
+
+half4 test_amdgcn_image_sample_l_3d_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_l_3d_v4f16_f32(100, f32, f32, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_l_3d_v4f16_f32' must be a constant integer}}
+}
+
+half4 test_amdgcn_image_sample_d_3d_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_d_3d_v4f16_f32(100, f32, f32, f32, f32, f32, f32, f32, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_d_3d_v4f16_f32' must be a constant integer}}
+}
+
+half4 test_amdgcn_image_sample_lz_cube_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_lz_cube_v4f16_f32(100, f32, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_lz_cube_v4f16_f32' must be a constant integer}}
+}
+
+half4 test_amdgcn_image_sample_l_cube_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_l_cube_v4f16_f32(i32, f32, f32, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_l_cube_v4f16_f32' must be a constant integer}}
+}
+
+half4 test_amdgcn_image_sample_lz_1darray_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_lz_1darray_v4f16_f32(i32, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_lz_1darray_v4f16_f32' must be a constant integer}}
+}
+
+half4 test_amdgcn_image_sample_l_1darray_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_l_1darray_v4f16_f32(i32, f32, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_l_1darray_v4f16_f32' must be a constant integer}}
+}
+
+half4 test_amdgcn_image_sample_d_1darray_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_d_1darray_v4f16_f32(100, f32, f32, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_d_1darray_v4f16_f32' must be a constant integer}}
+}
+
+half4 test_amdgcn_image_sample_lz_2darray_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_lz_2darray_v4f16_f32(100, f32, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_lz_2darray_v4f16_f32' must be a constant integer}}
+}
+
+half4 test_amdgcn_image_sample_l_2darray_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_l_2darray_v4f16_f32(100, f32, f32, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_l_2darray_v4f16_f32' must be a constant integer}}
+}
+
+half4 test_amdgcn_image_sample_d_2darray_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_d_2darray_v4f16_f32(100, f32, f32, f32, f32, f32, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_d_2darray_v4f16_f32' must be a constant integer}}
+}
+
+float test_amdgcn_image_sample_lz_2d_f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_lz_2d_f32_f32(1, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_lz_2d_f32_f32' must be a constant integer}}
+}
+
+float test_amdgcn_image_sample_l_2d_f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_l_2d_f32_f32(1, f32, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_l_2d_f32_f32' must be a constant integer}}
+}
+
+float test_amdgcn_image_sample_d_2d_f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_d_2d_f32_f32(1, f32, f32, f32, f32, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_d_2d_f32_f32' must be a constant integer}}
+}
+
+float test_amdgcn_image_sample_lz_2darray_f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_lz_2darray_f32_f32(1, f32, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_lz_2darray_f32_f32' must be a constant integer}}
+}
+
+float test_amdgcn_image_sample_l_2darray_f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_l_2darray_f32_f32(1, f32, f32, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_l_2darray_f32_f32' must be a constant integer}}
+}
+
+float test_amdgcn_image_sample_d_2darray_f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_d_2darray_f32_f32(1, f32, f32, f32, f32, f32, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_d_2darray_f32_f32' must be a constant integer}}
+}
diff --git a/clang/test/SemaOpenCL/builtins-extended-image-param-gfx942-err.cl b/clang/test/SemaOpenCL/builtins-extended-image-param-gfx942-err.cl
new file mode 100644
index 0000000000000..e60f8c70dc7c4
--- /dev/null
+++ b/clang/test/SemaOpenCL/builtins-extended-image-param-gfx942-err.cl
@@ -0,0 +1,227 @@
+// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx942 -verify=GFX94 -S -o - %s
+// REQUIRES: amdgpu-registered-target
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+typedef int int4 __attribute__((ext_vector_type(4)));
+typedef float float4 __attribute__((ext_vector_type(4)));
+typedef half half4 __attribute__((ext_vector_type(4)));
+
+float4 test_amdgcn_image_gather4_lz_2d_v4f32_f32_r(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_gather4_lz_2d_v4f32_f32(1, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_gather4_lz_2d_v4f32_f32_r' needs target feature extended-image-insts}}
+}
+
+float4 test_amdgcn_image_gather4_lz_2d_v4f32_f32_g(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_gather4_lz_2d_v4f32_f32(2, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_gather4_lz_2d_v4f32_f32_g' needs target feature extended-image-insts}}
+}
+
+float4 test_amdgcn_image_gather4_lz_2d_v4f32_f32_b(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_gather4_lz_2d_v4f32_f32(4, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_gather4_lz_2d_v4f32_f32_b' needs target feature extended-image-insts}}
+}
+
+float4 test_amdgcn_image_gather4_lz_2d_v4f32_f32_a(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_gather4_lz_2d_v4f32_f32(8, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_gather4_lz_2d_v4f32_f32_a' needs target feature extended-image-insts}}
+}
+
+float4 test_amdgcn_image_sample_lz_1d_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_lz_1d_v4f32_f32(105, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_lz_1d_v4f32_f32' needs target feature extended-image-insts}}
+}
+
+float4 test_amdgcn_image_sample_l_1d_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_l_1d_v4f32_f32(100, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_l_1d_v4f32_f32' needs target feature extended-image-insts}}
+}
+
+float4 test_amdgcn_image_sample_d_1d_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_d_1d_v4f32_f32(100, f32, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_d_1d_v4f32_f32' needs target feature extended-image-insts}}
+}
+
+float4 test_amdgcn_image_sample_lz_2d_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_lz_2d_v4f32_f32(100, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_lz_2d_v4f32_f32' needs target feature extended-image-insts}}
+}
+
+float4 test_amdgcn_image_sample_l_2d_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_l_2d_v4f32_f32(10, f32, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_l_2d_v4f32_f32' needs target feature extended-image-insts}}
+}
+
+float4 test_amdgcn_image_sample_d_2d_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_d_2d_v4f32_f32(105, f32, f32, f32, f32, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_d_2d_v4f32_f32' needs target feature extended-image-insts}}
+}
+float4 test_amdgcn_image_sample_lz_3d_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_lz_3d_v4f32_f32(105, f32, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_lz_3d_v4f32_f32' needs target feature extended-image-insts}}
+}
+
+float4 test_amdgcn_image_sample_l_3d_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_l_3d_v4f32_f32(1, f32, f32, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_l_3d_v4f32_f32' needs target feature extended-image-insts}}
+}
+
+float4 test_amdgcn_image_sample_d_3d_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_d_3d_v4f32_f32(1, f32, f32, f32, f32, f32, f32, f32, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_d_3d_v4f32_f32' needs target feature extended-image-insts}}
+}
+
+float4 test_amdgcn_image_sample_lz_cube_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_lz_cube_v4f32_f32(1, f32, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_lz_cube_v4f32_f32' needs target feature extended-image-insts}}
+}
+
+float4 test_amdgcn_image_sample_l_cube_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_l_cube_v4f32_f32(1, f32, f32, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_l_cube_v4f32_f32' needs target feature extended-image-insts}}
+}
+
+float4 test_amdgcn_image_sample_lz_1darray_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_lz_1darray_v4f32_f32(1, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_lz_1darray_v4f32_f32' needs target feature extended-image-insts}}
+}
+
+float4 test_amdgcn_image_sample_l_1darray_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_l_1darray_v4f32_f32(1, f32, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_l_1darray_v4f32_f32' needs target feature extended-image-insts}}
+}
+
+float4 test_amdgcn_image_sample_d_1darray_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_d_1darray_v4f32_f32(1, f32, f32, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_d_1darray_v4f32_f32' needs target feature extended-image-insts}}
+}
+
+float4 test_amdgcn_image_sample_lz_2darray_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_lz_2darray_v4f32_f32(1, f32, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_lz_2darray_v4f32_f32' needs target feature extended-image-insts}}
+}
+
+float4 test_amdgcn_image_sample_l_2darray_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_l_2darray_v4f32_f32(1, f32, f32, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_l_2darray_v4f32_f32' needs target feature extended-image-insts}}
+}
+
+float4 test_amdgcn_image_sample_d_2darray_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_d_2darray_v4f32_f32(1, f32, f32, f32, f32, f32, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_d_2darray_v4f32_f32' needs target feature extended-image-insts}}
+}
+
+half4 test_amdgcn_image_sample_lz_1d_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_lz_1d_v4f16_f32(105, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_lz_1d_v4f16_f32' needs target feature extended-image-insts}}
+}
+
+half4 test_amdgcn_image_sample_l_1d_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_l_1d_v4f16_f32(105, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_l_1d_v4f16_f32' needs target feature extended-image-insts}}
+}
+
+half4 test_amdgcn_image_sample_d_1d_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_d_1d_v4f16_f32(105, f32, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_d_1d_v4f16_f32' needs target feature extended-image-insts}}
+}
+
+half4 test_amdgcn_image_sample_lz_2d_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_lz_2d_v4f16_f32(100, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_lz_2d_v4f16_f32' needs target feature extended-image-insts}}
+}
+
+half4 test_amdgcn_image_sample_l_2d_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_l_2d_v4f16_f32(100, f32, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_l_2d_v4f16_f32' needs target feature extended-image-insts}}
+}
+
+half4 test_amdgcn_image_sample_d_2d_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_d_2d_v4f16_f32(100, f32, f32, f32, f32, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_d_2d_v4f16_f32' needs target feature extended-image-insts}}
+}
+
+half4 test_amdgcn_image_sample_lz_3d_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_lz_3d_v4f16_f32(100, f32, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_lz_3d_v4f16_f32' needs target feature extended-image-insts}}
+}
+
+half4 test_amdgcn_image_sample_l_3d_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_l_3d_v4f16_f32(100, f32, f32, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_l_3d_v4f16_f32' needs target feature extended-image-insts}}
+}
+
+half4 test_amdgcn_image_sample_d_3d_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_d_3d_v4f16_f32(100, f32, f32, f32, f32, f32, f32, f32, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_d_3d_v4f16_f32' needs target feature extended-image-insts}}
+}
+
+half4 test_amdgcn_image_sample_lz_cube_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_lz_cube_v4f16_f32(100, f32, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_lz_cube_v4f16_f32' needs target feature extended-image-insts}}
+}
+
+half4 test_amdgcn_image_sample_l_cube_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_l_cube_v4f16_f32(105, f32, f32, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_l_cube_v4f16_f32' needs target feature extended-image-insts}}
+}
+
+half4 test_amdgcn_image_sample_lz_1darray_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_lz_1darray_v4f16_f32(105, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_lz_1darray_v4f16_f32' needs target feature extended-image-insts}}
+}
+
+half4 test_amdgcn_image_sample_l_1darray_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_l_1darray_v4f16_f32(105, f32, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_l_1darray_v4f16_f32' needs target feature extended-image-insts}}
+}
+
+half4 test_amdgcn_image_sample_d_1darray_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_d_1darray_v4f16_f32(100, f32, f32, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_d_1darray_v4f16_f32' needs target feature extended-image-insts}}
+}
+
+half4 test_amdgcn_image_sample_lz_2darray_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_lz_2darray_v4f16_f32(100, f32, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_lz_2darray_v4f16_f32' needs target feature extended-image-insts}}
+}
+
+half4 test_amdgcn_image_sample_l_2darray_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_l_2darray_v4f16_f32(100, f32, f32, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_l_2darray_v4f16_f32' needs target feature extended-image-insts}}
+}
+
+half4 test_amdgcn_image_sample_d_2darray_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_d_2darray_v4f16_f32(100, f32, f32, f32, f32, f32, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_d_2darray_v4f16_f32' needs target feature extended-image-insts}}
+}
+
+float test_amdgcn_image_sample_lz_2d_f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_lz_2d_f32_f32(1, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_lz_2d_f32_f32' needs target feature extended-image-insts}}
+}
+
+float test_amdgcn_image_sample_l_2d_f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_l_2d_f32_f32(1, f32, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_l_2d_f32_f32' needs target feature extended-image-insts}}
+}
+
+float test_amdgcn_image_sample_d_2d_f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_d_2d_f32_f32(1, f32, f32, f32, f32, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_d_2d_f32_f32' needs target feature extended-image-insts}}
+}
+
+float test_amdgcn_image_sample_lz_2darray_f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_lz_2darray_f32_f32(1, f32, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_lz_2darray_f32_f32' needs target feature extended-image-insts}}
+}
+
+float test_amdgcn_image_sample_l_2darray_f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_l_2darray_f32_f32(1, f32, f32, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_l_2darray_f32_f32' needs target feature extended-image-insts}}
+}
+
+float test_amdgcn_image_sample_d_2darray_f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
+
+  return __builtin_amdgcn_image_sample_d_2darray_f32_f32(1, f32, f32, f32, f32, f32, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_d_2darray_f32_f32' needs target feature extended-image-insts}}
+}
diff --git a/clang/tools/clang-shlib/CMakeLists.txt b/clang/tools/clang-shlib/CMakeLists.txt
index 945076e1ad810..a4d0aa5779a7e 100644
--- a/clang/tools/clang-shlib/CMakeLists.txt
+++ b/clang/tools/clang-shlib/CMakeLists.txt
@@ -41,6 +41,10 @@ if (CLANG_LINK_CLANG_DYLIB)
   set(INSTALL_WITH_TOOLCHAIN INSTALL_WITH_TOOLCHAIN)
 endif()
 
+if (HAIKU)
+  list(APPEND _DEPS network)
+endif()
+
 add_clang_library(clang-cpp
                   SHARED
                   ${INSTALL_WITH_TOOLCHAIN}
diff --git a/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp
index fc27fd29da933..08776d9bcabfc 100644
--- a/clang/tools/libclang/CIndex.cpp
+++ b/clang/tools/libclang/CIndex.cpp
@@ -2406,6 +2406,8 @@ void OMPClauseEnqueue::VisitOMPCompareClause(const OMPCompareClause *) {}
 
 void OMPClauseEnqueue::VisitOMPFailClause(const OMPFailClause *) {}
 
+void OMPClauseEnqueue::VisitOMPThreadsetClause(const OMPThreadsetClause *) {}
+
 void OMPClauseEnqueue::VisitOMPAbsentClause(const OMPAbsentClause *) {}
 
 void OMPClauseEnqueue::VisitOMPHoldsClause(const OMPHoldsClause *) {}
diff --git a/clang/tools/scan-view/share/ScanView.py b/clang/tools/scan-view/share/ScanView.py
index a89bf3f24fc5a..9c110130315ad 100644
--- a/clang/tools/scan-view/share/ScanView.py
+++ b/clang/tools/scan-view/share/ScanView.py
@@ -1,40 +1,19 @@
-from __future__ import print_function
-
-try:
-    from http.server import HTTPServer, SimpleHTTPRequestHandler
-except ImportError:
-    from BaseHTTPServer import HTTPServer
-    from SimpleHTTPServer import SimpleHTTPRequestHandler
+from http.server import HTTPServer, SimpleHTTPRequestHandler
 import os
 import sys
-
-try:
-    from urlparse import urlparse
-    from urllib import unquote
-except ImportError:
-    from urllib.parse import urlparse, unquote
-
+from urllib.parse import urlparse, unquote
 import posixpath
-
-if sys.version_info.major >= 3:
-    from io import StringIO, BytesIO
-else:
-    from io import BytesIO, BytesIO as StringIO
-
+from io import StringIO, BytesIO
 import re
 import shutil
 import threading
 import time
 import socket
 import itertools
+import configparser
 
 import Reporter
 
-try:
-    import configparser
-except ImportError:
-    import ConfigParser as configparser
-
 ###
 # Various patterns matched or replaced by server.
 
diff --git a/clang/unittests/Driver/MultilibTest.cpp b/clang/unittests/Driver/MultilibTest.cpp
index ebb8611d97e1c..277fa266dea9b 100644
--- a/clang/unittests/Driver/MultilibTest.cpp
+++ b/clang/unittests/Driver/MultilibTest.cpp
@@ -144,7 +144,7 @@ TEST(MultilibTest, SetPushback) {
   ASSERT_TRUE(MS.size() == 2);
   for (MultilibSet::const_iterator I = MS.begin(), E = MS.end(); I != E; ++I) {
     ASSERT_TRUE(llvm::StringSwitch<bool>(I->gccSuffix())
-                    .Cases("/one", "/two", true)
+                    .Cases({"/one", "/two"}, true)
                     .Default(false));
   }
 }
diff --git a/clang/unittests/Format/AlignBracketsTest.cpp b/clang/unittests/Format/AlignBracketsTest.cpp
index ea8db51a4d18e..10ca5fb7da1ce 100644
--- a/clang/unittests/Format/AlignBracketsTest.cpp
+++ b/clang/unittests/Format/AlignBracketsTest.cpp
@@ -28,7 +28,7 @@ TEST_F(AlignBracketsTest, AlignsAfterOpenBracket) {
       "SomeLongVariableName->someFunction(foooooooo(aaaaaaaaaaaaaaa,\n"
       "                                             aaaaaaaaaaaaaaaaaaaaa));");
   FormatStyle Style = getLLVMStyle();
-  Style.AlignAfterOpenBracket = FormatStyle::BAS_DontAlign;
+  Style.AlignAfterOpenBracket = false;
   verifyFormat("void aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa(\n"
                "    aaaaaaaaaaa aaaaaaaa, aaaaaaaaa aaaaaaa) {}",
                Style);
@@ -64,7 +64,7 @@ TEST_F(AlignBracketsTest, AlignsAfterOpenBracket) {
                Style);
   Style.ColumnLimit = 80;
 
-  Style.AlignAfterOpenBracket = FormatStyle::BAS_AlwaysBreak;
+  Style.BreakAfterOpenBracketFunction = true;
   Style.BinPackArguments = false;
   Style.BinPackParameters = FormatStyle::BPPS_OnePerLine;
   verifyFormat("void aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa(\n"
@@ -115,7 +115,9 @@ TEST_F(AlignBracketsTest, AlignsAfterOpenBracket) {
       "    XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXZZZZZZZZZZZZZZZZZZZZZZZZZ()));",
       Style);
 
-  Style.AlignAfterOpenBracket = FormatStyle::BAS_BlockIndent;
+  Style.BreakAfterOpenBracketFunction = true;
+  Style.BreakBeforeCloseBracketFunction = true;
+  Style.BreakBeforeCloseBracketBracedList = true;
   Style.BinPackArguments = false;
   Style.BinPackParameters = FormatStyle::BPPS_OnePerLine;
   verifyFormat("void aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa(\n"
@@ -254,7 +256,8 @@ TEST_F(AlignBracketsTest, AlignAfterOpenBracketBlockIndent) {
                "argument5));",
                Style);
 
-  Style.AlignAfterOpenBracket = FormatStyle::BAS_BlockIndent;
+  Style.BreakAfterOpenBracketFunction = true;
+  Style.BreakBeforeCloseBracketFunction = true;
 
   verifyFormat(Short, Style);
   verifyFormat(
@@ -378,7 +381,8 @@ TEST_F(AlignBracketsTest, AlignAfterOpenBracketBlockIndentIfStatement) {
                "}",
                Style);
 
-  Style.AlignAfterOpenBracket = FormatStyle::BAS_BlockIndent;
+  Style.BreakAfterOpenBracketFunction = true;
+  Style.BreakBeforeCloseBracketFunction = true;
 
   verifyFormat("if (foo()) {\n"
                "  return;\n"
@@ -440,7 +444,8 @@ TEST_F(AlignBracketsTest, AlignAfterOpenBracketBlockIndentForStatement) {
                "}",
                Style);
 
-  Style.AlignAfterOpenBracket = FormatStyle::BAS_BlockIndent;
+  Style.BreakAfterOpenBracketFunction = true;
+  Style.BreakBeforeCloseBracketFunction = true;
 
   verifyFormat("for (int i = 0; i < 5; ++i) {\n"
                "  doSomething();\n"
@@ -457,7 +462,8 @@ TEST_F(AlignBracketsTest, AlignAfterOpenBracketBlockIndentForStatement) {
 
 TEST_F(AlignBracketsTest, AlignAfterOpenBracketBlockIndentInitializers) {
   auto Style = getLLVMStyleWithColumns(60);
-  Style.AlignAfterOpenBracket = FormatStyle::BAS_BlockIndent;
+  Style.BreakAfterOpenBracketBracedList = true;
+  Style.BreakBeforeCloseBracketBracedList = true;
   // Aggregate initialization.
   verifyFormat("int LooooooooooooooooooooooooongVariable[2] = {\n"
                "    10000000, 20000000\n"
@@ -611,13 +617,13 @@ TEST_F(AlignBracketsTest, AllowAllArgumentsOnNextLineDontAlign) {
   StringRef Input = "functionCall(paramA, paramB, paramC);\n"
                     "void functionDecl(int A, int B, int C);";
   Style.AllowAllArgumentsOnNextLine = false;
-  Style.AlignAfterOpenBracket = FormatStyle::BAS_DontAlign;
+  Style.AlignAfterOpenBracket = false;
   verifyFormat(StringRef("functionCall(paramA, paramB,\n"
                          "    paramC);\n"
                          "void functionDecl(int A, int B,\n"
                          "    int C);"),
                Input, Style);
-  Style.AlignAfterOpenBracket = FormatStyle::BAS_Align;
+  Style.AlignAfterOpenBracket = true;
   verifyFormat(StringRef("functionCall(paramA, paramB,\n"
                          "             paramC);\n"
                          "void functionDecl(int A, int B,\n"
@@ -625,13 +631,14 @@ TEST_F(AlignBracketsTest, AllowAllArgumentsOnNextLineDontAlign) {
                Input, Style);
   // However, BAS_AlwaysBreak and BAS_BlockIndent should take precedence over
   // AllowAllArgumentsOnNextLine.
-  Style.AlignAfterOpenBracket = FormatStyle::BAS_AlwaysBreak;
+  Style.BreakAfterOpenBracketFunction = true;
   verifyFormat(StringRef("functionCall(\n"
                          "    paramA, paramB, paramC);\n"
                          "void functionDecl(\n"
                          "    int A, int B, int C);"),
                Input, Style);
-  Style.AlignAfterOpenBracket = FormatStyle::BAS_BlockIndent;
+  Style.BreakAfterOpenBracketFunction = true;
+  Style.BreakBeforeCloseBracketFunction = true;
   verifyFormat("functionCall(\n"
                "    paramA, paramB, paramC\n"
                ");\n"
@@ -639,11 +646,12 @@ TEST_F(AlignBracketsTest, AllowAllArgumentsOnNextLineDontAlign) {
                "    int A, int B, int C\n"
                ");",
                Input, Style);
+  Style.BreakBeforeCloseBracketFunction = false;
 
   // When AllowAllArgumentsOnNextLine is set, we prefer breaking before the
   // first argument.
   Style.AllowAllArgumentsOnNextLine = true;
-  Style.AlignAfterOpenBracket = FormatStyle::BAS_AlwaysBreak;
+  Style.BreakAfterOpenBracketFunction = true;
   verifyFormat(StringRef("functionCall(\n"
                          "    paramA, paramB, paramC);\n"
                          "void functionDecl(\n"
@@ -651,13 +659,14 @@ TEST_F(AlignBracketsTest, AllowAllArgumentsOnNextLineDontAlign) {
                Input, Style);
   // It wouldn't fit on one line with aligned parameters so this setting
   // doesn't change anything for BAS_Align.
-  Style.AlignAfterOpenBracket = FormatStyle::BAS_Align;
+  Style.AlignAfterOpenBracket = true;
+  Style.BreakAfterOpenBracketFunction = false;
   verifyFormat(StringRef("functionCall(paramA, paramB,\n"
                          "             paramC);\n"
                          "void functionDecl(int A, int B,\n"
                          "                  int C);"),
                Input, Style);
-  Style.AlignAfterOpenBracket = FormatStyle::BAS_DontAlign;
+  Style.BreakAfterOpenBracketFunction = true;
   verifyFormat(StringRef("functionCall(\n"
                          "    paramA, paramB, paramC);\n"
                          "void functionDecl(\n"
@@ -678,13 +687,14 @@ TEST_F(AlignBracketsTest, FormatsDeclarationBreakAlways) {
 
   // Ensure AlignAfterOpenBracket interacts correctly with BinPackParameters set
   // to BPPS_AlwaysOnePerLine.
-  BreakAlways.AlignAfterOpenBracket = FormatStyle::BAS_AlwaysBreak;
+  BreakAlways.BreakAfterOpenBracketFunction = true;
   verifyFormat(
       "void someLongFunctionName(\n"
       "    int aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,\n"
       "    int b);",
       BreakAlways);
-  BreakAlways.AlignAfterOpenBracket = FormatStyle::BAS_BlockIndent;
+  BreakAlways.BreakAfterOpenBracketFunction = true;
+  BreakAlways.BreakBeforeCloseBracketFunction = true;
   verifyFormat(
       "void someLongFunctionName(\n"
       "    int aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,\n"
@@ -734,7 +744,7 @@ TEST_F(AlignBracketsTest, FormatsDefinitionBreakAlways) {
 
   // Ensure AlignAfterOpenBracket interacts correctly with BinPackParameters set
   // to BPPS_AlwaysOnePerLine.
-  BreakAlways.AlignAfterOpenBracket = FormatStyle::BAS_AlwaysBreak;
+  BreakAlways.BreakAfterOpenBracketFunction = true;
   verifyFormat(
       "void someLongFunctionName(\n"
       "    int aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,\n"
@@ -743,7 +753,8 @@ TEST_F(AlignBracketsTest, FormatsDefinitionBreakAlways) {
       "      aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa, b);\n"
       "}",
       BreakAlways);
-  BreakAlways.AlignAfterOpenBracket = FormatStyle::BAS_BlockIndent;
+  BreakAlways.BreakAfterOpenBracketFunction = true;
+  BreakAlways.BreakBeforeCloseBracketFunction = true;
   verifyFormat(
       "void someLongFunctionName(\n"
       "    int aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,\n"
@@ -761,17 +772,17 @@ TEST_F(AlignBracketsTest, ParenthesesAndOperandAlignment) {
   verifyFormat("int a = f(aaaaaaaaaaaaaaaaaaaaaa &&\n"
                "          bbbbbbbbbbbbbbbbbbbbbb);",
                Style);
-  Style.AlignAfterOpenBracket = FormatStyle::BAS_Align;
+  Style.AlignAfterOpenBracket = true;
   Style.AlignOperands = FormatStyle::OAS_DontAlign;
   verifyFormat("int a = f(aaaaaaaaaaaaaaaaaaaaaa &&\n"
                "          bbbbbbbbbbbbbbbbbbbbbb);",
                Style);
-  Style.AlignAfterOpenBracket = FormatStyle::BAS_DontAlign;
+  Style.AlignAfterOpenBracket = false;
   Style.AlignOperands = FormatStyle::OAS_Align;
   verifyFormat("int a = f(aaaaaaaaaaaaaaaaaaaaaa &&\n"
                "          bbbbbbbbbbbbbbbbbbbbbb);",
                Style);
-  Style.AlignAfterOpenBracket = FormatStyle::BAS_DontAlign;
+  Style.AlignAfterOpenBracket = false;
   Style.AlignOperands = FormatStyle::OAS_DontAlign;
   verifyFormat("int a = f(aaaaaaaaaaaaaaaaaaaaaa &&\n"
                "    bbbbbbbbbbbbbbbbbbbbbb);",
@@ -781,7 +792,10 @@ TEST_F(AlignBracketsTest, ParenthesesAndOperandAlignment) {
 TEST_F(AlignBracketsTest, BlockIndentAndNamespace) {
   auto Style = getLLVMStyleWithColumns(120);
   Style.AllowShortNamespacesOnASingleLine = true;
-  Style.AlignAfterOpenBracket = FormatStyle::BAS_BlockIndent;
+  Style.BreakAfterOpenBracketFunction = true;
+  Style.BreakAfterOpenBracketBracedList = true;
+  Style.BreakBeforeCloseBracketFunction = true;
+  Style.BreakBeforeCloseBracketBracedList = true;
 
   verifyNoCrash(
       "namespace {\n"
diff --git a/clang/unittests/Format/ConfigParseTest.cpp b/clang/unittests/Format/ConfigParseTest.cpp
index 6488e38badee7..43b21176962ea 100644
--- a/clang/unittests/Format/ConfigParseTest.cpp
+++ b/clang/unittests/Format/ConfigParseTest.cpp
@@ -172,6 +172,16 @@ TEST(ConfigParseTest, ParsesConfigurationBools) {
   CHECK_PARSE_BOOL(BinPackLongBracedList);
   CHECK_PARSE_BOOL(BreakAdjacentStringLiterals);
   CHECK_PARSE_BOOL(BreakAfterJavaFieldAnnotations);
+  CHECK_PARSE_BOOL(BreakAfterOpenBracketBracedList);
+  CHECK_PARSE_BOOL(BreakAfterOpenBracketFunction);
+  CHECK_PARSE_BOOL(BreakAfterOpenBracketIf);
+  CHECK_PARSE_BOOL(BreakAfterOpenBracketLoop);
+  CHECK_PARSE_BOOL(BreakAfterOpenBracketSwitch);
+  CHECK_PARSE_BOOL(BreakBeforeCloseBracketBracedList);
+  CHECK_PARSE_BOOL(BreakBeforeCloseBracketFunction);
+  CHECK_PARSE_BOOL(BreakBeforeCloseBracketIf);
+  CHECK_PARSE_BOOL(BreakBeforeCloseBracketLoop);
+  CHECK_PARSE_BOOL(BreakBeforeCloseBracketSwitch);
   CHECK_PARSE_BOOL(BreakBeforeTemplateCloser);
   CHECK_PARSE_BOOL(BreakBeforeTernaryOperators);
   CHECK_PARSE_BOOL(BreakStringLiterals);
@@ -533,20 +543,23 @@ TEST(ConfigParseTest, ParsesConfiguration) {
   CHECK_PARSE("EnumTrailingComma: Remove", EnumTrailingComma,
               FormatStyle::ETC_Remove);
 
-  Style.AlignAfterOpenBracket = FormatStyle::BAS_AlwaysBreak;
-  CHECK_PARSE("AlignAfterOpenBracket: Align", AlignAfterOpenBracket,
-              FormatStyle::BAS_Align);
-  CHECK_PARSE("AlignAfterOpenBracket: DontAlign", AlignAfterOpenBracket,
-              FormatStyle::BAS_DontAlign);
+  Style.AlignAfterOpenBracket = false;
+  CHECK_PARSE("AlignAfterOpenBracket: Align", AlignAfterOpenBracket, true);
+  CHECK_PARSE("AlignAfterOpenBracket: DontAlign", AlignAfterOpenBracket, false);
+  // For backward compatibility:
   CHECK_PARSE("AlignAfterOpenBracket: AlwaysBreak", AlignAfterOpenBracket,
-              FormatStyle::BAS_AlwaysBreak);
+              true);
+  CHECK_PARSE("AlignAfterOpenBracket: AlwaysBreak\n"
+              "BreakAfterOpenBracketIf: false",
+              BreakAfterOpenBracketIf, false);
+  CHECK_PARSE("BreakAfterOpenBracketLoop: true\n"
+              "AlignAfterOpenBracket: AlwaysBreak",
+              BreakAfterOpenBracketLoop, true);
+  CHECK_PARSE("AlignAfterOpenBracket: false", AlignAfterOpenBracket, false);
   CHECK_PARSE("AlignAfterOpenBracket: BlockIndent", AlignAfterOpenBracket,
-              FormatStyle::BAS_BlockIndent);
-  // For backward compatibility:
-  CHECK_PARSE("AlignAfterOpenBracket: false", AlignAfterOpenBracket,
-              FormatStyle::BAS_DontAlign);
-  CHECK_PARSE("AlignAfterOpenBracket: true", AlignAfterOpenBracket,
-              FormatStyle::BAS_Align);
+              true);
+  Style.AlignAfterOpenBracket = false;
+  CHECK_PARSE("AlignAfterOpenBracket: true", AlignAfterOpenBracket, true);
 
   Style.AlignEscapedNewlines = FormatStyle::ENAS_Left;
   CHECK_PARSE("AlignEscapedNewlines: DontAlign", AlignEscapedNewlines,
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index d45babe1b82ad..ca9e7925e5e95 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -5126,7 +5126,8 @@ TEST_F(FormatTest, DesignatedInitializers) {
 TEST_F(FormatTest, BracedInitializerIndentWidth) {
   auto Style = getLLVMStyleWithColumns(60);
   Style.BinPackArguments = true;
-  Style.AlignAfterOpenBracket = FormatStyle::BAS_AlwaysBreak;
+  Style.BreakAfterOpenBracketFunction = true;
+  Style.BreakAfterOpenBracketBracedList = true;
   Style.BracedInitializerIndentWidth = 6;
 
   // Non-initializing braces are unaffected by BracedInitializerIndentWidth.
@@ -5302,7 +5303,8 @@ TEST_F(FormatTest, BracedInitializerIndentWidth) {
                Style);
 
   // Aligning after open braces unaffected by BracedInitializerIndentWidth.
-  Style.AlignAfterOpenBracket = FormatStyle::BAS_Align;
+  Style.AlignAfterOpenBracket = true;
+  Style.BreakAfterOpenBracketBracedList = false;
   verifyFormat("SomeStruct s{\"xxxxxxxxxxxxx\", \"yyyyyyyyyyyyy\",\n"
                "             \"zzzzzzzzzzzzz\"};",
                Style);
@@ -7459,7 +7461,7 @@ TEST_F(FormatTest, ExpressionIndentationBreakingBeforeOperators) {
   Style.IndentWidth = 4;
   Style.TabWidth = 4;
   Style.UseTab = FormatStyle::UT_Always;
-  Style.AlignAfterOpenBracket = FormatStyle::BAS_DontAlign;
+  Style.AlignAfterOpenBracket = false;
   Style.AlignOperands = FormatStyle::OAS_DontAlign;
   verifyFormat("return someVeryVeryLongConditionThatBarelyFitsOnALine\n"
                "\t&& (someOtherLongishConditionPart1\n"
@@ -7470,7 +7472,7 @@ TEST_F(FormatTest, ExpressionIndentationBreakingBeforeOperators) {
                Style);
 
   Style = getLLVMStyleWithColumns(20);
-  Style.AlignAfterOpenBracket = FormatStyle::BAS_AlwaysBreak;
+  Style.BreakAfterOpenBracketFunction = true;
   Style.BinPackParameters = FormatStyle::BPPS_OnePerLine;
   Style.BreakBeforeBinaryOperators = FormatStyle::BOS_NonAssignment;
   Style.ContinuationIndentWidth = 2;
@@ -7632,7 +7634,7 @@ TEST_F(FormatTest, NoOperandAlignment) {
                "        * cccccccccccccccccccccccccccccccccccc;",
                Style);
 
-  Style.AlignAfterOpenBracket = FormatStyle::BAS_DontAlign;
+  Style.AlignAfterOpenBracket = false;
   verifyFormat("return (a > b\n"
                "    // comment1\n"
                "    // comment2\n"
@@ -11248,7 +11250,7 @@ TEST_F(FormatTest, BreakBeforeTemplateCloser) {
 
 TEST_F(FormatTest, WrapsTemplateParameters) {
   FormatStyle Style = getLLVMStyle();
-  Style.AlignAfterOpenBracket = FormatStyle::BAS_DontAlign;
+  Style.AlignAfterOpenBracket = false;
   Style.BreakBeforeBinaryOperators = FormatStyle::BOS_None;
   verifyFormat(
       "template <typename... a> struct q {};\n"
@@ -11256,7 +11258,7 @@ TEST_F(FormatTest, WrapsTemplateParameters) {
       "    aaaaaaaaaaaaaaaaa, aaaaaaaaaaaaaaaaa, aaaaaaaaaaaaaaaaa>\n"
       "    y;",
       Style);
-  Style.AlignAfterOpenBracket = FormatStyle::BAS_DontAlign;
+  Style.AlignAfterOpenBracket = false;
   Style.BreakBeforeBinaryOperators = FormatStyle::BOS_All;
   verifyFormat(
       "template <typename... a> struct r {};\n"
@@ -11264,7 +11266,7 @@ TEST_F(FormatTest, WrapsTemplateParameters) {
       "    aaaaaaaaaaaaaaaaa, aaaaaaaaaaaaaaaaa, aaaaaaaaaaaaaaaaa>\n"
       "    y;",
       Style);
-  Style.AlignAfterOpenBracket = FormatStyle::BAS_AlwaysBreak;
+  Style.BreakAfterOpenBracketFunction = true;
   Style.BreakBeforeBinaryOperators = FormatStyle::BOS_None;
   verifyFormat("template <typename... a> struct s {};\n"
                "extern s<\n"
@@ -11274,7 +11276,7 @@ TEST_F(FormatTest, WrapsTemplateParameters) {
                "aaaaaaaaaaaaaaaaaaaaaa>\n"
                "    y;",
                Style);
-  Style.AlignAfterOpenBracket = FormatStyle::BAS_AlwaysBreak;
+  Style.BreakAfterOpenBracketFunction = true;
   Style.BreakBeforeBinaryOperators = FormatStyle::BOS_All;
   verifyFormat("template <typename... a> struct t {};\n"
                "extern t<\n"
@@ -14302,7 +14304,7 @@ TEST_F(FormatTest, LayoutCxx11BraceInitializers) {
                "};",
                NoBinPacking);
 
-  NoBinPacking.AlignAfterOpenBracket = FormatStyle::BAS_AlwaysBreak;
+  NoBinPacking.BreakAfterOpenBracketBracedList = true;
   verifyFormat("static uint8 CddDp83848Reg[] = {\n"
                "    CDDDP83848_BMCR_REGISTER,\n"
                "    CDDDP83848_BMSR_REGISTER,\n"
@@ -15972,13 +15974,14 @@ TEST_F(FormatTest, BreaksStringLiteralOperands) {
   // In a function call with two operands, with AlignAfterOpenBracket enabled,
   // the first must be broken with a line break before it.
   FormatStyle Style = getLLVMStyleWithColumns(25);
-  Style.AlignAfterOpenBracket = FormatStyle::BAS_AlwaysBreak;
+  Style.BreakAfterOpenBracketFunction = true;
   verifyFormat("someFunction(\n"
                "    \"long long long \"\n"
                "    \"long\",\n"
                "    a);",
                "someFunction(\"long long long long\", a);", Style);
-  Style.AlignAfterOpenBracket = FormatStyle::BAS_BlockIndent;
+  Style.BreakAfterOpenBracketFunction = true;
+  Style.BreakBeforeCloseBracketFunction = true;
   verifyFormat("someFunction(\n"
                "    \"long long long \"\n"
                "    \"long\",\n"
@@ -17773,7 +17776,7 @@ TEST_F(FormatTest, ConfigurableSpacesInParens) {
 
   Spaces.ColumnLimit = 80;
   Spaces.IndentWidth = 4;
-  Spaces.AlignAfterOpenBracket = FormatStyle::BAS_AlwaysBreak;
+  Spaces.BreakAfterOpenBracketFunction = true;
   verifyFormat("void foo( ) {\n"
                "    size_t foo = (*(function))(\n"
                "        Foooo, Barrrrr, Foooo, Barrrr, FoooooooooLooooong, "
@@ -17798,7 +17801,8 @@ TEST_F(FormatTest, ConfigurableSpacesInParens) {
                "}",
                Spaces);
 
-  Spaces.AlignAfterOpenBracket = FormatStyle::BAS_BlockIndent;
+  Spaces.BreakAfterOpenBracketFunction = true;
+  Spaces.BreakBeforeCloseBracketFunction = true;
   verifyFormat("void foo( ) {\n"
                "    size_t foo = (*(function))(\n"
                "        Foooo, Barrrrr, Foooo, Barrrr, FoooooooooLooooong, "
@@ -22827,7 +22831,7 @@ TEST_F(FormatTest, ConstructorInitializerIndentWidth) {
       ": aaaaaaaaaaaaa(aaaaaaaaaaaaaa), aaaaaaaaaaaaa(aaaaaaaaaaaaaa),\n"
       "  aaaaaaaaaaaaa(aaaaaaaaaaaaaa) {}",
       Style);
-  Style.AlignAfterOpenBracket = FormatStyle::BAS_AlwaysBreak;
+  Style.BreakAfterOpenBracketFunction = true;
   verifyFormat(
       "SomeLongTemplateVariableName<\n"
       "    aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa, aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa>",
@@ -24082,7 +24086,7 @@ TEST_F(FormatTest, FormatsLambdas) {
                "      return aFunkyFunctionCall(qux);\n"
                "    }} {}",
                Style);
-  Style.AlignAfterOpenBracket = FormatStyle::BAS_AlwaysBreak;
+  Style.BreakAfterOpenBracketFunction = true;
   // FIXME: The following test should pass, but fails at the time of writing.
 #if 0
   // As long as all the non-lambda arguments fit on a single line, AlwaysBreak
diff --git a/clang/unittests/Format/FormatTestComments.cpp b/clang/unittests/Format/FormatTestComments.cpp
index 6b433bb384864..d7b2257605482 100644
--- a/clang/unittests/Format/FormatTestComments.cpp
+++ b/clang/unittests/Format/FormatTestComments.cpp
@@ -29,13 +29,13 @@ TEST_F(FormatTestComments, UnderstandsSingleLineComments) {
                "// line 2\n"
                "void f() {}");
 
-  EXPECT_EQ("// comment", format("//comment"));
-  EXPECT_EQ("// #comment", format("//#comment"));
+  verifyFormat("// comment", "//comment");
+  verifyFormat("// #comment", "//#comment");
 
-  EXPECT_EQ("// comment\n"
-            "// clang-format on",
-            format("//comment\n"
-                   "// clang-format on"));
+  verifyFormat("// comment\n"
+               "// clang-format on",
+               "//comment\n"
+               "// clang-format on");
 
   verifyFormat("void f() {\n"
                "  // Doesn't do anything\n"
@@ -84,11 +84,11 @@ TEST_F(FormatTestComments, UnderstandsSingleLineComments) {
                "#include \"a/b/c\" // comment");
   verifyFormat("#include <a>     // comment\n"
                "#include <a/b/c> // comment");
-  EXPECT_EQ("#include \"a\"     // comment\n"
-            "#include \"a/b/c\" // comment",
-            format("#include \\\n"
-                   "  \"a\" // comment\n"
-                   "#include \"a/b/c\" // comment"));
+  verifyFormat("#include \"a\"     // comment\n"
+               "#include \"a/b/c\" // comment",
+               "#include \\\n"
+               "  \"a\" // comment\n"
+               "#include \"a/b/c\" // comment");
 
   verifyFormat("enum E {\n"
                "  // comment\n"
@@ -96,63 +96,48 @@ TEST_F(FormatTestComments, UnderstandsSingleLineComments) {
                "  VAL_B\n"
                "};");
 
-  EXPECT_EQ("enum A {\n"
-            "  // line a\n"
-            "  a,\n"
-            "  b, // line b\n"
-            "\n"
-            "  // line c\n"
-            "  c\n"
-            "};",
-            format("enum A {\n"
-                   "  // line a\n"
-                   "  a,\n"
-                   "  b, // line b\n"
-                   "\n"
-                   "  // line c\n"
-                   "  c\n"
-                   "};",
-                   getLLVMStyleWithColumns(20)));
-  EXPECT_EQ("enum A {\n"
-            "  a, // line 1\n"
-            "  // line 2\n"
-            "};",
-            format("enum A {\n"
-                   "  a, // line 1\n"
-                   "  // line 2\n"
-                   "};",
-                   getLLVMStyleWithColumns(20)));
-  EXPECT_EQ("enum A {\n"
-            "  a, // line 1\n"
-            "     // line 2\n"
-            "};",
-            format("enum A {\n"
-                   "  a, // line 1\n"
-                   "   // line 2\n"
-                   "};",
-                   getLLVMStyleWithColumns(20)));
-  EXPECT_EQ("enum A {\n"
-            "  a, // line 1\n"
-            "  // line 2\n"
-            "  b\n"
-            "};",
-            format("enum A {\n"
-                   "  a, // line 1\n"
-                   "  // line 2\n"
-                   "  b\n"
-                   "};",
-                   getLLVMStyleWithColumns(20)));
-  EXPECT_EQ("enum A {\n"
-            "  a, // line 1\n"
-            "     // line 2\n"
-            "  b\n"
-            "};",
-            format("enum A {\n"
-                   "  a, // line 1\n"
-                   "   // line 2\n"
-                   "  b\n"
-                   "};",
-                   getLLVMStyleWithColumns(20)));
+  const auto Style20 = getLLVMStyleWithColumns(20);
+
+  verifyFormat("enum A {\n"
+               "  // line a\n"
+               "  a,\n"
+               "  b, // line b\n"
+               "\n"
+               "  // line c\n"
+               "  c\n"
+               "};",
+               Style20);
+  verifyNoChange("enum A {\n"
+                 "  a, // line 1\n"
+                 "  // line 2\n"
+                 "};",
+                 Style20);
+  verifyFormat("enum A {\n"
+               "  a, // line 1\n"
+               "     // line 2\n"
+               "};",
+               "enum A {\n"
+               "  a, // line 1\n"
+               "   // line 2\n"
+               "};",
+               Style20);
+  verifyNoChange("enum A {\n"
+                 "  a, // line 1\n"
+                 "  // line 2\n"
+                 "  b\n"
+                 "};",
+                 Style20);
+  verifyFormat("enum A {\n"
+               "  a, // line 1\n"
+               "     // line 2\n"
+               "  b\n"
+               "};",
+               "enum A {\n"
+               "  a, // line 1\n"
+               "   // line 2\n"
+               "  b\n"
+               "};",
+               Style20);
   verifyFormat(
       "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa =\n"
       "    bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb; // Trailing comment");
@@ -172,28 +157,28 @@ TEST_F(FormatTestComments, UnderstandsSingleLineComments) {
 
   verifyFormat("int aaaa; // aaaaa\n"
                "int aa;   // aaaaaaa",
-               getLLVMStyleWithColumns(20));
+               Style20);
 
-  EXPECT_EQ("void f() { // This does something ..\n"
-            "}\n"
-            "int a; // This is unrelated",
-            format("void f()    {     // This does something ..\n"
-                   "  }\n"
-                   "int   a;     // This is unrelated"));
-  EXPECT_EQ("class C {\n"
-            "  void f() { // This does something ..\n"
-            "  } // awesome..\n"
-            "\n"
-            "  int a; // This is unrelated\n"
-            "};",
-            format("class C{void f()    { // This does something ..\n"
-                   "      } // awesome..\n"
-                   " \n"
-                   "int a;    // This is unrelated\n"
-                   "};"));
-
-  EXPECT_EQ("int i; // single line trailing comment",
-            format("int i;\\\n// single line trailing comment"));
+  verifyFormat("void f() { // This does something ..\n"
+               "}\n"
+               "int a; // This is unrelated",
+               "void f()    {     // This does something ..\n"
+               "  }\n"
+               "int   a;     // This is unrelated");
+  verifyFormat("class C {\n"
+               "  void f() { // This does something ..\n"
+               "  } // awesome..\n"
+               "\n"
+               "  int a; // This is unrelated\n"
+               "};",
+               "class C{void f()    { // This does something ..\n"
+               "      } // awesome..\n"
+               " \n"
+               "int a;    // This is unrelated\n"
+               "};");
+
+  verifyFormat("int i; // single line trailing comment",
+               "int i;\\\n// single line trailing comment");
 
   verifyGoogleFormat("int a;  // Trailing comment.");
 
@@ -210,99 +195,99 @@ TEST_F(FormatTestComments, UnderstandsSingleLineComments) {
   verifyGoogleFormat(
       "aaaaaaaaaaaaaaaaaaaaaaaaaa(\n"
       "    aaaaaaaaaaaaaaaaaaaaaa);  // 81_cols_with_this_comment");
-  EXPECT_EQ("D(a, {\n"
-            "  // test\n"
-            "  int a;\n"
-            "});",
-            format("D(a, {\n"
-                   "// test\n"
-                   "int a;\n"
-                   "});"));
-
-  EXPECT_EQ("lineWith(); // comment\n"
-            "// at start\n"
-            "otherLine();",
-            format("lineWith();   // comment\n"
-                   "// at start\n"
-                   "otherLine();"));
-  EXPECT_EQ("lineWith(); // comment\n"
-            "/*\n"
-            " * at start */\n"
-            "otherLine();",
-            format("lineWith();   // comment\n"
-                   "/*\n"
-                   " * at start */\n"
-                   "otherLine();"));
-  EXPECT_EQ("lineWith(); // comment\n"
-            "            // at start\n"
-            "otherLine();",
-            format("lineWith();   // comment\n"
-                   " // at start\n"
-                   "otherLine();"));
-
-  EXPECT_EQ("lineWith(); // comment\n"
-            "// at start\n"
-            "otherLine(); // comment",
-            format("lineWith();   // comment\n"
-                   "// at start\n"
-                   "otherLine();   // comment"));
-  EXPECT_EQ("lineWith();\n"
-            "// at start\n"
-            "otherLine(); // comment",
-            format("lineWith();\n"
-                   " // at start\n"
-                   "otherLine();   // comment"));
-  EXPECT_EQ("// first\n"
-            "// at start\n"
-            "otherLine(); // comment",
-            format("// first\n"
-                   " // at start\n"
-                   "otherLine();   // comment"));
-  EXPECT_EQ("f();\n"
-            "// first\n"
-            "// at start\n"
-            "otherLine(); // comment",
-            format("f();\n"
-                   "// first\n"
-                   " // at start\n"
-                   "otherLine();   // comment"));
+  verifyFormat("D(a, {\n"
+               "  // test\n"
+               "  int a;\n"
+               "});",
+               "D(a, {\n"
+               "// test\n"
+               "int a;\n"
+               "});");
+
+  verifyFormat("lineWith(); // comment\n"
+               "// at start\n"
+               "otherLine();",
+               "lineWith();   // comment\n"
+               "// at start\n"
+               "otherLine();");
+  verifyFormat("lineWith(); // comment\n"
+               "/*\n"
+               " * at start */\n"
+               "otherLine();",
+               "lineWith();   // comment\n"
+               "/*\n"
+               " * at start */\n"
+               "otherLine();");
+  verifyFormat("lineWith(); // comment\n"
+               "            // at start\n"
+               "otherLine();",
+               "lineWith();   // comment\n"
+               " // at start\n"
+               "otherLine();");
+
+  verifyFormat("lineWith(); // comment\n"
+               "// at start\n"
+               "otherLine(); // comment",
+               "lineWith();   // comment\n"
+               "// at start\n"
+               "otherLine();   // comment");
+  verifyFormat("lineWith();\n"
+               "// at start\n"
+               "otherLine(); // comment",
+               "lineWith();\n"
+               " // at start\n"
+               "otherLine();   // comment");
+  verifyFormat("// first\n"
+               "// at start\n"
+               "otherLine(); // comment",
+               "// first\n"
+               " // at start\n"
+               "otherLine();   // comment");
+  verifyFormat("f();\n"
+               "// first\n"
+               "// at start\n"
+               "otherLine(); // comment",
+               "f();\n"
+               "// first\n"
+               " // at start\n"
+               "otherLine();   // comment");
   verifyFormat("f(); // comment\n"
                "// first\n"
                "// at start\n"
                "otherLine();");
-  EXPECT_EQ("f(); // comment\n"
-            "// first\n"
-            "// at start\n"
-            "otherLine();",
-            format("f();   // comment\n"
-                   "// first\n"
-                   " // at start\n"
-                   "otherLine();"));
-  EXPECT_EQ("f(); // comment\n"
-            "     // first\n"
-            "// at start\n"
-            "otherLine();",
-            format("f();   // comment\n"
-                   " // first\n"
-                   "// at start\n"
-                   "otherLine();"));
-  EXPECT_EQ("void f() {\n"
-            "  lineWith(); // comment\n"
-            "  // at start\n"
-            "}",
-            format("void              f() {\n"
-                   "  lineWith(); // comment\n"
-                   "  // at start\n"
-                   "}"));
-  EXPECT_EQ("int xy; // a\n"
-            "int z;  // b",
-            format("int xy;    // a\n"
-                   "int z;    //b"));
-  EXPECT_EQ("int xy; // a\n"
-            "int z; // bb",
-            format("int xy;    // a\n"
-                   "int z;    //bb",
-                   getLLVMStyleWithColumns(12)));
+  verifyFormat("f(); // comment\n"
+               "// first\n"
+               "// at start\n"
+               "otherLine();",
+               "f();   // comment\n"
+               "// first\n"
+               " // at start\n"
+               "otherLine();");
+  verifyFormat("f(); // comment\n"
+               "     // first\n"
+               "// at start\n"
+               "otherLine();",
+               "f();   // comment\n"
+               " // first\n"
+               "// at start\n"
+               "otherLine();");
+  verifyFormat("void f() {\n"
+               "  lineWith(); // comment\n"
+               "  // at start\n"
+               "}",
+               "void              f() {\n"
+               "  lineWith(); // comment\n"
+               "  // at start\n"
+               "}");
+  verifyFormat("int xy; // a\n"
+               "int z;  // b",
+               "int xy;    // a\n"
+               "int z;    //b");
+  verifyFormat("int xy; // a\n"
+               "int z; // bb",
+               "int xy;    // a\n"
+               "int z;    //bb",
+               getLLVMStyleWithColumns(12));
 
   verifyFormat("#define A                                                  \\\n"
                "  int i; /* iiiiiiiiiiiiiiiiiiiii */                       \\\n"
@@ -317,14 +302,14 @@ TEST_F(FormatTestComments, UnderstandsSingleLineComments) {
   verifyFormat("if ( // This is some comment\n"
                "    x + 3) {\n"
                "}");
-  EXPECT_EQ("if ( // This is some comment\n"
-            "     // spanning two lines\n"
-            "    x + 3) {\n"
-            "}",
-            format("if( // This is some comment\n"
-                   "     // spanning two lines\n"
-                   " x + 3) {\n"
-                   "}"));
+  verifyFormat("if ( // This is some comment\n"
+               "     // spanning two lines\n"
+               "    x + 3) {\n"
+               "}",
+               "if( // This is some comment\n"
+               "     // spanning two lines\n"
+               " x + 3) {\n"
+               "}");
 
   verifyNoCrash("/\\\n/");
   verifyNoCrash("/\\\n* */");
@@ -333,35 +318,35 @@ TEST_F(FormatTestComments, UnderstandsSingleLineComments) {
 }
 
 TEST_F(FormatTestComments, KeepsParameterWithTrailingCommentsOnTheirOwnLine) {
-  EXPECT_EQ("SomeFunction(a,\n"
-            "             b, // comment\n"
-            "             c);",
-            format("SomeFunction(a,\n"
-                   "          b, // comment\n"
-                   "      c);"));
-  EXPECT_EQ("SomeFunction(a, b,\n"
-            "             // comment\n"
-            "             c);",
-            format("SomeFunction(a,\n"
-                   "          b,\n"
-                   "  // comment\n"
-                   "      c);"));
-  EXPECT_EQ("SomeFunction(a, b, // comment (unclear relation)\n"
-            "             c);",
-            format("SomeFunction(a, b, // comment (unclear relation)\n"
-                   "      c);"));
-  EXPECT_EQ("SomeFunction(a, // comment\n"
-            "             b,\n"
-            "             c); // comment",
-            format("SomeFunction(a,     // comment\n"
-                   "          b,\n"
-                   "      c); // comment"));
-  EXPECT_EQ("aaaaaaaaaa(aaaa(aaaa,\n"
-            "                aaaa), //\n"
-            "           aaaa, bbbbb);",
-            format("aaaaaaaaaa(aaaa(aaaa,\n"
-                   "aaaa), //\n"
-                   "aaaa, bbbbb);"));
+  verifyFormat("SomeFunction(a,\n"
+               "             b, // comment\n"
+               "             c);",
+               "SomeFunction(a,\n"
+               "          b, // comment\n"
+               "      c);");
+  verifyFormat("SomeFunction(a, b,\n"
+               "             // comment\n"
+               "             c);",
+               "SomeFunction(a,\n"
+               "          b,\n"
+               "  // comment\n"
+               "      c);");
+  verifyFormat("SomeFunction(a, b, // comment (unclear relation)\n"
+               "             c);",
+               "SomeFunction(a, b, // comment (unclear relation)\n"
+               "      c);");
+  verifyFormat("SomeFunction(a, // comment\n"
+               "             b,\n"
+               "             c); // comment",
+               "SomeFunction(a,     // comment\n"
+               "          b,\n"
+               "      c); // comment");
+  verifyFormat("aaaaaaaaaa(aaaa(aaaa,\n"
+               "                aaaa), //\n"
+               "           aaaa, bbbbb);",
+               "aaaaaaaaaa(aaaa(aaaa,\n"
+               "aaaa), //\n"
+               "aaaa, bbbbb);");
 
   FormatStyle BreakAlways = getLLVMStyle();
   BreakAlways.BinPackParameters = FormatStyle::BPPS_AlwaysOnePerLine;
@@ -378,12 +363,12 @@ TEST_F(FormatTestComments, KeepsParameterWithTrailingCommentsOnTheirOwnLine) {
 }
 
 TEST_F(FormatTestComments, RemovesTrailingWhitespaceOfComments) {
-  EXPECT_EQ("// comment", format("// comment  "));
-  EXPECT_EQ("int aaaaaaa, bbbbbbb; // comment",
-            format("int aaaaaaa, bbbbbbb; // comment                   ",
-                   getLLVMStyleWithColumns(33)));
-  EXPECT_EQ("// comment\\\n", format("// comment\\\n  \t \v   \f   "));
-  EXPECT_EQ("// comment    \\\n", format("// comment    \\\n  \t \v   \f   "));
+  verifyFormat("// comment", "// comment  ");
+  verifyFormat("int aaaaaaa, bbbbbbb; // comment",
+               "int aaaaaaa, bbbbbbb; // comment                   ",
+               getLLVMStyleWithColumns(33));
+  verifyFormat("// comment\\\n", "// comment\\\n  \t \v   \f   ");
+  verifyFormat("// comment    \\\n", "// comment    \\\n  \t \v   \f   ");
 }
 
 TEST_F(FormatTestComments, UnderstandsBlockComments) {
@@ -393,16 +378,15 @@ TEST_F(FormatTestComments, UnderstandsBlockComments) {
                "    /*qq_=*/move(q), [this, b](bar<void(uint32_t)> b) {},\n"
                "    c);",
                getLLVMStyleWithColumns(60));
-  EXPECT_EQ("f(aaaaaaaaaaaaaaaaaaaaaaaaa, /* Trailing comment for aa... */\n"
-            "  bbbbbbbbbbbbbbbbbbbbbbbbb);",
-            format("f(aaaaaaaaaaaaaaaaaaaaaaaaa ,   \\\n"
-                   "/* Trailing comment for aa... */\n"
-                   "  bbbbbbbbbbbbbbbbbbbbbbbbb);"));
-  EXPECT_EQ(
-      "f(aaaaaaaaaaaaaaaaaaaaaaaaa,\n"
-      "  /* Leading comment for bb... */ bbbbbbbbbbbbbbbbbbbbbbbbb);",
-      format("f(aaaaaaaaaaaaaaaaaaaaaaaaa    ,   \n"
-             "/* Leading comment for bb... */   bbbbbbbbbbbbbbbbbbbbbbbbb);"));
+  verifyFormat("f(aaaaaaaaaaaaaaaaaaaaaaaaa, /* Trailing comment for aa... */\n"
+               "  bbbbbbbbbbbbbbbbbbbbbbbbb);",
+               "f(aaaaaaaaaaaaaaaaaaaaaaaaa ,   \\\n"
+               "/* Trailing comment for aa... */\n"
+               "  bbbbbbbbbbbbbbbbbbbbbbbbb);");
+  verifyFormat("f(aaaaaaaaaaaaaaaaaaaaaaaaa,\n"
+               "  /* Leading comment for bb... */ bbbbbbbbbbbbbbbbbbbbbbbbb);",
+               "f(aaaaaaaaaaaaaaaaaaaaaaaaa    ,   \n"
+               "/* Leading comment for bb... */   bbbbbbbbbbbbbbbbbbbbbbbbb);");
 
   verifyFormat(
       "void aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa(\n"
@@ -445,77 +429,74 @@ TEST_F(FormatTestComments, UnderstandsBlockComments) {
 }
 
 TEST_F(FormatTestComments, AlignsBlockComments) {
-  EXPECT_EQ("/*\n"
-            " * Really multi-line\n"
-            " * comment.\n"
-            " */\n"
-            "void f() {}",
-            format("  /*\n"
-                   "   * Really multi-line\n"
-                   "   * comment.\n"
-                   "   */\n"
-                   "  void f() {}"));
-  EXPECT_EQ("class C {\n"
-            "  /*\n"
-            "   * Another multi-line\n"
-            "   * comment.\n"
-            "   */\n"
-            "  void f() {}\n"
-            "};",
-            format("class C {\n"
-                   "/*\n"
-                   " * Another multi-line\n"
-                   " * comment.\n"
-                   " */\n"
-                   "void f() {}\n"
-                   "};"));
-  EXPECT_EQ("/*\n"
-            "  1. This is a comment with non-trivial formatting.\n"
-            "     1.1. We have to indent/outdent all lines equally\n"
-            "         1.1.1. to keep the formatting.\n"
-            " */",
-            format("  /*\n"
-                   "    1. This is a comment with non-trivial formatting.\n"
-                   "       1.1. We have to indent/outdent all lines equally\n"
-                   "           1.1.1. to keep the formatting.\n"
-                   "   */"));
-  EXPECT_EQ("/*\n"
-            "Don't try to outdent if there's not enough indentation.\n"
-            "*/",
-            format("  /*\n"
-                   " Don't try to outdent if there's not enough indentation.\n"
-                   " */"));
-
-  EXPECT_EQ("int i; /* Comment with empty...\n"
-            "        *\n"
-            "        * line. */",
-            format("int i; /* Comment with empty...\n"
-                   "        *\n"
-                   "        * line. */"));
-  EXPECT_EQ("int foobar = 0; /* comment */\n"
-            "int bar = 0;    /* multiline\n"
-            "                   comment 1 */\n"
-            "int baz = 0;    /* multiline\n"
-            "                   comment 2 */\n"
-            "int bzz = 0;    /* multiline\n"
-            "                   comment 3 */",
-            format("int foobar = 0; /* comment */\n"
-                   "int bar = 0;    /* multiline\n"
-                   "                   comment 1 */\n"
-                   "int baz = 0; /* multiline\n"
-                   "                comment 2 */\n"
-                   "int bzz = 0;         /* multiline\n"
-                   "                        comment 3 */"));
-  EXPECT_EQ("int foobar = 0; /* comment */\n"
-            "int bar = 0;    /* multiline\n"
-            "   comment */\n"
-            "int baz = 0;    /* multiline\n"
-            "comment */",
-            format("int foobar = 0; /* comment */\n"
-                   "int bar = 0; /* multiline\n"
-                   "comment */\n"
-                   "int baz = 0;        /* multiline\n"
-                   "comment */"));
+  verifyFormat("/*\n"
+               " * Really multi-line\n"
+               " * comment.\n"
+               " */\n"
+               "void f() {}",
+               "  /*\n"
+               "   * Really multi-line\n"
+               "   * comment.\n"
+               "   */\n"
+               "  void f() {}");
+  verifyFormat("class C {\n"
+               "  /*\n"
+               "   * Another multi-line\n"
+               "   * comment.\n"
+               "   */\n"
+               "  void f() {}\n"
+               "};",
+               "class C {\n"
+               "/*\n"
+               " * Another multi-line\n"
+               " * comment.\n"
+               " */\n"
+               "void f() {}\n"
+               "};");
+  verifyFormat("/*\n"
+               "  1. This is a comment with non-trivial formatting.\n"
+               "     1.1. We have to indent/outdent all lines equally\n"
+               "         1.1.1. to keep the formatting.\n"
+               " */",
+               "  /*\n"
+               "    1. This is a comment with non-trivial formatting.\n"
+               "       1.1. We have to indent/outdent all lines equally\n"
+               "           1.1.1. to keep the formatting.\n"
+               "   */");
+  verifyFormat("/*\n"
+               "Don't try to outdent if there's not enough indentation.\n"
+               "*/",
+               "  /*\n"
+               " Don't try to outdent if there's not enough indentation.\n"
+               " */");
+
+  verifyNoChange("int i; /* Comment with empty...\n"
+                 "        *\n"
+                 "        * line. */");
+  verifyFormat("int foobar = 0; /* comment */\n"
+               "int bar = 0;    /* multiline\n"
+               "                   comment 1 */\n"
+               "int baz = 0;    /* multiline\n"
+               "                   comment 2 */\n"
+               "int bzz = 0;    /* multiline\n"
+               "                   comment 3 */",
+               "int foobar = 0; /* comment */\n"
+               "int bar = 0;    /* multiline\n"
+               "                   comment 1 */\n"
+               "int baz = 0; /* multiline\n"
+               "                comment 2 */\n"
+               "int bzz = 0;         /* multiline\n"
+               "                        comment 3 */");
+  verifyFormat("int foobar = 0; /* comment */\n"
+               "int bar = 0;    /* multiline\n"
+               "   comment */\n"
+               "int baz = 0;    /* multiline\n"
+               "comment */",
+               "int foobar = 0; /* comment */\n"
+               "int bar = 0; /* multiline\n"
+               "comment */\n"
+               "int baz = 0;        /* multiline\n"
+               "comment */");
 }
 
 TEST_F(FormatTestComments, CommentReflowingCanBeTurnedOff) {
@@ -553,11 +534,9 @@ TEST_F(FormatTestComments, CommentReflowingCanApplyOnlyToIndents) {
 }
 
 TEST_F(FormatTestComments, CorrectlyHandlesLengthOfBlockComments) {
-  EXPECT_EQ("double *x; /* aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\n"
-            "              aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa */",
-            format("double *x; /* aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\n"
-                   "              aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa */"));
-  EXPECT_EQ(
+  verifyFormat("double *x; /* aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\n"
+               "              aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa */");
+  verifyFormat(
       "void ffffffffffff(\n"
       "    int aaaaaaaa, int bbbbbbbb,\n"
       "    int cccccccccccc) { /*\n"
@@ -567,150 +546,141 @@ TEST_F(FormatTestComments, CorrectlyHandlesLengthOfBlockComments) {
       "                           bbbbbbbbbb\n"
       "                         */\n"
       "}",
-      format("void ffffffffffff(int aaaaaaaa, int bbbbbbbb, int cccccccccccc)\n"
-             "{ /*\n"
-             "     aaaaaaaaaa aaaaaaaaaaaaa\n"
-             "     bbbbbbbbbbbbbb bbbbbbbbbb\n"
-             "   */\n"
-             "}",
-             getLLVMStyleWithColumns(40)));
+      "void ffffffffffff(int aaaaaaaa, int bbbbbbbb, int cccccccccccc)\n"
+      "{ /*\n"
+      "     aaaaaaaaaa aaaaaaaaaaaaa\n"
+      "     bbbbbbbbbbbbbb bbbbbbbbbb\n"
+      "   */\n"
+      "}",
+      getLLVMStyleWithColumns(40));
 }
 
 TEST_F(FormatTestComments, DontBreakNonTrailingBlockComments) {
-  EXPECT_EQ("void ffffffffff(\n"
-            "    int aaaaa /* test */);",
-            format("void ffffffffff(int aaaaa /* test */);",
-                   getLLVMStyleWithColumns(35)));
+  verifyFormat("void ffffffffff(\n"
+               "    int aaaaa /* test */);",
+               "void ffffffffff(int aaaaa /* test */);",
+               getLLVMStyleWithColumns(35));
 }
 
 TEST_F(FormatTestComments, SplitsLongCxxComments) {
-  EXPECT_EQ("// A comment that\n"
-            "// doesn't fit on\n"
-            "// one line",
-            format("// A comment that doesn't fit on one line",
-                   getLLVMStyleWithColumns(20)));
-  EXPECT_EQ("/// A comment that\n"
-            "/// doesn't fit on\n"
-            "/// one line",
-            format("/// A comment that doesn't fit on one line",
-                   getLLVMStyleWithColumns(20)));
-  EXPECT_EQ("//! A comment that\n"
-            "//! doesn't fit on\n"
-            "//! one line",
-            format("//! A comment that doesn't fit on one line",
-                   getLLVMStyleWithColumns(20)));
-  EXPECT_EQ("// a b c d\n"
-            "// e f  g\n"
-            "// h i j k",
-            format("// a b c d e f  g h i j k", getLLVMStyleWithColumns(10)));
-  EXPECT_EQ(
-      "// a b c d\n"
-      "// e f  g\n"
-      "// h i j k",
-      format("\\\n// a b c d e f  g h i j k", getLLVMStyleWithColumns(10)));
-  EXPECT_EQ("if (true) // A comment that\n"
-            "          // doesn't fit on\n"
-            "          // one line",
-            format("if (true) // A comment that doesn't fit on one line   ",
-                   getLLVMStyleWithColumns(30)));
-  verifyNoChange("//    Don't_touch_leading_whitespace",
-                 getLLVMStyleWithColumns(20));
-  EXPECT_EQ("// Add leading\n"
-            "// whitespace",
-            format("//Add leading whitespace", getLLVMStyleWithColumns(20)));
-  EXPECT_EQ("/// Add leading\n"
-            "/// whitespace",
-            format("///Add leading whitespace", getLLVMStyleWithColumns(20)));
-  EXPECT_EQ("//! Add leading\n"
-            "//! whitespace",
-            format("//!Add leading whitespace", getLLVMStyleWithColumns(20)));
-  EXPECT_EQ("// whitespace", format("//whitespace"));
-  EXPECT_EQ("// Even if it makes the line exceed the column\n"
-            "// limit",
-            format("//Even if it makes the line exceed the column limit",
-                   getLLVMStyleWithColumns(51)));
+  const auto Style10 = getLLVMStyleWithColumns(10);
+  const auto Style20 = getLLVMStyleWithColumns(20);
+  const auto Style22 = getLLVMStyleWithColumns(22);
+  const auto Style30 = getLLVMStyleWithColumns(30);
+
+  verifyFormat("// A comment that\n"
+               "// doesn't fit on\n"
+               "// one line",
+               "// A comment that doesn't fit on one line", Style20);
+  verifyFormat("/// A comment that\n"
+               "/// doesn't fit on\n"
+               "/// one line",
+               "/// A comment that doesn't fit on one line", Style20);
+  verifyFormat("//! A comment that\n"
+               "//! doesn't fit on\n"
+               "//! one line",
+               "//! A comment that doesn't fit on one line", Style20);
+  verifyFormat("// a b c d\n"
+               "// e f  g\n"
+               "// h i j k",
+               "// a b c d e f  g h i j k", Style10);
+  verifyFormat("// a b c d\n"
+               "// e f  g\n"
+               "// h i j k",
+               "\\\n// a b c d e f  g h i j k", Style10);
+  verifyFormat("if (true) // A comment that\n"
+               "          // doesn't fit on\n"
+               "          // one line",
+               "if (true) // A comment that doesn't fit on one line   ",
+               Style30);
+  verifyNoChange("//    Don't_touch_leading_whitespace", Style20);
+  verifyFormat("// Add leading\n"
+               "// whitespace",
+               "//Add leading whitespace", Style20);
+  verifyFormat("/// Add leading\n"
+               "/// whitespace",
+               "///Add leading whitespace", Style20);
+  verifyFormat("//! Add leading\n"
+               "//! whitespace",
+               "//!Add leading whitespace", Style20);
+  verifyFormat("// whitespace", "//whitespace");
+  verifyFormat("// Even if it makes the line exceed the column\n"
+               "// limit",
+               "//Even if it makes the line exceed the column limit",
+               getLLVMStyleWithColumns(51));
   verifyFormat("//--But not here");
-  EXPECT_EQ("/// line 1\n"
-            "// add leading whitespace",
-            format("/// line 1\n"
-                   "//add leading whitespace",
-                   getLLVMStyleWithColumns(30)));
-  EXPECT_EQ("/// line 1\n"
-            "/// line 2\n"
-            "//! line 3\n"
-            "//! line 4\n"
-            "//! line 5\n"
-            "// line 6\n"
-            "// line 7",
-            format("///line 1\n"
-                   "///line 2\n"
-                   "//! line 3\n"
-                   "//!line 4\n"
-                   "//!line 5\n"
-                   "// line 6\n"
-                   "//line 7",
-                   getLLVMStyleWithColumns(20)));
-
-  EXPECT_EQ("// aa bb cc dd",
-            format("// aa bb             cc dd                   ",
-                   getLLVMStyleWithColumns(15)));
-
-  EXPECT_EQ("// A comment before\n"
-            "// a macro\n"
-            "// definition\n"
-            "#define a b",
-            format("// A comment before a macro definition\n"
-                   "#define a b",
-                   getLLVMStyleWithColumns(20)));
-  EXPECT_EQ("void ffffff(\n"
-            "    int aaaaaaaaa,  // wwww\n"
-            "    int bbbbbbbbbb, // xxxxxxx\n"
-            "                    // yyyyyyyyyy\n"
-            "    int c, int d, int e) {}",
-            format("void ffffff(\n"
-                   "    int aaaaaaaaa, // wwww\n"
-                   "    int bbbbbbbbbb, // xxxxxxx yyyyyyyyyy\n"
-                   "    int c, int d, int e) {}",
-                   getLLVMStyleWithColumns(40)));
-  verifyFormat("//\t aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
-               getLLVMStyleWithColumns(20));
-  EXPECT_EQ(
-      "#define XXX // a b c d\n"
-      "            // e f g h",
-      format("#define XXX // a b c d e f g h", getLLVMStyleWithColumns(22)));
-  EXPECT_EQ(
-      "#define XXX // q w e r\n"
-      "            // t y u i",
-      format("#define XXX //q w e r t y u i", getLLVMStyleWithColumns(22)));
-  EXPECT_EQ("{\n"
-            "  //\n"
-            "  //\\\n"
-            "  // long 1 2 3 4 5\n"
-            "}",
-            format("{\n"
-                   "  //\n"
-                   "  //\\\n"
-                   "  // long 1 2 3 4 5\n"
-                   "}",
-                   getLLVMStyleWithColumns(20)));
-  EXPECT_EQ("{\n"
-            "  //\n"
-            "  //\\\n"
-            "  // long 1 2 3 4 5\n"
-            "  // 6\n"
-            "}",
-            format("{\n"
-                   "  //\n"
-                   "  //\\\n"
-                   "  // long 1 2 3 4 5 6\n"
-                   "}",
-                   getLLVMStyleWithColumns(20)));
-
-  EXPECT_EQ("//: A comment that\n"
-            "//: doesn't fit on\n"
-            "//: one line",
-            format("//: A comment that doesn't fit on one line",
-                   getLLVMStyleWithColumns(20)));
+  verifyFormat("/// line 1\n"
+               "// add leading whitespace",
+               "/// line 1\n"
+               "//add leading whitespace",
+               Style30);
+  verifyFormat("/// line 1\n"
+               "/// line 2\n"
+               "//! line 3\n"
+               "//! line 4\n"
+               "//! line 5\n"
+               "// line 6\n"
+               "// line 7",
+               "///line 1\n"
+               "///line 2\n"
+               "//! line 3\n"
+               "//!line 4\n"
+               "//!line 5\n"
+               "// line 6\n"
+               "//line 7",
+               Style20);
+
+  verifyFormat("// aa bb cc dd",
+               "// aa bb             cc dd                   ",
+               getLLVMStyleWithColumns(15));
+
+  verifyFormat("// A comment before\n"
+               "// a macro\n"
+               "// definition\n"
+               "#define a b",
+               "// A comment before a macro definition\n"
+               "#define a b",
+               Style20);
+  verifyFormat("void ffffff(\n"
+               "    int aaaaaaaaa,  // wwww\n"
+               "    int bbbbbbbbbb, // xxxxxxx\n"
+               "                    // yyyyyyyyyy\n"
+               "    int c, int d, int e) {}",
+               "void ffffff(\n"
+               "    int aaaaaaaaa, // wwww\n"
+               "    int bbbbbbbbbb, // xxxxxxx yyyyyyyyyy\n"
+               "    int c, int d, int e) {}",
+               getLLVMStyleWithColumns(40));
+  verifyFormat("//\t aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", Style20);
+  verifyFormat("#define XXX // a b c d\n"
+               "            // e f g h",
+               "#define XXX // a b c d e f g h", Style22);
+  verifyFormat("#define XXX // q w e r\n"
+               "            // t y u i",
+               "#define XXX //q w e r t y u i", Style22);
+  verifyNoChange("{\n"
+                 "  //\n"
+                 "  //\\\n"
+                 "  // long 1 2 3 4 5\n"
+                 "}",
+                 Style20);
+  verifyFormat("{\n"
+               "  //\n"
+               "  //\\\n"
+               "  // long 1 2 3 4 5\n"
+               "  // 6\n"
+               "}",
+               "{\n"
+               "  //\n"
+               "  //\\\n"
+               "  // long 1 2 3 4 5 6\n"
+               "}",
+               Style20);
+
+  verifyFormat("//: A comment that\n"
+               "//: doesn't fit on\n"
+               "//: one line",
+               "//: A comment that doesn't fit on one line", Style20);
 
   verifyFormat(
       "//\t\t\t\tofMap(message.velocity, 0, 127, 0, ofGetWidth()\n"
@@ -719,34 +689,27 @@ TEST_F(FormatTestComments, SplitsLongCxxComments) {
 }
 
 TEST_F(FormatTestComments, PreservesHangingIndentInCxxComments) {
-  EXPECT_EQ("//     A comment\n"
-            "//     that doesn't\n"
-            "//     fit on one\n"
-            "//     line",
-            format("//     A comment that doesn't fit on one line",
-                   getLLVMStyleWithColumns(20)));
-  EXPECT_EQ("///     A comment\n"
-            "///     that doesn't\n"
-            "///     fit on one\n"
-            "///     line",
-            format("///     A comment that doesn't fit on one line",
-                   getLLVMStyleWithColumns(20)));
+  const auto Style20 = getLLVMStyleWithColumns(20);
+  verifyFormat("//     A comment\n"
+               "//     that doesn't\n"
+               "//     fit on one\n"
+               "//     line",
+               "//     A comment that doesn't fit on one line", Style20);
+  verifyFormat("///     A comment\n"
+               "///     that doesn't\n"
+               "///     fit on one\n"
+               "///     line",
+               "///     A comment that doesn't fit on one line", Style20);
 }
 
 TEST_F(FormatTestComments, DontSplitLineCommentsWithEscapedNewlines) {
-  EXPECT_EQ("// aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\\\n"
-            "// aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\\\n"
-            "// aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
-            format("// aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\\\n"
-                   "// aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\\\n"
-                   "// aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"));
-  EXPECT_EQ("int a; // AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\\\n"
-            "       // AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\\\n"
-            "       // AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
-            format("int a; // AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\\\n"
-                   "       // AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\\\n"
-                   "       // AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
-                   getLLVMStyleWithColumns(50)));
+  verifyNoChange("// aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\\\n"
+                 "// aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\\\n"
+                 "// aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa");
+  verifyNoChange("int a; // AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\\\n"
+                 "       // AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\\\n"
+                 "       // AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
+                 getLLVMStyleWithColumns(50));
   verifyFormat("double\n"
                "    a; // AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\\\n"
                "       // AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\\\n"
@@ -759,84 +722,72 @@ TEST_F(FormatTestComments, DontSplitLineCommentsWithEscapedNewlines) {
 
 TEST_F(FormatTestComments, DontIntroduceMultilineComments) {
   // Avoid introducing a multiline comment by breaking after `\`.
+  auto Style = getLLVMStyle();
   for (int ColumnLimit = 15; ColumnLimit <= 17; ++ColumnLimit) {
-    EXPECT_EQ(
-        "// aaaaaaaaaa\n"
-        "// \\ bb",
-        format("// aaaaaaaaaa \\ bb", getLLVMStyleWithColumns(ColumnLimit)));
-    EXPECT_EQ(
-        "// aaaaaaaaa\n"
-        "// \\  bb",
-        format("// aaaaaaaaa \\  bb", getLLVMStyleWithColumns(ColumnLimit)));
-    EXPECT_EQ(
-        "// aaaaaaaaa\n"
-        "// \\  \\ bb",
-        format("// aaaaaaaaa \\  \\ bb", getLLVMStyleWithColumns(ColumnLimit)));
+    Style.ColumnLimit = ColumnLimit;
+    verifyFormat("// aaaaaaaaaa\n"
+                 "// \\ bb",
+                 "// aaaaaaaaaa \\ bb", Style);
+    verifyFormat("// aaaaaaaaa\n"
+                 "// \\  bb",
+                 "// aaaaaaaaa \\  bb", Style);
+    verifyFormat("// aaaaaaaaa\n"
+                 "// \\  \\ bb",
+                 "// aaaaaaaaa \\  \\ bb", Style);
   }
 }
 
 TEST_F(FormatTestComments, DontSplitLineCommentsWithPragmas) {
   FormatStyle Pragmas = getLLVMStyleWithColumns(30);
   Pragmas.CommentPragmas = "^ IWYU pragma:";
-  EXPECT_EQ(
-      "// IWYU pragma: aaaaaaaaaaaaaaaaaa bbbbbbbbbbbbbb",
-      format("// IWYU pragma: aaaaaaaaaaaaaaaaaa bbbbbbbbbbbbbb", Pragmas));
-  EXPECT_EQ(
-      "/* IWYU pragma: aaaaaaaaaaaaaaaaaa bbbbbbbbbbbbbb */",
-      format("/* IWYU pragma: aaaaaaaaaaaaaaaaaa bbbbbbbbbbbbbb */", Pragmas));
+  verifyFormat("// IWYU pragma: aaaaaaaaaaaaaaaaaa bbbbbbbbbbbbbb", Pragmas);
+  verifyFormat("/* IWYU pragma: aaaaaaaaaaaaaaaaaa bbbbbbbbbbbbbb */", Pragmas);
 }
 
 TEST_F(FormatTestComments, PriorityOfCommentBreaking) {
-  EXPECT_EQ("if (xxx ==\n"
-            "        yyy && // aaaaaaaaaaaa bbbbbbbbb\n"
-            "    zzz)\n"
-            "  q();",
-            format("if (xxx == yyy && // aaaaaaaaaaaa bbbbbbbbb\n"
-                   "    zzz) q();",
-                   getLLVMStyleWithColumns(40)));
-  EXPECT_EQ("if (xxxxxxxxxx ==\n"
-            "        yyy && // aaaaaa bbbbbbbb cccc\n"
-            "    zzz)\n"
-            "  q();",
-            format("if (xxxxxxxxxx == yyy && // aaaaaa bbbbbbbb cccc\n"
-                   "    zzz) q();",
-                   getLLVMStyleWithColumns(40)));
-  EXPECT_EQ("if (xxxxxxxxxx &&\n"
-            "        yyy || // aaaaaa bbbbbbbb cccc\n"
-            "    zzz)\n"
-            "  q();",
-            format("if (xxxxxxxxxx && yyy || // aaaaaa bbbbbbbb cccc\n"
-                   "    zzz) q();",
-                   getLLVMStyleWithColumns(40)));
-  EXPECT_EQ("fffffffff(\n"
-            "    &xxx, // aaaaaaaaaaaa bbbbbbbbbbb\n"
-            "    zzz);",
-            format("fffffffff(&xxx, // aaaaaaaaaaaa bbbbbbbbbbb\n"
-                   " zzz);",
-                   getLLVMStyleWithColumns(40)));
+  const auto Style40 = getLLVMStyleWithColumns(40);
+  verifyFormat("if (xxx ==\n"
+               "        yyy && // aaaaaaaaaaaa bbbbbbbbb\n"
+               "    zzz)\n"
+               "  q();",
+               "if (xxx == yyy && // aaaaaaaaaaaa bbbbbbbbb\n"
+               "    zzz) q();",
+               Style40);
+  verifyFormat("if (xxxxxxxxxx ==\n"
+               "        yyy && // aaaaaa bbbbbbbb cccc\n"
+               "    zzz)\n"
+               "  q();",
+               "if (xxxxxxxxxx == yyy && // aaaaaa bbbbbbbb cccc\n"
+               "    zzz) q();",
+               Style40);
+  verifyFormat("if (xxxxxxxxxx &&\n"
+               "        yyy || // aaaaaa bbbbbbbb cccc\n"
+               "    zzz)\n"
+               "  q();",
+               "if (xxxxxxxxxx && yyy || // aaaaaa bbbbbbbb cccc\n"
+               "    zzz) q();",
+               Style40);
+  verifyFormat("fffffffff(\n"
+               "    &xxx, // aaaaaaaaaaaa bbbbbbbbbbb\n"
+               "    zzz);",
+               "fffffffff(&xxx, // aaaaaaaaaaaa bbbbbbbbbbb\n"
+               " zzz);",
+               Style40);
 }
 
 TEST_F(FormatTestComments, MultiLineCommentsInDefines) {
-  EXPECT_EQ("#define A(x) /* \\\n"
-            "  a comment     \\\n"
-            "  inside */     \\\n"
-            "  f();",
-            format("#define A(x) /* \\\n"
-                   "  a comment     \\\n"
-                   "  inside */     \\\n"
-                   "  f();",
-                   getLLVMStyleWithColumns(17)));
-  EXPECT_EQ("#define A(      \\\n"
-            "    x) /*       \\\n"
-            "  a comment     \\\n"
-            "  inside */     \\\n"
-            "  f();",
-            format("#define A(      \\\n"
-                   "    x) /*       \\\n"
-                   "  a comment     \\\n"
-                   "  inside */     \\\n"
-                   "  f();",
-                   getLLVMStyleWithColumns(17)));
+  const auto Style17 = getLLVMStyleWithColumns(17);
+  verifyNoChange("#define A(x) /* \\\n"
+                 "  a comment     \\\n"
+                 "  inside */     \\\n"
+                 "  f();",
+                 Style17);
+  verifyNoChange("#define A(      \\\n"
+                 "    x) /*       \\\n"
+                 "  a comment     \\\n"
+                 "  inside */     \\\n"
+                 "  f();",
+                 Style17);
 }
 
 TEST_F(FormatTestComments, LineCommentsInMacrosDoNotGetEscapedNewlines) {
@@ -859,285 +810,266 @@ TEST_F(FormatTestComments, LineCommentsInMacrosDoNotGetEscapedNewlines) {
 }
 
 TEST_F(FormatTestComments, ParsesCommentsAdjacentToPPDirectives) {
-  EXPECT_EQ("namespace {}\n// Test\n#define A",
-            format("namespace {}\n   // Test\n#define A"));
-  EXPECT_EQ("namespace {}\n/* Test */\n#define A",
-            format("namespace {}\n   /* Test */\n#define A"));
-  EXPECT_EQ("namespace {}\n/* Test */ #define A",
-            format("namespace {}\n   /* Test */    #define A"));
+  verifyFormat("namespace {}\n// Test\n#define A",
+               "namespace {}\n   // Test\n#define A");
+  verifyFormat("namespace {}\n/* Test */\n#define A",
+               "namespace {}\n   /* Test */\n#define A");
+  verifyFormat("namespace {}\n/* Test */ #define A",
+               "namespace {}\n   /* Test */    #define A");
 }
 
 TEST_F(FormatTestComments, KeepsLevelOfCommentBeforePPDirective) {
   // Keep the current level if the comment was originally not aligned with
   // the preprocessor directive.
-  EXPECT_EQ("void f() {\n"
-            "  int i;\n"
-            "  /* comment */\n"
-            "#ifdef A\n"
-            "  int j;\n"
-            "}",
-            format("void f() {\n"
-                   "  int i;\n"
-                   "  /* comment */\n"
-                   "#ifdef A\n"
-                   "  int j;\n"
-                   "}"));
-
-  EXPECT_EQ("void f() {\n"
-            "  int i;\n"
-            "  /* comment */\n"
-            "\n"
-            "#ifdef A\n"
-            "  int j;\n"
-            "}",
-            format("void f() {\n"
-                   "  int i;\n"
-                   "  /* comment */\n"
-                   "\n"
-                   "#ifdef A\n"
-                   "  int j;\n"
-                   "}"));
-
-  EXPECT_EQ("int f(int i) {\n"
-            "  if (true) {\n"
-            "    ++i;\n"
-            "  }\n"
-            "  // comment\n"
-            "#ifdef A\n"
-            "  int j;\n"
-            "#endif\n"
-            "}",
-            format("int f(int i) {\n"
-                   "  if (true) {\n"
-                   "    ++i;\n"
-                   "  }\n"
-                   "  // comment\n"
-                   "#ifdef A\n"
-                   "int j;\n"
-                   "#endif\n"
-                   "}"));
-
-  EXPECT_EQ("int f(int i) {\n"
-            "  if (true) {\n"
-            "    i++;\n"
-            "  } else {\n"
-            "    // comment in else\n"
-            "#ifdef A\n"
-            "    j++;\n"
-            "#endif\n"
-            "  }\n"
-            "}",
-            format("int f(int i) {\n"
-                   "  if (true) {\n"
-                   "    i++;\n"
-                   "  } else {\n"
-                   "  // comment in else\n"
-                   "#ifdef A\n"
-                   "    j++;\n"
-                   "#endif\n"
-                   "  }\n"
-                   "}"));
-
-  EXPECT_EQ("int f(int i) {\n"
-            "  if (true) {\n"
-            "    i++;\n"
-            "  } else {\n"
-            "    /* comment in else */\n"
-            "#ifdef A\n"
-            "    j++;\n"
-            "#endif\n"
-            "  }\n"
-            "}",
-            format("int f(int i) {\n"
-                   "  if (true) {\n"
-                   "    i++;\n"
-                   "  } else {\n"
-                   "  /* comment in else */\n"
-                   "#ifdef A\n"
-                   "    j++;\n"
-                   "#endif\n"
-                   "  }\n"
-                   "}"));
+  verifyNoChange("void f() {\n"
+                 "  int i;\n"
+                 "  /* comment */\n"
+                 "#ifdef A\n"
+                 "  int j;\n"
+                 "}");
+
+  verifyNoChange("void f() {\n"
+                 "  int i;\n"
+                 "  /* comment */\n"
+                 "\n"
+                 "#ifdef A\n"
+                 "  int j;\n"
+                 "}");
+
+  verifyFormat("int f(int i) {\n"
+               "  if (true) {\n"
+               "    ++i;\n"
+               "  }\n"
+               "  // comment\n"
+               "#ifdef A\n"
+               "  int j;\n"
+               "#endif\n"
+               "}",
+               "int f(int i) {\n"
+               "  if (true) {\n"
+               "    ++i;\n"
+               "  }\n"
+               "  // comment\n"
+               "#ifdef A\n"
+               "int j;\n"
+               "#endif\n"
+               "}");
+
+  verifyFormat("int f(int i) {\n"
+               "  if (true) {\n"
+               "    i++;\n"
+               "  } else {\n"
+               "    // comment in else\n"
+               "#ifdef A\n"
+               "    j++;\n"
+               "#endif\n"
+               "  }\n"
+               "}",
+               "int f(int i) {\n"
+               "  if (true) {\n"
+               "    i++;\n"
+               "  } else {\n"
+               "  // comment in else\n"
+               "#ifdef A\n"
+               "    j++;\n"
+               "#endif\n"
+               "  }\n"
+               "}");
+
+  verifyFormat("int f(int i) {\n"
+               "  if (true) {\n"
+               "    i++;\n"
+               "  } else {\n"
+               "    /* comment in else */\n"
+               "#ifdef A\n"
+               "    j++;\n"
+               "#endif\n"
+               "  }\n"
+               "}",
+               "int f(int i) {\n"
+               "  if (true) {\n"
+               "    i++;\n"
+               "  } else {\n"
+               "  /* comment in else */\n"
+               "#ifdef A\n"
+               "    j++;\n"
+               "#endif\n"
+               "  }\n"
+               "}");
 
   // Keep the current level if there is an empty line between the comment and
   // the preprocessor directive.
-  EXPECT_EQ("void f() {\n"
-            "  int i;\n"
-            "  /* comment */\n"
-            "\n"
-            "#ifdef A\n"
-            "  int j;\n"
-            "}",
-            format("void f() {\n"
-                   "  int i;\n"
-                   "/* comment */\n"
-                   "\n"
-                   "#ifdef A\n"
-                   "  int j;\n"
-                   "}"));
-
-  EXPECT_EQ("void f() {\n"
-            "  int i;\n"
-            "  return i;\n"
-            "}\n"
-            "// comment\n"
-            "\n"
-            "#ifdef A\n"
-            "int i;\n"
-            "#endif // A",
-            format("void f() {\n"
-                   "   int i;\n"
-                   "  return i;\n"
-                   "}\n"
-                   "// comment\n"
-                   "\n"
-                   "#ifdef A\n"
-                   "int i;\n"
-                   "#endif // A"));
-
-  EXPECT_EQ("int f(int i) {\n"
-            "  if (true) {\n"
-            "    ++i;\n"
-            "  }\n"
-            "  // comment\n"
-            "\n"
-            "#ifdef A\n"
-            "  int j;\n"
-            "#endif\n"
-            "}",
-            format("int f(int i) {\n"
-                   "   if (true) {\n"
-                   "    ++i;\n"
-                   "  }\n"
-                   "  // comment\n"
-                   "\n"
-                   "#ifdef A\n"
-                   "  int j;\n"
-                   "#endif\n"
-                   "}"));
-
-  EXPECT_EQ("int f(int i) {\n"
-            "  if (true) {\n"
-            "    i++;\n"
-            "  } else {\n"
-            "    // comment in else\n"
-            "\n"
-            "#ifdef A\n"
-            "    j++;\n"
-            "#endif\n"
-            "  }\n"
-            "}",
-            format("int f(int i) {\n"
-                   "  if (true) {\n"
-                   "    i++;\n"
-                   "  } else {\n"
-                   "// comment in else\n"
-                   "\n"
-                   "#ifdef A\n"
-                   "    j++;\n"
-                   "#endif\n"
-                   "  }\n"
-                   "}"));
-
-  EXPECT_EQ("int f(int i) {\n"
-            "  if (true) {\n"
-            "    i++;\n"
-            "  } else {\n"
-            "    /* comment in else */\n"
-            "\n"
-            "#ifdef A\n"
-            "    j++;\n"
-            "#endif\n"
-            "  }\n"
-            "}",
-            format("int f(int i) {\n"
-                   "  if (true) {\n"
-                   "    i++;\n"
-                   "  } else {\n"
-                   "/* comment in else */\n"
-                   "\n"
-                   "#ifdef A\n"
-                   "    j++;\n"
-                   "#endif\n"
-                   "  }\n"
-                   "}"));
+  verifyFormat("void f() {\n"
+               "  int i;\n"
+               "  /* comment */\n"
+               "\n"
+               "#ifdef A\n"
+               "  int j;\n"
+               "}",
+               "void f() {\n"
+               "  int i;\n"
+               "/* comment */\n"
+               "\n"
+               "#ifdef A\n"
+               "  int j;\n"
+               "}");
+
+  verifyFormat("void f() {\n"
+               "  int i;\n"
+               "  return i;\n"
+               "}\n"
+               "// comment\n"
+               "\n"
+               "#ifdef A\n"
+               "int i;\n"
+               "#endif // A",
+               "void f() {\n"
+               "   int i;\n"
+               "  return i;\n"
+               "}\n"
+               "// comment\n"
+               "\n"
+               "#ifdef A\n"
+               "int i;\n"
+               "#endif // A");
+
+  verifyFormat("int f(int i) {\n"
+               "  if (true) {\n"
+               "    ++i;\n"
+               "  }\n"
+               "  // comment\n"
+               "\n"
+               "#ifdef A\n"
+               "  int j;\n"
+               "#endif\n"
+               "}",
+               "int f(int i) {\n"
+               "   if (true) {\n"
+               "    ++i;\n"
+               "  }\n"
+               "  // comment\n"
+               "\n"
+               "#ifdef A\n"
+               "  int j;\n"
+               "#endif\n"
+               "}");
+
+  verifyFormat("int f(int i) {\n"
+               "  if (true) {\n"
+               "    i++;\n"
+               "  } else {\n"
+               "    // comment in else\n"
+               "\n"
+               "#ifdef A\n"
+               "    j++;\n"
+               "#endif\n"
+               "  }\n"
+               "}",
+               "int f(int i) {\n"
+               "  if (true) {\n"
+               "    i++;\n"
+               "  } else {\n"
+               "// comment in else\n"
+               "\n"
+               "#ifdef A\n"
+               "    j++;\n"
+               "#endif\n"
+               "  }\n"
+               "}");
+
+  verifyFormat("int f(int i) {\n"
+               "  if (true) {\n"
+               "    i++;\n"
+               "  } else {\n"
+               "    /* comment in else */\n"
+               "\n"
+               "#ifdef A\n"
+               "    j++;\n"
+               "#endif\n"
+               "  }\n"
+               "}",
+               "int f(int i) {\n"
+               "  if (true) {\n"
+               "    i++;\n"
+               "  } else {\n"
+               "/* comment in else */\n"
+               "\n"
+               "#ifdef A\n"
+               "    j++;\n"
+               "#endif\n"
+               "  }\n"
+               "}");
 
   // Align with the preprocessor directive if the comment was originally aligned
   // with the preprocessor directive and there is no newline between the comment
   // and the preprocessor directive.
-  EXPECT_EQ("void f() {\n"
-            "  int i;\n"
-            "/* comment */\n"
-            "#ifdef A\n"
-            "  int j;\n"
-            "}",
-            format("void f() {\n"
-                   "  int i;\n"
-                   "/* comment */\n"
-                   "#ifdef A\n"
-                   "  int j;\n"
-                   "}"));
-
-  EXPECT_EQ("int f(int i) {\n"
-            "  if (true) {\n"
-            "    ++i;\n"
-            "  }\n"
-            "// comment\n"
-            "#ifdef A\n"
-            "  int j;\n"
-            "#endif\n"
-            "}",
-            format("int f(int i) {\n"
-                   "   if (true) {\n"
-                   "    ++i;\n"
-                   "  }\n"
-                   "// comment\n"
-                   "#ifdef A\n"
-                   "  int j;\n"
-                   "#endif\n"
-                   "}"));
-
-  EXPECT_EQ("int f(int i) {\n"
-            "  if (true) {\n"
-            "    i++;\n"
-            "  } else {\n"
-            "// comment in else\n"
-            "#ifdef A\n"
-            "    j++;\n"
-            "#endif\n"
-            "  }\n"
-            "}",
-            format("int f(int i) {\n"
-                   "  if (true) {\n"
-                   "    i++;\n"
-                   "  } else {\n"
-                   " // comment in else\n"
-                   " #ifdef A\n"
-                   "    j++;\n"
-                   "#endif\n"
-                   "  }\n"
-                   "}"));
-
-  EXPECT_EQ("int f(int i) {\n"
-            "  if (true) {\n"
-            "    i++;\n"
-            "  } else {\n"
-            "/* comment in else */\n"
-            "#ifdef A\n"
-            "    j++;\n"
-            "#endif\n"
-            "  }\n"
-            "}",
-            format("int f(int i) {\n"
-                   "  if (true) {\n"
-                   "    i++;\n"
-                   "  } else {\n"
-                   " /* comment in else */\n"
-                   " #ifdef A\n"
-                   "    j++;\n"
-                   "#endif\n"
-                   "  }\n"
-                   "}"));
+  verifyNoChange("void f() {\n"
+                 "  int i;\n"
+                 "/* comment */\n"
+                 "#ifdef A\n"
+                 "  int j;\n"
+                 "}");
+
+  verifyFormat("int f(int i) {\n"
+               "  if (true) {\n"
+               "    ++i;\n"
+               "  }\n"
+               "// comment\n"
+               "#ifdef A\n"
+               "  int j;\n"
+               "#endif\n"
+               "}",
+               "int f(int i) {\n"
+               "   if (true) {\n"
+               "    ++i;\n"
+               "  }\n"
+               "// comment\n"
+               "#ifdef A\n"
+               "  int j;\n"
+               "#endif\n"
+               "}");
+
+  verifyFormat("int f(int i) {\n"
+               "  if (true) {\n"
+               "    i++;\n"
+               "  } else {\n"
+               "// comment in else\n"
+               "#ifdef A\n"
+               "    j++;\n"
+               "#endif\n"
+               "  }\n"
+               "}",
+               "int f(int i) {\n"
+               "  if (true) {\n"
+               "    i++;\n"
+               "  } else {\n"
+               " // comment in else\n"
+               " #ifdef A\n"
+               "    j++;\n"
+               "#endif\n"
+               "  }\n"
+               "}");
+
+  verifyFormat("int f(int i) {\n"
+               "  if (true) {\n"
+               "    i++;\n"
+               "  } else {\n"
+               "/* comment in else */\n"
+               "#ifdef A\n"
+               "    j++;\n"
+               "#endif\n"
+               "  }\n"
+               "}",
+               "int f(int i) {\n"
+               "  if (true) {\n"
+               "    i++;\n"
+               "  } else {\n"
+               " /* comment in else */\n"
+               " #ifdef A\n"
+               "    j++;\n"
+               "#endif\n"
+               "  }\n"
+               "}");
 
   constexpr StringRef Code("void func() {\n"
                            "  // clang-format off\n"
@@ -1189,245 +1121,239 @@ TEST_F(FormatTestComments, CommentsBetweenUnbracedBodyAndPPDirective) {
 }
 
 TEST_F(FormatTestComments, SplitsLongLinesInComments) {
+  const auto Style10 = getLLVMStyleWithColumns(10);
+  const auto Style15 = getLLVMStyleWithColumns(15);
+  const auto Style20 = getLLVMStyleWithColumns(20);
+
   // FIXME: Do we need to fix up the "  */" at the end?
   // It doesn't look like any of our current logic triggers this.
-  EXPECT_EQ("/* This is a long\n"
-            " * comment that\n"
-            " * doesn't fit on\n"
-            " * one line.  */",
-            format("/* "
-                   "This is a long                                         "
-                   "comment that "
-                   "doesn't                                    "
-                   "fit on one line.  */",
-                   getLLVMStyleWithColumns(20)));
-  EXPECT_EQ(
-      "/* a b c d\n"
-      " * e f  g\n"
-      " * h i j k\n"
-      " */",
-      format("/* a b c d e f  g h i j k */", getLLVMStyleWithColumns(10)));
-  EXPECT_EQ(
-      "/* a b c d\n"
-      " * e f  g\n"
-      " * h i j k\n"
-      " */",
-      format("\\\n/* a b c d e f  g h i j k */", getLLVMStyleWithColumns(10)));
-  EXPECT_EQ("/*\n"
-            "This is a long\n"
-            "comment that doesn't\n"
-            "fit on one line.\n"
-            "*/",
-            format("/*\n"
-                   "This is a long                                         "
-                   "comment that doesn't                                    "
-                   "fit on one line.                                      \n"
-                   "*/",
-                   getLLVMStyleWithColumns(20)));
-  EXPECT_EQ("/*\n"
-            " * This is a long\n"
-            " * comment that\n"
-            " * doesn't fit on\n"
-            " * one line.\n"
-            " */",
-            format("/*      \n"
-                   " * This is a long "
-                   "   comment that     "
-                   "   doesn't fit on   "
-                   "   one line.                                            \n"
-                   " */",
-                   getLLVMStyleWithColumns(20)));
-  EXPECT_EQ("/*\n"
-            " * This_is_a_comment_with_words_that_dont_fit_on_one_line\n"
-            " * so_it_should_be_broken\n"
-            " * wherever_a_space_occurs\n"
-            " */",
-            format("/*\n"
-                   " * This_is_a_comment_with_words_that_dont_fit_on_one_line "
-                   "   so_it_should_be_broken "
-                   "   wherever_a_space_occurs                             \n"
-                   " */",
-                   getLLVMStyleWithColumns(20)));
-  EXPECT_EQ("/*\n"
-            " *    This_comment_can_not_be_broken_into_lines\n"
-            " */",
-            format("/*\n"
-                   " *    This_comment_can_not_be_broken_into_lines\n"
-                   " */",
-                   getLLVMStyleWithColumns(20)));
-  EXPECT_EQ("{\n"
-            "  /*\n"
-            "  This is another\n"
-            "  long comment that\n"
-            "  doesn't fit on one\n"
-            "  line    1234567890\n"
-            "  */\n"
-            "}",
-            format("{\n"
-                   "/*\n"
-                   "This is another     "
-                   "  long comment that "
-                   "  doesn't fit on one"
-                   "  line    1234567890\n"
-                   "*/\n"
-                   "}",
-                   getLLVMStyleWithColumns(20)));
-  EXPECT_EQ("{\n"
-            "  /*\n"
-            "   * This        i s\n"
-            "   * another comment\n"
-            "   * t hat  doesn' t\n"
-            "   * fit on one l i\n"
-            "   * n e\n"
-            "   */\n"
-            "}",
-            format("{\n"
-                   "/*\n"
-                   " * This        i s"
-                   "   another comment"
-                   "   t hat  doesn' t"
-                   "   fit on one l i"
-                   "   n e\n"
-                   " */\n"
-                   "}",
-                   getLLVMStyleWithColumns(20)));
-  EXPECT_EQ("/*\n"
-            " * This is a long\n"
-            " * comment that\n"
-            " * doesn't fit on\n"
-            " * one line\n"
-            " */",
-            format("   /*\n"
-                   "    * This is a long comment that doesn't fit on one line\n"
-                   "    */",
-                   getLLVMStyleWithColumns(20)));
-  EXPECT_EQ("{\n"
-            "  if (something) /* This is a\n"
-            "                    long\n"
-            "                    comment */\n"
-            "    ;\n"
-            "}",
-            format("{\n"
-                   "  if (something) /* This is a long comment */\n"
-                   "    ;\n"
-                   "}",
-                   getLLVMStyleWithColumns(30)));
-
-  EXPECT_EQ("/* A comment before\n"
-            " * a macro\n"
-            " * definition */\n"
-            "#define a b",
-            format("/* A comment before a macro definition */\n"
-                   "#define a b",
-                   getLLVMStyleWithColumns(20)));
-
-  EXPECT_EQ("/* some comment\n"
-            " *   a comment that\n"
-            " * we break another\n"
-            " * comment we have\n"
-            " * to break a left\n"
-            " * comment\n"
-            " */",
-            format("  /* some comment\n"
-                   "       *   a comment that we break\n"
-                   "   * another comment we have to break\n"
-                   "* a left comment\n"
-                   "   */",
-                   getLLVMStyleWithColumns(20)));
-
-  EXPECT_EQ("/**\n"
-            " * multiline block\n"
-            " * comment\n"
-            " *\n"
-            " */",
-            format("/**\n"
-                   " * multiline block comment\n"
-                   " *\n"
-                   " */",
-                   getLLVMStyleWithColumns(20)));
+  verifyFormat("/* This is a long\n"
+               " * comment that\n"
+               " * doesn't fit on\n"
+               " * one line.  */",
+               "/* "
+               "This is a long                                         "
+               "comment that "
+               "doesn't                                    "
+               "fit on one line.  */",
+               Style20);
+  verifyFormat("/* a b c d\n"
+               " * e f  g\n"
+               " * h i j k\n"
+               " */",
+               "/* a b c d e f  g h i j k */", Style10);
+  verifyFormat("/* a b c d\n"
+               " * e f  g\n"
+               " * h i j k\n"
+               " */",
+               "\\\n/* a b c d e f  g h i j k */", Style10);
+  verifyFormat("/*\n"
+               "This is a long\n"
+               "comment that doesn't\n"
+               "fit on one line.\n"
+               "*/",
+               "/*\n"
+               "This is a long                                         "
+               "comment that doesn't                                    "
+               "fit on one line.                                      \n"
+               "*/",
+               Style20);
+  verifyFormat("/*\n"
+               " * This is a long\n"
+               " * comment that\n"
+               " * doesn't fit on\n"
+               " * one line.\n"
+               " */",
+               "/*      \n"
+               " * This is a long "
+               "   comment that     "
+               "   doesn't fit on   "
+               "   one line.                                            \n"
+               " */",
+               Style20);
+  verifyFormat("/*\n"
+               " * This_is_a_comment_with_words_that_dont_fit_on_one_line\n"
+               " * so_it_should_be_broken\n"
+               " * wherever_a_space_occurs\n"
+               " */",
+               "/*\n"
+               " * This_is_a_comment_with_words_that_dont_fit_on_one_line "
+               "   so_it_should_be_broken "
+               "   wherever_a_space_occurs                             \n"
+               " */",
+               Style20);
+  verifyNoChange("/*\n"
+                 " *    This_comment_can_not_be_broken_into_lines\n"
+                 " */",
+                 Style20);
+  verifyFormat("{\n"
+               "  /*\n"
+               "  This is another\n"
+               "  long comment that\n"
+               "  doesn't fit on one\n"
+               "  line    1234567890\n"
+               "  */\n"
+               "}",
+               "{\n"
+               "/*\n"
+               "This is another     "
+               "  long comment that "
+               "  doesn't fit on one"
+               "  line    1234567890\n"
+               "*/\n"
+               "}",
+               Style20);
+  verifyFormat("{\n"
+               "  /*\n"
+               "   * This        i s\n"
+               "   * another comment\n"
+               "   * t hat  doesn' t\n"
+               "   * fit on one l i\n"
+               "   * n e\n"
+               "   */\n"
+               "}",
+               "{\n"
+               "/*\n"
+               " * This        i s"
+               "   another comment"
+               "   t hat  doesn' t"
+               "   fit on one l i"
+               "   n e\n"
+               " */\n"
+               "}",
+               Style20);
+  verifyFormat("/*\n"
+               " * This is a long\n"
+               " * comment that\n"
+               " * doesn't fit on\n"
+               " * one line\n"
+               " */",
+               "   /*\n"
+               "    * This is a long comment that doesn't fit on one line\n"
+               "    */",
+               Style20);
+  verifyFormat("{\n"
+               "  if (something) /* This is a\n"
+               "                    long\n"
+               "                    comment */\n"
+               "    ;\n"
+               "}",
+               "{\n"
+               "  if (something) /* This is a long comment */\n"
+               "    ;\n"
+               "}",
+               getLLVMStyleWithColumns(30));
+
+  verifyFormat("/* A comment before\n"
+               " * a macro\n"
+               " * definition */\n"
+               "#define a b",
+               "/* A comment before a macro definition */\n"
+               "#define a b",
+               Style20);
+
+  verifyFormat("/* some comment\n"
+               " *   a comment that\n"
+               " * we break another\n"
+               " * comment we have\n"
+               " * to break a left\n"
+               " * comment\n"
+               " */",
+               "  /* some comment\n"
+               "       *   a comment that we break\n"
+               "   * another comment we have to break\n"
+               "* a left comment\n"
+               "   */",
+               Style20);
+
+  verifyFormat("/**\n"
+               " * multiline block\n"
+               " * comment\n"
+               " *\n"
+               " */",
+               "/**\n"
+               " * multiline block comment\n"
+               " *\n"
+               " */",
+               Style20);
 
   // This reproduces a crashing bug where both adaptStartOfLine and
   // getCommentSplit were trying to wrap after the "/**".
-  verifyFormat("/** multilineblockcommentwithnowrapopportunity */",
-               getLLVMStyleWithColumns(20));
+  verifyFormat("/** multilineblockcommentwithnowrapopportunity */", Style20);
 
-  EXPECT_EQ("/*\n"
-            "\n"
-            "\n"
-            "    */",
-            format("  /*       \n"
-                   "      \n"
-                   "               \n"
-                   "      */"));
-
-  EXPECT_EQ("/* a a */",
-            format("/* a a            */", getLLVMStyleWithColumns(15)));
-  EXPECT_EQ("/* a a bc  */",
-            format("/* a a            bc  */", getLLVMStyleWithColumns(15)));
-  EXPECT_EQ("/* aaa aaa\n"
-            " * aaaaa */",
-            format("/* aaa aaa aaaaa       */", getLLVMStyleWithColumns(15)));
-  EXPECT_EQ("/* aaa aaa\n"
-            " * aaaaa     */",
-            format("/* aaa aaa aaaaa     */", getLLVMStyleWithColumns(15)));
+  verifyFormat("/*\n"
+               "\n"
+               "\n"
+               "    */",
+               "  /*       \n"
+               "      \n"
+               "               \n"
+               "      */");
+
+  verifyFormat("/* a a */", "/* a a            */", Style15);
+  verifyFormat("/* a a bc  */", "/* a a            bc  */", Style15);
+  verifyFormat("/* aaa aaa\n"
+               " * aaaaa */",
+               "/* aaa aaa aaaaa       */", Style15);
+  verifyFormat("/* aaa aaa\n"
+               " * aaaaa     */",
+               "/* aaa aaa aaaaa     */", Style15);
 }
 
 TEST_F(FormatTestComments, SplitsLongLinesInCommentsInPreprocessor) {
-  EXPECT_EQ("#define X          \\\n"
-            "  /*               \\\n"
-            "   Test            \\\n"
-            "   Macro comment   \\\n"
-            "   with a long     \\\n"
-            "   line            \\\n"
-            "   */              \\\n"
-            "  A + B",
-            format("#define X \\\n"
-                   "  /*\n"
-                   "   Test\n"
-                   "   Macro comment with a long  line\n"
-                   "   */ \\\n"
-                   "  A + B",
-                   getLLVMStyleWithColumns(20)));
-  EXPECT_EQ("#define X          \\\n"
-            "  /* Macro comment \\\n"
-            "     with a long   \\\n"
-            "     line */       \\\n"
-            "  A + B",
-            format("#define X \\\n"
-                   "  /* Macro comment with a long\n"
-                   "     line */ \\\n"
-                   "  A + B",
-                   getLLVMStyleWithColumns(20)));
-  EXPECT_EQ("#define X          \\\n"
-            "  /* Macro comment \\\n"
-            "   * with a long   \\\n"
-            "   * line */       \\\n"
-            "  A + B",
-            format("#define X \\\n"
-                   "  /* Macro comment with a long  line */ \\\n"
-                   "  A + B",
-                   getLLVMStyleWithColumns(20)));
+  const auto Style20 = getLLVMStyleWithColumns(20);
+  verifyFormat("#define X          \\\n"
+               "  /*               \\\n"
+               "   Test            \\\n"
+               "   Macro comment   \\\n"
+               "   with a long     \\\n"
+               "   line            \\\n"
+               "   */              \\\n"
+               "  A + B",
+               "#define X \\\n"
+               "  /*\n"
+               "   Test\n"
+               "   Macro comment with a long  line\n"
+               "   */ \\\n"
+               "  A + B",
+               Style20);
+  verifyFormat("#define X          \\\n"
+               "  /* Macro comment \\\n"
+               "     with a long   \\\n"
+               "     line */       \\\n"
+               "  A + B",
+               "#define X \\\n"
+               "  /* Macro comment with a long\n"
+               "     line */ \\\n"
+               "  A + B",
+               Style20);
+  verifyFormat("#define X          \\\n"
+               "  /* Macro comment \\\n"
+               "   * with a long   \\\n"
+               "   * line */       \\\n"
+               "  A + B",
+               "#define X \\\n"
+               "  /* Macro comment with a long  line */ \\\n"
+               "  A + B",
+               Style20);
 }
 
 TEST_F(FormatTestComments, KeepsTrailingPPCommentsAndSectionCommentsSeparate) {
   verifyFormat("#ifdef A // line about A\n"
                "// section comment\n"
-               "#endif",
-               getLLVMStyleWithColumns(80));
+               "#endif");
+  verifyFormat("#ifdef A // line 1 about A\n"
+               "         // line 2 about A\n"
+               "// section comment\n"
+               "#endif");
   verifyFormat("#ifdef A // line 1 about A\n"
                "         // line 2 about A\n"
                "// section comment\n"
                "#endif",
-               getLLVMStyleWithColumns(80));
-  EXPECT_EQ("#ifdef A // line 1 about A\n"
-            "         // line 2 about A\n"
-            "// section comment\n"
-            "#endif",
-            format("#ifdef A // line 1 about A\n"
-                   "          // line 2 about A\n"
-                   "// section comment\n"
-                   "#endif",
-                   getLLVMStyleWithColumns(80)));
+               "#ifdef A // line 1 about A\n"
+               "          // line 2 about A\n"
+               "// section comment\n"
+               "#endif");
   verifyFormat("int f() {\n"
                "  int i;\n"
                "#ifdef A // comment about A\n"
@@ -1438,46 +1364,41 @@ TEST_F(FormatTestComments, KeepsTrailingPPCommentsAndSectionCommentsSeparate) {
                "  // section comment 3\n"
                "  i = 4;\n"
                "#endif\n"
-               "}",
-               getLLVMStyleWithColumns(80));
+               "}");
 }
 
 TEST_F(FormatTestComments, AlignsPPElseEndifComments) {
+  const auto Style20 = getLLVMStyleWithColumns(20);
   verifyFormat("#if A\n"
                "#else  // A\n"
                "int iiii;\n"
                "#endif // B",
-               getLLVMStyleWithColumns(20));
+               Style20);
   verifyFormat("#if A\n"
                "#else  // A\n"
                "int iiii; // CC\n"
                "#endif // B",
-               getLLVMStyleWithColumns(20));
-  EXPECT_EQ("#if A\n"
-            "#else  // A1\n"
-            "       // A2\n"
-            "int ii;\n"
-            "#endif // B",
-            format("#if A\n"
-                   "#else  // A1\n"
-                   "       // A2\n"
-                   "int ii;\n"
-                   "#endif // B",
-                   getLLVMStyleWithColumns(20)));
+               Style20);
+  verifyNoChange("#if A\n"
+                 "#else  // A1\n"
+                 "       // A2\n"
+                 "int ii;\n"
+                 "#endif // B",
+                 Style20);
 }
 
 TEST_F(FormatTestComments, CommentsInStaticInitializers) {
-  EXPECT_EQ(
+  verifyFormat(
       "static SomeType type = {aaaaaaaaaaaaaaaaaaaa, /* comment */\n"
       "                        aaaaaaaaaaaaaaaaaaaa /* comment */,\n"
       "                        /* comment */ aaaaaaaaaaaaaaaaaaaa,\n"
       "                        aaaaaaaaaaaaaaaaaaaa, // comment\n"
       "                        aaaaaaaaaaaaaaaaaaaa};",
-      format("static SomeType type = { aaaaaaaaaaaaaaaaaaaa  ,  /* comment */\n"
-             "                   aaaaaaaaaaaaaaaaaaaa   /* comment */ ,\n"
-             "                     /* comment */   aaaaaaaaaaaaaaaaaaaa ,\n"
-             "              aaaaaaaaaaaaaaaaaaaa ,   // comment\n"
-             "                  aaaaaaaaaaaaaaaaaaaa };"));
+      "static SomeType type = { aaaaaaaaaaaaaaaaaaaa  ,  /* comment */\n"
+      "                   aaaaaaaaaaaaaaaaaaaa   /* comment */ ,\n"
+      "                     /* comment */   aaaaaaaaaaaaaaaaaaaa ,\n"
+      "              aaaaaaaaaaaaaaaaaaaa ,   // comment\n"
+      "                  aaaaaaaaaaaaaaaaaaaa };");
   verifyFormat("static SomeType type = {aaaaaaaaaaa, // comment for aa...\n"
                "                        bbbbbbbbbbb, ccccccccccc};");
   verifyFormat("static SomeType type = {aaaaaaaaaaa,\n"
@@ -1500,32 +1421,32 @@ TEST_F(FormatTestComments, CommentsInStaticInitializers) {
                "       {// Group #3\n"
                "        g, h, i}};");
 
-  EXPECT_EQ("S s = {\n"
-            "    // Some comment\n"
-            "    a,\n"
-            "\n"
-            "    // Comment after empty line\n"
-            "    b}",
-            format("S s =    {\n"
-                   "      // Some comment\n"
-                   "  a,\n"
-                   "  \n"
-                   "     // Comment after empty line\n"
-                   "      b\n"
-                   "}"));
-  EXPECT_EQ("S s = {\n"
-            "    /* Some comment */\n"
-            "    a,\n"
-            "\n"
-            "    /* Comment after empty line */\n"
-            "    b}",
-            format("S s =    {\n"
-                   "      /* Some comment */\n"
-                   "  a,\n"
-                   "  \n"
-                   "     /* Comment after empty line */\n"
-                   "      b\n"
-                   "}"));
+  verifyFormat("S s = {\n"
+               "    // Some comment\n"
+               "    a,\n"
+               "\n"
+               "    // Comment after empty line\n"
+               "    b}",
+               "S s =    {\n"
+               "      // Some comment\n"
+               "  a,\n"
+               "  \n"
+               "     // Comment after empty line\n"
+               "      b\n"
+               "}");
+  verifyFormat("S s = {\n"
+               "    /* Some comment */\n"
+               "    a,\n"
+               "\n"
+               "    /* Comment after empty line */\n"
+               "    b}",
+               "S s =    {\n"
+               "      /* Some comment */\n"
+               "  a,\n"
+               "  \n"
+               "     /* Comment after empty line */\n"
+               "      b\n"
+               "}");
   verifyFormat("const uint8_t aaaaaaaaaaaaaaaaaaaaaa[0] = {\n"
                "    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // comment\n"
                "    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // comment\n"
@@ -1533,486 +1454,482 @@ TEST_F(FormatTestComments, CommentsInStaticInitializers) {
 }
 
 TEST_F(FormatTestComments, LineCommentsAfterRightBrace) {
-  EXPECT_EQ("if (true) { // comment about branch\n"
-            "  // comment about f\n"
-            "  f();\n"
-            "}",
-            format("if (true) { // comment about branch\n"
-                   "  // comment about f\n"
-                   "  f();\n"
-                   "}",
-                   getLLVMStyleWithColumns(80)));
-  EXPECT_EQ("if (1) { // if line 1\n"
-            "         // if line 2\n"
-            "         // if line 3\n"
-            "  // f line 1\n"
-            "  // f line 2\n"
-            "  f();\n"
-            "} else { // else line 1\n"
-            "         // else line 2\n"
-            "         // else line 3\n"
-            "  // g line 1\n"
-            "  g();\n"
-            "}",
-            format("if (1) { // if line 1\n"
-                   "          // if line 2\n"
-                   "        // if line 3\n"
-                   "  // f line 1\n"
-                   "    // f line 2\n"
-                   "  f();\n"
-                   "} else { // else line 1\n"
-                   "        // else line 2\n"
-                   "         // else line 3\n"
-                   "  // g line 1\n"
-                   "  g();\n"
-                   "}"));
-  EXPECT_EQ("do { // line 1\n"
-            "     // line 2\n"
-            "     // line 3\n"
-            "  f();\n"
-            "} while (true);",
-            format("do { // line 1\n"
-                   "     // line 2\n"
-                   "   // line 3\n"
-                   "  f();\n"
-                   "} while (true);",
-                   getLLVMStyleWithColumns(80)));
-  EXPECT_EQ("while (a < b) { // line 1\n"
-            "  // line 2\n"
-            "  // line 3\n"
-            "  f();\n"
-            "}",
-            format("while (a < b) {// line 1\n"
-                   "  // line 2\n"
-                   "  // line 3\n"
-                   "  f();\n"
-                   "}",
-                   getLLVMStyleWithColumns(80)));
+  verifyFormat("if (true) { // comment about branch\n"
+               "  // comment about f\n"
+               "  f();\n"
+               "}");
+  verifyFormat("if (1) { // if line 1\n"
+               "         // if line 2\n"
+               "         // if line 3\n"
+               "  // f line 1\n"
+               "  // f line 2\n"
+               "  f();\n"
+               "} else { // else line 1\n"
+               "         // else line 2\n"
+               "         // else line 3\n"
+               "  // g line 1\n"
+               "  g();\n"
+               "}",
+               "if (1) { // if line 1\n"
+               "          // if line 2\n"
+               "        // if line 3\n"
+               "  // f line 1\n"
+               "    // f line 2\n"
+               "  f();\n"
+               "} else { // else line 1\n"
+               "        // else line 2\n"
+               "         // else line 3\n"
+               "  // g line 1\n"
+               "  g();\n"
+               "}");
+  verifyFormat("do { // line 1\n"
+               "     // line 2\n"
+               "     // line 3\n"
+               "  f();\n"
+               "} while (true);",
+               "do { // line 1\n"
+               "     // line 2\n"
+               "   // line 3\n"
+               "  f();\n"
+               "} while (true);");
+  verifyFormat("while (a < b) { // line 1\n"
+               "  // line 2\n"
+               "  // line 3\n"
+               "  f();\n"
+               "}",
+               "while (a < b) {// line 1\n"
+               "  // line 2\n"
+               "  // line 3\n"
+               "  f();\n"
+               "}");
 }
 
 TEST_F(FormatTestComments, ReflowsComments) {
+  const auto Style20 = getLLVMStyleWithColumns(20);
+  const auto Style22 = getLLVMStyleWithColumns(22);
+
   // Break a long line and reflow with the full next line.
-  EXPECT_EQ("// long long long\n"
-            "// long long",
-            format("// long long long long\n"
-                   "// long",
-                   getLLVMStyleWithColumns(20)));
+  verifyFormat("// long long long\n"
+               "// long long",
+               "// long long long long\n"
+               "// long",
+               Style20);
 
   // Keep the trailing newline while reflowing.
-  EXPECT_EQ("// long long long\n"
-            "// long long",
-            format("// long long long long\n"
-                   "// long",
-                   getLLVMStyleWithColumns(20)));
+  verifyFormat("// long long long\n"
+               "// long long",
+               "// long long long long\n"
+               "// long",
+               Style20);
 
   // Break a long line and reflow with a part of the next line.
-  EXPECT_EQ("// long long long\n"
-            "// long long\n"
-            "// long_long",
-            format("// long long long long\n"
-                   "// long long_long",
-                   getLLVMStyleWithColumns(20)));
+  verifyFormat("// long long long\n"
+               "// long long\n"
+               "// long_long",
+               "// long long long long\n"
+               "// long long_long",
+               Style20);
 
   // Break but do not reflow if the first word from the next line is too long.
-  EXPECT_EQ("// long long long\n"
-            "// long\n"
-            "// long_long_long",
-            format("// long long long long\n"
-                   "// long_long_long",
-                   getLLVMStyleWithColumns(20)));
+  verifyFormat("// long long long\n"
+               "// long\n"
+               "// long_long_long",
+               "// long long long long\n"
+               "// long_long_long",
+               Style20);
 
   // Don't break or reflow short lines.
   verifyFormat("// long\n"
                "// long long long lo\n"
                "// long long long lo\n"
                "// long",
-               getLLVMStyleWithColumns(20));
+               Style20);
 
   // Keep prefixes and decorations while reflowing.
-  EXPECT_EQ("/// long long long\n"
-            "/// long long",
-            format("/// long long long long\n"
-                   "/// long",
-                   getLLVMStyleWithColumns(20)));
-  EXPECT_EQ("//! long long long\n"
-            "//! long long",
-            format("//! long long long long\n"
-                   "//! long",
-                   getLLVMStyleWithColumns(20)));
-  EXPECT_EQ("/* long long long\n"
-            " * long long */",
-            format("/* long long long long\n"
-                   " * long */",
-                   getLLVMStyleWithColumns(20)));
-  EXPECT_EQ("///< long long long\n"
-            "///< long long",
-            format("///< long long long long\n"
-                   "///< long",
-                   getLLVMStyleWithColumns(20)));
-  EXPECT_EQ("//!< long long long\n"
-            "//!< long long",
-            format("//!< long long long long\n"
-                   "//!< long",
-                   getLLVMStyleWithColumns(20)));
+  verifyFormat("/// long long long\n"
+               "/// long long",
+               "/// long long long long\n"
+               "/// long",
+               Style20);
+  verifyFormat("//! long long long\n"
+               "//! long long",
+               "//! long long long long\n"
+               "//! long",
+               Style20);
+  verifyFormat("/* long long long\n"
+               " * long long */",
+               "/* long long long long\n"
+               " * long */",
+               Style20);
+  verifyFormat("///< long long long\n"
+               "///< long long",
+               "///< long long long long\n"
+               "///< long",
+               Style20);
+  verifyFormat("//!< long long long\n"
+               "//!< long long",
+               "//!< long long long long\n"
+               "//!< long",
+               Style20);
 
   // Don't bring leading whitespace up while reflowing.
-  EXPECT_EQ("/*  long long long\n"
-            " * long long long\n"
-            " */",
-            format("/*  long long long long\n"
-                   " *  long long\n"
-                   " */",
-                   getLLVMStyleWithColumns(20)));
+  verifyFormat("/*  long long long\n"
+               " * long long long\n"
+               " */",
+               "/*  long long long long\n"
+               " *  long long\n"
+               " */",
+               Style20);
 
   // Reflow the last line of a block comment with its trailing '*/'.
-  EXPECT_EQ("/* long long long\n"
-            "   long long */",
-            format("/* long long long long\n"
-                   "   long */",
-                   getLLVMStyleWithColumns(20)));
+  verifyFormat("/* long long long\n"
+               "   long long */",
+               "/* long long long long\n"
+               "   long */",
+               Style20);
 
   // Reflow two short lines; keep the postfix of the last one.
-  EXPECT_EQ("/* long long long\n"
-            " * long long long */",
-            format("/* long long long long\n"
-                   " * long\n"
-                   " * long */",
-                   getLLVMStyleWithColumns(20)));
+  verifyFormat("/* long long long\n"
+               " * long long long */",
+               "/* long long long long\n"
+               " * long\n"
+               " * long */",
+               Style20);
 
   // Put the postfix of the last short reflow line on a newline if it doesn't
   // fit.
-  EXPECT_EQ("/* long long long\n"
-            " * long long longg\n"
-            " */",
-            format("/* long long long long\n"
-                   " * long\n"
-                   " * longg */",
-                   getLLVMStyleWithColumns(20)));
+  verifyFormat("/* long long long\n"
+               " * long long longg\n"
+               " */",
+               "/* long long long long\n"
+               " * long\n"
+               " * longg */",
+               Style20);
 
   // Reflow lines with leading whitespace.
-  EXPECT_EQ("{\n"
-            "  /*\n"
-            "   * long long long\n"
-            "   * long long long\n"
-            "   * long long long\n"
-            "   */\n"
-            "}",
-            format("{\n"
-                   "/*\n"
-                   " * long long long long\n"
-                   " *   long\n"
-                   " * long long long long\n"
-                   " */\n"
-                   "}",
-                   getLLVMStyleWithColumns(20)));
+  verifyFormat("{\n"
+               "  /*\n"
+               "   * long long long\n"
+               "   * long long long\n"
+               "   * long long long\n"
+               "   */\n"
+               "}",
+               "{\n"
+               "/*\n"
+               " * long long long long\n"
+               " *   long\n"
+               " * long long long long\n"
+               " */\n"
+               "}",
+               Style20);
 
   // Break single line block comments that are first in the line with ' *'
   // decoration.
-  EXPECT_EQ("/* long long long\n"
-            " * long */",
-            format("/* long long long long */", getLLVMStyleWithColumns(20)));
+  verifyFormat("/* long long long\n"
+               " * long */",
+               "/* long long long long */", Style20);
 
   // Break single line block comment that are not first in the line with '  '
   // decoration.
-  EXPECT_EQ("int i; /* long long\n"
-            "          long */",
-            format("int i; /* long long long */", getLLVMStyleWithColumns(20)));
+  verifyFormat("int i; /* long long\n"
+               "          long */",
+               "int i; /* long long long */", Style20);
 
   // Reflow a line that goes just over the column limit.
-  EXPECT_EQ("// long long long\n"
-            "// lon long",
-            format("// long long long lon\n"
-                   "// long",
-                   getLLVMStyleWithColumns(20)));
+  verifyFormat("// long long long\n"
+               "// lon long",
+               "// long long long lon\n"
+               "// long",
+               Style20);
 
   // Stop reflowing if the next line has a different indentation than the
   // previous line.
-  EXPECT_EQ("// long long long\n"
-            "// long\n"
-            "//  long long\n"
-            "//  long",
-            format("// long long long long\n"
-                   "//  long long\n"
-                   "//  long",
-                   getLLVMStyleWithColumns(20)));
+  verifyFormat("// long long long\n"
+               "// long\n"
+               "//  long long\n"
+               "//  long",
+               "// long long long long\n"
+               "//  long long\n"
+               "//  long",
+               Style20);
 
   // Reflow into the last part of a really long line that has been broken into
   // multiple lines.
-  EXPECT_EQ("// long long long\n"
-            "// long long long\n"
-            "// long long long",
-            format("// long long long long long long long long\n"
-                   "// long",
-                   getLLVMStyleWithColumns(20)));
+  verifyFormat("// long long long\n"
+               "// long long long\n"
+               "// long long long",
+               "// long long long long long long long long\n"
+               "// long",
+               Style20);
 
   // Break the first line, then reflow the beginning of the second and third
   // line up.
-  EXPECT_EQ("// long long long\n"
-            "// lon1 lon2 lon2\n"
-            "// lon2 lon3 lon3",
-            format("// long long long lon1\n"
-                   "// lon2 lon2 lon2\n"
-                   "// lon3 lon3",
-                   getLLVMStyleWithColumns(20)));
+  verifyFormat("// long long long\n"
+               "// lon1 lon2 lon2\n"
+               "// lon2 lon3 lon3",
+               "// long long long lon1\n"
+               "// lon2 lon2 lon2\n"
+               "// lon3 lon3",
+               Style20);
 
   // Reflow the beginning of the second line, then break the rest.
-  EXPECT_EQ("// long long long\n"
-            "// lon1 lon2 lon2\n"
-            "// lon2 lon2 lon2\n"
-            "// lon3",
-            format("// long long long lon1\n"
-                   "// lon2 lon2 lon2 lon2 lon2 lon3",
-                   getLLVMStyleWithColumns(20)));
+  verifyFormat("// long long long\n"
+               "// lon1 lon2 lon2\n"
+               "// lon2 lon2 lon2\n"
+               "// lon3",
+               "// long long long lon1\n"
+               "// lon2 lon2 lon2 lon2 lon2 lon3",
+               Style20);
 
   // Shrink the first line, then reflow the second line up.
-  EXPECT_EQ("// long long long", format("// long              long\n"
-                                        "// long",
-                                        getLLVMStyleWithColumns(20)));
+  verifyFormat("// long long long",
+               "// long              long\n"
+               "// long",
+               Style20);
 
   // Don't shrink leading whitespace.
-  verifyNoChange("int i; ///           a", getLLVMStyleWithColumns(20));
+  verifyNoChange("int i; ///           a", Style20);
 
   // Shrink trailing whitespace if there is no postfix and reflow.
-  EXPECT_EQ("// long long long\n"
-            "// long long",
-            format("// long long long long    \n"
-                   "// long",
-                   getLLVMStyleWithColumns(20)));
+  verifyFormat("// long long long\n"
+               "// long long",
+               "// long long long long    \n"
+               "// long",
+               Style20);
 
   // Shrink trailing whitespace to a single one if there is postfix.
-  EXPECT_EQ("/* long long long */",
-            format("/* long long long     */", getLLVMStyleWithColumns(20)));
+  verifyFormat("/* long long long */", "/* long long long     */", Style20);
 
   // Break a block comment postfix if exceeding the line limit.
-  EXPECT_EQ("/*               long\n"
-            " */",
-            format("/*               long */", getLLVMStyleWithColumns(20)));
+  verifyFormat("/*               long\n"
+               " */",
+               "/*               long */", Style20);
 
   // Reflow indented comments.
-  EXPECT_EQ("{\n"
-            "  // long long long\n"
-            "  // long long\n"
-            "  int i; /* long lon\n"
-            "            g long\n"
-            "          */\n"
-            "}",
-            format("{\n"
-                   "  // long long long long\n"
-                   "  // long\n"
-                   "  int i; /* long lon g\n"
-                   "            long */\n"
-                   "}",
-                   getLLVMStyleWithColumns(20)));
+  verifyFormat("{\n"
+               "  // long long long\n"
+               "  // long long\n"
+               "  int i; /* long lon\n"
+               "            g long\n"
+               "          */\n"
+               "}",
+               "{\n"
+               "  // long long long long\n"
+               "  // long\n"
+               "  int i; /* long lon g\n"
+               "            long */\n"
+               "}",
+               Style20);
 
   // Don't realign trailing comments after reflow has happened.
-  EXPECT_EQ("// long long long\n"
-            "// long long\n"
-            "long i; // long",
-            format("// long long long long\n"
-                   "// long\n"
-                   "long i; // long",
-                   getLLVMStyleWithColumns(20)));
-  EXPECT_EQ("// long long long\n"
-            "// longng long long\n"
-            "// long lo",
-            format("// long long long longng\n"
-                   "// long long long\n"
-                   "// lo",
-                   getLLVMStyleWithColumns(20)));
+  verifyFormat("// long long long\n"
+               "// long long\n"
+               "long i; // long",
+               "// long long long long\n"
+               "// long\n"
+               "long i; // long",
+               Style20);
+  verifyFormat("// long long long\n"
+               "// longng long long\n"
+               "// long lo",
+               "// long long long longng\n"
+               "// long long long\n"
+               "// lo",
+               Style20);
 
   // Reflow lines after a broken line.
-  EXPECT_EQ("int a; // Trailing\n"
-            "       // comment on\n"
-            "       // 2 or 3\n"
-            "       // lines.",
-            format("int a; // Trailing comment\n"
-                   "       // on 2\n"
-                   "       // or 3\n"
-                   "       // lines.",
-                   getLLVMStyleWithColumns(20)));
-  EXPECT_EQ("/// This long line\n"
-            "/// gets reflown.",
-            format("/// This long line gets\n"
-                   "/// reflown.",
-                   getLLVMStyleWithColumns(20)));
-  EXPECT_EQ("//! This long line\n"
-            "//! gets reflown.",
-            format(" //! This long line gets\n"
-                   " //! reflown.",
-                   getLLVMStyleWithColumns(20)));
-  EXPECT_EQ("/* This long line\n"
-            " * gets reflown.\n"
-            " */",
-            format("/* This long line gets\n"
-                   " * reflown.\n"
-                   " */",
-                   getLLVMStyleWithColumns(20)));
+  verifyFormat("int a; // Trailing\n"
+               "       // comment on\n"
+               "       // 2 or 3\n"
+               "       // lines.",
+               "int a; // Trailing comment\n"
+               "       // on 2\n"
+               "       // or 3\n"
+               "       // lines.",
+               Style20);
+  verifyFormat("/// This long line\n"
+               "/// gets reflown.",
+               "/// This long line gets\n"
+               "/// reflown.",
+               Style20);
+  verifyFormat("//! This long line\n"
+               "//! gets reflown.",
+               " //! This long line gets\n"
+               " //! reflown.",
+               Style20);
+  verifyFormat("/* This long line\n"
+               " * gets reflown.\n"
+               " */",
+               "/* This long line gets\n"
+               " * reflown.\n"
+               " */",
+               Style20);
 
   // Reflow after indentation makes a line too long.
-  EXPECT_EQ("{\n"
-            "  // long long long\n"
-            "  // lo long\n"
-            "}",
-            format("{\n"
-                   "// long long long lo\n"
-                   "// long\n"
-                   "}",
-                   getLLVMStyleWithColumns(20)));
+  verifyFormat("{\n"
+               "  // long long long\n"
+               "  // lo long\n"
+               "}",
+               "{\n"
+               "// long long long lo\n"
+               "// long\n"
+               "}",
+               Style20);
 
   // Break and reflow multiple lines.
-  EXPECT_EQ("/*\n"
-            " * Reflow the end of\n"
-            " * line by 11 22 33\n"
-            " * 4.\n"
-            " */",
-            format("/*\n"
-                   " * Reflow the end of line\n"
-                   " * by\n"
-                   " * 11\n"
-                   " * 22\n"
-                   " * 33\n"
-                   " * 4.\n"
-                   " */",
-                   getLLVMStyleWithColumns(20)));
-  EXPECT_EQ("/// First line gets\n"
-            "/// broken. Second\n"
-            "/// line gets\n"
-            "/// reflown and\n"
-            "/// broken. Third\n"
-            "/// gets reflown.",
-            format("/// First line gets broken.\n"
-                   "/// Second line gets reflown and broken.\n"
-                   "/// Third gets reflown.",
-                   getLLVMStyleWithColumns(20)));
-  EXPECT_EQ("int i; // first long\n"
-            "       // long snd\n"
-            "       // long.",
-            format("int i; // first long long\n"
-                   "       // snd long.",
-                   getLLVMStyleWithColumns(20)));
-  EXPECT_EQ("{\n"
-            "  // first long line\n"
-            "  // line second\n"
-            "  // long line line\n"
-            "  // third long line\n"
-            "  // line\n"
-            "}",
-            format("{\n"
-                   "  // first long line line\n"
-                   "  // second long line line\n"
-                   "  // third long line line\n"
-                   "}",
-                   getLLVMStyleWithColumns(20)));
-  EXPECT_EQ("int i; /* first line\n"
-            "        * second\n"
-            "        * line third\n"
-            "        * line\n"
-            "        */",
-            format("int i; /* first line\n"
-                   "        * second line\n"
-                   "        * third line\n"
-                   "        */",
-                   getLLVMStyleWithColumns(20)));
+  verifyFormat("/*\n"
+               " * Reflow the end of\n"
+               " * line by 11 22 33\n"
+               " * 4.\n"
+               " */",
+               "/*\n"
+               " * Reflow the end of line\n"
+               " * by\n"
+               " * 11\n"
+               " * 22\n"
+               " * 33\n"
+               " * 4.\n"
+               " */",
+               Style20);
+  verifyFormat("/// First line gets\n"
+               "/// broken. Second\n"
+               "/// line gets\n"
+               "/// reflown and\n"
+               "/// broken. Third\n"
+               "/// gets reflown.",
+               "/// First line gets broken.\n"
+               "/// Second line gets reflown and broken.\n"
+               "/// Third gets reflown.",
+               Style20);
+  verifyFormat("int i; // first long\n"
+               "       // long snd\n"
+               "       // long.",
+               "int i; // first long long\n"
+               "       // snd long.",
+               Style20);
+  verifyFormat("{\n"
+               "  // first long line\n"
+               "  // line second\n"
+               "  // long line line\n"
+               "  // third long line\n"
+               "  // line\n"
+               "}",
+               "{\n"
+               "  // first long line line\n"
+               "  // second long line line\n"
+               "  // third long line line\n"
+               "}",
+               Style20);
+  verifyFormat("int i; /* first line\n"
+               "        * second\n"
+               "        * line third\n"
+               "        * line\n"
+               "        */",
+               "int i; /* first line\n"
+               "        * second line\n"
+               "        * third line\n"
+               "        */",
+               Style20);
 
   // Reflow the last two lines of a section that starts with a line having
   // different indentation.
-  EXPECT_EQ("//     long\n"
-            "// long long long\n"
-            "// long long",
-            format("//     long\n"
-                   "// long long long long\n"
-                   "// long",
-                   getLLVMStyleWithColumns(20)));
+  verifyFormat("//     long\n"
+               "// long long long\n"
+               "// long long",
+               "//     long\n"
+               "// long long long long\n"
+               "// long",
+               Style20);
 
   // Keep the block comment endling '*/' while reflowing.
-  EXPECT_EQ("/* Long long long\n"
-            " * line short */",
-            format("/* Long long long line\n"
-                   " * short */",
-                   getLLVMStyleWithColumns(20)));
+  verifyFormat("/* Long long long\n"
+               " * line short */",
+               "/* Long long long line\n"
+               " * short */",
+               Style20);
 
   // Don't reflow between separate blocks of comments.
-  EXPECT_EQ("/* First comment\n"
-            " * block will */\n"
-            "/* Snd\n"
-            " */",
-            format("/* First comment block\n"
-                   " * will */\n"
-                   "/* Snd\n"
-                   " */",
-                   getLLVMStyleWithColumns(20)));
+  verifyFormat("/* First comment\n"
+               " * block will */\n"
+               "/* Snd\n"
+               " */",
+               "/* First comment block\n"
+               " * will */\n"
+               "/* Snd\n"
+               " */",
+               Style20);
 
   // Don't reflow across blank comment lines.
-  EXPECT_EQ("int i; // This long\n"
-            "       // line gets\n"
-            "       // broken.\n"
-            "       //\n"
-            "       // keep.",
-            format("int i; // This long line gets broken.\n"
-                   "       //  \n"
-                   "       // keep.",
-                   getLLVMStyleWithColumns(20)));
-  EXPECT_EQ("{\n"
-            "  /// long long long\n"
-            "  /// long long\n"
-            "  ///\n"
-            "  /// long\n"
-            "}",
-            format("{\n"
-                   "  /// long long long long\n"
-                   "  /// long\n"
-                   "  ///\n"
-                   "  /// long\n"
-                   "}",
-                   getLLVMStyleWithColumns(20)));
-  EXPECT_EQ("//! long long long\n"
-            "//! long\n"
-            "\n"
-            "//! long",
-            format("//! long long long long\n"
-                   "\n"
-                   "//! long",
-                   getLLVMStyleWithColumns(20)));
-  EXPECT_EQ("/* long long long\n"
-            "   long\n"
-            "\n"
-            "   long */",
-            format("/* long long long long\n"
-                   "\n"
-                   "   long */",
-                   getLLVMStyleWithColumns(20)));
-  EXPECT_EQ("/* long long long\n"
-            " * long\n"
-            " *\n"
-            " * long */",
-            format("/* long long long long\n"
-                   " *\n"
-                   " * long */",
-                   getLLVMStyleWithColumns(20)));
+  verifyFormat("int i; // This long\n"
+               "       // line gets\n"
+               "       // broken.\n"
+               "       //\n"
+               "       // keep.",
+               "int i; // This long line gets broken.\n"
+               "       //  \n"
+               "       // keep.",
+               Style20);
+  verifyFormat("{\n"
+               "  /// long long long\n"
+               "  /// long long\n"
+               "  ///\n"
+               "  /// long\n"
+               "}",
+               "{\n"
+               "  /// long long long long\n"
+               "  /// long\n"
+               "  ///\n"
+               "  /// long\n"
+               "}",
+               Style20);
+  verifyFormat("//! long long long\n"
+               "//! long\n"
+               "\n"
+               "//! long",
+               "//! long long long long\n"
+               "\n"
+               "//! long",
+               Style20);
+  verifyFormat("/* long long long\n"
+               "   long\n"
+               "\n"
+               "   long */",
+               "/* long long long long\n"
+               "\n"
+               "   long */",
+               Style20);
+  verifyFormat("/* long long long\n"
+               " * long\n"
+               " *\n"
+               " * long */",
+               "/* long long long long\n"
+               " *\n"
+               " * long */",
+               Style20);
 
   // Don't reflow lines having content that is a single character.
-  EXPECT_EQ("// long long long\n"
-            "// long\n"
-            "// l",
-            format("// long long long long\n"
-                   "// l",
-                   getLLVMStyleWithColumns(20)));
+  verifyFormat("// long long long\n"
+               "// long\n"
+               "// l",
+               "// long long long long\n"
+               "// l",
+               Style20);
 
   // Don't reflow lines starting with two punctuation characters.
-  EXPECT_EQ("// long long long\n"
-            "// long\n"
-            "// ... --- ...",
-            format("// long long long long\n"
-                   "// ... --- ...",
-                   getLLVMStyleWithColumns(20)));
+  verifyFormat("// long long long\n"
+               "// long\n"
+               "// ... --- ...",
+               "// long long long long\n"
+               "// ... --- ...",
+               Style20);
 
   // Don't reflow lines starting with '@'.
-  EXPECT_EQ("// long long long\n"
-            "// long\n"
-            "// @param arg",
-            format("// long long long long\n"
-                   "// @param arg",
-                   getLLVMStyleWithColumns(20)));
+  verifyFormat("// long long long\n"
+               "// long\n"
+               "// @param arg",
+               "// long long long long\n"
+               "// @param arg",
+               Style20);
 
   // Don't reflow lines starting with '\'.
   verifyFormat("// long long long\n"
@@ -2020,433 +1937,437 @@ TEST_F(FormatTestComments, ReflowsComments) {
                "// \\param arg",
                "// long long long long\n"
                "// \\param arg",
-               getLLVMStyleWithColumns(20));
+               Style20);
 
   // Don't reflow lines starting with 'TODO'.
-  EXPECT_EQ("// long long long\n"
-            "// long\n"
-            "// TODO: long",
-            format("// long long long long\n"
-                   "// TODO: long",
-                   getLLVMStyleWithColumns(20)));
+  verifyFormat("// long long long\n"
+               "// long\n"
+               "// TODO: long",
+               "// long long long long\n"
+               "// TODO: long",
+               Style20);
 
   // Don't reflow lines starting with 'FIXME'.
-  EXPECT_EQ("// long long long\n"
-            "// long\n"
-            "// FIXME: long",
-            format("// long long long long\n"
-                   "// FIXME: long",
-                   getLLVMStyleWithColumns(20)));
+  verifyFormat("// long long long\n"
+               "// long\n"
+               "// FIXME: long",
+               "// long long long long\n"
+               "// FIXME: long",
+               Style20);
 
   // Don't reflow lines starting with 'XXX'.
-  EXPECT_EQ("// long long long\n"
-            "// long\n"
-            "// XXX: long",
-            format("// long long long long\n"
-                   "// XXX: long",
-                   getLLVMStyleWithColumns(20)));
+  verifyFormat("// long long long\n"
+               "// long\n"
+               "// XXX: long",
+               "// long long long long\n"
+               "// XXX: long",
+               Style20);
 
   // Don't reflow comment pragmas.
-  EXPECT_EQ("// long long long\n"
-            "// long\n"
-            "// IWYU pragma:",
-            format("// long long long long\n"
-                   "// IWYU pragma:",
-                   getLLVMStyleWithColumns(20)));
-  EXPECT_EQ("/* long long long\n"
-            " * long\n"
-            " * IWYU pragma:\n"
-            " */",
-            format("/* long long long long\n"
-                   " * IWYU pragma:\n"
-                   " */",
-                   getLLVMStyleWithColumns(20)));
+  verifyFormat("// long long long\n"
+               "// long\n"
+               "// IWYU pragma:",
+               "// long long long long\n"
+               "// IWYU pragma:",
+               Style20);
+  verifyFormat("/* long long long\n"
+               " * long\n"
+               " * IWYU pragma:\n"
+               " */",
+               "/* long long long long\n"
+               " * IWYU pragma:\n"
+               " */",
+               Style20);
 
   // Reflow lines that have a non-punctuation character among their first 2
   // characters.
-  EXPECT_EQ("// long long long\n"
-            "// long 'long'",
-            format("// long long long long\n"
-                   "// 'long'",
-                   getLLVMStyleWithColumns(20)));
+  verifyFormat("// long long long\n"
+               "// long 'long'",
+               "// long long long long\n"
+               "// 'long'",
+               Style20);
 
   // Don't reflow between separate blocks of comments.
-  EXPECT_EQ("/* First comment\n"
-            " * block will */\n"
-            "/* Snd\n"
-            " */",
-            format("/* First comment block\n"
-                   " * will */\n"
-                   "/* Snd\n"
-                   " */",
-                   getLLVMStyleWithColumns(20)));
+  verifyFormat("/* First comment\n"
+               " * block will */\n"
+               "/* Snd\n"
+               " */",
+               "/* First comment block\n"
+               " * will */\n"
+               "/* Snd\n"
+               " */",
+               Style20);
 
   // Don't reflow lines having different indentation.
-  EXPECT_EQ("// long long long\n"
-            "// long\n"
-            "//  long",
-            format("// long long long long\n"
-                   "//  long",
-                   getLLVMStyleWithColumns(20)));
+  verifyFormat("// long long long\n"
+               "// long\n"
+               "//  long",
+               "// long long long long\n"
+               "//  long",
+               Style20);
 
   // Don't reflow separate bullets in list
-  EXPECT_EQ("// - long long long\n"
-            "// long\n"
-            "// - long",
-            format("// - long long long long\n"
-                   "// - long",
-                   getLLVMStyleWithColumns(20)));
-  EXPECT_EQ("// * long long long\n"
-            "// long\n"
-            "// * long",
-            format("// * long long long long\n"
-                   "// * long",
-                   getLLVMStyleWithColumns(20)));
-  EXPECT_EQ("// + long long long\n"
-            "// long\n"
-            "// + long",
-            format("// + long long long long\n"
-                   "// + long",
-                   getLLVMStyleWithColumns(20)));
-  EXPECT_EQ("// 1. long long long\n"
-            "// long\n"
-            "// 2. long",
-            format("// 1. long long long long\n"
-                   "// 2. long",
-                   getLLVMStyleWithColumns(20)));
-  EXPECT_EQ("// -# long long long\n"
-            "// long\n"
-            "// -# long",
-            format("// -# long long long long\n"
-                   "// -# long",
-                   getLLVMStyleWithColumns(20)));
-
-  EXPECT_EQ("// - long long long\n"
-            "// long long long\n"
-            "// - long",
-            format("// - long long long long\n"
-                   "// long long\n"
-                   "// - long",
-                   getLLVMStyleWithColumns(20)));
-  EXPECT_EQ("// - long long long\n"
-            "// long long long\n"
-            "// long\n"
-            "// - long",
-            format("// - long long long long\n"
-                   "// long long long\n"
-                   "// - long",
-                   getLLVMStyleWithColumns(20)));
+  verifyFormat("// - long long long\n"
+               "// long\n"
+               "// - long",
+               "// - long long long long\n"
+               "// - long",
+               Style20);
+  verifyFormat("// * long long long\n"
+               "// long\n"
+               "// * long",
+               "// * long long long long\n"
+               "// * long",
+               Style20);
+  verifyFormat("// + long long long\n"
+               "// long\n"
+               "// + long",
+               "// + long long long long\n"
+               "// + long",
+               Style20);
+  verifyFormat("// 1. long long long\n"
+               "// long\n"
+               "// 2. long",
+               "// 1. long long long long\n"
+               "// 2. long",
+               Style20);
+  verifyFormat("// -# long long long\n"
+               "// long\n"
+               "// -# long",
+               "// -# long long long long\n"
+               "// -# long",
+               Style20);
+
+  verifyFormat("// - long long long\n"
+               "// long long long\n"
+               "// - long",
+               "// - long long long long\n"
+               "// long long\n"
+               "// - long",
+               Style20);
+  verifyFormat("// - long long long\n"
+               "// long long long\n"
+               "// long\n"
+               "// - long",
+               "// - long long long long\n"
+               "// long long long\n"
+               "// - long",
+               Style20);
 
   // Large number (>2 digits) are not list items
-  EXPECT_EQ("// long long long\n"
-            "// long 1024. long.",
-            format("// long long long long\n"
-                   "// 1024. long.",
-                   getLLVMStyleWithColumns(20)));
+  verifyFormat("// long long long\n"
+               "// long 1024. long.",
+               "// long long long long\n"
+               "// 1024. long.",
+               Style20);
 
   // Do not break before number, to avoid introducing a non-reflowable doxygen
   // list item.
-  EXPECT_EQ("// long long\n"
-            "// long 10. long.",
-            format("// long long long 10.\n"
-                   "// long.",
-                   getLLVMStyleWithColumns(20)));
+  verifyFormat("// long long\n"
+               "// long 10. long.",
+               "// long long long 10.\n"
+               "// long.",
+               Style20);
 
   // Don't break or reflow after implicit string literals.
   verifyFormat("#include <t> // l l l\n"
                "             // l",
-               getLLVMStyleWithColumns(20));
+               Style20);
 
   // Don't break or reflow comments on import lines.
-  EXPECT_EQ("#include \"t\" /* l l l\n"
-            "                * l */",
-            format("#include \"t\" /* l l l\n"
-                   "                * l */",
-                   getLLVMStyleWithColumns(20)));
+  verifyNoChange("#include \"t\" /* l l l\n"
+                 "                * l */",
+                 Style20);
 
   // Don't reflow between different trailing comment sections.
-  EXPECT_EQ("int i; // long long\n"
-            "       // long\n"
-            "int j; // long long\n"
-            "       // long",
-            format("int i; // long long long\n"
-                   "int j; // long long long",
-                   getLLVMStyleWithColumns(20)));
+  verifyFormat("int i; // long long\n"
+               "       // long\n"
+               "int j; // long long\n"
+               "       // long",
+               "int i; // long long long\n"
+               "int j; // long long long",
+               Style20);
 
   // Don't reflow if the first word on the next line is longer than the
   // available space at current line.
-  EXPECT_EQ("int i; // trigger\n"
-            "       // reflow\n"
-            "       // longsec",
-            format("int i; // trigger reflow\n"
-                   "       // longsec",
-                   getLLVMStyleWithColumns(20)));
+  verifyFormat("int i; // trigger\n"
+               "       // reflow\n"
+               "       // longsec",
+               "int i; // trigger reflow\n"
+               "       // longsec",
+               Style20);
 
   // Simple case that correctly handles reflow in parameter lists.
-  EXPECT_EQ("a = f(/* looooooooong\n"
-            "       * long long\n"
-            "       */\n"
-            "      a);",
-            format("a = f(/* looooooooong long\n* long\n*/ a);",
-                   getLLVMStyleWithColumns(22)));
+  verifyFormat("a = f(/* looooooooong\n"
+               "       * long long\n"
+               "       */\n"
+               "      a);",
+               "a = f(/* looooooooong long\n* long\n*/ a);", Style22);
   // Tricky case that has fewer lines if we reflow the comment, ending up with
   // fewer lines.
-  EXPECT_EQ("a = f(/* loooooong\n"
-            "       * long long\n"
-            "       */\n"
-            "      a);",
-            format("a = f(/* loooooong long\n* long\n*/ a);",
-                   getLLVMStyleWithColumns(22)));
+  verifyFormat("a = f(/* loooooong\n"
+               "       * long long\n"
+               "       */\n"
+               "      a);",
+               "a = f(/* loooooong long\n* long\n*/ a);", Style22);
 
   // Keep empty comment lines.
-  EXPECT_EQ("/**/", format(" /**/", getLLVMStyleWithColumns(20)));
-  EXPECT_EQ("/* */", format(" /* */", getLLVMStyleWithColumns(20)));
-  EXPECT_EQ("/*  */", format(" /*  */", getLLVMStyleWithColumns(20)));
-  EXPECT_EQ("//", format(" //  ", getLLVMStyleWithColumns(20)));
-  EXPECT_EQ("///", format(" ///  ", getLLVMStyleWithColumns(20)));
+  verifyFormat("/**/", " /**/", Style20);
+  verifyFormat("/* */", " /* */", Style20);
+  verifyFormat("/*  */", " /*  */", Style20);
+  verifyFormat("//", " //  ", Style20);
+  verifyFormat("///", " ///  ", Style20);
 }
 
 TEST_F(FormatTestComments, ReflowsCommentsPrecise) {
+  auto Style = getLLVMStyleWithColumns(20);
+
   // FIXME: This assumes we do not continue compressing whitespace once we are
   // in reflow mode. Consider compressing whitespace.
 
   // Test that we stop reflowing precisely at the column limit.
   // After reflowing, "// reflows into   foo" does not fit the column limit,
   // so we compress the whitespace.
-  EXPECT_EQ("// some text that\n"
-            "// reflows into foo",
-            format("// some text that reflows\n"
-                   "// into   foo",
-                   getLLVMStyleWithColumns(20)));
+  verifyFormat("// some text that\n"
+               "// reflows into foo",
+               "// some text that reflows\n"
+               "// into   foo",
+               Style);
+
+  Style.ColumnLimit = 21;
+
   // Given one more column, "// reflows into   foo" does fit the limit, so we
   // do not compress the whitespace.
-  EXPECT_EQ("// some text that\n"
-            "// reflows into   foo",
-            format("// some text that reflows\n"
-                   "// into   foo",
-                   getLLVMStyleWithColumns(21)));
+  verifyFormat("// some text that\n"
+               "// reflows into   foo",
+               "// some text that reflows\n"
+               "// into   foo",
+               Style);
 
   // Make sure that we correctly account for the space added in the reflow case
   // when making the reflowing decision.
   // First, when the next line ends precisely one column over the limit, do not
   // reflow.
-  EXPECT_EQ("// some text that\n"
-            "// reflows\n"
-            "// into1234567",
-            format("// some text that reflows\n"
-                   "// into1234567",
-                   getLLVMStyleWithColumns(21)));
+  verifyFormat("// some text that\n"
+               "// reflows\n"
+               "// into1234567",
+               "// some text that reflows\n"
+               "// into1234567",
+               Style);
+
   // Secondly, when the next line ends later, but the first word in that line
   // is precisely one column over the limit, do not reflow.
-  EXPECT_EQ("// some text that\n"
-            "// reflows\n"
-            "// into1234567 f",
-            format("// some text that reflows\n"
-                   "// into1234567 f",
-                   getLLVMStyleWithColumns(21)));
+  verifyFormat("// some text that\n"
+               "// reflows\n"
+               "// into1234567 f",
+               "// some text that reflows\n"
+               "// into1234567 f",
+               Style);
 }
 
 TEST_F(FormatTestComments, ReflowsCommentsWithExtraWhitespace) {
+  const auto Style16 = getLLVMStyleWithColumns(16);
+
   // Baseline.
-  EXPECT_EQ("// some text\n"
-            "// that re flows",
-            format("// some text that\n"
-                   "// re flows",
-                   getLLVMStyleWithColumns(16)));
-  EXPECT_EQ("// some text\n"
-            "// that re flows",
-            format("// some text that\n"
-                   "// re    flows",
-                   getLLVMStyleWithColumns(16)));
-  EXPECT_EQ("/* some text\n"
-            " * that re flows\n"
-            " */",
-            format("/* some text that\n"
-                   "*      re       flows\n"
-                   "*/",
-                   getLLVMStyleWithColumns(16)));
+  verifyFormat("// some text\n"
+               "// that re flows",
+               "// some text that\n"
+               "// re flows",
+               Style16);
+  verifyFormat("// some text\n"
+               "// that re flows",
+               "// some text that\n"
+               "// re    flows",
+               Style16);
+  verifyFormat("/* some text\n"
+               " * that re flows\n"
+               " */",
+               "/* some text that\n"
+               "*      re       flows\n"
+               "*/",
+               Style16);
   // FIXME: We do not reflow if the indent of two subsequent lines differs;
   // given that this is different behavior from block comments, do we want
   // to keep this?
-  EXPECT_EQ("// some text\n"
-            "// that\n"
-            "//     re flows",
-            format("// some text that\n"
-                   "//     re       flows",
-                   getLLVMStyleWithColumns(16)));
+  verifyFormat("// some text\n"
+               "// that\n"
+               "//     re flows",
+               "// some text that\n"
+               "//     re       flows",
+               Style16);
   // Space within parts of a line that fit.
   // FIXME: Use the earliest possible split while reflowing to compress the
   // whitespace within the line.
-  EXPECT_EQ("// some text that\n"
-            "// does re   flow\n"
-            "// more  here",
-            format("// some text that does\n"
-                   "// re   flow  more  here",
-                   getLLVMStyleWithColumns(21)));
+  verifyFormat("// some text that\n"
+               "// does re   flow\n"
+               "// more  here",
+               "// some text that does\n"
+               "// re   flow  more  here",
+               getLLVMStyleWithColumns(21));
 }
 
 TEST_F(FormatTestComments, IgnoresIf0Contents) {
-  EXPECT_EQ("#if 0\n"
-            "}{)(&*(^%%#%@! fsadj f;ldjs ,:;| <<<>>>][)(][\n"
-            "#endif\n"
-            "void f() {}",
-            format("#if 0\n"
-                   "}{)(&*(^%%#%@! fsadj f;ldjs ,:;| <<<>>>][)(][\n"
-                   "#endif\n"
-                   "void f(  ) {  }"));
-  EXPECT_EQ("#if false\n"
-            "void f(  ) {  }\n"
-            "#endif\n"
-            "void g() {}",
-            format("#if false\n"
-                   "void f(  ) {  }\n"
-                   "#endif\n"
-                   "void g(  ) {  }"));
-  EXPECT_EQ("enum E {\n"
-            "  One,\n"
-            "  Two,\n"
-            "#if 0\n"
-            "Three,\n"
-            "      Four,\n"
-            "#endif\n"
-            "  Five\n"
-            "};",
-            format("enum E {\n"
-                   "  One,Two,\n"
-                   "#if 0\n"
-                   "Three,\n"
-                   "      Four,\n"
-                   "#endif\n"
-                   "  Five};"));
-  EXPECT_EQ("enum F {\n"
-            "  One,\n"
-            "#if 1\n"
-            "  Two,\n"
-            "#if 0\n"
-            "Three,\n"
-            "      Four,\n"
-            "#endif\n"
-            "  Five\n"
-            "#endif\n"
-            "};",
-            format("enum F {\n"
-                   "One,\n"
-                   "#if 1\n"
-                   "Two,\n"
-                   "#if 0\n"
-                   "Three,\n"
-                   "      Four,\n"
-                   "#endif\n"
-                   "Five\n"
-                   "#endif\n"
-                   "};"));
-  EXPECT_EQ("enum G {\n"
-            "  One,\n"
-            "#if 0\n"
-            "Two,\n"
-            "#else\n"
-            "  Three,\n"
-            "#endif\n"
-            "  Four\n"
-            "};",
-            format("enum G {\n"
-                   "One,\n"
-                   "#if 0\n"
-                   "Two,\n"
-                   "#else\n"
-                   "Three,\n"
-                   "#endif\n"
-                   "Four\n"
-                   "};"));
-  EXPECT_EQ("enum H {\n"
-            "  One,\n"
-            "#if 0\n"
-            "#ifdef Q\n"
-            "Two,\n"
-            "#else\n"
-            "Three,\n"
-            "#endif\n"
-            "#endif\n"
-            "  Four\n"
-            "};",
-            format("enum H {\n"
-                   "One,\n"
-                   "#if 0\n"
-                   "#ifdef Q\n"
-                   "Two,\n"
-                   "#else\n"
-                   "Three,\n"
-                   "#endif\n"
-                   "#endif\n"
-                   "Four\n"
-                   "};"));
-  EXPECT_EQ("enum I {\n"
-            "  One,\n"
-            "#if /* test */ 0 || 1\n"
-            "Two,\n"
-            "Three,\n"
-            "#endif\n"
-            "  Four\n"
-            "};",
-            format("enum I {\n"
-                   "One,\n"
-                   "#if /* test */ 0 || 1\n"
-                   "Two,\n"
-                   "Three,\n"
-                   "#endif\n"
-                   "Four\n"
-                   "};"));
-  EXPECT_EQ("enum J {\n"
-            "  One,\n"
-            "#if 0\n"
-            "#if 0\n"
-            "Two,\n"
-            "#else\n"
-            "Three,\n"
-            "#endif\n"
-            "Four,\n"
-            "#endif\n"
-            "  Five\n"
-            "};",
-            format("enum J {\n"
-                   "One,\n"
-                   "#if 0\n"
-                   "#if 0\n"
-                   "Two,\n"
-                   "#else\n"
-                   "Three,\n"
-                   "#endif\n"
-                   "Four,\n"
-                   "#endif\n"
-                   "Five\n"
-                   "};"));
+  verifyFormat("#if 0\n"
+               "}{)(&*(^%%#%@! fsadj f;ldjs ,:;| <<<>>>][)(][\n"
+               "#endif\n"
+               "void f() {}",
+               "#if 0\n"
+               "}{)(&*(^%%#%@! fsadj f;ldjs ,:;| <<<>>>][)(][\n"
+               "#endif\n"
+               "void f(  ) {  }");
+  verifyFormat("#if false\n"
+               "void f(  ) {  }\n"
+               "#endif\n"
+               "void g() {}",
+               "#if false\n"
+               "void f(  ) {  }\n"
+               "#endif\n"
+               "void g(  ) {  }");
+  verifyFormat("enum E {\n"
+               "  One,\n"
+               "  Two,\n"
+               "#if 0\n"
+               "Three,\n"
+               "      Four,\n"
+               "#endif\n"
+               "  Five\n"
+               "};",
+               "enum E {\n"
+               "  One,Two,\n"
+               "#if 0\n"
+               "Three,\n"
+               "      Four,\n"
+               "#endif\n"
+               "  Five};");
+  verifyFormat("enum F {\n"
+               "  One,\n"
+               "#if 1\n"
+               "  Two,\n"
+               "#if 0\n"
+               "Three,\n"
+               "      Four,\n"
+               "#endif\n"
+               "  Five\n"
+               "#endif\n"
+               "};",
+               "enum F {\n"
+               "One,\n"
+               "#if 1\n"
+               "Two,\n"
+               "#if 0\n"
+               "Three,\n"
+               "      Four,\n"
+               "#endif\n"
+               "Five\n"
+               "#endif\n"
+               "};");
+  verifyFormat("enum G {\n"
+               "  One,\n"
+               "#if 0\n"
+               "Two,\n"
+               "#else\n"
+               "  Three,\n"
+               "#endif\n"
+               "  Four\n"
+               "};",
+               "enum G {\n"
+               "One,\n"
+               "#if 0\n"
+               "Two,\n"
+               "#else\n"
+               "Three,\n"
+               "#endif\n"
+               "Four\n"
+               "};");
+  verifyFormat("enum H {\n"
+               "  One,\n"
+               "#if 0\n"
+               "#ifdef Q\n"
+               "Two,\n"
+               "#else\n"
+               "Three,\n"
+               "#endif\n"
+               "#endif\n"
+               "  Four\n"
+               "};",
+               "enum H {\n"
+               "One,\n"
+               "#if 0\n"
+               "#ifdef Q\n"
+               "Two,\n"
+               "#else\n"
+               "Three,\n"
+               "#endif\n"
+               "#endif\n"
+               "Four\n"
+               "};");
+  verifyFormat("enum I {\n"
+               "  One,\n"
+               "#if /* test */ 0 || 1\n"
+               "Two,\n"
+               "Three,\n"
+               "#endif\n"
+               "  Four\n"
+               "};",
+               "enum I {\n"
+               "One,\n"
+               "#if /* test */ 0 || 1\n"
+               "Two,\n"
+               "Three,\n"
+               "#endif\n"
+               "Four\n"
+               "};");
+  verifyFormat("enum J {\n"
+               "  One,\n"
+               "#if 0\n"
+               "#if 0\n"
+               "Two,\n"
+               "#else\n"
+               "Three,\n"
+               "#endif\n"
+               "Four,\n"
+               "#endif\n"
+               "  Five\n"
+               "};",
+               "enum J {\n"
+               "One,\n"
+               "#if 0\n"
+               "#if 0\n"
+               "Two,\n"
+               "#else\n"
+               "Three,\n"
+               "#endif\n"
+               "Four,\n"
+               "#endif\n"
+               "Five\n"
+               "};");
 
   // Ignore stuff in SWIG-blocks.
-  EXPECT_EQ("#ifdef SWIG\n"
-            "}{)(&*(^%%#%@! fsadj f;ldjs ,:;| <<<>>>][)(][\n"
-            "#endif\n"
-            "void f() {}",
-            format("#ifdef SWIG\n"
-                   "}{)(&*(^%%#%@! fsadj f;ldjs ,:;| <<<>>>][)(][\n"
-                   "#endif\n"
-                   "void f(  ) {  }"));
-  EXPECT_EQ("#ifndef SWIG\n"
-            "void f() {}\n"
-            "#endif",
-            format("#ifndef SWIG\n"
-                   "void f(      ) {       }\n"
-                   "#endif"));
+  verifyFormat("#ifdef SWIG\n"
+               "}{)(&*(^%%#%@! fsadj f;ldjs ,:;| <<<>>>][)(][\n"
+               "#endif\n"
+               "void f() {}",
+               "#ifdef SWIG\n"
+               "}{)(&*(^%%#%@! fsadj f;ldjs ,:;| <<<>>>][)(][\n"
+               "#endif\n"
+               "void f(  ) {  }");
+  verifyFormat("#ifndef SWIG\n"
+               "void f() {}\n"
+               "#endif",
+               "#ifndef SWIG\n"
+               "void f(      ) {       }\n"
+               "#endif");
 }
 
 TEST_F(FormatTestComments, DontCrashOnBlockComments) {
-  EXPECT_EQ(
+  verifyFormat(
       "int xxxxxxxxx; /* "
       "yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\n"
       "zzzzzz\n"
       "0*/",
-      format("int xxxxxxxxx;                          /* "
-             "yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy zzzzzz\n"
-             "0*/"));
+      "int xxxxxxxxx;                          /* "
+      "yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy zzzzzz\n"
+      "0*/");
 }
 
 TEST_F(FormatTestComments, BlockCommentsInControlLoops) {
@@ -2470,225 +2391,214 @@ TEST_F(FormatTestComments, BlockCommentsInControlLoops) {
 }
 
 TEST_F(FormatTestComments, BlockComments) {
-  EXPECT_EQ("/* */ /* */ /* */\n/* */ /* */ /* */",
-            format("/* *//* */  /* */\n/* *//* */  /* */"));
-  EXPECT_EQ("/* */ a /* */ b;", format("  /* */  a/* */  b;"));
-  EXPECT_EQ("#define A /*123*/ \\\n"
-            "  b\n"
-            "/* */\n"
-            "someCall(\n"
-            "    parameter);",
-            format("#define A /*123*/ b\n"
-                   "/* */\n"
-                   "someCall(parameter);",
-                   getLLVMStyleWithColumns(15)));
-
-  EXPECT_EQ("#define A\n"
-            "/* */ someCall(\n"
-            "    parameter);",
-            format("#define A\n"
-                   "/* */someCall(parameter);",
-                   getLLVMStyleWithColumns(15)));
+  const auto Style10 = getLLVMStyleWithColumns(10);
+  const auto Style15 = getLLVMStyleWithColumns(15);
+
+  verifyFormat("/* */ /* */ /* */\n/* */ /* */ /* */",
+               "/* *//* */  /* */\n/* *//* */  /* */");
+  verifyFormat("/* */ a /* */ b;", "  /* */  a/* */  b;");
+  verifyFormat("#define A /*123*/ \\\n"
+               "  b\n"
+               "/* */\n"
+               "someCall(\n"
+               "    parameter);",
+               "#define A /*123*/ b\n"
+               "/* */\n"
+               "someCall(parameter);",
+               Style15);
+
+  verifyFormat("#define A\n"
+               "/* */ someCall(\n"
+               "    parameter);",
+               "#define A\n"
+               "/* */someCall(parameter);",
+               Style15);
   verifyNoChange("/*\n**\n*/");
-  EXPECT_EQ("/*\n"
-            " *\n"
-            " * aaaaaa\n"
-            " * aaaaaa\n"
-            " */",
-            format("/*\n"
-                   "*\n"
-                   " * aaaaaa aaaaaa\n"
-                   "*/",
-                   getLLVMStyleWithColumns(10)));
-  EXPECT_EQ("/*\n"
-            "**\n"
-            "* aaaaaa\n"
-            "* aaaaaa\n"
-            "*/",
-            format("/*\n"
-                   "**\n"
-                   "* aaaaaa aaaaaa\n"
-                   "*/",
-                   getLLVMStyleWithColumns(10)));
-  EXPECT_EQ("int aaaaaaaaaaaaaaaaaaaaaaaaaaaa =\n"
-            "    /* line 1\n"
-            "       bbbbbbbbbbbb */\n"
-            "    bbbbbbbbbbbbbbbbbbbbbbbbbbbb;",
-            format("int aaaaaaaaaaaaaaaaaaaaaaaaaaaa =\n"
-                   "    /* line 1\n"
-                   "       bbbbbbbbbbbb */ bbbbbbbbbbbbbbbbbbbbbbbbbbbb;",
-                   getLLVMStyleWithColumns(50)));
+  verifyFormat("/*\n"
+               " *\n"
+               " * aaaaaa\n"
+               " * aaaaaa\n"
+               " */",
+               "/*\n"
+               "*\n"
+               " * aaaaaa aaaaaa\n"
+               "*/",
+               Style10);
+  verifyFormat("/*\n"
+               "**\n"
+               "* aaaaaa\n"
+               "* aaaaaa\n"
+               "*/",
+               "/*\n"
+               "**\n"
+               "* aaaaaa aaaaaa\n"
+               "*/",
+               Style10);
+  verifyFormat("int aaaaaaaaaaaaaaaaaaaaaaaaaaaa =\n"
+               "    /* line 1\n"
+               "       bbbbbbbbbbbb */\n"
+               "    bbbbbbbbbbbbbbbbbbbbbbbbbbbb;",
+               "int aaaaaaaaaaaaaaaaaaaaaaaaaaaa =\n"
+               "    /* line 1\n"
+               "       bbbbbbbbbbbb */ bbbbbbbbbbbbbbbbbbbbbbbbbbbb;",
+               getLLVMStyleWithColumns(50));
 
   FormatStyle NoBinPacking = getLLVMStyle();
   NoBinPacking.BinPackParameters = FormatStyle::BPPS_OnePerLine;
-  EXPECT_EQ("someFunction(1, /* comment 1 */\n"
-            "             2, /* comment 2 */\n"
-            "             3, /* comment 3 */\n"
-            "             aaaa,\n"
-            "             bbbb);",
-            format("someFunction (1,   /* comment 1 */\n"
-                   "                2, /* comment 2 */  \n"
-                   "               3,   /* comment 3 */\n"
-                   "aaaa, bbbb );",
-                   NoBinPacking));
+  verifyFormat("someFunction(1, /* comment 1 */\n"
+               "             2, /* comment 2 */\n"
+               "             3, /* comment 3 */\n"
+               "             aaaa,\n"
+               "             bbbb);",
+               "someFunction (1,   /* comment 1 */\n"
+               "                2, /* comment 2 */  \n"
+               "               3,   /* comment 3 */\n"
+               "aaaa, bbbb );",
+               NoBinPacking);
   verifyFormat(
       "bool aaaaaaaaaaaaa = /* comment: */ aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa ||\n"
       "                     aaaaaaaaaaaaaaaaaaaaaaaaaaaa;");
-  EXPECT_EQ(
+  verifyFormat(
       "bool aaaaaaaaaaaaa = /* trailing comment */\n"
       "    aaaaaaaaaaaaaaaaaaaaaaaaaaa || aaaaaaaaaaaaaaaaaaaaaaaaa ||\n"
       "    aaaaaaaaaaaaaaaaaaaaaaaaaaaa || aaaaaaaaaaaaaaaaaaaaaaaaaa;",
-      format(
-          "bool       aaaaaaaaaaaaa =       /* trailing comment */\n"
-          "    aaaaaaaaaaaaaaaaaaaaaaaaaaa||aaaaaaaaaaaaaaaaaaaaaaaaa    ||\n"
-          "    aaaaaaaaaaaaaaaaaaaaaaaaaaaa   || aaaaaaaaaaaaaaaaaaaaaaaaaa;"));
-  EXPECT_EQ(
-      "int aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa; /* comment */\n"
-      "int bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb;   /* comment */\n"
-      "int cccccccccccccccccccccccccccccc;       /* comment */",
-      format("int aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa; /* comment */\n"
-             "int      bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb; /* comment */\n"
-             "int    cccccccccccccccccccccccccccccc;  /* comment */"));
+      "bool       aaaaaaaaaaaaa =       /* trailing comment */\n"
+      "    aaaaaaaaaaaaaaaaaaaaaaaaaaa||aaaaaaaaaaaaaaaaaaaaaaaaa    ||\n"
+      "    aaaaaaaaaaaaaaaaaaaaaaaaaaaa   || aaaaaaaaaaaaaaaaaaaaaaaaaa;");
+  verifyFormat("int aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa; /* comment */\n"
+               "int bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb;   /* comment */\n"
+               "int cccccccccccccccccccccccccccccc;       /* comment */",
+               "int aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa; /* comment */\n"
+               "int      bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb; /* comment */\n"
+               "int    cccccccccccccccccccccccccccccc;  /* comment */");
 
   verifyFormat("void f(int * /* unused */) {}");
 
-  EXPECT_EQ("/*\n"
-            " **\n"
-            " */",
-            format("/*\n"
-                   " **\n"
-                   " */"));
-  EXPECT_EQ("/*\n"
-            " *q\n"
-            " */",
-            format("/*\n"
-                   " *q\n"
-                   " */"));
-  EXPECT_EQ("/*\n"
-            " * q\n"
-            " */",
-            format("/*\n"
-                   " * q\n"
-                   " */"));
-  EXPECT_EQ("/*\n"
-            " **/",
-            format("/*\n"
-                   " **/"));
-  EXPECT_EQ("/*\n"
-            " ***/",
-            format("/*\n"
-                   " ***/"));
+  verifyNoChange("/*\n"
+                 " **\n"
+                 " */");
+  verifyNoChange("/*\n"
+                 " *q\n"
+                 " */");
+  verifyNoChange("/*\n"
+                 " * q\n"
+                 " */");
+  verifyNoChange("/*\n"
+                 " **/");
+  verifyNoChange("/*\n"
+                 " ***/");
 }
 
 TEST_F(FormatTestComments, BlockCommentsInMacros) {
-  EXPECT_EQ("#define A          \\\n"
-            "  {                \\\n"
-            "    /* one line */ \\\n"
-            "    someCall();",
-            format("#define A {        \\\n"
-                   "  /* one line */   \\\n"
-                   "  someCall();",
-                   getLLVMStyleWithColumns(20)));
-  EXPECT_EQ("#define A          \\\n"
-            "  {                \\\n"
-            "    /* previous */ \\\n"
-            "    /* one line */ \\\n"
-            "    someCall();",
-            format("#define A {        \\\n"
-                   "  /* previous */   \\\n"
-                   "  /* one line */   \\\n"
-                   "  someCall();",
-                   getLLVMStyleWithColumns(20)));
+  const auto Style20 = getLLVMStyleWithColumns(20);
+  verifyFormat("#define A          \\\n"
+               "  {                \\\n"
+               "    /* one line */ \\\n"
+               "    someCall();",
+               "#define A {        \\\n"
+               "  /* one line */   \\\n"
+               "  someCall();",
+               Style20);
+  verifyFormat("#define A          \\\n"
+               "  {                \\\n"
+               "    /* previous */ \\\n"
+               "    /* one line */ \\\n"
+               "    someCall();",
+               "#define A {        \\\n"
+               "  /* previous */   \\\n"
+               "  /* one line */   \\\n"
+               "  someCall();",
+               Style20);
 }
 
 TEST_F(FormatTestComments, BlockCommentsAtEndOfLine) {
-  EXPECT_EQ("a = {\n"
-            "    1111 /*    */\n"
-            "};",
-            format("a = {1111 /*    */\n"
-                   "};",
-                   getLLVMStyleWithColumns(15)));
-  EXPECT_EQ("a = {\n"
-            "    1111 /*      */\n"
-            "};",
-            format("a = {1111 /*      */\n"
-                   "};",
-                   getLLVMStyleWithColumns(15)));
-  EXPECT_EQ("a = {\n"
-            "    1111 /*      a\n"
-            "          */\n"
-            "};",
-            format("a = {1111 /*      a */\n"
-                   "};",
-                   getLLVMStyleWithColumns(15)));
+  const auto Style15 = getLLVMStyleWithColumns(15);
+  verifyFormat("a = {\n"
+               "    1111 /*    */\n"
+               "};",
+               "a = {1111 /*    */\n"
+               "};",
+               Style15);
+  verifyFormat("a = {\n"
+               "    1111 /*      */\n"
+               "};",
+               "a = {1111 /*      */\n"
+               "};",
+               Style15);
+  verifyFormat("a = {\n"
+               "    1111 /*      a\n"
+               "          */\n"
+               "};",
+               "a = {1111 /*      a */\n"
+               "};",
+               Style15);
 }
 
 TEST_F(FormatTestComments, BreaksAfterMultilineBlockCommentsInParamLists) {
-  EXPECT_EQ("a = f(/* long\n"
-            "         long */\n"
-            "      a);",
-            format("a = f(/* long long */ a);", getLLVMStyleWithColumns(16)));
-  EXPECT_EQ("a = f(\n"
-            "    /* long\n"
-            "       long */\n"
-            "    a);",
-            format("a = f(/* long long */ a);", getLLVMStyleWithColumns(15)));
-
-  EXPECT_EQ("a = f(/* long\n"
-            "         long\n"
-            "       */\n"
-            "      a);",
-            format("a = f(/* long\n"
-                   "         long\n"
-                   "       */a);",
-                   getLLVMStyleWithColumns(16)));
-
-  EXPECT_EQ("a = f(/* long\n"
-            "         long\n"
-            "       */\n"
-            "      a);",
-            format("a = f(/* long\n"
-                   "         long\n"
-                   "       */ a);",
-                   getLLVMStyleWithColumns(16)));
-
-  EXPECT_EQ("a = f(/* long\n"
-            "         long\n"
-            "       */\n"
-            "      (1 + 1));",
-            format("a = f(/* long\n"
-                   "         long\n"
-                   "       */ (1 + 1));",
-                   getLLVMStyleWithColumns(16)));
-
-  EXPECT_EQ(
-      "a = f(a,\n"
-      "      /* long\n"
-      "         long */\n"
-      "      b);",
-      format("a = f(a, /* long long */ b);", getLLVMStyleWithColumns(16)));
-
-  EXPECT_EQ(
-      "a = f(\n"
-      "    a,\n"
-      "    /* long\n"
-      "       long */\n"
-      "    b);",
-      format("a = f(a, /* long long */ b);", getLLVMStyleWithColumns(15)));
-
-  EXPECT_EQ("a = f(a,\n"
-            "      /* long\n"
-            "         long */\n"
-            "      (1 + 1));",
-            format("a = f(a, /* long long */ (1 + 1));",
-                   getLLVMStyleWithColumns(16)));
-  EXPECT_EQ("a = f(\n"
-            "    a,\n"
-            "    /* long\n"
-            "       long */\n"
-            "    (1 + 1));",
-            format("a = f(a, /* long long */ (1 + 1));",
-                   getLLVMStyleWithColumns(15)));
+  const auto Style15 = getLLVMStyleWithColumns(15);
+  const auto Style16 = getLLVMStyleWithColumns(16);
+
+  verifyFormat("a = f(/* long\n"
+               "         long */\n"
+               "      a);",
+               "a = f(/* long long */ a);", Style16);
+  verifyFormat("a = f(\n"
+               "    /* long\n"
+               "       long */\n"
+               "    a);",
+               "a = f(/* long long */ a);", Style15);
+
+  verifyFormat("a = f(/* long\n"
+               "         long\n"
+               "       */\n"
+               "      a);",
+               "a = f(/* long\n"
+               "         long\n"
+               "       */a);",
+               Style16);
+
+  verifyFormat("a = f(/* long\n"
+               "         long\n"
+               "       */\n"
+               "      a);",
+               "a = f(/* long\n"
+               "         long\n"
+               "       */ a);",
+               Style16);
+
+  verifyFormat("a = f(/* long\n"
+               "         long\n"
+               "       */\n"
+               "      (1 + 1));",
+               "a = f(/* long\n"
+               "         long\n"
+               "       */ (1 + 1));",
+               Style16);
+
+  verifyFormat("a = f(a,\n"
+               "      /* long\n"
+               "         long */\n"
+               "      b);",
+               "a = f(a, /* long long */ b);", Style16);
+
+  verifyFormat("a = f(\n"
+               "    a,\n"
+               "    /* long\n"
+               "       long */\n"
+               "    b);",
+               "a = f(a, /* long long */ b);", Style15);
+
+  verifyFormat("a = f(a,\n"
+               "      /* long\n"
+               "         long */\n"
+               "      (1 + 1));",
+               "a = f(a, /* long long */ (1 + 1));", Style16);
+  verifyFormat("a = f(\n"
+               "    a,\n"
+               "    /* long\n"
+               "       long */\n"
+               "    (1 + 1));",
+               "a = f(a, /* long long */ (1 + 1));", Style15);
 }
 
 TEST_F(FormatTestComments, IndentLineCommentsInStartOfBlockAtEndOfFile) {
@@ -2698,229 +2608,219 @@ TEST_F(FormatTestComments, IndentLineCommentsInStartOfBlockAtEndOfFile) {
 }
 
 TEST_F(FormatTestComments, AlignTrailingComments) {
-  EXPECT_EQ("#define MACRO(V)                       \\\n"
-            "  V(Rt2) /* one more char */           \\\n"
-            "  V(Rs)  /* than here  */              \\\n"
-            "/* comment 3 */\n",
-            format("#define MACRO(V)\\\n"
-                   "V(Rt2)  /* one more char */ \\\n"
-                   "V(Rs) /* than here  */    \\\n"
-                   "/* comment 3 */\n",
-                   getLLVMStyleWithColumns(40)));
-  EXPECT_EQ("int i = f(abc, // line 1\n"
-            "          d,   // line 2\n"
-            "               // line 3\n"
-            "          b);",
-            format("int i = f(abc, // line 1\n"
-                   "          d, // line 2\n"
-                   "             // line 3\n"
-                   "          b);",
-                   getLLVMStyleWithColumns(40)));
+  const auto Style15 = getLLVMStyleWithColumns(15);
+  const auto Style40 = getLLVMStyleWithColumns(40);
+
+  verifyFormat("#define MACRO(V)                       \\\n"
+               "  V(Rt2) /* one more char */           \\\n"
+               "  V(Rs)  /* than here  */              \\\n"
+               "/* comment 3 */\n",
+               "#define MACRO(V)\\\n"
+               "V(Rt2)  /* one more char */ \\\n"
+               "V(Rs) /* than here  */    \\\n"
+               "/* comment 3 */\n",
+               Style40);
+  verifyFormat("int i = f(abc, // line 1\n"
+               "          d,   // line 2\n"
+               "               // line 3\n"
+               "          b);",
+               "int i = f(abc, // line 1\n"
+               "          d, // line 2\n"
+               "             // line 3\n"
+               "          b);",
+               Style40);
 
   // Align newly broken trailing comments.
-  EXPECT_EQ("int ab; // line\n"
-            "int a;  // long\n"
-            "        // long",
-            format("int ab; // line\n"
-                   "int a; // long long",
-                   getLLVMStyleWithColumns(15)));
-  EXPECT_EQ("int ab; // line\n"
-            "int a;  // long\n"
-            "        // long\n"
-            "        // long",
-            format("int ab; // line\n"
-                   "int a; // long long\n"
-                   "       // long",
-                   getLLVMStyleWithColumns(15)));
-  EXPECT_EQ("int ab; // line\n"
-            "int a;  // long\n"
-            "        // long\n"
-            "pt c;   // long",
-            format("int ab; // line\n"
-                   "int a; // long long\n"
-                   "pt c; // long",
-                   getLLVMStyleWithColumns(15)));
-  EXPECT_EQ("int ab; // line\n"
-            "int a;  // long\n"
-            "        // long\n"
-            "\n"
-            "// long",
-            format("int ab; // line\n"
-                   "int a; // long long\n"
-                   "\n"
-                   "// long",
-                   getLLVMStyleWithColumns(15)));
+  verifyFormat("int ab; // line\n"
+               "int a;  // long\n"
+               "        // long",
+               "int ab; // line\n"
+               "int a; // long long",
+               Style15);
+  verifyFormat("int ab; // line\n"
+               "int a;  // long\n"
+               "        // long\n"
+               "        // long",
+               "int ab; // line\n"
+               "int a; // long long\n"
+               "       // long",
+               Style15);
+  verifyFormat("int ab; // line\n"
+               "int a;  // long\n"
+               "        // long\n"
+               "pt c;   // long",
+               "int ab; // line\n"
+               "int a; // long long\n"
+               "pt c; // long",
+               Style15);
+  verifyFormat("int ab; // line\n"
+               "int a;  // long\n"
+               "        // long\n"
+               "\n"
+               "// long",
+               "int ab; // line\n"
+               "int a; // long long\n"
+               "\n"
+               "// long",
+               Style15);
 
   // Don't align newly broken trailing comments if that would put them over the
   // column limit.
-  EXPECT_EQ("int i, j; // line 1\n"
-            "int k; // line longg\n"
-            "       // long",
-            format("int i, j; // line 1\n"
-                   "int k; // line longg long",
-                   getLLVMStyleWithColumns(20)));
+  verifyFormat("int i, j; // line 1\n"
+               "int k; // line longg\n"
+               "       // long",
+               "int i, j; // line 1\n"
+               "int k; // line longg long",
+               getLLVMStyleWithColumns(20));
 
   // Always align if ColumnLimit = 0
-  EXPECT_EQ("int i, j; // line 1\n"
-            "int k;    // line longg long",
-            format("int i, j; // line 1\n"
-                   "int k; // line longg long",
-                   getLLVMStyleWithColumns(0)));
+  verifyFormat("int i, j; // line 1\n"
+               "int k;    // line longg long",
+               "int i, j; // line 1\n"
+               "int k; // line longg long",
+               getLLVMStyleWithColumns(0));
 
   // Align comment line sections aligned with the next token with the next
   // token.
-  EXPECT_EQ("class A {\n"
-            "public: // public comment\n"
-            "  // comment about a\n"
-            "  int a;\n"
-            "};",
-            format("class A {\n"
-                   "public: // public comment\n"
-                   "  // comment about a\n"
-                   "  int a;\n"
-                   "};",
-                   getLLVMStyleWithColumns(40)));
-  EXPECT_EQ("class A {\n"
-            "public: // public comment 1\n"
-            "        // public comment 2\n"
-            "  // comment 1 about a\n"
-            "  // comment 2 about a\n"
-            "  int a;\n"
-            "};",
-            format("class A {\n"
-                   "public: // public comment 1\n"
-                   "   // public comment 2\n"
-                   "  // comment 1 about a\n"
-                   "  // comment 2 about a\n"
-                   "  int a;\n"
-                   "};",
-                   getLLVMStyleWithColumns(40)));
-  EXPECT_EQ("int f(int n) { // comment line 1 on f\n"
-            "               // comment line 2 on f\n"
-            "  // comment line 1 before return\n"
-            "  // comment line 2 before return\n"
-            "  return n; // comment line 1 on return\n"
-            "            // comment line 2 on return\n"
-            "  // comment line 1 after return\n"
-            "}",
-            format("int f(int n) { // comment line 1 on f\n"
-                   "   // comment line 2 on f\n"
-                   "  // comment line 1 before return\n"
-                   "  // comment line 2 before return\n"
-                   "  return n; // comment line 1 on return\n"
-                   "   // comment line 2 on return\n"
-                   "  // comment line 1 after return\n"
-                   "}",
-                   getLLVMStyleWithColumns(40)));
-  EXPECT_EQ("int f(int n) {\n"
-            "  switch (n) { // comment line 1 on switch\n"
-            "               // comment line 2 on switch\n"
-            "  // comment line 1 before case 1\n"
-            "  // comment line 2 before case 1\n"
-            "  case 1: // comment line 1 on case 1\n"
-            "          // comment line 2 on case 1\n"
-            "    // comment line 1 before return 1\n"
-            "    // comment line 2 before return 1\n"
-            "    return 1; // comment line 1 on return 1\n"
-            "              // comment line 2 on return 1\n"
-            "  // comment line 1 before default\n"
-            "  // comment line 2 before default\n"
-            "  default: // comment line 1 on default\n"
-            "           // comment line 2 on default\n"
-            "    // comment line 1 before return 2\n"
-            "    return 2 * f(n - 1); // comment line 1 on return 2\n"
-            "                         // comment line 2 on return 2\n"
-            "    // comment line 1 after return\n"
-            "    // comment line 2 after return\n"
-            "  }\n"
-            "}",
-            format("int f(int n) {\n"
-                   "  switch (n) { // comment line 1 on switch\n"
-                   "              // comment line 2 on switch\n"
-                   "    // comment line 1 before case 1\n"
-                   "    // comment line 2 before case 1\n"
-                   "    case 1: // comment line 1 on case 1\n"
-                   "              // comment line 2 on case 1\n"
-                   "    // comment line 1 before return 1\n"
-                   "    // comment line 2 before return 1\n"
-                   "    return 1;  // comment line 1 on return 1\n"
-                   "             // comment line 2 on return 1\n"
-                   "    // comment line 1 before default\n"
-                   "    // comment line 2 before default\n"
-                   "    default:   // comment line 1 on default\n"
-                   "                // comment line 2 on default\n"
-                   "    // comment line 1 before return 2\n"
-                   "    return 2 * f(n - 1); // comment line 1 on return 2\n"
-                   "                        // comment line 2 on return 2\n"
-                   "    // comment line 1 after return\n"
-                   "     // comment line 2 after return\n"
-                   "  }\n"
-                   "}",
-                   getLLVMStyleWithColumns(80)));
+  verifyFormat("class A {\n"
+               "public: // public comment\n"
+               "  // comment about a\n"
+               "  int a;\n"
+               "};",
+               Style40);
+  verifyFormat("class A {\n"
+               "public: // public comment 1\n"
+               "        // public comment 2\n"
+               "  // comment 1 about a\n"
+               "  // comment 2 about a\n"
+               "  int a;\n"
+               "};",
+               "class A {\n"
+               "public: // public comment 1\n"
+               "   // public comment 2\n"
+               "  // comment 1 about a\n"
+               "  // comment 2 about a\n"
+               "  int a;\n"
+               "};",
+               Style40);
+  verifyFormat("int f(int n) { // comment line 1 on f\n"
+               "               // comment line 2 on f\n"
+               "  // comment line 1 before return\n"
+               "  // comment line 2 before return\n"
+               "  return n; // comment line 1 on return\n"
+               "            // comment line 2 on return\n"
+               "  // comment line 1 after return\n"
+               "}",
+               "int f(int n) { // comment line 1 on f\n"
+               "   // comment line 2 on f\n"
+               "  // comment line 1 before return\n"
+               "  // comment line 2 before return\n"
+               "  return n; // comment line 1 on return\n"
+               "   // comment line 2 on return\n"
+               "  // comment line 1 after return\n"
+               "}",
+               Style40);
+  verifyFormat("int f(int n) {\n"
+               "  switch (n) { // comment line 1 on switch\n"
+               "               // comment line 2 on switch\n"
+               "  // comment line 1 before case 1\n"
+               "  // comment line 2 before case 1\n"
+               "  case 1: // comment line 1 on case 1\n"
+               "          // comment line 2 on case 1\n"
+               "    // comment line 1 before return 1\n"
+               "    // comment line 2 before return 1\n"
+               "    return 1; // comment line 1 on return 1\n"
+               "              // comment line 2 on return 1\n"
+               "  // comment line 1 before default\n"
+               "  // comment line 2 before default\n"
+               "  default: // comment line 1 on default\n"
+               "           // comment line 2 on default\n"
+               "    // comment line 1 before return 2\n"
+               "    return 2 * f(n - 1); // comment line 1 on return 2\n"
+               "                         // comment line 2 on return 2\n"
+               "    // comment line 1 after return\n"
+               "    // comment line 2 after return\n"
+               "  }\n"
+               "}",
+               "int f(int n) {\n"
+               "  switch (n) { // comment line 1 on switch\n"
+               "              // comment line 2 on switch\n"
+               "    // comment line 1 before case 1\n"
+               "    // comment line 2 before case 1\n"
+               "    case 1: // comment line 1 on case 1\n"
+               "              // comment line 2 on case 1\n"
+               "    // comment line 1 before return 1\n"
+               "    // comment line 2 before return 1\n"
+               "    return 1;  // comment line 1 on return 1\n"
+               "             // comment line 2 on return 1\n"
+               "    // comment line 1 before default\n"
+               "    // comment line 2 before default\n"
+               "    default:   // comment line 1 on default\n"
+               "                // comment line 2 on default\n"
+               "    // comment line 1 before return 2\n"
+               "    return 2 * f(n - 1); // comment line 1 on return 2\n"
+               "                        // comment line 2 on return 2\n"
+               "    // comment line 1 after return\n"
+               "     // comment line 2 after return\n"
+               "  }\n"
+               "}");
 
   // If all the lines in a sequence of line comments are aligned with the next
   // token, the first line belongs to the previous token and the other lines
   // belong to the next token.
-  EXPECT_EQ("int a; // line about a\n"
-            "long b;",
-            format("int a; // line about a\n"
-                   "       long b;",
-                   getLLVMStyleWithColumns(80)));
-  EXPECT_EQ("int a; // line about a\n"
-            "// line about b\n"
-            "long b;",
-            format("int a; // line about a\n"
-                   "       // line about b\n"
-                   "       long b;",
-                   getLLVMStyleWithColumns(80)));
-  EXPECT_EQ("int a; // line about a\n"
-            "// line 1 about b\n"
-            "// line 2 about b\n"
-            "long b;",
-            format("int a; // line about a\n"
-                   "       // line 1 about b\n"
-                   "       // line 2 about b\n"
-                   "       long b;",
-                   getLLVMStyleWithColumns(80)));
+  verifyFormat("int a; // line about a\n"
+               "long b;",
+               "int a; // line about a\n"
+               "       long b;");
+  verifyFormat("int a; // line about a\n"
+               "// line about b\n"
+               "long b;",
+               "int a; // line about a\n"
+               "       // line about b\n"
+               "       long b;");
+  verifyFormat("int a; // line about a\n"
+               "// line 1 about b\n"
+               "// line 2 about b\n"
+               "long b;",
+               "int a; // line about a\n"
+               "       // line 1 about b\n"
+               "       // line 2 about b\n"
+               "       long b;");
 
   // Checks an edge case in preprocessor handling.
   // These comments should *not* be aligned
-  EXPECT_EQ(
-      "#if FOO\n"
-      "#else\n"
-      "long a; // Line about a\n"
-      "#endif\n"
-      "#if BAR\n"
-      "#else\n"
-      "long b_long_name; // Line about b\n"
-      "#endif",
-      format("#if FOO\n"
-             "#else\n"
-             "long a;           // Line about a\n" // Previous (bad) behavior
-             "#endif\n"
-             "#if BAR\n"
-             "#else\n"
-             "long b_long_name; // Line about b\n"
-             "#endif",
-             getLLVMStyleWithColumns(80)));
+  verifyFormat("#if FOO\n"
+               "#else\n"
+               "long a; // Line about a\n"
+               "#endif\n"
+               "#if BAR\n"
+               "#else\n"
+               "long b_long_name; // Line about b\n"
+               "#endif",
+               "#if FOO\n"
+               "#else\n"
+               "long a;           // Line about a\n" // Previous (bad) behavior
+               "#endif\n"
+               "#if BAR\n"
+               "#else\n"
+               "long b_long_name; // Line about b\n"
+               "#endif");
 
   // bug 47589
-  EXPECT_EQ(
-      "namespace m {\n\n"
-      "#define FOO_GLOBAL 0      // Global scope.\n"
-      "#define FOO_LINKLOCAL 1   // Link-local scope.\n"
-      "#define FOO_SITELOCAL 2   // Site-local scope (deprecated).\n"
-      "#define FOO_UNIQUELOCAL 3 // Unique local\n"
-      "#define FOO_NODELOCAL 4   // Loopback\n\n"
-      "} // namespace m",
-      format("namespace m {\n\n"
-             "#define FOO_GLOBAL 0   // Global scope.\n"
-             "#define FOO_LINKLOCAL 1  // Link-local scope.\n"
-             "#define FOO_SITELOCAL 2  // Site-local scope (deprecated).\n"
-             "#define FOO_UNIQUELOCAL 3 // Unique local\n"
-             "#define FOO_NODELOCAL 4  // Loopback\n\n"
-             "} // namespace m",
-             getLLVMStyleWithColumns(80)));
+  verifyFormat("namespace m {\n\n"
+               "#define FOO_GLOBAL 0      // Global scope.\n"
+               "#define FOO_LINKLOCAL 1   // Link-local scope.\n"
+               "#define FOO_SITELOCAL 2   // Site-local scope (deprecated).\n"
+               "#define FOO_UNIQUELOCAL 3 // Unique local\n"
+               "#define FOO_NODELOCAL 4   // Loopback\n\n"
+               "} // namespace m",
+               "namespace m {\n\n"
+               "#define FOO_GLOBAL 0   // Global scope.\n"
+               "#define FOO_LINKLOCAL 1  // Link-local scope.\n"
+               "#define FOO_SITELOCAL 2  // Site-local scope (deprecated).\n"
+               "#define FOO_UNIQUELOCAL 3 // Unique local\n"
+               "#define FOO_NODELOCAL 4  // Loopback\n\n"
+               "} // namespace m");
 
   // https://llvm.org/PR53441
   verifyFormat("/* */  //\n"
@@ -2980,193 +2880,178 @@ TEST_F(FormatTestComments, AlignTrailingCommentsAcrossEmptyLines) {
   Style.AlignTrailingComments.OverEmptyLines = 2;
   // Cannot use verifyFormat here
   // test::messUp removes all new lines which changes the logic
-  EXPECT_EQ("#include \"a.h\" // comment\n"
-            "\n"
-            "\n"
-            "\n"
-            "#include \"ab.h\"      // comment\n"
-            "\n"
-            "\n"
-            "#include \"abcdefg.h\" // comment",
-            format("#include \"a.h\" // comment\n"
-                   "\n"
-                   "\n"
-                   "\n"
-                   "#include \"ab.h\" // comment\n"
-                   "\n"
-                   "\n"
-                   "#include \"abcdefg.h\" // comment",
-                   Style));
+  verifyFormat("#include \"a.h\" // comment\n"
+               "\n"
+               "\n"
+               "\n"
+               "#include \"ab.h\"      // comment\n"
+               "\n"
+               "\n"
+               "#include \"abcdefg.h\" // comment",
+               "#include \"a.h\" // comment\n"
+               "\n"
+               "\n"
+               "\n"
+               "#include \"ab.h\" // comment\n"
+               "\n"
+               "\n"
+               "#include \"abcdefg.h\" // comment",
+               Style);
 
   Style.MaxEmptyLinesToKeep = 1;
   Style.AlignTrailingComments.OverEmptyLines = 1;
   // End of testing OverEmptyLines
 
   Style.ColumnLimit = 15;
-  EXPECT_EQ("int ab; // line\n"
-            "int a;  // long\n"
-            "        // long\n"
-            "\n"
-            "        // long",
-            format("int ab; // line\n"
-                   "int a; // long long\n"
-                   "\n"
-                   "// long",
-                   Style));
+  verifyFormat("int ab; // line\n"
+               "int a;  // long\n"
+               "        // long\n"
+               "\n"
+               "        // long",
+               "int ab; // line\n"
+               "int a; // long long\n"
+               "\n"
+               "// long",
+               Style);
 
   Style.ColumnLimit = 15;
-  EXPECT_EQ("int ab; // line\n"
-            "\n"
-            "int a;  // long\n"
-            "        // long",
-            format("int ab; // line\n"
-                   "\n"
-                   "int a; // long long",
-                   Style));
+  verifyFormat("int ab; // line\n"
+               "\n"
+               "int a;  // long\n"
+               "        // long",
+               "int ab; // line\n"
+               "\n"
+               "int a; // long long",
+               Style);
 
   Style.ColumnLimit = 30;
-  EXPECT_EQ("int foo = 12345; // comment\n"
-            "int bar =\n"
-            "    1234;  // This is a very\n"
-            "           // long comment\n"
-            "           // which is wrapped\n"
-            "           // arround.\n"
-            "\n"
-            "int x = 2; // Is this still\n"
-            "           // aligned?",
-            format("int foo = 12345; // comment\n"
-                   "int bar = 1234; // This is a very long comment\n"
-                   "                // which is wrapped arround.\n"
-                   "\n"
-                   "int x = 2; // Is this still aligned?",
-                   Style));
+  verifyFormat("int foo = 12345; // comment\n"
+               "int bar =\n"
+               "    1234;  // This is a very\n"
+               "           // long comment\n"
+               "           // which is wrapped\n"
+               "           // arround.\n"
+               "\n"
+               "int x = 2; // Is this still\n"
+               "           // aligned?",
+               "int foo = 12345; // comment\n"
+               "int bar = 1234; // This is a very long comment\n"
+               "                // which is wrapped arround.\n"
+               "\n"
+               "int x = 2; // Is this still aligned?",
+               Style);
 
   Style.ColumnLimit = 35;
-  EXPECT_EQ("int foo = 12345; // comment\n"
-            "int bar =\n"
-            "    1234; // This is a very long\n"
-            "          // comment which is\n"
-            "          // wrapped arround.\n"
-            "\n"
-            "int x =\n"
-            "    2; // Is this still aligned?",
-            format("int foo = 12345; // comment\n"
-                   "int bar = 1234; // This is a very long comment\n"
-                   "                // which is wrapped arround.\n"
-                   "\n"
-                   "int x = 2; // Is this still aligned?",
-                   Style));
+  verifyFormat("int foo = 12345; // comment\n"
+               "int bar =\n"
+               "    1234; // This is a very long\n"
+               "          // comment which is\n"
+               "          // wrapped arround.\n"
+               "\n"
+               "int x =\n"
+               "    2; // Is this still aligned?",
+               "int foo = 12345; // comment\n"
+               "int bar = 1234; // This is a very long comment\n"
+               "                // which is wrapped arround.\n"
+               "\n"
+               "int x = 2; // Is this still aligned?",
+               Style);
 
   Style.ColumnLimit = 40;
-  EXPECT_EQ("int foo = 12345; // comment\n"
-            "int bar =\n"
-            "    1234; // This is a very long comment\n"
-            "          // which is wrapped arround.\n"
-            "\n"
-            "int x = 2; // Is this still aligned?",
-            format("int foo = 12345; // comment\n"
-                   "int bar = 1234; // This is a very long comment\n"
-                   "                // which is wrapped arround.\n"
-                   "\n"
-                   "int x = 2; // Is this still aligned?",
-                   Style));
+  verifyFormat("int foo = 12345; // comment\n"
+               "int bar =\n"
+               "    1234; // This is a very long comment\n"
+               "          // which is wrapped arround.\n"
+               "\n"
+               "int x = 2; // Is this still aligned?",
+               "int foo = 12345; // comment\n"
+               "int bar = 1234; // This is a very long comment\n"
+               "                // which is wrapped arround.\n"
+               "\n"
+               "int x = 2; // Is this still aligned?",
+               Style);
 
   Style.ColumnLimit = 45;
-  EXPECT_EQ("int foo = 12345; // comment\n"
-            "int bar =\n"
-            "    1234;  // This is a very long comment\n"
-            "           // which is wrapped arround.\n"
-            "\n"
-            "int x = 2; // Is this still aligned?",
-            format("int foo = 12345; // comment\n"
-                   "int bar = 1234; // This is a very long comment\n"
-                   "                // which is wrapped arround.\n"
-                   "\n"
-                   "int x = 2; // Is this still aligned?",
-                   Style));
+  verifyFormat("int foo = 12345; // comment\n"
+               "int bar =\n"
+               "    1234;  // This is a very long comment\n"
+               "           // which is wrapped arround.\n"
+               "\n"
+               "int x = 2; // Is this still aligned?",
+               "int foo = 12345; // comment\n"
+               "int bar = 1234; // This is a very long comment\n"
+               "                // which is wrapped arround.\n"
+               "\n"
+               "int x = 2; // Is this still aligned?",
+               Style);
 
   Style.ColumnLimit = 80;
-  EXPECT_EQ("int a; // line about a\n"
-            "\n"
-            "// line about b\n"
-            "long b;",
-            format("int a; // line about a\n"
-                   "\n"
-                   "       // line about b\n"
-                   "       long b;",
-                   Style));
+  verifyFormat("int a; // line about a\n"
+               "\n"
+               "// line about b\n"
+               "long b;",
+               "int a; // line about a\n"
+               "\n"
+               "       // line about b\n"
+               "       long b;",
+               Style);
 
   Style.ColumnLimit = 80;
-  EXPECT_EQ("int a; // line about a\n"
-            "\n"
-            "// line 1 about b\n"
-            "// line 2 about b\n"
-            "long b;",
-            format("int a; // line about a\n"
-                   "\n"
-                   "       // line 1 about b\n"
-                   "       // line 2 about b\n"
-                   "       long b;",
-                   Style));
+  verifyFormat("int a; // line about a\n"
+               "\n"
+               "// line 1 about b\n"
+               "// line 2 about b\n"
+               "long b;",
+               "int a; // line about a\n"
+               "\n"
+               "       // line 1 about b\n"
+               "       // line 2 about b\n"
+               "       long b;",
+               Style);
 }
 
 TEST_F(FormatTestComments, AlignTrailingCommentsLeave) {
   FormatStyle Style = getLLVMStyle();
   Style.AlignTrailingComments.Kind = FormatStyle::TCAS_Leave;
 
-  EXPECT_EQ("int a;// do not touch\n"
-            "int b; // any comments\n"
-            "int c;  // comment\n"
-            "int d;   // comment",
-            format("int a;// do not touch\n"
-                   "int b; // any comments\n"
-                   "int c;  // comment\n"
-                   "int d;   // comment",
-                   Style));
-
-  EXPECT_EQ("int a;   // do not touch\n"
-            "int b;  // any comments\n"
-            "int c; // comment\n"
-            "int d;// comment",
-            format("int a;   // do not touch\n"
-                   "int b;  // any comments\n"
-                   "int c; // comment\n"
-                   "int d;// comment",
-                   Style));
-
-  EXPECT_EQ("// do not touch\n"
-            "int a;  // any comments\n"
-            "\n"
-            "   // comment\n"
-            "// comment\n"
-            "\n"
-            "// comment",
-            format("// do not touch\n"
-                   "int a;  // any comments\n"
-                   "\n"
-                   "   // comment\n"
-                   "// comment\n"
-                   "\n"
-                   "// comment",
-                   Style));
-
-  EXPECT_EQ("// do not touch\n"
-            "int a;  // any comments\n"
-            "\n"
-            "   // comment\n"
-            "// comment\n"
-            "\n"
-            "// comment",
-            format("// do not touch\n"
-                   "int a;  // any comments\n"
-                   "\n"
-                   "\n"
-                   "   // comment\n"
-                   "// comment\n"
-                   "\n"
-                   "\n"
-                   "// comment",
-                   Style));
+  verifyNoChange("int a;// do not touch\n"
+                 "int b; // any comments\n"
+                 "int c;  // comment\n"
+                 "int d;   // comment",
+                 Style);
+
+  verifyNoChange("int a;   // do not touch\n"
+                 "int b;  // any comments\n"
+                 "int c; // comment\n"
+                 "int d;// comment",
+                 Style);
+
+  verifyNoChange("// do not touch\n"
+                 "int a;  // any comments\n"
+                 "\n"
+                 "   // comment\n"
+                 "// comment\n"
+                 "\n"
+                 "// comment",
+                 Style);
+
+  verifyFormat("// do not touch\n"
+               "int a;  // any comments\n"
+               "\n"
+               "   // comment\n"
+               "// comment\n"
+               "\n"
+               "// comment",
+               "// do not touch\n"
+               "int a;  // any comments\n"
+               "\n"
+               "\n"
+               "   // comment\n"
+               "// comment\n"
+               "\n"
+               "\n"
+               "// comment",
+               Style);
 
   verifyFormat("namespace ns {\n"
                "int i;\n"
@@ -3186,36 +3071,28 @@ TEST_F(FormatTestComments, AlignTrailingCommentsLeave) {
 
   // Allow to keep 2 empty lines
   Style.MaxEmptyLinesToKeep = 2;
-  EXPECT_EQ("// do not touch\n"
-            "int a;  // any comments\n"
-            "\n"
-            "\n"
-            "   // comment\n"
-            "// comment\n"
-            "\n"
-            "// comment",
-            format("// do not touch\n"
-                   "int a;  // any comments\n"
-                   "\n"
-                   "\n"
-                   "   // comment\n"
-                   "// comment\n"
-                   "\n"
-                   "// comment",
-                   Style));
+  verifyNoChange("// do not touch\n"
+                 "int a;  // any comments\n"
+                 "\n"
+                 "\n"
+                 "   // comment\n"
+                 "// comment\n"
+                 "\n"
+                 "// comment",
+                 Style);
   Style.MaxEmptyLinesToKeep = 1;
 
   // Just format comments normally when leaving exceeds the column limit
   Style.ColumnLimit = 35;
-  EXPECT_EQ("int foo = 12345; // comment\n"
-            "int bar =\n"
-            "    1234; // This is a very long\n"
-            "          // comment which is\n"
-            "          // wrapped arround.",
-            format("int foo = 12345; // comment\n"
-                   "int bar = 1234;       // This is a very long comment\n"
-                   "          // which is wrapped arround.",
-                   Style));
+  verifyFormat("int foo = 12345; // comment\n"
+               "int bar =\n"
+               "    1234; // This is a very long\n"
+               "          // comment which is\n"
+               "          // wrapped arround.",
+               "int foo = 12345; // comment\n"
+               "int bar = 1234;       // This is a very long comment\n"
+               "          // which is wrapped arround.",
+               Style);
 
   Style = getLLVMStyle();
   Style.AlignTrailingComments.Kind = FormatStyle::TCAS_Leave;
@@ -3241,16 +3118,16 @@ TEST_F(FormatTestComments, DontAlignNamespaceComments) {
   Style.NamespaceMacros.push_back("TESTSUITE");
   Style.ShortNamespaceLines = 0;
 
-  StringRef Input = "namespace A {\n"
-                    "  TESTSUITE(B) {\n"
-                    "    namespace C {\n"
-                    "      namespace D { //\n"
-                    "      } // namespace D\n"
-                    "      std::string Foo = Bar; // Comment\n"
-                    "      std::string BazString = Baz;   // C2\n"
-                    "    }          // namespace C\n"
-                    "  }\n"
-                    "} // NaMeSpAcE A";
+  constexpr StringRef Input("namespace A {\n"
+                            "  TESTSUITE(B) {\n"
+                            "    namespace C {\n"
+                            "      namespace D { //\n"
+                            "      } // namespace D\n"
+                            "      std::string Foo = Bar; // Comment\n"
+                            "      std::string BazString = Baz;   // C2\n"
+                            "    }          // namespace C\n"
+                            "  }\n"
+                            "} // NaMeSpAcE A");
 
   EXPECT_TRUE(Style.FixNamespaceComments);
   EXPECT_EQ(Style.AlignTrailingComments.Kind, FormatStyle::TCAS_Always);
@@ -3334,21 +3211,21 @@ TEST_F(FormatTestComments, DontAlignNamespaceComments) {
 
   Style.AlignTrailingComments.Kind = FormatStyle::TCAS_Always;
   Style.FixNamespaceComments = true;
-  Input = "namespace A {\n"
-          "  int Foo;\n"
-          "  int Bar;\n"
-          "}\n"
-          "// Comment";
+  constexpr StringRef Code("namespace A {\n"
+                           "  int Foo;\n"
+                           "  int Bar;\n"
+                           "}\n"
+                           "// Comment");
 
   verifyFormat("namespace A {\n"
                "  int Foo;\n"
                "  int Bar;\n"
                "} // namespace A\n"
                "// Comment",
-               Input, Style);
+               Code, Style);
 
   Style.FixNamespaceComments = false;
-  verifyFormat(Input, Style);
+  verifyFormat(Code, Style);
 }
 
 TEST_F(FormatTestComments, DontAlignOverScope) {
@@ -3502,171 +3379,161 @@ TEST_F(FormatTestComments, DontAlignOverScope) {
 }
 
 TEST_F(FormatTestComments, AlignsBlockCommentDecorations) {
-  EXPECT_EQ("/*\n"
-            " */",
-            format("/*\n"
-                   "*/"));
-  EXPECT_EQ("/*\n"
-            " */",
-            format("/*\n"
-                   " */"));
-  EXPECT_EQ("/*\n"
-            " */",
-            format("/*\n"
-                   "  */"));
+  verifyFormat("/*\n"
+               " */",
+               "/*\n"
+               "*/");
+  verifyNoChange("/*\n"
+                 " */");
+  verifyFormat("/*\n"
+               " */",
+               "/*\n"
+               "  */");
 
   // Align a single line.
-  EXPECT_EQ("/*\n"
-            " * line */",
-            format("/*\n"
-                   "* line */"));
-  EXPECT_EQ("/*\n"
-            " * line */",
-            format("/*\n"
-                   " * line */"));
-  EXPECT_EQ("/*\n"
-            " * line */",
-            format("/*\n"
-                   "  * line */"));
-  EXPECT_EQ("/*\n"
-            " * line */",
-            format("/*\n"
-                   "   * line */"));
-  EXPECT_EQ("/**\n"
-            " * line */",
-            format("/**\n"
-                   "* line */"));
-  EXPECT_EQ("/**\n"
-            " * line */",
-            format("/**\n"
-                   " * line */"));
-  EXPECT_EQ("/**\n"
-            " * line */",
-            format("/**\n"
-                   "  * line */"));
-  EXPECT_EQ("/**\n"
-            " * line */",
-            format("/**\n"
-                   "   * line */"));
-  EXPECT_EQ("/**\n"
-            " * line */",
-            format("/**\n"
-                   "    * line */"));
+  verifyFormat("/*\n"
+               " * line */",
+               "/*\n"
+               "* line */");
+  verifyNoChange("/*\n"
+                 " * line */");
+  verifyFormat("/*\n"
+               " * line */",
+               "/*\n"
+               "  * line */");
+  verifyFormat("/*\n"
+               " * line */",
+               "/*\n"
+               "   * line */");
+  verifyFormat("/**\n"
+               " * line */",
+               "/**\n"
+               "* line */");
+  verifyNoChange("/**\n"
+                 " * line */");
+  verifyFormat("/**\n"
+               " * line */",
+               "/**\n"
+               "  * line */");
+  verifyFormat("/**\n"
+               " * line */",
+               "/**\n"
+               "   * line */");
+  verifyFormat("/**\n"
+               " * line */",
+               "/**\n"
+               "    * line */");
 
   // Align the end '*/' after a line.
-  EXPECT_EQ("/*\n"
-            " * line\n"
-            " */",
-            format("/*\n"
-                   "* line\n"
-                   "*/"));
-  EXPECT_EQ("/*\n"
-            " * line\n"
-            " */",
-            format("/*\n"
-                   "   * line\n"
-                   "  */"));
-  EXPECT_EQ("/*\n"
-            " * line\n"
-            " */",
-            format("/*\n"
-                   "  * line\n"
-                   "  */"));
+  verifyFormat("/*\n"
+               " * line\n"
+               " */",
+               "/*\n"
+               "* line\n"
+               "*/");
+  verifyFormat("/*\n"
+               " * line\n"
+               " */",
+               "/*\n"
+               "   * line\n"
+               "  */");
+  verifyFormat("/*\n"
+               " * line\n"
+               " */",
+               "/*\n"
+               "  * line\n"
+               "  */");
 
   // Align two lines.
-  EXPECT_EQ("/* line 1\n"
-            " * line 2 */",
-            format("/* line 1\n"
-                   " * line 2 */"));
-  EXPECT_EQ("/* line 1\n"
-            " * line 2 */",
-            format("/* line 1\n"
-                   "* line 2 */"));
-  EXPECT_EQ("/* line 1\n"
-            " * line 2 */",
-            format("/* line 1\n"
-                   "  * line 2 */"));
-  EXPECT_EQ("/* line 1\n"
-            " * line 2 */",
-            format("/* line 1\n"
-                   "   * line 2 */"));
-  EXPECT_EQ("/* line 1\n"
-            " * line 2 */",
-            format("/* line 1\n"
-                   "    * line 2 */"));
-  EXPECT_EQ("int i; /* line 1\n"
-            "        * line 2 */",
-            format("int i; /* line 1\n"
-                   "* line 2 */"));
-  EXPECT_EQ("int i; /* line 1\n"
-            "        * line 2 */",
-            format("int i; /* line 1\n"
-                   "        * line 2 */"));
-  EXPECT_EQ("int i; /* line 1\n"
-            "        * line 2 */",
-            format("int i; /* line 1\n"
-                   "             * line 2 */"));
+  verifyNoChange("/* line 1\n"
+                 " * line 2 */");
+  verifyFormat("/* line 1\n"
+               " * line 2 */",
+               "/* line 1\n"
+               "* line 2 */");
+  verifyFormat("/* line 1\n"
+               " * line 2 */",
+               "/* line 1\n"
+               "  * line 2 */");
+  verifyFormat("/* line 1\n"
+               " * line 2 */",
+               "/* line 1\n"
+               "   * line 2 */");
+  verifyFormat("/* line 1\n"
+               " * line 2 */",
+               "/* line 1\n"
+               "    * line 2 */");
+  verifyFormat("int i; /* line 1\n"
+               "        * line 2 */",
+               "int i; /* line 1\n"
+               "* line 2 */");
+  verifyNoChange("int i; /* line 1\n"
+                 "        * line 2 */");
+  verifyFormat("int i; /* line 1\n"
+               "        * line 2 */",
+               "int i; /* line 1\n"
+               "             * line 2 */");
 
   // Align several lines.
-  EXPECT_EQ("/* line 1\n"
-            " * line 2\n"
-            " * line 3 */",
-            format("/* line 1\n"
-                   " * line 2\n"
-                   "* line 3 */"));
-  EXPECT_EQ("/* line 1\n"
-            " * line 2\n"
-            " * line 3 */",
-            format("/* line 1\n"
-                   "  * line 2\n"
-                   "* line 3 */"));
-  EXPECT_EQ("/*\n"
-            "** line 1\n"
-            "** line 2\n"
-            "*/",
-            format("/*\n"
-                   "** line 1\n"
-                   " ** line 2\n"
-                   "*/"));
+  verifyFormat("/* line 1\n"
+               " * line 2\n"
+               " * line 3 */",
+               "/* line 1\n"
+               " * line 2\n"
+               "* line 3 */");
+  verifyFormat("/* line 1\n"
+               " * line 2\n"
+               " * line 3 */",
+               "/* line 1\n"
+               "  * line 2\n"
+               "* line 3 */");
+  verifyFormat("/*\n"
+               "** line 1\n"
+               "** line 2\n"
+               "*/",
+               "/*\n"
+               "** line 1\n"
+               " ** line 2\n"
+               "*/");
 
   // Align with different indent after the decorations.
-  EXPECT_EQ("/*\n"
-            " * line 1\n"
-            " *  line 2\n"
-            " * line 3\n"
-            " *   line 4\n"
-            " */",
-            format("/*\n"
-                   "* line 1\n"
-                   "  *  line 2\n"
-                   "   * line 3\n"
-                   "*   line 4\n"
-                   "*/"));
+  verifyFormat("/*\n"
+               " * line 1\n"
+               " *  line 2\n"
+               " * line 3\n"
+               " *   line 4\n"
+               " */",
+               "/*\n"
+               "* line 1\n"
+               "  *  line 2\n"
+               "   * line 3\n"
+               "*   line 4\n"
+               "*/");
 
   // Align empty or blank lines.
-  EXPECT_EQ("/**\n"
-            " *\n"
-            " *\n"
-            " *\n"
-            " */",
-            format("/**\n"
-                   "*  \n"
-                   " * \n"
-                   "  *\n"
-                   "*/"));
+  verifyFormat("/**\n"
+               " *\n"
+               " *\n"
+               " *\n"
+               " */",
+               "/**\n"
+               "*  \n"
+               " * \n"
+               "  *\n"
+               "*/");
 
   // Align while breaking and reflowing.
-  EXPECT_EQ("/*\n"
-            " * long long long\n"
-            " * long long\n"
-            " *\n"
-            " * long */",
-            format("/*\n"
-                   " * long long long long\n"
-                   " * long\n"
-                   "  *\n"
-                   "* long */",
-                   getLLVMStyleWithColumns(20)));
+  verifyFormat("/*\n"
+               " * long long long\n"
+               " * long long\n"
+               " *\n"
+               " * long */",
+               "/*\n"
+               " * long long long long\n"
+               " * long\n"
+               "  *\n"
+               "* long */",
+               getLLVMStyleWithColumns(20));
 }
 
 TEST_F(FormatTestComments, NoCrash_Bug34236) {
@@ -3674,110 +3541,111 @@ TEST_F(FormatTestComments, NoCrash_Bug34236) {
   // https://bugs.llvm.org/show_bug.cgi?id=34236
   // Temporarily disable formatting for readability.
   // clang-format off
-  EXPECT_EQ(
+  verifyFormat(
 "/*                                                                */ /*\n"
 "                                                                      *       a\n"
 "                                                                      * b c d*/",
-      format(
 "/*                                                                */ /*\n"
 " *       a b\n"
-" *       c     d*/",
-          getLLVMStyleWithColumns(80)));
+" *       c     d*/");
   // clang-format on
 }
 
 TEST_F(FormatTestComments, NonTrailingBlockComments) {
-  verifyFormat("const /** comment comment */ A = B;",
-               getLLVMStyleWithColumns(40));
+  const auto Style40 = getLLVMStyleWithColumns(40);
+
+  verifyFormat("const /** comment comment */ A = B;", Style40);
 
   verifyFormat("const /** comment comment comment */ A =\n"
                "    B;",
-               getLLVMStyleWithColumns(40));
-
-  EXPECT_EQ("const /** comment comment comment\n"
-            "         comment */\n"
-            "    A = B;",
-            format("const /** comment comment comment comment */\n"
-                   "    A = B;",
-                   getLLVMStyleWithColumns(40)));
+               Style40);
+
+  verifyFormat("const /** comment comment comment\n"
+               "         comment */\n"
+               "    A = B;",
+               "const /** comment comment comment comment */\n"
+               "    A = B;",
+               Style40);
 }
 
 TEST_F(FormatTestComments, PythonStyleComments) {
+  const auto ProtoStyle20 = getTextProtoStyleWithColumns(20);
+
   // Keeps a space after '#'.
-  EXPECT_EQ("# comment\n"
-            "key: value",
-            format("#comment\n"
-                   "key:value",
-                   getTextProtoStyleWithColumns(20)));
-  EXPECT_EQ("# comment\n"
-            "key: value",
-            format("# comment\n"
-                   "key:value",
-                   getTextProtoStyleWithColumns(20)));
+  verifyFormat("# comment\n"
+               "key: value",
+               "#comment\n"
+               "key:value",
+               ProtoStyle20);
+  verifyFormat("# comment\n"
+               "key: value",
+               "# comment\n"
+               "key:value",
+               ProtoStyle20);
   // Breaks long comment.
-  EXPECT_EQ("# comment comment\n"
-            "# comment\n"
-            "key: value",
-            format("# comment comment comment\n"
-                   "key:value",
-                   getTextProtoStyleWithColumns(20)));
+  verifyFormat("# comment comment\n"
+               "# comment\n"
+               "key: value",
+               "# comment comment comment\n"
+               "key:value",
+               ProtoStyle20);
   // Indents comments.
-  EXPECT_EQ("data {\n"
-            "  # comment comment\n"
-            "  # comment\n"
-            "  key: value\n"
-            "}",
-            format("data {\n"
-                   "# comment comment comment\n"
-                   "key: value}",
-                   getTextProtoStyleWithColumns(20)));
-  EXPECT_EQ("data {\n"
-            "  # comment comment\n"
-            "  # comment\n"
-            "  key: value\n"
-            "}",
-            format("data {# comment comment comment\n"
-                   "key: value}",
-                   getTextProtoStyleWithColumns(20)));
+  verifyFormat("data {\n"
+               "  # comment comment\n"
+               "  # comment\n"
+               "  key: value\n"
+               "}",
+               "data {\n"
+               "# comment comment comment\n"
+               "key: value}",
+               ProtoStyle20);
+  verifyFormat("data {\n"
+               "  # comment comment\n"
+               "  # comment\n"
+               "  key: value\n"
+               "}",
+               "data {# comment comment comment\n"
+               "key: value}",
+               ProtoStyle20);
   // Reflows long comments.
-  EXPECT_EQ("# comment comment\n"
-            "# comment comment\n"
-            "key: value",
-            format("# comment comment comment\n"
-                   "# comment\n"
-                   "key:value",
-                   getTextProtoStyleWithColumns(20)));
+  verifyFormat("# comment comment\n"
+               "# comment comment\n"
+               "key: value",
+               "# comment comment comment\n"
+               "# comment\n"
+               "key:value",
+               ProtoStyle20);
   // Breaks trailing comments.
-  EXPECT_EQ("k: val  # comment\n"
-            "        # comment\n"
-            "a: 1",
-            format("k:val#comment comment\n"
-                   "a:1",
-                   getTextProtoStyleWithColumns(20)));
-  EXPECT_EQ("id {\n"
-            "  k: val  # comment\n"
-            "          # comment\n"
-            "  # line line\n"
-            "  a: 1\n"
-            "}",
-            format("id {k:val#comment comment\n"
-                   "# line line\n"
-                   "a:1}",
-                   getTextProtoStyleWithColumns(20)));
+  verifyFormat("k: val  # comment\n"
+               "        # comment\n"
+               "a: 1",
+               "k:val#comment comment\n"
+               "a:1",
+               ProtoStyle20);
+  verifyFormat("id {\n"
+               "  k: val  # comment\n"
+               "          # comment\n"
+               "  # line line\n"
+               "  a: 1\n"
+               "}",
+               "id {k:val#comment comment\n"
+               "# line line\n"
+               "a:1}",
+               ProtoStyle20);
   // Aligns trailing comments.
-  EXPECT_EQ("k: val  # commen1\n"
-            "        # commen2\n"
-            "        # commen3\n"
-            "# commen4\n"
-            "a: 1  # commen5\n"
-            "      # commen6\n"
-            "      # commen7",
-            format("k:val#commen1 commen2\n"
-                   " #commen3\n"
-                   "# commen4\n"
-                   "a:1#commen5 commen6\n"
-                   " #commen7",
-                   getTextProtoStyleWithColumns(20)));
+  verifyFormat("k: val  # commen1\n"
+               "        # commen2\n"
+               "        # commen3\n"
+               "# commen4\n"
+               "a: 1  # commen5\n"
+               "      # commen6\n"
+               "      # commen7",
+               "k:val#commen1 commen2\n"
+               " #commen3\n"
+               "# commen4\n"
+               "a:1#commen5 commen6\n"
+               " #commen7",
+               ProtoStyle20);
 }
 
 TEST_F(FormatTestComments, BreaksBeforeTrailingUnbreakableSequence) {
@@ -3791,154 +3659,153 @@ TEST_F(FormatTestComments, BreaksBeforeTrailingUnbreakableSequence) {
 
 TEST_F(FormatTestComments, ReflowBackslashCrash) {
   // clang-format off
-  EXPECT_EQ(
+  verifyFormat(
 "// How to run:\n"
 "// bbbbb run \\\n"
 "// rrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrr\n"
 "// \\ <log_file> -- --output_directory=\"<output_directory>\"",
-  format(
 "// How to run:\n"
 "// bbbbb run \\\n"
 "// rrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrr \\\n"
-"// <log_file> -- --output_directory=\"<output_directory>\""));
+"// <log_file> -- --output_directory=\"<output_directory>\"");
   // clang-format on
 }
 
 TEST_F(FormatTestComments, IndentsLongJavadocAnnotatedLines) {
   FormatStyle Style = getGoogleStyle(FormatStyle::LK_Java);
   Style.ColumnLimit = 60;
+  verifyFormat("/**\n"
+               " * @param x long long long long long long long long long\n"
+               " *     long\n"
+               " */",
+               "/**\n"
+               " * @param x long long long long long long long long long long\n"
+               " */",
+               Style);
+  verifyFormat("/**\n"
+               " * @param x long long long long long long long long long\n"
+               " *     long long long long long long long long long long\n"
+               " */",
+               "/**\n"
+               " * @param x long long long long long long long long long "
+               "long long long long long long long long long long\n"
+               " */",
+               Style);
+  verifyFormat("/**\n"
+               " * @param x long long long long long long long long long\n"
+               " *     long long long long long long long long long long\n"
+               " *     long\n"
+               " */",
+               "/**\n"
+               " * @param x long long long long long long long long long "
+               "long long long long long long long long long long long\n"
+               " */",
+               Style);
+
   FormatStyle Style20 = getGoogleStyle(FormatStyle::LK_Java);
   Style20.ColumnLimit = 20;
-  EXPECT_EQ(
-      "/**\n"
-      " * @param x long long long long long long long long long\n"
-      " *     long\n"
-      " */",
-      format("/**\n"
-             " * @param x long long long long long long long long long long\n"
-             " */",
-             Style));
-  EXPECT_EQ("/**\n"
-            " * @param x long long long long long long long long long\n"
-            " *     long long long long long long long long long long\n"
-            " */",
-            format("/**\n"
-                   " * @param x long long long long long long long long long "
-                   "long long long long long long long long long long\n"
-                   " */",
-                   Style));
-  EXPECT_EQ("/**\n"
-            " * @param x long long long long long long long long long\n"
-            " *     long long long long long long long long long long\n"
-            " *     long\n"
-            " */",
-            format("/**\n"
-                   " * @param x long long long long long long long long long "
-                   "long long long long long long long long long long long\n"
-                   " */",
-                   Style));
-  EXPECT_EQ("/**\n"
-            " * Sentence that\n"
-            " * should be broken.\n"
-            " * @param short\n"
-            " * keep indentation\n"
-            " */",
-            format("/**\n"
-                   " * Sentence that should be broken.\n"
-                   " * @param short\n"
-                   " * keep indentation\n"
-                   " */",
-                   Style20));
-
-  EXPECT_EQ("/**\n"
-            " * @param l1 long1\n"
-            " *     to break\n"
-            " * @param l2 long2\n"
-            " *     to break\n"
-            " */",
-            format("/**\n"
-                   " * @param l1 long1 to break\n"
-                   " * @param l2 long2 to break\n"
-                   " */",
-                   Style20));
-
-  EXPECT_EQ("/**\n"
-            " * @param xx to\n"
-            " *     break\n"
-            " * no reflow\n"
-            " */",
-            format("/**\n"
-                   " * @param xx to break\n"
-                   " * no reflow\n"
-                   " */",
-                   Style20));
-
-  EXPECT_EQ("/**\n"
-            " * @param xx to\n"
-            " *     break yes\n"
-            " *     reflow\n"
-            " */",
-            format("/**\n"
-                   " * @param xx to break\n"
-                   " *     yes reflow\n"
-                   " */",
-                   Style20));
+
+  verifyFormat("/**\n"
+               " * Sentence that\n"
+               " * should be broken.\n"
+               " * @param short\n"
+               " * keep indentation\n"
+               " */",
+               "/**\n"
+               " * Sentence that should be broken.\n"
+               " * @param short\n"
+               " * keep indentation\n"
+               " */",
+               Style20);
+
+  verifyFormat("/**\n"
+               " * @param l1 long1\n"
+               " *     to break\n"
+               " * @param l2 long2\n"
+               " *     to break\n"
+               " */",
+               "/**\n"
+               " * @param l1 long1 to break\n"
+               " * @param l2 long2 to break\n"
+               " */",
+               Style20);
+
+  verifyFormat("/**\n"
+               " * @param xx to\n"
+               " *     break\n"
+               " * no reflow\n"
+               " */",
+               "/**\n"
+               " * @param xx to break\n"
+               " * no reflow\n"
+               " */",
+               Style20);
+
+  verifyFormat("/**\n"
+               " * @param xx to\n"
+               " *     break yes\n"
+               " *     reflow\n"
+               " */",
+               "/**\n"
+               " * @param xx to break\n"
+               " *     yes reflow\n"
+               " */",
+               Style20);
 
   FormatStyle JSStyle20 = getGoogleStyle(FormatStyle::LK_JavaScript);
   JSStyle20.ColumnLimit = 20;
-  EXPECT_EQ("/**\n"
-            " * @param l1 long1\n"
-            " *     to break\n"
-            " */",
-            format("/**\n"
-                   " * @param l1 long1 to break\n"
-                   " */",
-                   JSStyle20));
-  EXPECT_EQ("/**\n"
-            " * @param {l1 long1\n"
-            " *     to break}\n"
-            " */",
-            format("/**\n"
-                   " * @param {l1 long1 to break}\n"
-                   " */",
-                   JSStyle20));
+  verifyFormat("/**\n"
+               " * @param l1 long1\n"
+               " *     to break\n"
+               " */",
+               "/**\n"
+               " * @param l1 long1 to break\n"
+               " */",
+               JSStyle20);
+  verifyFormat("/**\n"
+               " * @param {l1 long1\n"
+               " *     to break}\n"
+               " */",
+               "/**\n"
+               " * @param {l1 long1 to break}\n"
+               " */",
+               JSStyle20);
 }
 
 TEST_F(FormatTestComments, SpaceAtLineCommentBegin) {
-  FormatStyle Style = getLLVMStyle();
-  StringRef NoTextInComment = " //       \n"
-                              "\n"
-                              "void foo() {// \n"
-                              "// \n"
-                              "}";
-
-  EXPECT_EQ("//\n"
-            "\n"
-            "void foo() { //\n"
-            "  //\n"
-            "}",
-            format(NoTextInComment, Style));
+  constexpr StringRef NoTextInComment(" //       \n"
+                                      "\n"
+                                      "void foo() {// \n"
+                                      "// \n"
+                                      "}");
+
+  verifyFormat("//\n"
+               "\n"
+               "void foo() { //\n"
+               "  //\n"
+               "}",
+               NoTextInComment);
 
+  auto Style = getLLVMStyle();
   Style.SpacesInLineCommentPrefix.Minimum = 0;
   verifyFormat("//#comment", Style);
-  EXPECT_EQ("//\n"
-            "\n"
-            "void foo() { //\n"
-            "  //\n"
-            "}",
-            format(NoTextInComment, Style));
+  verifyFormat("//\n"
+               "\n"
+               "void foo() { //\n"
+               "  //\n"
+               "}",
+               NoTextInComment, Style);
 
   Style.SpacesInLineCommentPrefix.Minimum = 5;
-  EXPECT_EQ("//     #comment", format("//#comment", Style));
-  EXPECT_EQ("//\n"
-            "\n"
-            "void foo() { //\n"
-            "  //\n"
-            "}",
-            format(NoTextInComment, Style));
+  verifyFormat("//     #comment", "//#comment", Style);
+  verifyFormat("//\n"
+               "\n"
+               "void foo() { //\n"
+               "  //\n"
+               "}",
+               NoTextInComment, Style);
 
-  Style = getLLVMStyle();
-  StringRef Code =
+  constexpr StringRef Code(
       "//Free comment without space\n"
       "\n"
       "//   Free comment with 3 spaces\n"
@@ -4008,731 +3875,529 @@ TEST_F(FormatTestComments, SpaceAtLineCommentBegin) {
       "//} will not move\n"
       "\n"
       "//vv will only move\n"
-      "//} if the line above does";
-
-  EXPECT_EQ("// Free comment without space\n"
-            "\n"
-            "//   Free comment with 3 spaces\n"
-            "\n"
-            "/// Free Doxygen without space\n"
-            "\n"
-            "///   Free Doxygen with 3 spaces\n"
-            "\n"
-            "// 🐉 A nice dragon\n"
-            "\n"
-            "//\t abccba\n"
-            "\n"
-            "//\\t deffed\n"
-            "\n"
-            "//   🐉 Another nice dragon\n"
-            "\n"
-            "//   \t Three leading spaces following tab\n"
-            "\n"
-            "//   \\t Three leading spaces following backslash\n"
-            "\n"
-            "/// A Doxygen Comment with a nested list:\n"
-            "/// - Foo\n"
-            "/// - Bar\n"
-            "///   - Baz\n"
-            "///   - End\n"
-            "///     of the inner list\n"
-            "///   .\n"
-            "/// .\n"
-            "\n"
-            "namespace Foo {\n"
-            "bool bar(bool b) {\n"
-            "  bool ret1 = true; ///< Doxygenstyle without space\n"
-            "  bool ret2 = true; ///<   Doxygenstyle with 3 spaces\n"
-            "  if (b) {\n"
-            "    // Foo\n"
-            "\n"
-            "    //   In function comment\n"
-            "    ret2 = false;\n"
-            "  } // End of if\n"
-            "\n"
-            "  //  if (ret1) {\n"
-            "  //    return ret2;\n"
-            "  //  }\n"
-            "\n"
-            "  // if (ret1) {\n"
-            "  //   return ret2;\n"
-            "  // }\n"
-            "\n"
-            "  return ret1 && ret2;\n"
-            "}\n"
-            "} // namespace Foo\n"
-            "\n"
-            "namespace Bar {\n"
-            "int foo();\n"
-            "} //  namespace Bar\n"
-            "//@Nothing added because of the non ascii char\n"
-            "\n"
-            "//@      Nothing removed because of the non ascii char\n"
-            "\n"
-            "//  Comment to move to the left\n"
-            "// But not this?\n"
-            "//  @but this\n"
-            "\n"
-            "// Comment to move to the right\n"
-            "//@ this stays\n"
-            "\n"
-            "//} will not move\n"
-            "\n"
-            "// vv will only move\n"
-            "// } if the line above does",
-            format(Code, Style));
+      "//} if the line above does");
+
+  constexpr StringRef Code2(
+      "// Free comment without space\n"
+      "\n"
+      "//   Free comment with 3 spaces\n"
+      "\n"
+      "/// Free Doxygen without space\n"
+      "\n"
+      "///   Free Doxygen with 3 spaces\n"
+      "\n"
+      "// 🐉 A nice dragon\n"
+      "\n"
+      "//\t abccba\n"
+      "\n"
+      "//\\t deffed\n"
+      "\n"
+      "//   🐉 Another nice dragon\n"
+      "\n"
+      "//   \t Three leading spaces following tab\n"
+      "\n"
+      "//   \\t Three leading spaces following backslash\n"
+      "\n"
+      "/// A Doxygen Comment with a nested list:\n"
+      "/// - Foo\n"
+      "/// - Bar\n"
+      "///   - Baz\n"
+      "///   - End\n"
+      "///     of the inner list\n"
+      "///   .\n"
+      "/// .\n"
+      "\n"
+      "namespace Foo {\n"
+      "bool bar(bool b) {\n"
+      "  bool ret1 = true; ///< Doxygenstyle without space\n"
+      "  bool ret2 = true; ///<   Doxygenstyle with 3 spaces\n"
+      "  if (b) {\n"
+      "    // Foo\n"
+      "\n"
+      "    //   In function comment\n"
+      "    ret2 = false;\n"
+      "  } // End of if\n"
+      "\n"
+      "  //  if (ret1) {\n"
+      "  //    return ret2;\n"
+      "  //  }\n"
+      "\n"
+      "  // if (ret1) {\n"
+      "  //   return ret2;\n"
+      "  // }\n"
+      "\n"
+      "  return ret1 && ret2;\n"
+      "}\n"
+      "} // namespace Foo\n"
+      "\n"
+      "namespace Bar {\n"
+      "int foo();\n"
+      "} //  namespace Bar\n"
+      "//@Nothing added because of the non ascii char\n"
+      "\n"
+      "//@      Nothing removed because of the non ascii char\n"
+      "\n"
+      "//  Comment to move to the left\n"
+      "// But not this?\n"
+      "//  @but this\n"
+      "\n"
+      "// Comment to move to the right\n"
+      "//@ this stays\n"
+      "\n"
+      "//} will not move\n"
+      "\n"
+      "// vv will only move\n"
+      "// } if the line above does");
+
+  constexpr StringRef Code3(
+      "//Free comment without space\n"
+      "\n"
+      "//Free comment with 3 spaces\n"
+      "\n"
+      "///Free Doxygen without space\n"
+      "\n"
+      "///Free Doxygen with 3 spaces\n"
+      "\n"
+      "//🐉 A nice dragon\n"
+      "\n"
+      "//\t abccba\n"
+      "\n"
+      "//\\t deffed\n"
+      "\n"
+      "//🐉 Another nice dragon\n"
+      "\n"
+      "//\t Three leading spaces following tab\n"
+      "\n"
+      "//\\t Three leading spaces following backslash\n"
+      "\n"
+      "///A Doxygen Comment with a nested list:\n"
+      "///- Foo\n"
+      "///- Bar\n"
+      "///  - Baz\n" // Here we keep the relative indentation
+      "///  - End\n"
+      "///    of the inner list\n"
+      "///  .\n"
+      "///.\n"
+      "\n"
+      "namespace Foo {\n"
+      "bool bar(bool b) {\n"
+      "  bool ret1 = true; ///<Doxygenstyle without space\n"
+      "  bool ret2 = true; ///<Doxygenstyle with 3 spaces\n"
+      "  if (b) {\n"
+      "    //Foo\n"
+      "\n"
+      "    //In function comment\n"
+      "    ret2 = false;\n"
+      "  } //End of if\n"
+      "\n"
+      "  //if (ret1) {\n"
+      "  //  return ret2;\n"
+      "  //}\n"
+      "\n"
+      "  //if (ret1) {\n"
+      "  //  return ret2;\n"
+      "  //}\n"
+      "\n"
+      "  return ret1 && ret2;\n"
+      "}\n"
+      "} //namespace Foo\n"
+      "\n"
+      "namespace Bar {\n"
+      "int foo();\n"
+      "} //namespace Bar\n"
+      "//@Nothing added because of the non ascii char\n"
+      "\n"
+      "//@      Nothing removed because of the non ascii char\n"
+      "\n"
+      "//Comment to move to the left\n"
+      "//But not this?\n"
+      "//@but this\n"
+      "\n"
+      "//Comment to move to the right\n"
+      "//@ this stays\n"
+      "\n"
+      "//} will not move\n"
+      "\n"
+      "//vv will only move\n"
+      "//} if the line above does");
+
+  constexpr StringRef Code4(
+      "//  Free comment without space\n"
+      "\n"
+      "//   Free comment with 3 spaces\n"
+      "\n"
+      "///  Free Doxygen without space\n"
+      "\n"
+      "///   Free Doxygen with 3 spaces\n"
+      "\n"
+      "//  🐉 A nice dragon\n"
+      "\n"
+      "//\t abccba\n"
+      "\n"
+      "//\\t deffed\n"
+      "\n"
+      "//   🐉 Another nice dragon\n"
+      "\n"
+      "//   \t Three leading spaces following tab\n"
+      "\n"
+      "//   \\t Three leading spaces following backslash\n"
+      "\n"
+      "///  A Doxygen Comment with a nested list:\n"
+      "///  - Foo\n"
+      "///  - Bar\n"
+      "///    - Baz\n"
+      "///    - End\n"
+      "///      of the inner list\n"
+      "///    .\n"
+      "///  .\n"
+      "\n"
+      "namespace Foo {\n"
+      "bool bar(bool b) {\n"
+      "  bool ret1 = true; ///<  Doxygenstyle without space\n"
+      "  bool ret2 = true; ///<   Doxygenstyle with 3 spaces\n"
+      "  if (b) {\n"
+      "    //  Foo\n"
+      "\n"
+      "    //   In function comment\n"
+      "    ret2 = false;\n"
+      "  } //  End of if\n"
+      "\n"
+      "  //  if (ret1) {\n"
+      "  //    return ret2;\n"
+      "  //  }\n"
+      "\n"
+      "  //  if (ret1) {\n"
+      "  //    return ret2;\n"
+      "  //  }\n"
+      "\n"
+      "  return ret1 && ret2;\n"
+      "}\n"
+      "} //  namespace Foo\n"
+      "\n"
+      "namespace Bar {\n"
+      "int foo();\n"
+      "} //  namespace Bar\n"
+      "//@Nothing added because of the non ascii char\n"
+      "\n"
+      "//@      Nothing removed because of the non ascii char\n"
+      "\n"
+      "//  Comment to move to the left\n"
+      "//  But not this?\n"
+      "//  @but this\n"
+      "\n"
+      "//  Comment to move to the right\n"
+      "//@ this stays\n"
+      "\n"
+      "//} will not move\n"
+      "\n"
+      "//  vv will only move\n"
+      "//  } if the line above does");
+
+  verifyFormat(Code2, Code);
 
+  Style = getLLVMStyle();
   Style.SpacesInLineCommentPrefix = {0, 0};
-  EXPECT_EQ("//#comment", format("//   #comment", Style));
-  EXPECT_EQ("//Free comment without space\n"
-            "\n"
-            "//Free comment with 3 spaces\n"
-            "\n"
-            "///Free Doxygen without space\n"
-            "\n"
-            "///Free Doxygen with 3 spaces\n"
-            "\n"
-            "//🐉 A nice dragon\n"
-            "\n"
-            "//\t abccba\n"
-            "\n"
-            "//\\t deffed\n"
-            "\n"
-            "//🐉 Another nice dragon\n"
-            "\n"
-            "//\t Three leading spaces following tab\n"
-            "\n"
-            "//\\t Three leading spaces following backslash\n"
-            "\n"
-            "///A Doxygen Comment with a nested list:\n"
-            "///- Foo\n"
-            "///- Bar\n"
-            "///  - Baz\n" // Here we keep the relative indentation
-            "///  - End\n"
-            "///    of the inner list\n"
-            "///  .\n"
-            "///.\n"
-            "\n"
-            "namespace Foo {\n"
-            "bool bar(bool b) {\n"
-            "  bool ret1 = true; ///<Doxygenstyle without space\n"
-            "  bool ret2 = true; ///<Doxygenstyle with 3 spaces\n"
-            "  if (b) {\n"
-            "    //Foo\n"
-            "\n"
-            "    //In function comment\n"
-            "    ret2 = false;\n"
-            "  } //End of if\n"
-            "\n"
-            "  //if (ret1) {\n"
-            "  //  return ret2;\n"
-            "  //}\n"
-            "\n"
-            "  //if (ret1) {\n"
-            "  //  return ret2;\n"
-            "  //}\n"
-            "\n"
-            "  return ret1 && ret2;\n"
-            "}\n"
-            "} //namespace Foo\n"
-            "\n"
-            "namespace Bar {\n"
-            "int foo();\n"
-            "} //namespace Bar\n"
-            "//@Nothing added because of the non ascii char\n"
-            "\n"
-            "//@      Nothing removed because of the non ascii char\n"
-            "\n"
-            "//Comment to move to the left\n"
-            "//But not this?\n"
-            "//@but this\n"
-            "\n"
-            "//Comment to move to the right\n"
-            "//@ this stays\n"
-            "\n"
-            "//} will not move\n"
-            "\n"
-            "//vv will only move\n"
-            "//} if the line above does",
-            format(Code, Style));
+  verifyFormat("//#comment", "//   #comment", Style);
+  verifyFormat(Code3, Code, Style);
 
   Style.SpacesInLineCommentPrefix = {2, -1u};
-  EXPECT_EQ("//  Free comment without space\n"
-            "\n"
-            "//   Free comment with 3 spaces\n"
-            "\n"
-            "///  Free Doxygen without space\n"
-            "\n"
-            "///   Free Doxygen with 3 spaces\n"
-            "\n"
-            "//  🐉 A nice dragon\n"
-            "\n"
-            "//\t abccba\n"
-            "\n"
-            "//\\t deffed\n"
-            "\n"
-            "//   🐉 Another nice dragon\n"
-            "\n"
-            "//   \t Three leading spaces following tab\n"
-            "\n"
-            "//   \\t Three leading spaces following backslash\n"
-            "\n"
-            "///  A Doxygen Comment with a nested list:\n"
-            "///  - Foo\n"
-            "///  - Bar\n"
-            "///    - Baz\n"
-            "///    - End\n"
-            "///      of the inner list\n"
-            "///    .\n"
-            "///  .\n"
-            "\n"
-            "namespace Foo {\n"
-            "bool bar(bool b) {\n"
-            "  bool ret1 = true; ///<  Doxygenstyle without space\n"
-            "  bool ret2 = true; ///<   Doxygenstyle with 3 spaces\n"
-            "  if (b) {\n"
-            "    //  Foo\n"
-            "\n"
-            "    //   In function comment\n"
-            "    ret2 = false;\n"
-            "  } //  End of if\n"
-            "\n"
-            "  //  if (ret1) {\n"
-            "  //    return ret2;\n"
-            "  //  }\n"
-            "\n"
-            "  //  if (ret1) {\n"
-            "  //    return ret2;\n"
-            "  //  }\n"
-            "\n"
-            "  return ret1 && ret2;\n"
-            "}\n"
-            "} //  namespace Foo\n"
-            "\n"
-            "namespace Bar {\n"
-            "int foo();\n"
-            "} //  namespace Bar\n"
-            "//@Nothing added because of the non ascii char\n"
-            "\n"
-            "//@      Nothing removed because of the non ascii char\n"
-            "\n"
-            "//  Comment to move to the left\n"
-            "//  But not this?\n"
-            "//  @but this\n"
-            "\n"
-            "//  Comment to move to the right\n"
-            "//@ this stays\n"
-            "\n"
-            "//} will not move\n"
-            "\n"
-            "//  vv will only move\n"
-            "//  } if the line above does",
-            format(Code, Style));
+  verifyFormat(Code4, Code, Style);
 
   Style = getLLVMStyleWithColumns(20);
-  StringRef WrapCode = "//Lorem ipsum dolor sit amet\n"
-                       "\n"
-                       "//  Lorem   ipsum   dolor   sit   amet\n"
-                       "\n"
-                       "void f() {//Hello World\n"
-                       "}";
-
-  EXPECT_EQ("// Lorem ipsum dolor\n"
-            "// sit amet\n"
-            "\n"
-            "//  Lorem   ipsum\n"
-            "//  dolor   sit amet\n"
-            "\n"
-            "void f() { // Hello\n"
-            "           // World\n"
-            "}",
-            format(WrapCode, Style));
+  constexpr StringRef WrapCode("//Lorem ipsum dolor sit amet\n"
+                               "\n"
+                               "//  Lorem   ipsum   dolor   sit   amet\n"
+                               "\n"
+                               "void f() {//Hello World\n"
+                               "}");
+
+  verifyFormat("// Lorem ipsum dolor\n"
+               "// sit amet\n"
+               "\n"
+               "//  Lorem   ipsum\n"
+               "//  dolor   sit amet\n"
+               "\n"
+               "void f() { // Hello\n"
+               "           // World\n"
+               "}",
+               WrapCode, Style);
 
   Style.SpacesInLineCommentPrefix = {0, 0};
-  EXPECT_EQ("//Lorem ipsum dolor\n"
-            "//sit amet\n"
-            "\n"
-            "//Lorem   ipsum\n"
-            "//dolor   sit   amet\n"
-            "\n"
-            "void f() { //Hello\n"
-            "           //World\n"
-            "}",
-            format(WrapCode, Style));
+  verifyFormat("//Lorem ipsum dolor\n"
+               "//sit amet\n"
+               "\n"
+               "//Lorem   ipsum\n"
+               "//dolor   sit   amet\n"
+               "\n"
+               "void f() { //Hello\n"
+               "           //World\n"
+               "}",
+               WrapCode, Style);
 
   Style.SpacesInLineCommentPrefix = {1, 1};
-  EXPECT_EQ("// Lorem ipsum dolor\n"
-            "// sit amet\n"
-            "\n"
-            "// Lorem   ipsum\n"
-            "// dolor   sit amet\n"
-            "\n"
-            "void f() { // Hello\n"
-            "           // World\n"
-            "}",
-            format(WrapCode, Style));
-  EXPECT_EQ("// x\n"
-            "// y",
-            format("//   x\n"
-                   "// y",
-                   Style));
-  EXPECT_EQ(
+  verifyFormat("// Lorem ipsum dolor\n"
+               "// sit amet\n"
+               "\n"
+               "// Lorem   ipsum\n"
+               "// dolor   sit amet\n"
+               "\n"
+               "void f() { // Hello\n"
+               "           // World\n"
+               "}",
+               WrapCode, Style);
+  verifyFormat("// x\n"
+               "// y",
+               "//   x\n"
+               "// y",
+               Style);
+  verifyFormat(
       "// loooooooooooooooooooooooooooooong\n"
       "// commentcomments\n"
       "// normal comments",
-      format("//            loooooooooooooooooooooooooooooong commentcomments\n"
-             "// normal comments",
-             Style));
+      "//            loooooooooooooooooooooooooooooong commentcomments\n"
+      "// normal comments",
+      Style);
 
   Style.SpacesInLineCommentPrefix = {3, 3};
-  EXPECT_EQ("//   Lorem ipsum\n"
-            "//   dolor sit amet\n"
-            "\n"
-            "//   Lorem   ipsum\n"
-            "//   dolor   sit\n"
-            "//   amet\n"
-            "\n"
-            "void f() { //   Hello\n"
-            "           //   World\n"
-            "}",
-            format(WrapCode, Style));
+  verifyFormat("//   Lorem ipsum\n"
+               "//   dolor sit amet\n"
+               "\n"
+               "//   Lorem   ipsum\n"
+               "//   dolor   sit\n"
+               "//   amet\n"
+               "\n"
+               "void f() { //   Hello\n"
+               "           //   World\n"
+               "}",
+               WrapCode, Style);
 
   Style = getLLVMStyleWithColumns(20);
-  StringRef LotsOfSpaces = "//                      This are more spaces "
-                           "than the ColumnLimit, what now?\n"
-                           "\n"
-                           "//   Comment\n"
-                           "\n"
-                           "// This is a text to split in multiple "
-                           "lines, please. Thank you very much!\n"
-                           "\n"
-                           "// A comment with\n"
-                           "//   some indentation that has to be split.\n"
-                           "// And now without";
-  EXPECT_EQ("//                      This are more spaces "
-            "than the ColumnLimit, what now?\n"
-            "\n"
-            "//   Comment\n"
-            "\n"
-            "// This is a text to\n"
-            "// split in multiple\n"
-            "// lines, please.\n"
-            "// Thank you very\n"
-            "// much!\n"
-            "\n"
-            "// A comment with\n"
-            "//   some\n"
-            "//   indentation\n"
-            "//   that has to be\n"
-            "//   split.\n"
-            "// And now without",
-            format(LotsOfSpaces, Style));
+  constexpr StringRef LotsOfSpaces(
+      "//                      This are more spaces "
+      "than the ColumnLimit, what now?\n"
+      "\n"
+      "//   Comment\n"
+      "\n"
+      "// This is a text to split in multiple "
+      "lines, please. Thank you very much!\n"
+      "\n"
+      "// A comment with\n"
+      "//   some indentation that has to be split.\n"
+      "// And now without");
+  verifyFormat("//                      This are more spaces "
+               "than the ColumnLimit, what now?\n"
+               "\n"
+               "//   Comment\n"
+               "\n"
+               "// This is a text to\n"
+               "// split in multiple\n"
+               "// lines, please.\n"
+               "// Thank you very\n"
+               "// much!\n"
+               "\n"
+               "// A comment with\n"
+               "//   some\n"
+               "//   indentation\n"
+               "//   that has to be\n"
+               "//   split.\n"
+               "// And now without",
+               LotsOfSpaces, Style);
 
   Style.SpacesInLineCommentPrefix = {0, 0};
-  EXPECT_EQ("//This are more\n"
-            "//spaces than the\n"
-            "//ColumnLimit, what\n"
-            "//now?\n"
-            "\n"
-            "//Comment\n"
-            "\n"
-            "//This is a text to\n"
-            "//split in multiple\n"
-            "//lines, please.\n"
-            "//Thank you very\n"
-            "//much!\n"
-            "\n"
-            "//A comment with\n"
-            "//  some indentation\n"
-            "//  that has to be\n"
-            "//  split.\n"
-            "//And now without",
-            format(LotsOfSpaces, Style));
+  verifyFormat("//This are more\n"
+               "//spaces than the\n"
+               "//ColumnLimit, what\n"
+               "//now?\n"
+               "\n"
+               "//Comment\n"
+               "\n"
+               "//This is a text to\n"
+               "//split in multiple\n"
+               "//lines, please.\n"
+               "//Thank you very\n"
+               "//much!\n"
+               "\n"
+               "//A comment with\n"
+               "//  some indentation\n"
+               "//  that has to be\n"
+               "//  split.\n"
+               "//And now without",
+               LotsOfSpaces, Style);
 
   Style.SpacesInLineCommentPrefix = {3, 3};
-  EXPECT_EQ("//   This are more\n"
-            "//   spaces than the\n"
-            "//   ColumnLimit,\n"
-            "//   what now?\n"
-            "\n"
-            "//   Comment\n"
-            "\n"
-            "//   This is a text\n"
-            "//   to split in\n"
-            "//   multiple lines,\n"
-            "//   please. Thank\n"
-            "//   you very much!\n"
-            "\n"
-            "//   A comment with\n"
-            "//     some\n"
-            "//     indentation\n"
-            "//     that has to\n"
-            "//     be split.\n"
-            "//   And now without",
-            format(LotsOfSpaces, Style));
+  verifyFormat("//   This are more\n"
+               "//   spaces than the\n"
+               "//   ColumnLimit,\n"
+               "//   what now?\n"
+               "\n"
+               "//   Comment\n"
+               "\n"
+               "//   This is a text\n"
+               "//   to split in\n"
+               "//   multiple lines,\n"
+               "//   please. Thank\n"
+               "//   you very much!\n"
+               "\n"
+               "//   A comment with\n"
+               "//     some\n"
+               "//     indentation\n"
+               "//     that has to\n"
+               "//     be split.\n"
+               "//   And now without",
+               LotsOfSpaces, Style);
 
   Style.SpacesInLineCommentPrefix = {30, -1u};
-  EXPECT_EQ("//                              This are more spaces than the "
-            "ColumnLimit, what now?\n"
-            "\n"
-            "//                              Comment\n"
-            "\n"
-            "//                              This is a text to split in "
-            "multiple lines, please. Thank you very much!\n"
-            "\n"
-            "//                              A comment with\n"
-            "//                                some indentation that has to be "
-            "split.\n"
-            "//                              And now without",
-            format(LotsOfSpaces, Style));
+  verifyFormat(
+      "//                              This are more spaces than the "
+      "ColumnLimit, what now?\n"
+      "\n"
+      "//                              Comment\n"
+      "\n"
+      "//                              This is a text to split in "
+      "multiple lines, please. Thank you very much!\n"
+      "\n"
+      "//                              A comment with\n"
+      "//                                some indentation that has to be "
+      "split.\n"
+      "//                              And now without",
+      LotsOfSpaces, Style);
 
   Style.SpacesInLineCommentPrefix = {2, 4};
-  EXPECT_EQ("//  A Comment to be\n"
-            "//  moved\n"
-            "//   with indent\n"
-            "\n"
-            "//  A Comment to be\n"
-            "//  moved\n"
-            "//   with indent\n"
-            "\n"
-            "//  A Comment to be\n"
-            "//  moved\n"
-            "//   with indent\n"
-            "\n"
-            "//   A Comment to be\n"
-            "//   moved\n"
-            "//    with indent\n"
-            "\n"
-            "//    A Comment to\n"
-            "//    be moved\n"
-            "//     with indent\n"
-            "\n"
-            "//    A Comment to\n"
-            "//    be moved\n"
-            "//     with indent\n"
-            "\n"
-            "//    A Comment to\n"
-            "//    be moved\n"
-            "//     with indent",
-            format("//A Comment to be moved\n"
-                   "// with indent\n"
-                   "\n"
-                   "// A Comment to be moved\n"
-                   "//  with indent\n"
-                   "\n"
-                   "//  A Comment to be moved\n"
-                   "//   with indent\n"
-                   "\n"
-                   "//   A Comment to be moved\n"
-                   "//    with indent\n"
-                   "\n"
-                   "//    A Comment to be moved\n"
-                   "//     with indent\n"
-                   "\n"
-                   "//     A Comment to be moved\n"
-                   "//      with indent\n"
-                   "\n"
-                   "//      A Comment to be moved\n"
-                   "//       with indent",
-                   Style));
+  verifyFormat("//  A Comment to be\n"
+               "//  moved\n"
+               "//   with indent\n"
+               "\n"
+               "//  A Comment to be\n"
+               "//  moved\n"
+               "//   with indent\n"
+               "\n"
+               "//  A Comment to be\n"
+               "//  moved\n"
+               "//   with indent\n"
+               "\n"
+               "//   A Comment to be\n"
+               "//   moved\n"
+               "//    with indent\n"
+               "\n"
+               "//    A Comment to\n"
+               "//    be moved\n"
+               "//     with indent\n"
+               "\n"
+               "//    A Comment to\n"
+               "//    be moved\n"
+               "//     with indent\n"
+               "\n"
+               "//    A Comment to\n"
+               "//    be moved\n"
+               "//     with indent",
+               "//A Comment to be moved\n"
+               "// with indent\n"
+               "\n"
+               "// A Comment to be moved\n"
+               "//  with indent\n"
+               "\n"
+               "//  A Comment to be moved\n"
+               "//   with indent\n"
+               "\n"
+               "//   A Comment to be moved\n"
+               "//    with indent\n"
+               "\n"
+               "//    A Comment to be moved\n"
+               "//     with indent\n"
+               "\n"
+               "//     A Comment to be moved\n"
+               "//      with indent\n"
+               "\n"
+               "//      A Comment to be moved\n"
+               "//       with indent",
+               Style);
 
   Style.ColumnLimit = 30;
-  EXPECT_EQ("int i; //  A Comment to be\n"
-            "       //  moved\n"
-            "       //   with indent\n"
-            "\n"
-            "int i; //  A Comment to be\n"
-            "       //  moved\n"
-            "       //   with indent\n"
-            "\n"
-            "int i; //  A Comment to be\n"
-            "       //  moved\n"
-            "       //   with indent\n"
-            "\n"
-            "int i; //   A Comment to be\n"
-            "       //   moved\n"
-            "       //    with indent\n"
-            "\n"
-            "int i; //    A Comment to be\n"
-            "       //    moved\n"
-            "       //     with indent\n"
-            "\n"
-            "int i; //    A Comment to be\n"
-            "       //    moved\n"
-            "       //     with indent\n"
-            "\n"
-            "int i; //    A Comment to be\n"
-            "       //    moved\n"
-            "       //     with indent",
-            format("int i;//A Comment to be moved\n"
-                   "      // with indent\n"
-                   "\n"
-                   "int i;// A Comment to be moved\n"
-                   "      //  with indent\n"
-                   "\n"
-                   "int i;//  A Comment to be moved\n"
-                   "      //   with indent\n"
-                   "\n"
-                   "int i;//   A Comment to be moved\n"
-                   "      //    with indent\n"
-                   "\n"
-                   "int i;//    A Comment to be moved\n"
-                   "      //     with indent\n"
-                   "\n"
-                   "int i;//     A Comment to be moved\n"
-                   "      //      with indent\n"
-                   "\n"
-                   "int i;//      A Comment to be moved\n"
-                   "      //       with indent",
-                   Style));
+  verifyFormat("int i; //  A Comment to be\n"
+               "       //  moved\n"
+               "       //   with indent\n"
+               "\n"
+               "int i; //  A Comment to be\n"
+               "       //  moved\n"
+               "       //   with indent\n"
+               "\n"
+               "int i; //  A Comment to be\n"
+               "       //  moved\n"
+               "       //   with indent\n"
+               "\n"
+               "int i; //   A Comment to be\n"
+               "       //   moved\n"
+               "       //    with indent\n"
+               "\n"
+               "int i; //    A Comment to be\n"
+               "       //    moved\n"
+               "       //     with indent\n"
+               "\n"
+               "int i; //    A Comment to be\n"
+               "       //    moved\n"
+               "       //     with indent\n"
+               "\n"
+               "int i; //    A Comment to be\n"
+               "       //    moved\n"
+               "       //     with indent",
+               "int i;//A Comment to be moved\n"
+               "      // with indent\n"
+               "\n"
+               "int i;// A Comment to be moved\n"
+               "      //  with indent\n"
+               "\n"
+               "int i;//  A Comment to be moved\n"
+               "      //   with indent\n"
+               "\n"
+               "int i;//   A Comment to be moved\n"
+               "      //    with indent\n"
+               "\n"
+               "int i;//    A Comment to be moved\n"
+               "      //     with indent\n"
+               "\n"
+               "int i;//     A Comment to be moved\n"
+               "      //      with indent\n"
+               "\n"
+               "int i;//      A Comment to be moved\n"
+               "      //       with indent",
+               Style);
 
   Style = getLLVMStyleWithColumns(0);
-  EXPECT_EQ("// Free comment without space\n"
-            "\n"
-            "//   Free comment with 3 spaces\n"
-            "\n"
-            "/// Free Doxygen without space\n"
-            "\n"
-            "///   Free Doxygen with 3 spaces\n"
-            "\n"
-            "// 🐉 A nice dragon\n"
-            "\n"
-            "//\t abccba\n"
-            "\n"
-            "//\\t deffed\n"
-            "\n"
-            "//   🐉 Another nice dragon\n"
-            "\n"
-            "//   \t Three leading spaces following tab\n"
-            "\n"
-            "//   \\t Three leading spaces following backslash\n"
-            "\n"
-            "/// A Doxygen Comment with a nested list:\n"
-            "/// - Foo\n"
-            "/// - Bar\n"
-            "///   - Baz\n"
-            "///   - End\n"
-            "///     of the inner list\n"
-            "///   .\n"
-            "/// .\n"
-            "\n"
-            "namespace Foo {\n"
-            "bool bar(bool b) {\n"
-            "  bool ret1 = true; ///< Doxygenstyle without space\n"
-            "  bool ret2 = true; ///<   Doxygenstyle with 3 spaces\n"
-            "  if (b) {\n"
-            "    // Foo\n"
-            "\n"
-            "    //   In function comment\n"
-            "    ret2 = false;\n"
-            "  } // End of if\n"
-            "\n"
-            "  //  if (ret1) {\n"
-            "  //    return ret2;\n"
-            "  //  }\n"
-            "\n"
-            "  // if (ret1) {\n"
-            "  //   return ret2;\n"
-            "  // }\n"
-            "\n"
-            "  return ret1 && ret2;\n"
-            "}\n"
-            "} // namespace Foo\n"
-            "\n"
-            "namespace Bar {\n"
-            "int foo();\n"
-            "} //  namespace Bar\n"
-            "//@Nothing added because of the non ascii char\n"
-            "\n"
-            "//@      Nothing removed because of the non ascii char\n"
-            "\n"
-            "//  Comment to move to the left\n"
-            "// But not this?\n"
-            "//  @but this\n"
-            "\n"
-            "// Comment to move to the right\n"
-            "//@ this stays\n"
-            "\n"
-            "//} will not move\n"
-            "\n"
-            "// vv will only move\n"
-            "// } if the line above does",
-            format(Code, Style));
+  verifyFormat(Code2, Code, Style);
 
   Style.SpacesInLineCommentPrefix = {0, 0};
-  EXPECT_EQ("//Free comment without space\n"
-            "\n"
-            "//Free comment with 3 spaces\n"
-            "\n"
-            "///Free Doxygen without space\n"
-            "\n"
-            "///Free Doxygen with 3 spaces\n"
-            "\n"
-            "//🐉 A nice dragon\n"
-            "\n"
-            "//\t abccba\n"
-            "\n"
-            "//\\t deffed\n"
-            "\n"
-            "//🐉 Another nice dragon\n"
-            "\n"
-            "//\t Three leading spaces following tab\n"
-            "\n"
-            "//\\t Three leading spaces following backslash\n"
-            "\n"
-            "///A Doxygen Comment with a nested list:\n"
-            "///- Foo\n"
-            "///- Bar\n"
-            "///  - Baz\n" // Here we keep the relative indentation
-            "///  - End\n"
-            "///    of the inner list\n"
-            "///  .\n"
-            "///.\n"
-            "\n"
-            "namespace Foo {\n"
-            "bool bar(bool b) {\n"
-            "  bool ret1 = true; ///<Doxygenstyle without space\n"
-            "  bool ret2 = true; ///<Doxygenstyle with 3 spaces\n"
-            "  if (b) {\n"
-            "    //Foo\n"
-            "\n"
-            "    //In function comment\n"
-            "    ret2 = false;\n"
-            "  } //End of if\n"
-            "\n"
-            "  //if (ret1) {\n"
-            "  //  return ret2;\n"
-            "  //}\n"
-            "\n"
-            "  //if (ret1) {\n"
-            "  //  return ret2;\n"
-            "  //}\n"
-            "\n"
-            "  return ret1 && ret2;\n"
-            "}\n"
-            "} //namespace Foo\n"
-            "\n"
-            "namespace Bar {\n"
-            "int foo();\n"
-            "} //namespace Bar\n"
-            "//@Nothing added because of the non ascii char\n"
-            "\n"
-            "//@      Nothing removed because of the non ascii char\n"
-            "\n"
-            "//Comment to move to the left\n"
-            "//But not this?\n"
-            "//@but this\n"
-            "\n"
-            "//Comment to move to the right\n"
-            "//@ this stays\n"
-            "\n"
-            "//} will not move\n"
-            "\n"
-            "//vv will only move\n"
-            "//} if the line above does",
-            format(Code, Style));
+  verifyFormat(Code3, Code, Style);
 
   Style.SpacesInLineCommentPrefix = {2, -1u};
-  EXPECT_EQ("//  Free comment without space\n"
-            "\n"
-            "//   Free comment with 3 spaces\n"
-            "\n"
-            "///  Free Doxygen without space\n"
-            "\n"
-            "///   Free Doxygen with 3 spaces\n"
-            "\n"
-            "//  🐉 A nice dragon\n"
-            "\n"
-            "//\t abccba\n"
-            "\n"
-            "//\\t deffed\n"
-            "\n"
-            "//   🐉 Another nice dragon\n"
-            "\n"
-            "//   \t Three leading spaces following tab\n"
-            "\n"
-            "//   \\t Three leading spaces following backslash\n"
-            "\n"
-            "///  A Doxygen Comment with a nested list:\n"
-            "///  - Foo\n"
-            "///  - Bar\n"
-            "///    - Baz\n"
-            "///    - End\n"
-            "///      of the inner list\n"
-            "///    .\n"
-            "///  .\n"
-            "\n"
-            "namespace Foo {\n"
-            "bool bar(bool b) {\n"
-            "  bool ret1 = true; ///<  Doxygenstyle without space\n"
-            "  bool ret2 = true; ///<   Doxygenstyle with 3 spaces\n"
-            "  if (b) {\n"
-            "    //  Foo\n"
-            "\n"
-            "    //   In function comment\n"
-            "    ret2 = false;\n"
-            "  } //  End of if\n"
-            "\n"
-            "  //  if (ret1) {\n"
-            "  //    return ret2;\n"
-            "  //  }\n"
-            "\n"
-            "  //  if (ret1) {\n"
-            "  //    return ret2;\n"
-            "  //  }\n"
-            "\n"
-            "  return ret1 && ret2;\n"
-            "}\n"
-            "} //  namespace Foo\n"
-            "\n"
-            "namespace Bar {\n"
-            "int foo();\n"
-            "} //  namespace Bar\n"
-            "//@Nothing added because of the non ascii char\n"
-            "\n"
-            "//@      Nothing removed because of the non ascii char\n"
-            "\n"
-            "//  Comment to move to the left\n"
-            "//  But not this?\n"
-            "//  @but this\n"
-            "\n"
-            "//  Comment to move to the right\n"
-            "//@ this stays\n"
-            "\n"
-            "//} will not move\n"
-            "\n"
-            "//  vv will only move\n"
-            "//  } if the line above does",
-            format(Code, Style));
+  verifyFormat(Code4, Code, Style);
 }
 
 TEST_F(FormatTestComments, SplitCommentIntroducers) {
-  EXPECT_EQ(R"(//
-/\
-/
-)",
-            format(R"(//
-/\
-/ 
-  )",
-                   getLLVMStyleWithColumns(10)));
+  verifyFormat("//\n"
+               "/\\\n"
+               "/\n",
+               "//\n"
+               "/\\\n"
+               "/ \n"
+               "  ",
+               getLLVMStyleWithColumns(10));
 }
 
 TEST_F(FormatTestComments, LineCommentsOnStartOfFunctionCall) {
-  auto Style = getLLVMStyle();
-
-  EXPECT_EQ(Style.Cpp11BracedListStyle, FormatStyle::BLS_AlignFirstComment);
   verifyFormat("Type name{// Comment\n"
-               "          value};",
-               Style);
+               "          value};");
 
+  auto Style = getLLVMStyle();
+  EXPECT_EQ(Style.Cpp11BracedListStyle, FormatStyle::BLS_AlignFirstComment);
   Style.Cpp11BracedListStyle = FormatStyle::BLS_Block;
+
   verifyFormat("Type name{ // Comment\n"
                "           value\n"
                "};",
                Style);
 
   Style.Cpp11BracedListStyle = FormatStyle::BLS_FunctionCall;
+
   verifyFormat("Type name{ // Comment\n"
                "    value};",
                Style);
diff --git a/clang/unittests/Format/FormatTestJS.cpp b/clang/unittests/Format/FormatTestJS.cpp
index 91577b9a49167..4847151c14b33 100644
--- a/clang/unittests/Format/FormatTestJS.cpp
+++ b/clang/unittests/Format/FormatTestJS.cpp
@@ -2883,7 +2883,7 @@ TEST_F(FormatTestJS, DontBreakFieldsAsGoToLabels) {
 
 TEST_F(FormatTestJS, BreakAfterOpenBracket) {
   auto Style = getGoogleStyle(FormatStyle::LK_JavaScript);
-  EXPECT_EQ(Style.AlignAfterOpenBracket, FormatStyle::BAS_AlwaysBreak);
+  EXPECT_EQ(Style.BreakAfterOpenBracketFunction, true);
   verifyFormat("ctrl.onCopy(/** @type {!WizEvent}*/ (\n"
                "    {event, targetElement: {el: () => selectedElement}}));",
                Style);
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index ca99940890984..815c79e68dac9 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -799,6 +799,30 @@ TEST_F(TokenAnnotatorTest, UnderstandsTemplateTemplateParameters) {
   EXPECT_TOKEN(Tokens[23], tok::identifier, TT_ClassHeadName);
 }
 
+TEST_F(TokenAnnotatorTest, UnderstandsCommonCppTemplates) {
+  auto Tokens =
+      annotate("static_assert(std::conditional_t<A || B, C, D>::value);");
+  ASSERT_EQ(Tokens.size(), 19u) << Tokens;
+  EXPECT_TOKEN(Tokens[5], tok::less, TT_TemplateOpener);
+  EXPECT_TOKEN(Tokens[13], tok::greater, TT_TemplateCloser);
+
+  Tokens =
+      annotate("static_assert(std::conditional<A || B, C, D>::type::value);");
+  ASSERT_EQ(Tokens.size(), 21u) << Tokens;
+  EXPECT_TOKEN(Tokens[5], tok::less, TT_TemplateOpener);
+  EXPECT_TOKEN(Tokens[13], tok::greater, TT_TemplateCloser);
+
+  Tokens = annotate("static_assert(fancy_v<A || B>);");
+  ASSERT_EQ(Tokens.size(), 11u) << Tokens;
+  EXPECT_TOKEN(Tokens[3], tok::less, TT_TemplateOpener);
+  EXPECT_TOKEN(Tokens[7], tok::greater, TT_TemplateCloser);
+
+  Tokens = annotate("static_assert(fancy<A || B>::value);");
+  ASSERT_EQ(Tokens.size(), 13u) << Tokens;
+  EXPECT_TOKEN(Tokens[3], tok::less, TT_TemplateOpener);
+  EXPECT_TOKEN(Tokens[7], tok::greater, TT_TemplateCloser);
+}
+
 TEST_F(TokenAnnotatorTest, UnderstandsWhitespaceSensitiveMacros) {
   FormatStyle Style = getLLVMStyle();
   Style.WhitespaceSensitiveMacros.push_back("FOO");
@@ -1119,6 +1143,11 @@ TEST_F(TokenAnnotatorTest, UnderstandsOverloadedOperators) {
   EXPECT_TOKEN(Tokens[8], tok::amp, TT_PointerOrReference);
   EXPECT_TOKEN(Tokens[12], tok::amp, TT_PointerOrReference);
 
+  Tokens = annotate("::foo::bar& ::foo::bar::operator=(::foo::bar& other);");
+  ASSERT_EQ(Tokens.size(), 22u) << Tokens;
+  EXPECT_TOKEN(Tokens[6], tok::identifier, TT_FunctionDeclarationName);
+  EXPECT_TOKEN(Tokens[17], tok::amp, TT_PointerOrReference);
+
   Tokens = annotate("SomeLoooooooooooooooooType::Awaitable\n"
                     "SomeLoooooooooooooooooType::operator co_await();");
   ASSERT_EQ(Tokens.size(), 11u) << Tokens;
@@ -2681,6 +2710,7 @@ TEST_F(TokenAnnotatorTest, UnderstandsVerilogOperators) {
   // precedence.
   std::pair<prec::Level, std::string> JoinedBinary[] = {
       {prec::Comma, "->"},        {prec::Comma, "<->"},
+      {prec::Comma, "#-#"},       {prec::Comma, "#=#"},
       {prec::Assignment, "+="},   {prec::Assignment, "-="},
       {prec::Assignment, "*="},   {prec::Assignment, "/="},
       {prec::Assignment, "%="},   {prec::Assignment, "&="},
@@ -3484,6 +3514,10 @@ TEST_F(TokenAnnotatorTest, StartOfName) {
   ASSERT_EQ(Tokens.size(), 8u) << Tokens;
   EXPECT_TOKEN(Tokens[2], tok::identifier, TT_Unknown); // Not StartOfName
 
+  Tokens = annotate("int* ::foo::bar;");
+  ASSERT_EQ(Tokens.size(), 8u) << Tokens;
+  EXPECT_TOKEN(Tokens[3], tok::identifier, TT_StartOfName);
+
   auto Style = getLLVMStyle();
   Style.StatementAttributeLikeMacros.push_back("emit");
   Tokens = annotate("emit foo = 0;", Style);
diff --git a/clang/unittests/Tooling/RangeSelectorTest.cpp b/clang/unittests/Tooling/RangeSelectorTest.cpp
index adf5e74ea3192..a1fcbb023832f 100644
--- a/clang/unittests/Tooling/RangeSelectorTest.cpp
+++ b/clang/unittests/Tooling/RangeSelectorTest.cpp
@@ -527,6 +527,31 @@ TEST(RangeSelectorTest, NameOpDeclRefError) {
           AllOf(HasSubstr(Ref), HasSubstr("requires property 'identifier'")))));
 }
 
+TEST(RangeSelectorTest, NameOpDeclInMacroArg) {
+  StringRef Code = R"cc(
+  #define MACRO(name) int name;
+  MACRO(x)
+  )cc";
+  const char *ID = "id";
+  TestMatch Match = matchCode(Code, varDecl().bind(ID));
+  EXPECT_THAT_EXPECTED(select(name(ID), Match), HasValue("x"));
+}
+
+TEST(RangeSelectorTest, NameOpDeclInMacroBodyError) {
+  StringRef Code = R"cc(
+  #define MACRO int x;
+  MACRO
+  )cc";
+  const char *ID = "id";
+  TestMatch Match = matchCode(Code, varDecl().bind(ID));
+  EXPECT_THAT_EXPECTED(
+      name(ID)(Match.Result),
+      Failed<StringError>(testing::Property(
+          &StringError::getMessage,
+          AllOf(HasSubstr("range selected by name(node id="),
+                HasSubstr("' is different from decl name 'x'")))));
+}
+
 TEST(RangeSelectorTest, CallArgsOp) {
   const StringRef Code = R"cc(
     struct C {
diff --git a/clang/utils/CmpDriver b/clang/utils/CmpDriver
index 12ce7a3250f66..0732baa76d01c 100755
--- a/clang/utils/CmpDriver
+++ b/clang/utils/CmpDriver
@@ -5,6 +5,7 @@ A simple utility that compares tool invocations and exit codes issued by
 compiler drivers that support -### (e.g. gcc and clang).
 """
 
+from itertools import zip_longest
 import subprocess
 
 def splitArgs(s):
@@ -22,7 +23,7 @@ def splitArgs(s):
         elif inQuote:
             if c == '\\':
                 current += c
-                current += it.next()
+                current += next(it)
             else:
                 current += c
         elif not c.isspace():
@@ -135,77 +136,77 @@ def main():
 
     # Compare stdout.
     if infoA.stdout != infoB.stdout:
-        print '-- STDOUT DIFFERS -'
-        print 'A OUTPUT: ',infoA.stdout
-        print 'B OUTPUT: ',infoB.stdout
-        print
+        print('-- STDOUT DIFFERS -')
+        print('A OUTPUT: ',infoA.stdout)
+        print('B OUTPUT: ',infoB.stdout)
+        print()
 
         diff = ZipperDiff(infoA.stdout.split('\n'),
                           infoB.stdout.split('\n'))
         for i,(aElt,bElt) in enumerate(diff.getDiffs()):
             if aElt is None:
-                print 'A missing: %s' % bElt
+                print('A missing: %s' % bElt)
             elif bElt is None:
-                print 'B missing: %s' % aElt
+                print('B missing: %s' % aElt)
             else:
-                print 'mismatch: A: %s' % aElt
-                print '          B: %s' % bElt
+                print('mismatch: A: %s' % aElt)
+                print('          B: %s' % bElt)
 
         differ = True
 
     # Compare stderr.
     if infoA.stderr != infoB.stderr:
-        print '-- STDERR DIFFERS -'
-        print 'A STDERR: ',infoA.stderr
-        print 'B STDERR: ',infoB.stderr
-        print
+        print('-- STDERR DIFFERS -')
+        print('A STDERR: ',infoA.stderr)
+        print('B STDERR: ',infoB.stderr)
+        print()
 
         diff = ZipperDiff(infoA.stderr.split('\n'),
                           infoB.stderr.split('\n'))
         for i,(aElt,bElt) in enumerate(diff.getDiffs()):
             if aElt is None:
-                print 'A missing: %s' % bElt
+                print('A missing: %s' % bElt)
             elif bElt is None:
-                print 'B missing: %s' % aElt
+                print('B missing: %s' % aElt)
             else:
-                print 'mismatch: A: %s' % aElt
-                print '          B: %s' % bElt
+                print('mismatch: A: %s' % aElt)
+                print('          B: %s' % bElt)
 
         differ = True
 
     # Compare commands.
-    for i,(a,b) in enumerate(map(None, infoA.commands, infoB.commands)):
+    for i,(a,b) in enumerate(zip_longest(infoA.commands, infoB.commands, fillvalue=None)):
         if a is None:
-            print 'A MISSING:',' '.join(b)
+            print('A MISSING:',' '.join(b))
             differ = True
             continue
         elif b is None:
-            print 'B MISSING:',' '.join(a)
+            print('B MISSING:',' '.join(a))
             differ = True
             continue
 
         diff = DriverZipperDiff(a,b)
         diffs = list(diff.getDiffs())
         if diffs:
-            print '-- COMMAND %d DIFFERS -' % i
-            print 'A COMMAND:',' '.join(a)
-            print 'B COMMAND:',' '.join(b)
-            print
+            print('-- COMMAND %d DIFFERS -' % i)
+            print('A COMMAND:',' '.join(a))
+            print('B COMMAND:',' '.join(b))
+            print()
             for i,(aElt,bElt) in enumerate(diffs):
                 if aElt is None:
-                    print 'A missing: %s' % bElt
+                    print('A missing: %s' % bElt)
                 elif bElt is None:
-                    print 'B missing: %s' % aElt
+                    print('B missing: %s' % aElt)
                 else:
-                    print 'mismatch: A: %s' % aElt
-                    print '          B: %s' % bElt
+                    print('mismatch: A: %s' % aElt)
+                    print('          B: %s' % bElt)
             differ = True
     
     # Compare result codes.
     if infoA.exitCode != infoB.exitCode:
-        print '-- EXIT CODES DIFFER -'
-        print 'A: ',infoA.exitCode
-        print 'B: ',infoB.exitCode
+        print('-- EXIT CODES DIFFER -')
+        print('A: ',infoA.exitCode)
+        print('B: ',infoB.exitCode)
         differ = True
 
     if differ:
diff --git a/clang/utils/check_cfc/check_cfc.py b/clang/utils/check_cfc/check_cfc.py
index 8d42ec532bbb7..7658f6c27009b 100755
--- a/clang/utils/check_cfc/check_cfc.py
+++ b/clang/utils/check_cfc/check_cfc.py
@@ -56,11 +56,7 @@
 import subprocess
 import sys
 import tempfile
-
-try:
-    import configparser
-except ImportError:
-    import ConfigParser as configparser
+import configparser
 import io
 
 import obj_diff
diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html
index b7da22cf9fb22..0312c9dfc0665 100755
--- a/clang/www/cxx_dr_status.html
+++ b/clang/www/cxx_dr_status.html
@@ -81,7 +81,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/6.html">6</a></td>
     <td>NAD</td>
     <td>Should the optimization that allows a class object to alias another object also allow the case of a parameter in an inline function to alias its argument?</td>
-    <td class="unknown" align="center">Unknown</td>
+    <td class="full" align="center">Yes</td>
   </tr>
   <tr id="7">
     <td><a href="https://cplusplus.github.io/CWG/issues/7.html">7</a></td>
@@ -1318,7 +1318,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/212.html">212</a></td>
     <td>CD4</td>
     <td>Implicit instantiation is not described clearly enough</td>
-    <td class="unknown" align="center">Unknown</td>
+    <td class="full" align="center">Yes</td>
   </tr>
   <tr id="213">
     <td><a href="https://cplusplus.github.io/CWG/issues/213.html">213</a></td>
@@ -1438,7 +1438,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/232.html">232</a></td>
     <td>NAD</td>
     <td>Is indirection through a null pointer undefined behavior?</td>
-    <td class="unknown" align="center">Unknown</td>
+    <td class="none" align="center">Duplicate of <a href="#2823">2823</a></td>
   </tr>
   <tr id="233">
     <td><a href="https://cplusplus.github.io/CWG/issues/233.html">233</a></td>
@@ -3113,11 +3113,11 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td>Default initialization of POD classes?</td>
     <td class="na" align="center">N/A</td>
   </tr>
-  <tr class="open" id="511">
+  <tr id="511">
     <td><a href="https://cplusplus.github.io/CWG/issues/511.html">511</a></td>
-    <td>open</td>
+    <td>NAD</td>
     <td>POD-structs with template assignment operators</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="512">
     <td><a href="https://cplusplus.github.io/CWG/issues/512.html">512</a></td>
@@ -10895,7 +10895,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr class="open" id="1845">
     <td><a href="https://cplusplus.github.io/CWG/issues/1845.html">1845</a></td>
-    <td>drafting</td>
+    <td>review</td>
     <td>Point of instantiation of a variable template specialization</td>
     <td align="center">Not resolved</td>
   </tr>
@@ -12081,7 +12081,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr class="open" id="2042">
     <td><a href="https://cplusplus.github.io/CWG/issues/2042.html">2042</a></td>
-    <td>drafting</td>
+    <td>review</td>
     <td>Exceptions and deallocation functions</td>
     <td align="center">Not resolved</td>
   </tr>
@@ -12335,7 +12335,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/2084.html">2084</a></td>
     <td>CD4</td>
     <td>NSDMIs and deleted union default constructors</td>
-    <td class="unknown" align="center">Unknown</td>
+    <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="2085">
     <td><a href="https://cplusplus.github.io/CWG/issues/2085.html">2085</a></td>
@@ -12837,7 +12837,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr class="open" id="2168">
     <td><a href="https://cplusplus.github.io/CWG/issues/2168.html">2168</a></td>
-    <td>open</td>
+    <td>review</td>
     <td>Narrowing conversions and +/- infinity</td>
     <td align="center">Not resolved</td>
   </tr>
@@ -14237,11 +14237,11 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td>Constexpr virtual functions and temporary objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
-  <tr class="open" id="2401">
+  <tr id="2401">
     <td><a href="https://cplusplus.github.io/CWG/issues/2401.html">2401</a></td>
-    <td>drafting</td>
+    <td>C++20</td>
     <td>Array decay vs prohibition of subobject non-type arguments</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2402">
     <td><a href="https://cplusplus.github.io/CWG/issues/2402.html">2402</a></td>
@@ -15171,7 +15171,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr class="open" id="2555">
     <td><a href="https://cplusplus.github.io/CWG/issues/2555.html">2555</a></td>
-    <td>drafting</td>
+    <td>tentatively ready</td>
     <td>Ineffective redeclaration prevention for <I>using-declarator</I>s</td>
     <td align="center">Not resolved</td>
   </tr>
@@ -15311,23 +15311,23 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td>Undefined behavior for preprocessing directives in macro arguments</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2578">
+  <tr id="2578">
     <td><a href="https://cplusplus.github.io/CWG/issues/2578.html">2578</a></td>
-    <td>open</td>
+    <td>CD7</td>
     <td>Undefined behavior when creating an invalid string literal via stringizing</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
-  <tr class="open" id="2579">
+  <tr id="2579">
     <td><a href="https://cplusplus.github.io/CWG/issues/2579.html">2579</a></td>
-    <td>open</td>
+    <td>CD7</td>
     <td>Undefined behavior when token pasting does not create a preprocessing token</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
-  <tr class="open" id="2580">
+  <tr id="2580">
     <td><a href="https://cplusplus.github.io/CWG/issues/2580.html">2580</a></td>
-    <td>open</td>
+    <td>CD7</td>
     <td>Undefined behavior with <TT>#line</TT></td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2581">
     <td><a href="https://cplusplus.github.io/CWG/issues/2581.html">2581</a></td>
@@ -16790,7 +16790,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/2823.html">2823</a></td>
     <td>CD7</td>
     <td>Implicit undefined behavior when dereferencing pointers</td>
-    <td class="unknown" align="center">Unknown</td>
+    <td class="none" align="center">No</td>
   </tr>
   <tr id="2824">
     <td><a href="https://cplusplus.github.io/CWG/issues/2824.html">2824</a></td>
@@ -17104,7 +17104,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr class="open" id="2875">
     <td><a href="https://cplusplus.github.io/CWG/issues/2875.html">2875</a></td>
-    <td>review</td>
+    <td>tentatively ready</td>
     <td>Missing support for round-tripping null pointer values through indirection/address operators</td>
     <td align="center">Not resolved</td>
   </tr>
@@ -17400,7 +17400,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr class="open" id="2923">
     <td><a href="https://cplusplus.github.io/CWG/issues/2923.html">2923</a></td>
-    <td>review</td>
+    <td>tentatively ready</td>
     <td>Note about infinite loops and execution steps</td>
     <td align="center">Not resolved</td>
   </tr>
@@ -17760,7 +17760,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr class="open" id="2983">
     <td><a href="https://cplusplus.github.io/CWG/issues/2983.html">2983</a></td>
-    <td>open</td>
+    <td>review</td>
     <td>Non-type template parameters are not variables</td>
     <td align="center">Not resolved</td>
   </tr>
@@ -17868,7 +17868,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr class="open" id="3001">
     <td><a href="https://cplusplus.github.io/CWG/issues/3001.html">3001</a></td>
-    <td>review</td>
+    <td>tentatively ready</td>
     <td>Inconsistent restrictions for <TT>static_cast</TT> on pointers to out-of-lifetime objects</td>
     <td align="center">Not resolved</td>
   </tr>
@@ -17932,7 +17932,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr class="open" id="3011">
     <td><a href="https://cplusplus.github.io/CWG/issues/3011.html">3011</a></td>
-    <td>open</td>
+    <td>tentatively ready</td>
     <td>Parenthesized aggregate initialization for <I>new-expression</I>s</td>
     <td align="center">Not resolved</td>
   </tr>
@@ -17992,7 +17992,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr class="open" id="3021">
     <td><a href="https://cplusplus.github.io/CWG/issues/3021.html">3021</a></td>
-    <td>open</td>
+    <td>drafting</td>
     <td>Subsumption rules for fold expanded constraints</td>
     <td align="center">Not resolved</td>
   </tr>
@@ -18058,7 +18058,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr class="open" id="3032">
     <td><a href="https://cplusplus.github.io/CWG/issues/3032.html">3032</a></td>
-    <td>open</td>
+    <td>tentatively ready</td>
     <td>Template argument disambiguation</td>
     <td align="center">Not resolved</td>
   </tr>
@@ -18184,7 +18184,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr class="open" id="3053">
     <td><a href="https://cplusplus.github.io/CWG/issues/3053.html">3053</a></td>
-    <td>open</td>
+    <td>tentatively ready</td>
     <td>Allowing <TT>#undef likely</TT></td>
     <td align="center">Not resolved</td>
   </tr>
@@ -18265,6 +18265,210 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td>tentatively ready</td>
     <td>Declarative <I>nested-name-specifier</I> in explicit instantiation</td>
     <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3067">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3067.html">3067</a></td>
+    <td>open</td>
+    <td>Array-to-pointer conversion with object type mismatch</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3068">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3068.html">3068</a></td>
+    <td>open</td>
+    <td>Access checking in friends involving <I>qualified-id</I>s</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3069">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3069.html">3069</a></td>
+    <td>open</td>
+    <td>Reference to wrong placeholder</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3070">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3070.html">3070</a></td>
+    <td>open</td>
+    <td>Trivial assignment can skip member subobjects</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3071">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3071.html">3071</a></td>
+    <td>open</td>
+    <td>Negative <TT>tuple_size</TT> in structured bindings</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3072">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3072.html">3072</a></td>
+    <td>open</td>
+    <td>Incorrect examples for lambda SFINAE</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3073">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3073.html">3073</a></td>
+    <td>open</td>
+    <td>Dependence of <I>R</I> on <TT>T2</TT> is unclear</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3074">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3074.html">3074</a></td>
+    <td>tentatively ready</td>
+    <td>Redundant ill-formedness for module macros</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3075">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3075.html">3075</a></td>
+    <td>tentatively ready</td>
+    <td>Unclear matching of import directive</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3076">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3076.html">3076</a></td>
+    <td>tentatively ready</td>
+    <td>Remove unnecessary IFNDR for malformed <I>header-name-token</I>s</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3077">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3077.html">3077</a></td>
+    <td>tentatively ready</td>
+    <td>Undesirable formation of <TT>import</TT> directive with <I>string-literal</I></td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3078">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3078.html">3078</a></td>
+    <td>review</td>
+    <td>Different treatment of <TT>#include</TT> <I>pp-tokens</I> and <I>header-name-tokens</I></td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3079">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3079.html">3079</a></td>
+    <td>open</td>
+    <td>Allow <I>empty-declaration</I>s in anonymous unions</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3080">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3080.html">3080</a></td>
+    <td>tentatively ready</td>
+    <td>Clarify kinds of permitted template template arguments</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3081">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3081.html">3081</a></td>
+    <td>review</td>
+    <td>Require glvalue when splicing direct base class relationship</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3082">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3082.html">3082</a></td>
+    <td>tentatively ready</td>
+    <td>Allow for call-compatible function types in <TT>reinterpret_cast</TT></td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3083">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3083.html">3083</a></td>
+    <td>tentatively ready</td>
+    <td>Remove redundant restrictions on class and enum definitions</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3084">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3084.html">3084</a></td>
+    <td>tentatively ready</td>
+    <td><I>compound-statement</I>s inside <I>iteration-statement</I>s</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3085">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3085.html">3085</a></td>
+    <td>tentatively ready</td>
+    <td>Apply restriction inside for-range-declaration</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3086">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3086.html">3086</a></td>
+    <td>tentatively ready</td>
+    <td>Destringizing should consider all sorts of encoding-prefixes</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3087">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3087.html">3087</a></td>
+    <td>open</td>
+    <td>Destringizing for raw string literals</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3088">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3088.html">3088</a></td>
+    <td>open</td>
+    <td>Clarify macro treatment of identifiers with special meaning</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3089">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3089.html">3089</a></td>
+    <td>tentatively ready</td>
+    <td>const-default-constructible improperly handles std::meta::info</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3090">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3090.html">3090</a></td>
+    <td>tentatively ready</td>
+    <td>Internal linkage from header units</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3091">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3091.html">3091</a></td>
+    <td>review</td>
+    <td>Linking of translation units as sequences of tokens</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3092">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3092.html">3092</a></td>
+    <td>tentatively ready</td>
+    <td><I>base-specifier</I>s are not "declared"</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3093">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3093.html">3093</a></td>
+    <td>open</td>
+    <td>Missing integration of direct base class relationships</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3094">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3094.html">3094</a></td>
+    <td>review</td>
+    <td>Rework phases for string literal concatenation and token formation</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3095">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3095.html">3095</a></td>
+    <td>open</td>
+    <td>Type-dependent packs that are not structured binding packs</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3096">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3096.html">3096</a></td>
+    <td>open</td>
+    <td>Value-dependence of size of structured binding pack with non-dependent initializer</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3097">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3097.html">3097</a></td>
+    <td>tentatively ready</td>
+    <td>Lambda expression introduces a scope</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3098">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3098.html">3098</a></td>
+    <td>tentatively ready</td>
+    <td>Remove redundancy "names or designates"</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3099">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3099.html">3099</a></td>
+    <td>open</td>
+    <td>Instantiation of type aliases from alias templates is unspecified</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="3100">
+    <td><a href="https://cplusplus.github.io/CWG/issues/3100.html">3100</a></td>
+    <td>open</td>
+    <td>Destruction order for objects with static storage duration</td>
+    <td align="center">Not resolved</td>
   </tr></table>
 
 </div>
diff --git a/compiler-rt/lib/lsan/lsan_allocator.h b/compiler-rt/lib/lsan/lsan_allocator.h
index 556b9f56a4a4a..2d0ea0b46fe0e 100644
--- a/compiler-rt/lib/lsan/lsan_allocator.h
+++ b/compiler-rt/lib/lsan/lsan_allocator.h
@@ -93,6 +93,10 @@ using LSanSizeClassMap = DefaultSizeClassMap;
 const uptr kAllocatorSpace = 0x600000000000ULL;
 const uptr kAllocatorSize  = 0x40000000000ULL;  // 4T.
 using LSanSizeClassMap = DefaultSizeClassMap;
+#  elif SANITIZER_ANDROID && defined(__aarch64__)
+const uptr kAllocatorSpace = 0x3000000000ULL;
+const uptr kAllocatorSize = 0x2000000000ULL;
+using LSanSizeClassMap = VeryCompactSizeClassMap;
 #  else
 const uptr kAllocatorSpace = 0x500000000000ULL;
 const uptr kAllocatorSize = 0x40000000000ULL;  // 4T.
diff --git a/compiler-rt/lib/nsan/tests/NSanUnitTest.cpp b/compiler-rt/lib/nsan/tests/NSanUnitTest.cpp
index 73b59671fe07a..d121292c36682 100644
--- a/compiler-rt/lib/nsan/tests/NSanUnitTest.cpp
+++ b/compiler-rt/lib/nsan/tests/NSanUnitTest.cpp
@@ -43,8 +43,8 @@ template <typename FT, auto next> void TestFT() {
   ASSERT_EQ(GetULPDiff<FT>(-X, -Y), 3);
 
   // Values with larger differences.
-  static constexpr const __uint128_t MantissaSize =
-      __uint128_t{1} << FTInfo<FT>::kMantissaBits;
+  static constexpr const __sanitizer::u64 MantissaSize =
+      __sanitizer::u64{1} << FTInfo<FT>::kMantissaBits;
   ASSERT_EQ(GetULPDiff<FT>(1.0, next(2.0, 1.0)), MantissaSize - 1);
   ASSERT_EQ(GetULPDiff<FT>(1.0, 2.0), MantissaSize);
   ASSERT_EQ(GetULPDiff<FT>(1.0, next(2.0, 3.0)), MantissaSize + 1);
@@ -57,6 +57,11 @@ TEST(NSanTest, Double) {
   TestFT<double, static_cast<double (*)(double, double)>(nextafter)>();
 }
 
-TEST(NSanTest, Float128) { TestFT<__float128, nextafterf128>(); }
+TEST(NSanTest, Float128) {
+  // Very basic tests. FIXME: improve when we have nextafter<__float128>.
+  ASSERT_EQ(GetULPDiff<__float128>(0.0, 0.0), 0);
+  ASSERT_EQ(GetULPDiff<__float128>(-0.0, 0.0), 0);
+  ASSERT_NE(GetULPDiff<__float128>(-0.01, 0.01), kMaxULPDiff);
+}
 
 } // end namespace __nsan
diff --git a/compiler-rt/lib/tsan/rtl/tsan_platform_mac.cpp b/compiler-rt/lib/tsan/rtl/tsan_platform_mac.cpp
index 62ab0554df08e..7fa5e017d3985 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_platform_mac.cpp
+++ b/compiler-rt/lib/tsan/rtl/tsan_platform_mac.cpp
@@ -259,7 +259,9 @@ void InitializePlatform() {
 
   ThreadEventCallbacks callbacks = {
       .create = ThreadCreateCallback,
+      .start = nullptr,
       .terminate = ThreadTerminateCallback,
+      .destroy = nullptr,
   };
   InstallPthreadIntrospectionHook(callbacks);
 #endif
diff --git a/compiler-rt/lib/tysan/tysan.cpp b/compiler-rt/lib/tysan/tysan.cpp
index 4fa8166986d76..1c67adeba0fc5 100644
--- a/compiler-rt/lib/tysan/tysan.cpp
+++ b/compiler-rt/lib/tysan/tysan.cpp
@@ -22,6 +22,7 @@
 
 #include "tysan/tysan.h"
 
+#include <stdint.h>
 #include <string.h>
 
 using namespace __sanitizer;
@@ -254,10 +255,68 @@ static void reportError(void *Addr, int Size, tysan_type_descriptor *TD,
   }
 }
 
+ALWAYS_INLINE
+static void SetShadowType(tysan_type_descriptor *td,
+                          tysan_type_descriptor **shadowData,
+                          uint64_t AccessSize) {
+  *shadowData = td;
+  uint64_t shadowDataInt = (uint64_t)shadowData;
+
+  for (uint64_t i = 1; i < AccessSize; ++i) {
+    int64_t dataOffset = i << PtrShift();
+    int64_t *badShadowData = (int64_t *)(shadowDataInt + dataOffset);
+    int64_t badTD = int64_t(i) * -1;
+    *badShadowData = badTD;
+  }
+}
+
+ALWAYS_INLINE
+static bool GetNotAllBadTD(uint64_t ShadowDataInt, uint64_t AccessSize) {
+  bool notAllBadTD = false;
+  for (uint64_t i = 1; i < AccessSize; ++i) {
+    int64_t **unkShadowData = (int64_t **)(ShadowDataInt + (i << PtrShift()));
+    int64_t *ILdTD = *unkShadowData;
+    notAllBadTD = notAllBadTD || (ILdTD != nullptr);
+  }
+  return notAllBadTD;
+}
+
+ALWAYS_INLINE
+static bool GetNotAllUnkTD(uint64_t ShadowDataInt, uint64_t AccessSize) {
+  bool notAllBadTD = false;
+  for (uint64_t i = 1; i < AccessSize; ++i) {
+    int64_t *badShadowData = (int64_t *)(ShadowDataInt + (i << PtrShift()));
+    int64_t ILdTD = *badShadowData;
+    notAllBadTD = notAllBadTD || (ILdTD >= 0);
+  }
+  return notAllBadTD;
+}
+
 extern "C" SANITIZER_INTERFACE_ATTRIBUTE void
-__tysan_check(void *addr, int size, tysan_type_descriptor *td, int flags) {
-  GET_CALLER_PC_BP_SP;
+__tysan_instrument_mem_inst(char *dest, char *src, uint64_t size,
+                            bool needsMemMove) {
+  tysan_type_descriptor **destShadowDataPtr = shadow_for(dest);
+
+  if (!src) {
+    internal_memset((char *)destShadowDataPtr, 0, size << PtrShift());
+    return;
+  }
+
+  uint64_t srcInt = (uint64_t)src;
+  uint64_t srcShadowInt = ((srcInt & AppMask()) << PtrShift()) + ShadowAddr();
+  uint64_t *srcShadow = (uint64_t *)srcShadowInt;
 
+  if (needsMemMove) {
+    internal_memmove((char *)destShadowDataPtr, srcShadow, size << PtrShift());
+  } else {
+    internal_memcpy((char *)destShadowDataPtr, srcShadow, size << PtrShift());
+  }
+}
+
+ALWAYS_INLINE
+static void __tysan_check_internal(void *addr, int size,
+                                   tysan_type_descriptor *td, int flags,
+                                   uptr pc, uptr bp, uptr sp) {
   bool IsRead = flags & 1;
   bool IsWrite = flags & 2;
   const char *AccessStr;
@@ -300,6 +359,64 @@ __tysan_check(void *addr, int size, tysan_type_descriptor *td, int flags) {
   }
 }
 
+extern "C" SANITIZER_INTERFACE_ATTRIBUTE void
+__tysan_check(void *addr, int size, tysan_type_descriptor *td, int flags) {
+  GET_CALLER_PC_BP_SP;
+  __tysan_check_internal(addr, size, td, flags, pc, bp, sp);
+}
+
+extern "C" SANITIZER_INTERFACE_ATTRIBUTE void
+__tysan_instrument_with_shadow_update(void *ptr, tysan_type_descriptor *td,
+                                      bool sanitizeFunction,
+                                      uint64_t accessSize, int flags) {
+  tysan_type_descriptor **shadowData = shadow_for(ptr);
+  tysan_type_descriptor *loadedTD = *shadowData;
+  bool shadowIsNull = loadedTD == nullptr;
+
+  // TODO, sanitizeFunction is known at compile time, so maybe this is split
+  // into two different functions
+  if (sanitizeFunction) {
+
+    if (td != loadedTD) {
+
+      // We now know that the types did not match (we're on the slow path). If
+      // the type is unknown, then set it.
+      if (shadowIsNull) {
+        // We're about to set the type. Make sure that all bytes in the value
+        // are also of unknown type.
+        bool isAllUnknownTD = GetNotAllUnkTD((uint64_t)shadowData, accessSize);
+        if (isAllUnknownTD) {
+          GET_CALLER_PC_BP_SP;
+          __tysan_check_internal(ptr, accessSize, td, flags, pc, bp, sp);
+        }
+        SetShadowType(td, shadowData, accessSize);
+      } else {
+        GET_CALLER_PC_BP_SP;
+        __tysan_check_internal(ptr, accessSize, td, flags, pc, bp, sp);
+      }
+    } else {
+      // We appear to have the right type. Make sure that all other bytes in
+      // the type are still marked as interior bytes. If not, call the runtime.
+      bool isNotAllBadTD = GetNotAllBadTD((uint64_t)shadowData, accessSize);
+      if (isNotAllBadTD) {
+        GET_CALLER_PC_BP_SP;
+        __tysan_check_internal(ptr, accessSize, td, flags, pc, bp, sp);
+      }
+    }
+  } else if (shadowIsNull) {
+    SetShadowType(td, shadowData, accessSize);
+  }
+}
+
+extern "C" SANITIZER_INTERFACE_ATTRIBUTE void
+__tysan_set_shadow_type(void *ptr, tysan_type_descriptor *td,
+                        uint64_t accessSize) {
+  // In the mode where writes always set the type, for a write (which does
+  // not also read), we just set the type.
+  tysan_type_descriptor **shadow = shadow_for(ptr);
+  SetShadowType(td, shadow, accessSize);
+}
+
 Flags __tysan::flags_data;
 
 SANITIZER_INTERFACE_ATTRIBUTE uptr __tysan_shadow_memory_address;
diff --git a/compiler-rt/lib/tysan/tysan_platform.h b/compiler-rt/lib/tysan/tysan_platform.h
index f01392885d939..19f77f0cace6b 100644
--- a/compiler-rt/lib/tysan/tysan_platform.h
+++ b/compiler-rt/lib/tysan/tysan_platform.h
@@ -21,24 +21,28 @@ struct Mapping {
   static const uptr kShadowAddr = 0x010000000000ull;
   static const uptr kAppAddr = 0x550000000000ull;
   static const uptr kAppMemMsk = ~0x780000000000ull;
+  static const uptr kPtrShift = 3;
 };
 #elif defined(__aarch64__)
 struct Mapping39 {
   static const uptr kShadowAddr = 0x0800000000ull;
   static const uptr kAppAddr = 0x5500000000ull;
   static const uptr kAppMemMsk = ~0x7800000000ull;
+  static const uptr kPtrShift = 3;
 };
 
 struct Mapping42 {
   static const uptr kShadowAddr = 0x10000000000ull;
   static const uptr kAppAddr = 0x2aa00000000ull;
   static const uptr kAppMemMsk = ~0x3c000000000ull;
+  static const uptr kPtrShift = 3;
 };
 
 struct Mapping48 {
   static const uptr kShadowAddr = 0x0002000000000ull;
   static const uptr kAppAddr = 0x0aaaa00000000ull;
   static const uptr kAppMemMsk = ~0x0fff800000000ull;
+  static const uptr kPtrShift = 3;
 };
 #define TYSAN_RUNTIME_VMA 1
 #else
@@ -49,7 +53,12 @@ struct Mapping48 {
 extern int vmaSize;
 #endif
 
-enum MappingType { MAPPING_SHADOW_ADDR, MAPPING_APP_ADDR, MAPPING_APP_MASK };
+enum MappingType {
+  MAPPING_SHADOW_ADDR,
+  MAPPING_APP_ADDR,
+  MAPPING_APP_MASK,
+  MAPPING_PTR_SHIFT
+};
 
 template <typename Mapping, int Type> uptr MappingImpl(void) {
   switch (Type) {
@@ -59,6 +68,8 @@ template <typename Mapping, int Type> uptr MappingImpl(void) {
     return Mapping::kAppAddr;
   case MAPPING_APP_MASK:
     return Mapping::kAppMemMsk;
+  case MAPPING_PTR_SHIFT:
+    return Mapping::kPtrShift;
   }
 }
 
@@ -88,6 +99,9 @@ uptr AppAddr() { return MappingArchImpl<MAPPING_APP_ADDR>(); }
 ALWAYS_INLINE
 uptr AppMask() { return MappingArchImpl<MAPPING_APP_MASK>(); }
 
+ALWAYS_INLINE
+uptr PtrShift() { return MappingArchImpl<MAPPING_PTR_SHIFT>(); }
+
 } // namespace __tysan
 
 #endif
diff --git a/compiler-rt/test/asan/TestCases/Darwin/asan-symbolize-templated-cxx.cpp b/compiler-rt/test/asan/TestCases/Darwin/asan-symbolize-templated-cxx.cpp
index 3d726a32b7eaa..5794f5dbadaec 100644
--- a/compiler-rt/test/asan/TestCases/Darwin/asan-symbolize-templated-cxx.cpp
+++ b/compiler-rt/test/asan/TestCases/Darwin/asan-symbolize-templated-cxx.cpp
@@ -1,4 +1,5 @@
 // UNSUPPORTED: ios
+// UNSUPPORTED: darwin
 // RUN: %clangxx_asan -O0 -g %s -o %t.executable
 // RUN: %env_asan_opts="symbolize=0" not %run %t.executable > %t_no_module_map.log 2>&1
 // RUN: %asan_symbolize --force-system-symbolizer < %t_no_module_map.log 2>&1 | FileCheck %s
diff --git a/compiler-rt/test/asan/TestCases/Darwin/suppressions-sandbox.cpp b/compiler-rt/test/asan/TestCases/Darwin/suppressions-sandbox.cpp
index f12e2b2ada50d..651d0c5d05b07 100644
--- a/compiler-rt/test/asan/TestCases/Darwin/suppressions-sandbox.cpp
+++ b/compiler-rt/test/asan/TestCases/Darwin/suppressions-sandbox.cpp
@@ -15,9 +15,6 @@
 // sandbox-exec isn't available on iOS
 // UNSUPPORTED: ios
 
-// Symbolizer fails to find test functions on current macOS bot version
-// XFAIL: system-darwin && target=arm{{.*}}
-
 #include <CoreFoundation/CoreFoundation.h>
 
 #if defined(SHARED_LIB)
diff --git a/compiler-rt/test/asan/TestCases/Posix/fread_fwrite.cpp b/compiler-rt/test/asan/TestCases/Posix/fread_fwrite.cpp
index c7b9280ea7d8e..c0629260418a3 100644
--- a/compiler-rt/test/asan/TestCases/Posix/fread_fwrite.cpp
+++ b/compiler-rt/test/asan/TestCases/Posix/fread_fwrite.cpp
@@ -2,9 +2,6 @@
 // RUN: not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-FWRITE
 // RUN: not %run %t 1 2>&1 | FileCheck %s --check-prefix=CHECK-FREAD
 
-// Symbolizer fails to find test functions on current macOS bot version
-// XFAIL: system-darwin && target=arm{{.*}}
-
 #include <stdio.h>
 #include <stdlib.h>
 
diff --git a/compiler-rt/test/asan/TestCases/log-path_test.cpp b/compiler-rt/test/asan/TestCases/log-path_test.cpp
index 3c5ca114cfd71..6875d57c43cc0 100644
--- a/compiler-rt/test/asan/TestCases/log-path_test.cpp
+++ b/compiler-rt/test/asan/TestCases/log-path_test.cpp
@@ -25,7 +25,8 @@
 // RUN: FileCheck %s --check-prefix=CHECK-BAD-DIR < %t.out
 
 // Too long log_path.
-// RUN: %env_asan_opts=log_path=`for((i=0;i<10000;i++)); do echo -n $i; done` \
+// RUN: %python -c "for i in range(0, 10000): print(i, end='')" > %t.long_log_path
+// RUN: %env_asan_opts=log_path=%{readfile:%t.long_log_path} \
 // RUN:   not %run %t 2> %t.out
 // RUN: FileCheck %s --check-prefix=CHECK-LONG < %t.out
 
diff --git a/compiler-rt/test/asan/TestCases/scariness_score_test.cpp b/compiler-rt/test/asan/TestCases/scariness_score_test.cpp
index 9e55e33675fde..5d229cf383648 100644
--- a/compiler-rt/test/asan/TestCases/scariness_score_test.cpp
+++ b/compiler-rt/test/asan/TestCases/scariness_score_test.cpp
@@ -6,7 +6,7 @@
 // RUN: %clangxx_asan -O0 -mllvm -asan-use-stack-safety=0 %s -o %t
 // On OSX and Windows, alloc_dealloc_mismatch=1 isn't 100% reliable, so it's
 // off by default. It's safe for these tests, though, so we turn it on.
-// RUN: export %env_asan_opts=symbolize=0:detect_stack_use_after_return=1:handle_abort=1:print_scariness=1:alloc_dealloc_mismatch=1
+// RUN: %export_asan_opts=symbolize=0:detect_stack_use_after_return=1:handle_abort=1:print_scariness=1:alloc_dealloc_mismatch=1
 // Make sure the stack is limited (may not be the default under GNU make)
 // RUN: ulimit -s 4096
 // RUN: not %run %t  1 2>&1 | FileCheck %s --check-prefix=CHECK1
@@ -41,7 +41,7 @@
 // RUN: %clangxx_asan -O0 %s -o %t -fsanitize-address-use-after-return=always -mllvm -asan-use-stack-safety=0
 // On OSX and Windows, alloc_dealloc_mismatch=1 isn't 100% reliable, so it's
 // off by default. It's safe for these tests, though, so we turn it on.
-// RUN: export %env_asan_opts=symbolize=0:handle_abort=1:print_scariness=1:alloc_dealloc_mismatch=1
+// RUN: %export_asan_opts=symbolize=0:handle_abort=1:print_scariness=1:alloc_dealloc_mismatch=1
 // Make sure the stack is limited (may not be the default under GNU make)
 // RUN: ulimit -s 4096
 // RUN: not %run %t  1 2>&1 | FileCheck %s --check-prefix=CHECK1
diff --git a/compiler-rt/test/asan/lit.cfg.py b/compiler-rt/test/asan/lit.cfg.py
index 96201e679b0a3..0194c720d003b 100644
--- a/compiler-rt/test/asan/lit.cfg.py
+++ b/compiler-rt/test/asan/lit.cfg.py
@@ -41,6 +41,9 @@ def get_required_attr(config, attr_name):
 config.substitutions.append(
     ("%env_asan_opts=", "env ASAN_OPTIONS=" + default_asan_opts_str)
 )
+config.substitutions.append(
+    ("%export_asan_opts=", "export ASAN_OPTIONS=" + default_asan_opts_str)
+)
 
 # Setup source root.
 config.test_source_root = os.path.dirname(__file__)
diff --git a/compiler-rt/test/fuzzer/coverage.test b/compiler-rt/test/fuzzer/coverage.test
index cf36784ce21da..a4af2648d61e1 100644
--- a/compiler-rt/test/fuzzer/coverage.test
+++ b/compiler-rt/test/fuzzer/coverage.test
@@ -2,6 +2,8 @@
 UNSUPPORTED: target={{.*windows.*}}
 # FIXME: CreatePCArray() emits PLT stub addresses for entry blocks, which are ignored by TracePC::PrintCoverage().
 UNSUPPORTED: target=s390x{{.*}}
+UNSUPPORTED: darwin
+
 RUN: mkdir -p %t.dir && cd %t.dir
 RUN: %cpp_compiler -mllvm -use-unknown-locations=Disable  %S/NullDerefTest.cpp -o %t.dir/NullDerefTest
 RUN: %cpp_compiler -mllvm -use-unknown-locations=Disable %S/DSO1.cpp -fPIC %ld_flags_rpath_so1 -O0 -shared -o %dynamiclib1
diff --git a/compiler-rt/test/fuzzer/exit_on_src_pos.test b/compiler-rt/test/fuzzer/exit_on_src_pos.test
index 020424e2d9fdd..ba4fb01780ce2 100644
--- a/compiler-rt/test/fuzzer/exit_on_src_pos.test
+++ b/compiler-rt/test/fuzzer/exit_on_src_pos.test
@@ -8,6 +8,7 @@
 UNSUPPORTED: target=thumb{{.*}}
 # Timeout on loongarch64 machine
 UNSUPPORTED: target=loongarch64{{.*}}
+UNSUPPORTED: darwin
 
 RUN: %cpp_compiler -O0 %S/SimpleTest.cpp -o %t-SimpleTest.exe -mllvm -use-unknown-locations=Disable
 RUN: %cpp_compiler -O0 %S/ShrinkControlFlowTest.cpp -o %t-ShrinkControlFlowTest.exe
diff --git a/compiler-rt/test/fuzzer/fuzzer-ubsan.test b/compiler-rt/test/fuzzer/fuzzer-ubsan.test
index d22339d72e261..6bc2c38636688 100644
--- a/compiler-rt/test/fuzzer/fuzzer-ubsan.test
+++ b/compiler-rt/test/fuzzer/fuzzer-ubsan.test
@@ -1,6 +1,3 @@
-// This test currently fails to compile on green.lab.llvm.org (arm)
-// XFAIL: system-darwin && target=arm{{.*}}
-
 RUN: %cpp_compiler -fsanitize=undefined -fno-sanitize-recover=all %S/SignedIntOverflowTest.cpp -o %t-SignedIntOverflowTest-Ubsan
 RUN: not %run %t-SignedIntOverflowTest-Ubsan 2>&1 | FileCheck %s
 CHECK: runtime error: signed integer overflow: 2147483647 + 1 cannot be represented in type 'int'
diff --git a/compiler-rt/test/fuzzer/reduce_inputs.test b/compiler-rt/test/fuzzer/reduce_inputs.test
index e65f572277297..d296fa42191af 100644
--- a/compiler-rt/test/fuzzer/reduce_inputs.test
+++ b/compiler-rt/test/fuzzer/reduce_inputs.test
@@ -12,5 +12,5 @@ RUN: %run %t-ShrinkControlFlowSimpleTest -runs=0 %t/C 2>&1 | FileCheck %s --chec
 COUNT: seed corpus: files: 4
 
 # a bit longer test
-RUN: %run %t-ShrinkControlFlowTest  -exit_on_item=0eb8e4ed029b774d80f2b66408203801cb982a60  -seed=42 -runs=1000000  2>&1 | FileCheck %s
+RUN: %run %t-ShrinkControlFlowTest  -exit_on_item=0eb8e4ed029b774d80f2b66408203801cb982a60  -seed=42 -runs=10000000  2>&1 | FileCheck %s
 
diff --git a/compiler-rt/test/hwasan/TestCases/Linux/fixed-shadow.c b/compiler-rt/test/hwasan/TestCases/Linux/fixed-shadow.c
index 421d233957830..fc83b213561c8 100644
--- a/compiler-rt/test/hwasan/TestCases/Linux/fixed-shadow.c
+++ b/compiler-rt/test/hwasan/TestCases/Linux/fixed-shadow.c
@@ -3,17 +3,17 @@
 // Default compiler instrumentation works with any shadow base (dynamic or fixed).
 // RUN: %clang_hwasan %s -o %t
 // RUN: %run %t
-// RUN: HWASAN_OPTIONS=fixed_shadow_base=263878495698944 %run %t 2>%t.out || (cat %t.out | FileCheck %s)
-// RUN: HWASAN_OPTIONS=fixed_shadow_base=4398046511104 %run %t
+// RUN: env HWASAN_OPTIONS=fixed_shadow_base=263878495698944 %run %t 2>%t.out || (cat %t.out | FileCheck %s)
+// RUN: env HWASAN_OPTIONS=fixed_shadow_base=4398046511104 %run %t
 //
 // If -hwasan-mapping-offset is set, then the fixed_shadow_base needs to match.
 // RUN: %clang_hwasan %s -mllvm -hwasan-mapping-offset=263878495698944 -o %t
-// RUN: HWASAN_OPTIONS=fixed_shadow_base=263878495698944 %run %t 2>%t.out || (cat %t.out | FileCheck %s)
-// RUN: HWASAN_OPTIONS=fixed_shadow_base=4398046511104 not %run %t
+// RUN: env HWASAN_OPTIONS=fixed_shadow_base=263878495698944 %run %t 2>%t.out || (cat %t.out | FileCheck %s)
+// RUN: env HWASAN_OPTIONS=fixed_shadow_base=4398046511104 not %run %t
 
 // RUN: %clang_hwasan %s -mllvm -hwasan-mapping-offset=4398046511104 -o %t
-// RUN: HWASAN_OPTIONS=fixed_shadow_base=4398046511104 %run %t
-// RUN: HWASAN_OPTIONS=fixed_shadow_base=263878495698944 not %run %t
+// RUN: env HWASAN_OPTIONS=fixed_shadow_base=4398046511104 %run %t
+// RUN: env HWASAN_OPTIONS=fixed_shadow_base=263878495698944 not %run %t
 //
 // Note: if fixed_shadow_base is not set, compiler-rt will dynamically choose a
 // shadow base, which has a tiny but non-zero probability of matching the
diff --git a/compiler-rt/test/lit.common.cfg.py b/compiler-rt/test/lit.common.cfg.py
index 8d147055293ed..9d2f02189b8bd 100644
--- a/compiler-rt/test/lit.common.cfg.py
+++ b/compiler-rt/test/lit.common.cfg.py
@@ -1066,3 +1066,5 @@ def target_page_size():
 # llvm.
 config.substitutions.append(("%crt_src", config.compiler_rt_src_root))
 config.substitutions.append(("%llvm_src", config.llvm_src_root))
+
+config.substitutions.append(("%python", '"%s"' % (sys.executable)))
diff --git a/compiler-rt/test/memprof/TestCases/log_path_test.cpp b/compiler-rt/test/memprof/TestCases/log_path_test.cpp
index 664ab79393195..683ca67122c31 100644
--- a/compiler-rt/test/memprof/TestCases/log_path_test.cpp
+++ b/compiler-rt/test/memprof/TestCases/log_path_test.cpp
@@ -18,7 +18,8 @@
 // RUN: %env_memprof_opts=print_text=true:log_path=/dev/null/INVALID not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-BAD-DIR --dump-input=always
 
 // Too long log_path.
-// RUN: %env_memprof_opts=print_text=true:log_path=`for((i=0;i<10000;i++)); do echo -n $i; done` \
+// RUN: %python -c "for i in range(0, 10000): print(i, end='')" > %t.long_log_path
+// RUN: %env_memprof_opts=print_text=true:log_path=%{readfile:%t.long_log_path} \
 // RUN:   not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-LONG --dump-input=always
 
 // Specifying the log name via the __memprof_profile_filename variable.
diff --git a/compiler-rt/test/msan/allocator_mapping.cpp b/compiler-rt/test/msan/allocator_mapping.cpp
index e7a12da489152..6eaba7e16a5be 100644
--- a/compiler-rt/test/msan/allocator_mapping.cpp
+++ b/compiler-rt/test/msan/allocator_mapping.cpp
@@ -3,7 +3,8 @@
 // mapping the heap early, in __msan_init.
 //
 // RUN: %clangxx_msan -O0 %s -o %t_1
-// RUN: %clangxx_msan -O0 -DHEAP_ADDRESS=$(%run %t_1) %s -o %t_2 && %run %t_2
+// RUN: %run %t_1 > %t.heap_address
+// RUN: %clangxx_msan -O0 -DHEAP_ADDRESS=%{readfile:%t.heap_address} %s -o %t_2 && %run %t_2
 //
 // This test only makes sense for the 64-bit allocator. The 32-bit allocator
 // does not have a fixed mapping. Exclude platforms that use the 32-bit
diff --git a/compiler-rt/test/nsan/Posix/allocator_mapping.cpp b/compiler-rt/test/nsan/Posix/allocator_mapping.cpp
index 3a3e655e259d0..a92962e16d9d2 100644
--- a/compiler-rt/test/nsan/Posix/allocator_mapping.cpp
+++ b/compiler-rt/test/nsan/Posix/allocator_mapping.cpp
@@ -2,7 +2,8 @@
 /// Test that a module constructor can not map memory over the NSan heap
 /// (without MAP_FIXED, of course).
 // RUN: %clangxx_nsan -O0 %s -o %t_1
-// RUN: %clangxx_nsan -O0 -DHEAP_ADDRESS=$(%run %t_1) %s -o %t_2 && %run %t_2
+// RUN: %run %t_1 > %t.heap_address
+// RUN: %clangxx_nsan -O0 -DHEAP_ADDRESS=%{readfile:%t.heap_address} %s -o %t_2 && %run %t_2
 
 #include <assert.h>
 #include <stdio.h>
diff --git a/compiler-rt/test/profile/Linux/instrprof-debug-info-correlate-warnings.c b/compiler-rt/test/profile/Linux/instrprof-debug-info-correlate-warnings.c
index 5069c6340b64f..25022f241a6d2 100644
--- a/compiler-rt/test/profile/Linux/instrprof-debug-info-correlate-warnings.c
+++ b/compiler-rt/test/profile/Linux/instrprof-debug-info-correlate-warnings.c
@@ -1,6 +1,6 @@
 // Disable full debug info and verify that we get warnings during merging
 
-// RUN: %clang_pgogen -o %t -gline-tables-only -mllvm --debug-info-correlate -mllvm --disable-vp=true %S/../Inputs/instrprof-debug-info-correlate-main.cpp %S/../Inputs/instrprof-debug-info-correlate-foo.cpp
+// RUN: %clang_pgogen -o %t -gline-tables-only -mllvm --profile-correlate=debug-info -mllvm --disable-vp=true %S/../Inputs/instrprof-debug-info-correlate-main.cpp %S/../Inputs/instrprof-debug-info-correlate-foo.cpp
 // RUN: env LLVM_PROFILE_FILE=%t.proflite %run %t
 // RUN: llvm-profdata merge -o %t.profdata --debug-info=%t %t.proflite --max-debug-info-correlation-warnings=2 2>&1 >/dev/null | FileCheck %s --check-prefixes=CHECK,LIMIT --implicit-check-not=warning
 // RUN: llvm-profdata merge -o %t.profdata --debug-info=%t %t.proflite --max-debug-info-correlation-warnings=0 2>&1 >/dev/null | FileCheck %s --check-prefixes=CHECK,NOLIMIT --implicit-check-not=warning
diff --git a/compiler-rt/test/profile/instrprof-hostname.c b/compiler-rt/test/profile/instrprof-hostname.c
index b77cf8df158bd..c0b3426eeaa84 100644
--- a/compiler-rt/test/profile/instrprof-hostname.c
+++ b/compiler-rt/test/profile/instrprof-hostname.c
@@ -1,7 +1,7 @@
 // RUN: %clang_profgen -o %t -O3 %s
 // RUN: env LLVM_PROFILE_FILE=%h.%t-%h.profraw_%h %run %t
-// RUN: %run uname -n > %t.n
-// RUN: llvm-profdata merge -o %t.profdata `cat %t.n`.%t-`cat %t.n`.profraw_`cat %t.n`
+// RUN: %run uname -n | tr -d '\n' > %t.n
+// RUN: llvm-profdata merge -o %t.profdata %{readfile:%t.n}.%t-%{readfile:%t.n}.profraw_%{readfile:%t.n}
 // RUN: %clang_profuse=%t.profdata -o - -S -emit-llvm %s | FileCheck %s
 // REQUIRES: shell
 
diff --git a/compiler-rt/test/tsan/Darwin/external.cpp b/compiler-rt/test/tsan/Darwin/external.cpp
index 3869c7abb7664..bf189eb1d6b5b 100644
--- a/compiler-rt/test/tsan/Darwin/external.cpp
+++ b/compiler-rt/test/tsan/Darwin/external.cpp
@@ -68,7 +68,7 @@ int main(int argc, char *argv[]) {
   // TEST2-NOT: WARNING: ThreadSanitizer
   
   // TEST3: WARNING: ThreadSanitizer: race on MyLibrary::MyObject
-  // TEST3: {{Modifying|read-only}} access of MyLibrary::MyObject at
+  // TEST3: {{Modifying|Read-only}} access of MyLibrary::MyObject at
   // TEST3: {{ObjectWrite|ObjectRead}}
   // TEST3: Previous {{modifying|read-only}} access of MyLibrary::MyObject at
   // TEST3: {{ObjectWrite|ObjectRead}}
diff --git a/compiler-rt/test/tsan/ignore_lib0.cpp b/compiler-rt/test/tsan/ignore_lib0.cpp
index cba58c6177038..9c4919022b512 100644
--- a/compiler-rt/test/tsan/ignore_lib0.cpp
+++ b/compiler-rt/test/tsan/ignore_lib0.cpp
@@ -4,11 +4,13 @@
 // RUN: %clangxx_tsan -O1 -fno-builtin %s -DLIB -fPIC -fno-sanitize=thread -shared -o %t-dir/libignore_lib0.so
 // RUN: %clangxx_tsan -O1 %s -L%t-dir -lignore_lib0 %link_libcxx_tsan -o %t
 // RUN: echo running w/o suppressions:
-// RUN: env LD_LIBRARY_PATH=%t-dir${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH} %deflake %run %t | FileCheck %s --check-prefix=CHECK-NOSUPP
+// RUN: echo -n %t-dir > %t.ld_library_path
+// RUN: %python -c "if 'LD_LIBRARY_PATH' in __import__('os').environ: print(':' + __import__('os').environ['LD_LIBRARY_PATH'], end='')" >> %t.ld_library_path
+// RUN: env LD_LIBRARY_PATH=%{readfile:%t.ld_library_path} %deflake %run %t | FileCheck %s --check-prefix=CHECK-NOSUPP
 // RUN: echo running with suppressions:
-// RUN: env LD_LIBRARY_PATH=%t-dir${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH} %env_tsan_opts=suppressions='%s.supp' %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-WITHSUPP
+// RUN: env LD_LIBRARY_PATH=%{readfile:%t.ld_library_path} %env_tsan_opts=suppressions='%s.supp' %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-WITHSUPP
 // RUN: echo running with generic suppression of noninstrumented code:
-// RUN: env LD_LIBRARY_PATH=%t-dir${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH} %env_tsan_opts=ignore_noninstrumented_modules=1 %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-WITHSUPP
+// RUN: env LD_LIBRARY_PATH=%{readfile:%t.ld_library_path} %env_tsan_opts=ignore_noninstrumented_modules=1 %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-WITHSUPP
 
 // Tests that interceptors coming from a library specified in called_from_lib
 // suppression are ignored.
diff --git a/compiler-rt/test/tysan/basic.c b/compiler-rt/test/tysan/basic.c
index 8e66e1a721383..28b94c425757e 100644
--- a/compiler-rt/test/tysan/basic.c
+++ b/compiler-rt/test/tysan/basic.c
@@ -1,6 +1,10 @@
-// RUN: %clang_tysan -O0 %s -o %t && %run %t 10 >%t.out.0 2>&1
+// RUN: %clang_tysan -O0 -mllvm -tysan-outline-instrumentation=false %s -o %t && %run %t 10 >%t.out.0 2>&1
 // RUN: FileCheck %s < %t.out.0
-// RUN: %clang_tysan -O2 %s -o %t && %run %t 10 >%t.out 2>&1
+// RUN: %clang_tysan -O2 -mllvm -tysan-outline-instrumentation=false %s -o %t && %run %t 10 >%t.out 2>&1
+// RUN: FileCheck %s < %t.out
+// RUN: %clang_tysan -O0 -mllvm -tysan-outline-instrumentation=true %s -o %t && %run %t 10 >%t.out.0 2>&1
+// RUN: FileCheck %s < %t.out.0
+// RUN: %clang_tysan -O2 -mllvm -tysan-outline-instrumentation=true %s -o %t && %run %t 10 >%t.out 2>&1
 // RUN: FileCheck %s < %t.out
 
 #include <stdio.h>
diff --git a/compiler-rt/test/tysan/simple_verify_outlines.c b/compiler-rt/test/tysan/simple_verify_outlines.c
new file mode 100644
index 0000000000000..0d0730edb0b99
--- /dev/null
+++ b/compiler-rt/test/tysan/simple_verify_outlines.c
@@ -0,0 +1,22 @@
+// RUN: %clang_tysan -mllvm -tysan-outline-instrumentation=true -mllvm -tysan-verify-outlined-instrumentation=true %s -o %t && %run %t >%t.out.0 2>&1
+// RUN: FileCheck %s < %t.out.0
+
+#include <stdio.h>
+
+void printInt(int *i) { printf("%d\n", *i); }
+
+int main() {
+
+  float value = 5.0f;
+  printInt((int *)&value);
+
+  return 0;
+}
+
+// CHECK: ERROR: TypeSanitizer: type-aliasing-violation
+// CHECK-NEXT: READ of size 4 at {{.*}} with type int accesses an existing object of type float
+// CHECK-NEXT: {{#0 0x.* in printInt}}
+// CHECK-EMPTY:
+// CHECK-NEXT: ERROR: TypeSanitizer: type-aliasing-violation
+// CHECK-NEXT: READ of size 4 at {{.*}} with type int accesses an existing object of type float
+// CHECK-NEXT: {{#0 0x.* in printInt}}
diff --git a/compiler-rt/test/tysan/struct-offset-outline.c b/compiler-rt/test/tysan/struct-offset-outline.c
new file mode 100644
index 0000000000000..c84eb2762f669
--- /dev/null
+++ b/compiler-rt/test/tysan/struct-offset-outline.c
@@ -0,0 +1,32 @@
+// RUN: %clang_tysan -mllvm -tysan-outline-instrumentation=true -O0 %s -o %t && %run %t >%t.out 2>&1
+// RUN: FileCheck %s < %t.out
+// RUN: %clang_tysan -mllvm -tysan-outline-instrumentation=true -mllvm -tysan-verify-outlined-instrumentation=true -O0 %s -o %t && %run %t >%t.out 2>&1
+// RUN: FileCheck %s --check-prefixes='CHECK,CHECK-VERIFY' < %t.out
+
+#include <stdio.h>
+#include <stdlib.h>
+
+struct X {
+  int i;
+  int j;
+};
+
+int foo(struct X *p, struct X *q) {
+  q->j = 1;
+  p->i = 0;
+  // CHECK: ERROR: TypeSanitizer: type-aliasing-violation
+  // CHECK-NEXT: WRITE of size 4 at {{.*}} with type int (in X at offset 0) accesses an existing object of type int (in X at offset 4)
+  // CHECK-NEXT: {{#0 0x.* in foo .*struct-offset-outline.c:}}[[@LINE-3]]
+  // CHECK-VERIFY-EMPTY:
+  // CHECK-VERIFY-NEXT: ERROR: TypeSanitizer: type-aliasing-violation
+  // CHECK-VERIFY-NEXT: WRITE of size 4 at {{.*}} with type int (in X at offset 0) accesses an existing object of type int (in X at offset 4)
+  // CHECK-VERIFY-NEXT: {{#0 0x.* in foo .*struct-offset-outline.c:}}[[@LINE-7]]
+  return q->j;
+}
+
+int main() {
+  unsigned char *p = malloc(3 * sizeof(int));
+  printf("%i\n", foo((struct X *)(p + sizeof(int)), (struct X *)p));
+}
+
+// CHECK-NOT: ERROR: TypeSanitizer: type-aliasing-violation
diff --git a/compiler-rt/test/ubsan/TestCases/Misc/Posix/print_stack_trace.cpp b/compiler-rt/test/ubsan/TestCases/Misc/Posix/print_stack_trace.cpp
index 93c6bd66e127c..2eac710d98085 100644
--- a/compiler-rt/test/ubsan/TestCases/Misc/Posix/print_stack_trace.cpp
+++ b/compiler-rt/test/ubsan/TestCases/Misc/Posix/print_stack_trace.cpp
@@ -1,5 +1,5 @@
-// RUN: %clangxx -fsanitize=undefined -O0 %s -o %t && UBSAN_OPTIONS=stack_trace_format=DEFAULT:fast_unwind_on_fatal=1 %run %t 2>&1 | FileCheck %s
-// RUN: %clangxx -fsanitize=undefined -O0 %s -o %t && UBSAN_OPTIONS=stack_trace_format=DEFAULT:fast_unwind_on_fatal=0 %run %t 2>&1 | FileCheck %s
+// RUN: %clangxx -fsanitize=undefined -O0 %s -o %t && env UBSAN_OPTIONS=stack_trace_format=DEFAULT:fast_unwind_on_fatal=1 %run %t 2>&1 | FileCheck %s
+// RUN: %clangxx -fsanitize=undefined -O0 %s -o %t && env UBSAN_OPTIONS=stack_trace_format=DEFAULT:fast_unwind_on_fatal=0 %run %t 2>&1 | FileCheck %s
 
 // This test is temporarily disabled due to broken unwinding on ARM.
 // UNSUPPORTED: target={{.*-linux-.*}}
diff --git a/compiler-rt/test/xray/TestCases/Posix/fdr-single-thread.cpp b/compiler-rt/test/xray/TestCases/Posix/fdr-single-thread.cpp
index b8803aedc8851..36a4e65988f9a 100644
--- a/compiler-rt/test/xray/TestCases/Posix/fdr-single-thread.cpp
+++ b/compiler-rt/test/xray/TestCases/Posix/fdr-single-thread.cpp
@@ -1,11 +1,12 @@
 // RUN: %clangxx_xray -g -std=c++11 %s -o %t
 // RUN: rm -f fdr-logging-1thr-*
-// RUN: XRAY_OPTIONS=XRAY_OPTIONS="verbosity=1 patch_premain=true \
+// RUN: env XRAY_OPTIONS=XRAY_OPTIONS="verbosity=1 patch_premain=true \
 // RUN:   xray_fdr_log=true \
 // RUN:   xray_fdr_log_func_duration_threshold_us=0 \
 // RUN:   xray_logfile_base=fdr-logging-1thr-" %run %t 2>&1
+// RUN: ls fdr-logging-1thr-* | head -n1 | tr -d '\n' > %t.xray_input
 // RUN: %llvm_xray convert --output-format=yaml --symbolize --instr_map=%t \
-// RUN:   "`ls fdr-logging-1thr-* | head -n1`" | FileCheck %s
+// RUN:   "%{readfile:%t.xray_input}" | FileCheck %s
 // RUN: rm fdr-logging-1thr-*
 
 // UNSUPPORTED: target=arm{{.*}}
diff --git a/flang-rt/CMakeLists.txt b/flang-rt/CMakeLists.txt
index cad39d0c71016..50b8e834776fb 100644
--- a/flang-rt/CMakeLists.txt
+++ b/flang-rt/CMakeLists.txt
@@ -330,3 +330,19 @@ if (FLANG_RT_INCLUDE_TESTS)
 else ()
   add_custom_target(check-flang-rt)
 endif()
+
+###################
+# Install headers #
+###################
+
+if (NOT LLVM_INSTALL_TOOLCHAIN_ONLY)
+  add_llvm_install_targets(install-flang-rt-headers COMPONENT flang-rt-headers)
+
+  install(DIRECTORY include/flang-rt/runtime
+    DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/flang-rt"
+    COMPONENT flang-rt-headers
+    FILES_MATCHING
+    PATTERN "*.h"
+    PATTERN ".git" EXCLUDE
+    PATTERN "CMakeFiles" EXCLUDE)
+endif()
diff --git a/flang/docs/Directives.md b/flang/docs/Directives.md
index 3ebb08c486228..2f16a8d579f8b 100644
--- a/flang/docs/Directives.md
+++ b/flang/docs/Directives.md
@@ -1,9 +1,9 @@
-<!--===- docs/Directives.md 
-  
+<!--===- docs/Directives.md
+
    Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
    See https://llvm.org/LICENSE.txt for license information.
    SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-  
+
 -->
 
 # Compiler directives supported by Flang
@@ -12,16 +12,18 @@ A list of non-standard directives supported by Flang
 
 * `!dir$ fixed` and `!dir$ free` select Fortran source forms.  Their effect
   persists to the end of the current source file.
-* `!dir$ ignore_tkr [[(TKRDMAC)] dummy-arg-name]...` in an interface definition
+* `!dir$ ignore_tkr [[(TKRDMACP)] dummy-arg-name]...` in an interface definition
   disables some semantic checks at call sites for the actual arguments that
-  correspond to some named dummy arguments (or all of them, by default).
-  The directive allow actual arguments that would otherwise be diagnosed
-  as incompatible in type (T), kind (K), rank (R), CUDA device (D), or
-  managed (M) status.  The letter (A) is a shorthand for all of these,
-  and is the default when no letters appear.  The letter (C) checks for
-  contiguity for example allowing an element of an assumed-shape array to be
-  passed as a dummy argument. For example, if one wanted to call a "set all
-  bytes to zero" utility that could be applied to arrays of any type or rank:
+  correspond to some named dummy arguments (or all of them, by default). The
+  directive allow actual arguments that would otherwise be diagnosed as
+  incompatible in type (T), kind (K), rank (R), CUDA device (D), or managed (M)
+  status. The letter (A) is a shorthand for (TKRDM), and is the default when no
+  letters appear. The letter (C) checks for contiguity, for example allowing an
+  element of an assumed-shape array to be passed as a dummy argument. The
+  letter (P) ignores pointer and allocatable matching, so that one can pass an
+  allocatable array to routine with pointer array argument and vice versa. For
+  example, if one wanted to call a "set all bytes to zero" utility that could
+  be applied to arrays of any type or rank:
 ```
   interface
     subroutine clear(arr,bytes)
@@ -46,27 +48,27 @@ A list of non-standard directives supported by Flang
   unroll the loop. Some compilers accept an optional `=` before the `n` when `n`
   is present in the directive. Flang does not.
 * `!dir$ unroll_and_jam [N]` control how many times a loop should be unrolled and
-  jammed. It must be placed immediately before a loop that follows. `N` is an optional 
-  integer that specifying the unrolling factor. When `N` is `0` or `1`, the loop 
+  jammed. It must be placed immediately before a loop that follows. `N` is an optional
+  integer that specifying the unrolling factor. When `N` is `0` or `1`, the loop
   should not be unrolled at all. If `N` is omitted the optimizer will
   selects the number of times to unroll the loop.
 * `!dir$ novector` disabling vectorization on the following loop.
 * `!dir$ nounroll` disabling unrolling on the following loop.
 * `!dir$ nounroll_and_jam` disabling unrolling and jamming on the following loop.
-* `!dir$ inline` instructs the compiler to attempt to inline the called routines if the 
-  directive is specified before a call statement or all call statements within the loop 
-  body if specified before a DO LOOP or all function references if specified before an 
+* `!dir$ inline` instructs the compiler to attempt to inline the called routines if the
+  directive is specified before a call statement or all call statements within the loop
+  body if specified before a DO LOOP or all function references if specified before an
   assignment statement.
-* `!dir$ forceinline` works in the same way as the `inline` directive, but it forces 
+* `!dir$ forceinline` works in the same way as the `inline` directive, but it forces
    inlining by the compiler on a function call statement.
-* `!dir$ noinline` works in the same way as the `inline` directive, but prevents 
+* `!dir$ noinline` works in the same way as the `inline` directive, but prevents
   any attempt of inlining by the compiler on a function call statement.
 
 # Directive Details
 
 ## Introduction
-Directives are commonly used in Fortran programs to specify additional actions 
-to be performed by the compiler. The directives are always specified with the 
+Directives are commonly used in Fortran programs to specify additional actions
+to be performed by the compiler. The directives are always specified with the
 `!dir$` or `cdir$` prefix.
 
 ## Loop Directives
@@ -97,7 +99,7 @@ check that that construct matches the expected construct for the directive.
 Skipping other intermediate directives allows multiple directives to appear on
 the same construct.
 
-## Lowering 
+## Lowering
 Evaluation is extended with a new field called dirs for representing directives
 associated with that Evaluation. When lowering loop directives, the associated
 Do Loop's evaluation is found and the directive is added to it. This information
@@ -109,7 +111,7 @@ about the loop. For example, the `llvm.loop.vectorize.enable` metadata informs
 the optimizer that a loop can be vectorized without considering its cost-model.
 This attribute is added to the loop condition branch.
 
-### Representation in MLIR 
+### Representation in MLIR
 The MLIR LLVM dialect models this by an attribute called LoopAnnotation
 Attribute. The attribute can be added to the latch of the loop in the cf
 dialect and is then carried through lowering to the LLVM dialect.
diff --git a/flang/docs/Extensions.md b/flang/docs/Extensions.md
index 6d872094811e3..c9cc02703fbc8 100644
--- a/flang/docs/Extensions.md
+++ b/flang/docs/Extensions.md
@@ -182,6 +182,13 @@ end
   Note that internally the main program symbol name is all uppercase, unlike
   the names of all other symbols, which are usually all lowercase. This
   may make a difference in testing/debugging.
+* A `PROCEDURE()` with no interface name or type may be called as an
+  subroutine with an implicit interface, F'2023 15.4.3.6 paragraph 4 and
+  C1525 notwithstanding.
+  This is a universally portable feature, and it also applies to
+  `PROCEDURE(), POINTER, NOPASS` derived type components.
+  Such procedures may *not* be referenced as implicitly typed functions
+  without first being associated with a function pointer.
 
 ## Extensions, deletions, and legacy features supported by default
 
@@ -954,4 +961,3 @@ print *, [(j,j=1,10)]
   "&GRP A(1:)=1. 2. 3./".
   This extension is necessarily disabled when the type of the array
   has an accessible defined formatted READ subroutine.
-
diff --git a/flang/include/flang/Evaluate/common.h b/flang/include/flang/Evaluate/common.h
index 0263f15d4215e..3d220afa71718 100644
--- a/flang/include/flang/Evaluate/common.h
+++ b/flang/include/flang/Evaluate/common.h
@@ -303,10 +303,16 @@ class FoldingContext {
     return common::ScopedSet(analyzingPDTComponentKindSelector_, true);
   }
 
+  common::Restorer<std::string> SetRealFlagWarningContext(std::string str) {
+    return common::ScopedSet(realFlagWarningContext_, str);
+  }
+
   parser::CharBlock SaveTempName(std::string &&name) {
     return {*tempNames_.emplace(std::move(name)).first};
   }
 
+  void RealFlagWarnings(const RealFlags &, const char *op);
+
 private:
   parser::ContextualMessages messages_;
   const common::IntrinsicTypeDefaultKinds &defaults_;
@@ -318,8 +324,8 @@ class FoldingContext {
   std::map<parser::CharBlock, ConstantSubscript> impliedDos_;
   const common::LanguageFeatureControl &languageFeatures_;
   std::set<std::string> &tempNames_;
+  std::string realFlagWarningContext_;
 };
 
-void RealFlagWarnings(FoldingContext &, const RealFlags &, const char *op);
 } // namespace Fortran::evaluate
 #endif // FORTRAN_EVALUATE_COMMON_H_
diff --git a/flang/include/flang/Lower/OpenMP/Clauses.h b/flang/include/flang/Lower/OpenMP/Clauses.h
index 74924661d9a03..688d01704370d 100644
--- a/flang/include/flang/Lower/OpenMP/Clauses.h
+++ b/flang/include/flang/Lower/OpenMP/Clauses.h
@@ -294,6 +294,7 @@ using Permutation = tomp::clause::PermutationT<TypeTy, IdTy, ExprTy>;
 using TaskReduction = tomp::clause::TaskReductionT<TypeTy, IdTy, ExprTy>;
 using ThreadLimit = tomp::clause::ThreadLimitT<TypeTy, IdTy, ExprTy>;
 using Threads = tomp::clause::ThreadsT<TypeTy, IdTy, ExprTy>;
+using Threadset = tomp::clause::ThreadsetT<TypeTy, IdTy, ExprTy>;
 using Transparent = tomp::clause::TransparentT<TypeTy, IdTy, ExprTy>;
 using To = tomp::clause::ToT<TypeTy, IdTy, ExprTy>;
 using UnifiedAddress = tomp::clause::UnifiedAddressT<TypeTy, IdTy, ExprTy>;
diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
index c3cd119b96174..3407dd01dd504 100644
--- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
+++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
@@ -211,6 +211,8 @@ struct IntrinsicLibrary {
   mlir::Value genBarrierArrive(mlir::Type, llvm::ArrayRef<mlir::Value>);
   mlir::Value genBarrierArriveCnt(mlir::Type, llvm::ArrayRef<mlir::Value>);
   void genBarrierInit(llvm::ArrayRef<fir::ExtendedValue>);
+  mlir::Value genBarrierTryWait(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  mlir::Value genBarrierTryWaitSleep(mlir::Type, llvm::ArrayRef<mlir::Value>);
   fir::ExtendedValue genBesselJn(mlir::Type,
                                  llvm::ArrayRef<fir::ExtendedValue>);
   fir::ExtendedValue genBesselYn(mlir::Type,
@@ -459,7 +461,21 @@ struct IntrinsicLibrary {
   mlir::Value genTime(mlir::Type, llvm::ArrayRef<mlir::Value>);
   void genTMABulkCommitGroup(llvm::ArrayRef<fir::ExtendedValue>);
   void genTMABulkG2S(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkLoadC4(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkLoadC8(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkLoadI4(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkLoadI8(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkLoadR2(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkLoadR4(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkLoadR8(llvm::ArrayRef<fir::ExtendedValue>);
   void genTMABulkS2G(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkStoreI4(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkStoreI8(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkStoreR2(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkStoreR4(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkStoreR8(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkStoreC4(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkStoreC8(llvm::ArrayRef<fir::ExtendedValue>);
   void genTMABulkWaitGroup(llvm::ArrayRef<fir::ExtendedValue>);
   mlir::Value genTrailz(mlir::Type, llvm::ArrayRef<mlir::Value>);
   fir::ExtendedValue genTransfer(mlir::Type,
diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.h b/flang/include/flang/Optimizer/Dialect/FIROps.h
index 62ef8b4b502f2..4651f2bb8038e 100644
--- a/flang/include/flang/Optimizer/Dialect/FIROps.h
+++ b/flang/include/flang/Optimizer/Dialect/FIROps.h
@@ -20,6 +20,7 @@
 #include "mlir/Dialect/LLVMIR/LLVMAttrs.h"
 #include "mlir/Interfaces/LoopLikeInterface.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
+#include "mlir/Interfaces/ViewLikeInterface.h"
 
 namespace fir {
 
diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.td b/flang/include/flang/Optimizer/Dialect/FIROps.td
index 58a317cf5d691..bae52d63fda45 100644
--- a/flang/include/flang/Optimizer/Dialect/FIROps.td
+++ b/flang/include/flang/Optimizer/Dialect/FIROps.td
@@ -17,6 +17,7 @@
 include "mlir/Dialect/Arith/IR/ArithBase.td"
 include "mlir/Dialect/Arith/IR/ArithOpsInterfaces.td"
 include "mlir/Dialect/LLVMIR/LLVMAttrDefs.td"
+include "mlir/Interfaces/ViewLikeInterface.td"
 include "flang/Optimizer/Dialect/CUF/Attributes/CUFAttr.td"
 include "flang/Optimizer/Dialect/FIRDialect.td"
 include "flang/Optimizer/Dialect/FIRTypes.td"
@@ -2828,7 +2829,8 @@ def fir_VolatileCastOp : fir_SimpleOneResultOp<"volatile_cast", [Pure]> {
   let hasFolder = 1;
 }
 
-def fir_ConvertOp : fir_SimpleOneResultOp<"convert", [NoMemoryEffect]> {
+def fir_ConvertOp
+    : fir_SimpleOneResultOp<"convert", [NoMemoryEffect, ViewLikeOpInterface]> {
   let summary = "encapsulates all Fortran entity type conversions";
 
   let description = [{
@@ -2866,6 +2868,7 @@ def fir_ConvertOp : fir_SimpleOneResultOp<"convert", [NoMemoryEffect]> {
     static bool isPointerCompatible(mlir::Type ty);
     static bool canBeConverted(mlir::Type inType, mlir::Type outType);
     static bool areVectorsCompatible(mlir::Type inTy, mlir::Type outTy);
+    mlir::Value getViewSource() { return getValue(); }
   }];
   let hasCanonicalizer = 1;
 }
diff --git a/flang/include/flang/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.h b/flang/include/flang/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.h
new file mode 100644
index 0000000000000..7afe97aac57e8
--- /dev/null
+++ b/flang/include/flang/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.h
@@ -0,0 +1,58 @@
+//===- FIROpenACCOpsInterfaces.h --------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains external operation interfaces for FIR.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FLANG_OPTIMIZER_OPENACC_FIROPENACC_OPS_INTERFACES_H_
+#define FLANG_OPTIMIZER_OPENACC_FIROPENACC_OPS_INTERFACES_H_
+
+#include "mlir/Dialect/OpenACC/OpenACC.h"
+
+namespace fir {
+class DeclareOp;
+} // namespace fir
+
+namespace hlfir {
+class DeclareOp;
+class DesignateOp;
+} // namespace hlfir
+
+namespace fir::acc {
+
+template <typename Op>
+struct PartialEntityAccessModel
+    : public mlir::acc::PartialEntityAccessOpInterface::ExternalModel<
+          PartialEntityAccessModel<Op>, Op> {
+  mlir::Value getBaseEntity(mlir::Operation *op) const;
+
+  // Default implementation - returns false (partial view)
+  bool isCompleteView(mlir::Operation *op) const { return false; }
+};
+
+// Full specializations for declare operations
+template <>
+struct PartialEntityAccessModel<fir::DeclareOp>
+    : public mlir::acc::PartialEntityAccessOpInterface::ExternalModel<
+          PartialEntityAccessModel<fir::DeclareOp>, fir::DeclareOp> {
+  mlir::Value getBaseEntity(mlir::Operation *op) const;
+  bool isCompleteView(mlir::Operation *op) const;
+};
+
+template <>
+struct PartialEntityAccessModel<hlfir::DeclareOp>
+    : public mlir::acc::PartialEntityAccessOpInterface::ExternalModel<
+          PartialEntityAccessModel<hlfir::DeclareOp>, hlfir::DeclareOp> {
+  mlir::Value getBaseEntity(mlir::Operation *op) const;
+  bool isCompleteView(mlir::Operation *op) const;
+};
+
+} // namespace fir::acc
+
+#endif // FLANG_OPTIMIZER_OPENACC_FIROPENACC_OPS_INTERFACES_H_
diff --git a/flang/include/flang/Optimizer/OpenACC/Support/FIROpenACCTypeInterfaces.h b/flang/include/flang/Optimizer/OpenACC/Support/FIROpenACCTypeInterfaces.h
index 4817ed933ba06..3167c554abbdd 100644
--- a/flang/include/flang/Optimizer/OpenACC/Support/FIROpenACCTypeInterfaces.h
+++ b/flang/include/flang/Optimizer/OpenACC/Support/FIROpenACCTypeInterfaces.h
@@ -60,6 +60,8 @@ struct OpenACCMappableModel
   getOffsetInBytes(mlir::Type type, mlir::Value var, mlir::ValueRange accBounds,
                    const mlir::DataLayout &dataLayout) const;
 
+  bool hasUnknownDimensions(mlir::Type type) const;
+
   llvm::SmallVector<mlir::Value>
   generateAccBounds(mlir::Type type, mlir::Value var,
                     mlir::OpBuilder &builder) const;
diff --git a/flang/include/flang/Parser/dump-parse-tree.h b/flang/include/flang/Parser/dump-parse-tree.h
index 553cbd52cb3fd..a7398a4ef970f 100644
--- a/flang/include/flang/Parser/dump-parse-tree.h
+++ b/flang/include/flang/Parser/dump-parse-tree.h
@@ -599,7 +599,7 @@ class ParseTreeDumper {
   NODE(parser, OmpInitClause)
   NODE(OmpInitClause, Modifier)
   NODE(parser, OmpInitializerClause)
-  NODE(parser, OmpInitializerProc)
+  NODE(parser, OmpInitializerExpression)
   NODE(parser, OmpInReductionClause)
   NODE(OmpInReductionClause, Modifier)
   NODE(parser, OmpInteropPreference)
@@ -677,10 +677,16 @@ class ParseTreeDumper {
   NODE_ENUM(OmpSeverityClause, Severity)
   NODE(parser, OmpStepComplexModifier)
   NODE(parser, OmpStepSimpleModifier)
+  NODE(parser, OmpStylizedDeclaration)
+  NODE(parser, OmpStylizedExpression)
+  NODE(parser, OmpStylizedInstance)
+  NODE(OmpStylizedInstance, Instance)
   NODE(parser, OmpTaskDependenceType)
   NODE_ENUM(OmpTaskDependenceType, Value)
   NODE(parser, OmpTaskReductionClause)
   NODE(OmpTaskReductionClause, Modifier)
+  NODE(parser, OmpThreadsetClause)
+  NODE_ENUM(OmpThreadsetClause, ThreadsetPolicy)
   NODE(parser, OmpToClause)
   NODE(OmpToClause, Modifier)
   NODE(parser, OmpTraitProperty)
diff --git a/flang/include/flang/Parser/openmp-utils.h b/flang/include/flang/Parser/openmp-utils.h
index f761332c9cfd7..49db091af93a7 100644
--- a/flang/include/flang/Parser/openmp-utils.h
+++ b/flang/include/flang/Parser/openmp-utils.h
@@ -25,6 +25,13 @@
 
 namespace Fortran::parser::omp {
 
+template <typename T> constexpr auto addr_if(std::optional<T> &x) {
+  return x ? &*x : nullptr;
+}
+template <typename T> constexpr auto addr_if(const std::optional<T> &x) {
+  return x ? &*x : nullptr;
+}
+
 namespace detail {
 using D = llvm::omp::Directive;
 
@@ -133,9 +140,24 @@ template <typename T> OmpDirectiveName GetOmpDirectiveName(const T &x) {
 }
 
 const OmpObjectList *GetOmpObjectList(const OmpClause &clause);
+
+template <typename T>
+const T *GetFirstArgument(const OmpDirectiveSpecification &spec) {
+  for (const OmpArgument &arg : spec.Arguments().v) {
+    if (auto *t{std::get_if<T>(&arg.u)}) {
+      return t;
+    }
+  }
+  return nullptr;
+}
+
 const BlockConstruct *GetFortranBlockConstruct(
     const ExecutionPartConstruct &epc);
 
+const OmpCombinerExpression *GetCombinerExpr(
+    const OmpReductionSpecifier &rspec);
+const OmpInitializerExpression *GetInitializerExpr(const OmpClause &init);
+
 } // namespace Fortran::parser::omp
 
 #endif // FORTRAN_PARSER_OPENMP_UTILS_H
diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h
index 2cf6faead479d..375790af90b74 100644
--- a/flang/include/flang/Parser/parse-tree.h
+++ b/flang/include/flang/Parser/parse-tree.h
@@ -24,7 +24,9 @@
 #include "provenance.h"
 #include "flang/Common/idioms.h"
 #include "flang/Common/indirection.h"
+#include "flang/Common/reference.h"
 #include "flang/Support/Fortran.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/Frontend/OpenACC/ACC.h.inc"
 #include "llvm/Frontend/OpenMP/OMP.h"
 #include "llvm/Frontend/OpenMP/OMPConstants.h"
@@ -3510,6 +3512,8 @@ struct OmpDirectiveName {
 
 // type-name list item
 struct OmpTypeName {
+  CharBlock source;
+  mutable const semantics::DeclTypeSpec *declTypeSpec{nullptr};
   UNION_CLASS_BOILERPLATE(OmpTypeName);
   std::variant<TypeSpec, DeclarationTypeSpec> u;
 };
@@ -3538,6 +3542,39 @@ struct OmpObjectList {
   WRAPPER_CLASS_BOILERPLATE(OmpObjectList, std::list<OmpObject>);
 };
 
+struct OmpStylizedDeclaration {
+  COPY_AND_ASSIGN_BOILERPLATE(OmpStylizedDeclaration);
+  // Since "Reference" isn't handled by parse-tree-visitor, add EmptyTrait,
+  // and visit the members by hand when needed.
+  using EmptyTrait = std::true_type;
+  common::Reference<const OmpTypeName> type;
+  EntityDecl var;
+};
+
+struct OmpStylizedInstance {
+  struct Instance {
+    UNION_CLASS_BOILERPLATE(Instance);
+    std::variant<AssignmentStmt, CallStmt, common::Indirection<Expr>> u;
+  };
+  TUPLE_CLASS_BOILERPLATE(OmpStylizedInstance);
+  std::tuple<std::list<OmpStylizedDeclaration>, Instance> t;
+};
+
+class ParseState;
+
+// Ref: [5.2:76], [6.0:185]
+//
+struct OmpStylizedExpression {
+  CharBlock source;
+  // Pointer to a temporary copy of the ParseState that is used to create
+  // additional parse subtrees for the stylized expression. This is only
+  // used internally during parsing and conveys no information to the
+  // consumers of the AST.
+  const ParseState *state{nullptr};
+  WRAPPER_CLASS_BOILERPLATE(
+      OmpStylizedExpression, std::list<OmpStylizedInstance>);
+};
+
 // Ref: [4.5:201-207], [5.0:293-299], [5.1:325-331], [5.2:124]
 //
 // reduction-identifier ->
@@ -3555,9 +3592,22 @@ struct OmpReductionIdentifier {
 // combiner-expression ->                           // since 4.5
 //    assignment-statement |
 //    function-reference
-struct OmpCombinerExpression {
-  UNION_CLASS_BOILERPLATE(OmpCombinerExpression);
-  std::variant<AssignmentStmt, FunctionReference> u;
+struct OmpCombinerExpression : public OmpStylizedExpression {
+  INHERITED_WRAPPER_CLASS_BOILERPLATE(
+      OmpCombinerExpression, OmpStylizedExpression);
+  static llvm::ArrayRef<CharBlock> Variables();
+};
+
+// Ref: [4.5:222:7-8], [5.0:305:28-29], [5.1:337:20-21], [5.2:127:6-8],
+//      [6.0:242:3-5]
+//
+// initializer-expression ->                        // since 4.5
+//    OMP_PRIV = expression |
+//    subroutine-name(argument-list)
+struct OmpInitializerExpression : public OmpStylizedExpression {
+  INHERITED_WRAPPER_CLASS_BOILERPLATE(
+      OmpInitializerExpression, OmpStylizedExpression);
+  static llvm::ArrayRef<CharBlock> Variables();
 };
 
 inline namespace arguments {
@@ -4558,16 +4608,9 @@ struct OmpInReductionClause {
   std::tuple<MODIFIERS(), OmpObjectList> t;
 };
 
-// declare-reduction -> DECLARE REDUCTION (reduction-identifier : type-list
-//                                              : combiner) [initializer-clause]
-struct OmpInitializerProc {
-  TUPLE_CLASS_BOILERPLATE(OmpInitializerProc);
-  std::tuple<ProcedureDesignator, std::list<ActualArgSpec>> t;
-};
 // Initialization for declare reduction construct
 struct OmpInitializerClause {
-  UNION_CLASS_BOILERPLATE(OmpInitializerClause);
-  std::variant<OmpInitializerProc, AssignmentStmt> u;
+  WRAPPER_CLASS_BOILERPLATE(OmpInitializerClause, OmpInitializerExpression);
 };
 
 // Ref: [4.5:199-201], [5.0:288-290], [5.1:321-322], [5.2:115-117]
@@ -4782,6 +4825,14 @@ struct OmpTaskReductionClause {
   std::tuple<MODIFIERS(), OmpObjectList> t;
 };
 
+// Ref: [6.0:442]
+// threadset-clause ->
+//     THREADSET(omp_pool|omp_team)
+struct OmpThreadsetClause {
+  ENUM_CLASS(ThreadsetPolicy, Omp_Pool, Omp_Team)
+  WRAPPER_CLASS_BOILERPLATE(OmpThreadsetClause, ThreadsetPolicy);
+};
+
 // Ref: [4.5:107-109], [5.0:176-180], [5.1:205-210], [5.2:167-168]
 //
 // to-clause (in DECLARE TARGET) ->
diff --git a/flang/include/flang/Semantics/dump-expr.h b/flang/include/flang/Semantics/dump-expr.h
index 2dbd4cb60be59..5a78e13b19e5d 100644
--- a/flang/include/flang/Semantics/dump-expr.h
+++ b/flang/include/flang/Semantics/dump-expr.h
@@ -48,10 +48,11 @@ class DumpEvaluateExpr {
       // "... [with T = xyz; std::string_view = ...]"
 #ifdef __clang__
       std::string_view front("[T = ");
+      std::string_view back("]");
 #else
       std::string_view front("[with T = ");
-#endif
       std::string_view back("; std::string_view =");
+#endif
 
 #elif defined(_MSC_VER)
 #define DUMP_EXPR_SHOW_TYPE
diff --git a/flang/include/flang/Semantics/symbol.h b/flang/include/flang/Semantics/symbol.h
index 04a063957082a..cb27d544ed9f5 100644
--- a/flang/include/flang/Semantics/symbol.h
+++ b/flang/include/flang/Semantics/symbol.h
@@ -830,6 +830,8 @@ class Symbol {
       OmpUseDevicePtr, OmpUseDeviceAddr, OmpIsDevicePtr, OmpHasDeviceAddr,
       // OpenMP data-copying attribute
       OmpCopyIn, OmpCopyPrivate,
+      // OpenMP special variables
+      OmpInVar, OmpOrigVar, OmpOutVar, OmpPrivVar,
       // OpenMP miscellaneous flags
       OmpCommonBlock, OmpReduction, OmpInReduction, OmpAligned, OmpNontemporal,
       OmpAllocate, OmpDeclarativeAllocateDirective,
diff --git a/flang/include/flang/Support/Fortran.h b/flang/include/flang/Support/Fortran.h
index ea0344ecb0830..cf39781c1e8a7 100644
--- a/flang/include/flang/Support/Fortran.h
+++ b/flang/include/flang/Support/Fortran.h
@@ -86,8 +86,9 @@ ENUM_CLASS(IgnoreTKR,
     Rank, // R - don't check ranks
     Device, // D - don't check host/device residence
     Managed, // M - don't check managed storage
-    Contiguous) // C - don't check for storage sequence association with a
+    Contiguous, // C - don't check for storage sequence association with a
                 // potentially non-contiguous object
+    Pointer) // P - ignore pointer and allocatable matching
 using IgnoreTKRSet = EnumSet<IgnoreTKR, 8>;
 // IGNORE_TKR(A) = IGNORE_TKR(TKRDM)
 static constexpr IgnoreTKRSet ignoreTKRAll{IgnoreTKR::Type, IgnoreTKR::Kind,
diff --git a/flang/lib/Evaluate/common.cpp b/flang/lib/Evaluate/common.cpp
index 46c75a5c2ee44..ed6a0ef93b0db 100644
--- a/flang/lib/Evaluate/common.cpp
+++ b/flang/lib/Evaluate/common.cpp
@@ -13,24 +13,28 @@ using namespace Fortran::parser::literals;
 
 namespace Fortran::evaluate {
 
-void RealFlagWarnings(
-    FoldingContext &context, const RealFlags &flags, const char *operation) {
+void FoldingContext::RealFlagWarnings(
+    const RealFlags &flags, const char *operation) {
   static constexpr auto warning{common::UsageWarning::FoldingException};
   if (flags.test(RealFlag::Overflow)) {
-    context.Warn(warning, "overflow on %s"_warn_en_US, operation);
+    Warn(warning, "overflow on %s%s"_warn_en_US, operation,
+        realFlagWarningContext_);
   }
   if (flags.test(RealFlag::DivideByZero)) {
     if (std::strcmp(operation, "division") == 0) {
-      context.Warn(warning, "division by zero"_warn_en_US);
+      Warn(warning, "division by zero%s"_warn_en_US, realFlagWarningContext_);
     } else {
-      context.Warn(warning, "division by zero on %s"_warn_en_US, operation);
+      Warn(warning, "division by zero on %s%s"_warn_en_US, operation,
+          realFlagWarningContext_);
     }
   }
   if (flags.test(RealFlag::InvalidArgument)) {
-    context.Warn(warning, "invalid argument on %s"_warn_en_US, operation);
+    Warn(warning, "invalid argument on %s%s"_warn_en_US, operation,
+        realFlagWarningContext_);
   }
   if (flags.test(RealFlag::Underflow)) {
-    context.Warn(warning, "underflow on %s"_warn_en_US, operation);
+    Warn(warning, "underflow on %s%s"_warn_en_US, operation,
+        realFlagWarningContext_);
   }
 }
 
diff --git a/flang/lib/Evaluate/fold-implementation.h b/flang/lib/Evaluate/fold-implementation.h
index 3fdf3a6f38848..52ea627d0bbe4 100644
--- a/flang/lib/Evaluate/fold-implementation.h
+++ b/flang/lib/Evaluate/fold-implementation.h
@@ -1862,7 +1862,7 @@ Expr<TO> FoldOperation(
                 std::snprintf(buffer, sizeof buffer,
                     "INTEGER(%d) to REAL(%d) conversion", Operand::kind,
                     TO::kind);
-                RealFlagWarnings(ctx, converted.flags, buffer);
+                ctx.RealFlagWarnings(converted.flags, buffer);
               }
               return ScalarConstantToExpr(std::move(converted.value));
             } else if constexpr (FromCat == TypeCategory::Real) {
@@ -1871,7 +1871,7 @@ Expr<TO> FoldOperation(
               if (!converted.flags.empty()) {
                 std::snprintf(buffer, sizeof buffer,
                     "REAL(%d) to REAL(%d) conversion", Operand::kind, TO::kind);
-                RealFlagWarnings(ctx, converted.flags, buffer);
+                ctx.RealFlagWarnings(converted.flags, buffer);
               }
               if (ctx.targetCharacteristics().areSubnormalsFlushedToZero()) {
                 converted.value = converted.value.FlushSubnormalToZero();
@@ -2012,7 +2012,7 @@ Expr<T> FoldOperation(FoldingContext &context, Add<T> &&x) {
     } else {
       auto sum{folded->first.Add(
           folded->second, context.targetCharacteristics().roundingMode())};
-      RealFlagWarnings(context, sum.flags, "addition");
+      context.RealFlagWarnings(sum.flags, "addition");
       if (context.targetCharacteristics().areSubnormalsFlushedToZero()) {
         sum.value = sum.value.FlushSubnormalToZero();
       }
@@ -2041,7 +2041,7 @@ Expr<T> FoldOperation(FoldingContext &context, Subtract<T> &&x) {
     } else {
       auto difference{folded->first.Subtract(
           folded->second, context.targetCharacteristics().roundingMode())};
-      RealFlagWarnings(context, difference.flags, "subtraction");
+      context.RealFlagWarnings(difference.flags, "subtraction");
       if (context.targetCharacteristics().areSubnormalsFlushedToZero()) {
         difference.value = difference.value.FlushSubnormalToZero();
       }
@@ -2070,7 +2070,7 @@ Expr<T> FoldOperation(FoldingContext &context, Multiply<T> &&x) {
     } else {
       auto product{folded->first.Multiply(
           folded->second, context.targetCharacteristics().roundingMode())};
-      RealFlagWarnings(context, product.flags, "multiplication");
+      context.RealFlagWarnings(product.flags, "multiplication");
       if (context.targetCharacteristics().areSubnormalsFlushedToZero()) {
         product.value = product.value.FlushSubnormalToZero();
       }
@@ -2141,7 +2141,7 @@ Expr<T> FoldOperation(FoldingContext &context, Divide<T> &&x) {
         }
       }
       if (!isCanonicalNaNOrInf) {
-        RealFlagWarnings(context, quotient.flags, "division");
+        context.RealFlagWarnings(quotient.flags, "division");
       }
       if (context.targetCharacteristics().areSubnormalsFlushedToZero()) {
         quotient.value = quotient.value.FlushSubnormalToZero();
@@ -2201,7 +2201,7 @@ Expr<T> FoldOperation(FoldingContext &context, RealToIntPower<T> &&x) {
       [&](auto &y) -> Expr<T> {
         if (auto folded{OperandsAreConstants(x.left(), y)}) {
           auto power{evaluate::IntPower(folded->first, folded->second)};
-          RealFlagWarnings(context, power.flags, "power with INTEGER exponent");
+          context.RealFlagWarnings(power.flags, "power with INTEGER exponent");
           if (context.targetCharacteristics().areSubnormalsFlushedToZero()) {
             power.value = power.value.FlushSubnormalToZero();
           }
diff --git a/flang/lib/Evaluate/host.cpp b/flang/lib/Evaluate/host.cpp
index 25409ac3418b8..bf0249647162a 100644
--- a/flang/lib/Evaluate/host.cpp
+++ b/flang/lib/Evaluate/host.cpp
@@ -140,8 +140,8 @@ void HostFloatingPointEnvironment::CheckAndRestoreFloatingPointEnvironment(
   }
 
   if (!flags_.empty()) {
-    RealFlagWarnings(
-        context, flags_, "evaluation of intrinsic function or operation");
+    context.RealFlagWarnings(
+        flags_, "evaluation of intrinsic function or operation");
   }
   errno = 0;
   if (fesetenv(&originalFenv_) != 0) {
diff --git a/flang/lib/Evaluate/intrinsics-library.cpp b/flang/lib/Evaluate/intrinsics-library.cpp
index 9820aa3d2ea3d..d8af5246fabdd 100644
--- a/flang/lib/Evaluate/intrinsics-library.cpp
+++ b/flang/lib/Evaluate/intrinsics-library.cpp
@@ -1043,7 +1043,7 @@ std::optional<HostRuntimeWrapper> GetHostRuntimeWrapper(const std::string &name,
   if (const auto *hostFunction{
           SearchHostRuntime(name, biggerResultType, biggerArgTypes)}) {
     auto hostFolderWithChecks{AddArgumentVerifierIfAny(name, *hostFunction)};
-    return [hostFunction, resultType, hostFolderWithChecks](
+    return [hostFunction, resultType, hostFolderWithChecks, name](
                FoldingContext &context, std::vector<Expr<SomeType>> &&args) {
       auto nArgs{args.size()};
       for (size_t i{0}; i < nArgs; ++i) {
@@ -1051,6 +1051,8 @@ std::optional<HostRuntimeWrapper> GetHostRuntimeWrapper(const std::string &name,
             ConvertToType(hostFunction->argumentTypes[i], std::move(args[i]))
                 .value());
       }
+      auto restorer{context.SetRealFlagWarningContext(
+          " after folding a call to '"s + name + "'"s)};
       return Fold(context,
           ConvertToType(
               resultType, hostFolderWithChecks(context, std::move(args)))
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 6e729874eb5e6..0f4b39a07c5da 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -4876,6 +4876,10 @@ class FirConverter : public Fortran::lower::AbstractConverter {
       mlir::Value shape = builder->genShape(loc, lbounds, extents);
       rhsBox = fir::ReboxOp::create(*builder, loc, lhsBoxType, rhsBox, shape,
                                     /*slice=*/mlir::Value{});
+    } else if (fir::isClassStarType(lhsBoxType) &&
+               !fir::ConvertOp::canBeConverted(rhsBoxType, lhsBoxType)) {
+      rhsBox = fir::ReboxOp::create(*builder, loc, lhsBoxType, rhsBox,
+                                    mlir::Value{}, mlir::Value{});
     }
     return rhsBox;
   }
diff --git a/flang/lib/Lower/OpenMP/Clauses.cpp b/flang/lib/Lower/OpenMP/Clauses.cpp
index d39f9dda92a28..0f60b47991004 100644
--- a/flang/lib/Lower/OpenMP/Clauses.cpp
+++ b/flang/lib/Lower/OpenMP/Clauses.cpp
@@ -1482,6 +1482,21 @@ ThreadLimit make(const parser::OmpClause::ThreadLimit &inp,
   return ThreadLimit{/*Threadlim=*/makeExpr(inp.v, semaCtx)};
 }
 
+Threadset make(const parser::OmpClause::Threadset &inp,
+               semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::OmpThreadsetClause
+  using wrapped = parser::OmpThreadsetClause;
+
+  CLAUSET_ENUM_CONVERT( //
+      convert, wrapped::ThreadsetPolicy, Threadset::ThreadsetPolicy,
+      // clang-format off
+      MS(Omp_Pool, Omp_Pool)
+      MS(Omp_Team, Omp_Team)
+      // clang-format on
+  );
+  return Threadset{/*ThreadsetPolicy=*/convert(inp.v.v)};
+}
+
 // Threadprivate: empty
 // Threads: empty
 
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index 39bac818fe5d0..15ea84565dd75 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -50,6 +50,7 @@
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMTypes.h"
 #include "mlir/Dialect/Math/IR/Math.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -358,6 +359,14 @@ static constexpr IntrinsicHandler handlers[]{
      &I::genBarrierInit,
      {{{"barrier", asAddr}, {"count", asValue}}},
      /*isElemental=*/false},
+    {"barrier_try_wait",
+     &I::genBarrierTryWait,
+     {{{"barrier", asAddr}, {"token", asValue}}},
+     /*isElemental=*/false},
+    {"barrier_try_wait_sleep",
+     &I::genBarrierTryWaitSleep,
+     {{{"barrier", asAddr}, {"token", asValue}, {"ns", asValue}}},
+     /*isElemental=*/false},
     {"bessel_jn",
      &I::genBesselJn,
      {{{"n1", asValue}, {"n2", asValue}, {"x", asValue}}},
@@ -1036,10 +1045,87 @@ static constexpr IntrinsicHandler handlers[]{
        {"dst", asAddr},
        {"nbytes", asValue}}},
      /*isElemental=*/false},
+    {"tma_bulk_ldc4",
+     &I::genTMABulkLoadC4,
+     {{{"barrier", asAddr},
+       {"src", asAddr},
+       {"dst", asAddr},
+       {"nelems", asValue}}},
+     /*isElemental=*/false},
+    {"tma_bulk_ldc8",
+     &I::genTMABulkLoadC8,
+     {{{"barrier", asAddr},
+       {"src", asAddr},
+       {"dst", asAddr},
+       {"nelems", asValue}}},
+     /*isElemental=*/false},
+    {"tma_bulk_ldi4",
+     &I::genTMABulkLoadI4,
+     {{{"barrier", asAddr},
+       {"src", asAddr},
+       {"dst", asAddr},
+       {"nelems", asValue}}},
+     /*isElemental=*/false},
+    {"tma_bulk_ldi8",
+     &I::genTMABulkLoadI8,
+     {{{"barrier", asAddr},
+       {"src", asAddr},
+       {"dst", asAddr},
+       {"nelems", asValue}}},
+     /*isElemental=*/false},
+    {"tma_bulk_ldr2",
+     &I::genTMABulkLoadR2,
+     {{{"barrier", asAddr},
+       {"src", asAddr},
+       {"dst", asAddr},
+       {"nelems", asValue}}},
+     /*isElemental=*/false},
+    {"tma_bulk_ldr4",
+     &I::genTMABulkLoadR4,
+     {{{"barrier", asAddr},
+       {"src", asAddr},
+       {"dst", asAddr},
+       {"nelems", asValue}}},
+     /*isElemental=*/false},
+    {"tma_bulk_ldr8",
+     &I::genTMABulkLoadR8,
+     {{{"barrier", asAddr},
+       {"src", asAddr},
+       {"dst", asAddr},
+       {"nelems", asValue}}},
+     /*isElemental=*/false},
     {"tma_bulk_s2g",
      &I::genTMABulkS2G,
      {{{"src", asAddr}, {"dst", asAddr}, {"nbytes", asValue}}},
      /*isElemental=*/false},
+    {"tma_bulk_store_c4",
+     &I::genTMABulkStoreC4,
+     {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
+     /*isElemental=*/false},
+    {"tma_bulk_store_c8",
+     &I::genTMABulkStoreC8,
+     {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
+     /*isElemental=*/false},
+    {"tma_bulk_store_i4",
+     &I::genTMABulkStoreI4,
+     {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
+     /*isElemental=*/false},
+    {"tma_bulk_store_i8",
+     &I::genTMABulkStoreI8,
+     {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
+     /*isElemental=*/false},
+    {"tma_bulk_store_r2",
+     &I::genTMABulkStoreR2,
+     {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
+     /*isElemental=*/false},
+    {"tma_bulk_store_r4",
+     &I::genTMABulkStoreR4,
+     {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
+     /*isElemental=*/false},
+    {"tma_bulk_store_r8",
+     &I::genTMABulkStoreR8,
+     {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
+     /*isElemental=*/false},
     {"tma_bulk_wait_group",
      &I::genTMABulkWaitGroup,
      {{}},
@@ -3273,8 +3359,8 @@ void IntrinsicLibrary::genBarrierInit(llvm::ArrayRef<fir::ExtendedValue> args) {
   assert(args.size() == 2);
   mlir::Value barrier = convertPtrToNVVMSpace(
       builder, loc, fir::getBase(args[0]), mlir::NVVM::NVVMMemorySpace::Shared);
-  mlir::NVVM::MBarrierInitSharedOp::create(builder, loc, barrier,
-                                           fir::getBase(args[1]), {});
+  mlir::NVVM::MBarrierInitOp::create(builder, loc, barrier,
+                                     fir::getBase(args[1]), {});
   auto kind = mlir::NVVM::ProxyKindAttr::get(
       builder.getContext(), mlir::NVVM::ProxyKind::async_shared);
   auto space = mlir::NVVM::SharedSpaceAttr::get(
@@ -3282,6 +3368,57 @@ void IntrinsicLibrary::genBarrierInit(llvm::ArrayRef<fir::ExtendedValue> args) {
   mlir::NVVM::FenceProxyOp::create(builder, loc, kind, space);
 }
 
+// BARRIER_TRY_WAIT (CUDA)
+mlir::Value
+IntrinsicLibrary::genBarrierTryWait(mlir::Type resultType,
+                                    llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 2);
+  mlir::Value res = fir::AllocaOp::create(builder, loc, resultType);
+  mlir::Value zero = builder.createIntegerConstant(loc, resultType, 0);
+  fir::StoreOp::create(builder, loc, zero, res);
+  mlir::Value ns =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 1000000);
+  mlir::Value load = fir::LoadOp::create(builder, loc, res);
+  auto whileOp = mlir::scf::WhileOp::create(
+      builder, loc, mlir::TypeRange{resultType}, mlir::ValueRange{load});
+  mlir::Block *beforeBlock = builder.createBlock(&whileOp.getBefore());
+  mlir::Value beforeArg = beforeBlock->addArgument(resultType, loc);
+  builder.setInsertionPointToStart(beforeBlock);
+  mlir::Value condition = mlir::arith::CmpIOp::create(
+      builder, loc, mlir::arith::CmpIPredicate::ne, beforeArg, zero);
+  mlir::scf::ConditionOp::create(builder, loc, condition, beforeArg);
+  mlir::Block *afterBlock = builder.createBlock(&whileOp.getAfter());
+  afterBlock->addArgument(resultType, loc);
+  builder.setInsertionPointToStart(afterBlock);
+  auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext());
+  auto barrier = builder.createConvert(loc, llvmPtrTy, args[0]);
+  mlir::Value ret =
+      mlir::NVVM::InlinePtxOp::create(
+          builder, loc, {resultType}, {barrier, args[1], ns}, {},
+          ".reg .pred p; mbarrier.try_wait.shared.b64 p, [%1], %2, %3; "
+          "selp.b32 %0, 1, 0, p;",
+          {})
+          .getResult(0);
+  mlir::scf::YieldOp::create(builder, loc, ret);
+  builder.setInsertionPointAfter(whileOp);
+  return whileOp.getResult(0);
+}
+
+// BARRIER_TRY_WAIT_SLEEP (CUDA)
+mlir::Value
+IntrinsicLibrary::genBarrierTryWaitSleep(mlir::Type resultType,
+                                         llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 3);
+  auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext());
+  auto barrier = builder.createConvert(loc, llvmPtrTy, args[0]);
+  return mlir::NVVM::InlinePtxOp::create(
+             builder, loc, {resultType}, {barrier, args[1], args[2]}, {},
+             ".reg .pred p; mbarrier.try_wait.shared.b64 p, [%1], %2, %3; "
+             "selp.b32 %0, 1, 0, p;",
+             {})
+      .getResult(0);
+}
+
 // BESSEL_JN
 fir::ExtendedValue
 IntrinsicLibrary::genBesselJn(mlir::Type resultType,
@@ -9218,6 +9355,95 @@ void IntrinsicLibrary::genTMABulkG2S(llvm::ArrayRef<fir::ExtendedValue> args) {
       builder, loc, dst, src, barrier, fir::getBase(args[3]), {}, {});
 }
 
+static void genTMABulkLoad(fir::FirOpBuilder &builder, mlir::Location loc,
+                           mlir::Value barrier, mlir::Value src,
+                           mlir::Value dst, mlir::Value nelem,
+                           mlir::Value eleSize) {
+  mlir::Value size = mlir::arith::MulIOp::create(builder, loc, nelem, eleSize);
+  auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext());
+  barrier = builder.createConvert(loc, llvmPtrTy, barrier);
+  dst = builder.createConvert(loc, llvmPtrTy, dst);
+  src = builder.createConvert(loc, llvmPtrTy, src);
+  mlir::NVVM::InlinePtxOp::create(
+      builder, loc, mlir::TypeRange{}, {dst, src, size, barrier}, {},
+      "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], "
+      "[%1], %2, [%3];",
+      {});
+  mlir::NVVM::InlinePtxOp::create(
+      builder, loc, mlir::TypeRange{}, {barrier, size}, {},
+      "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;", {});
+}
+
+// TMA_BULK_LOADC4
+void IntrinsicLibrary::genTMABulkLoadC4(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 4);
+  mlir::Value eleSize =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 8);
+  genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+                 fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
+}
+
+// TMA_BULK_LOADC8
+void IntrinsicLibrary::genTMABulkLoadC8(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 4);
+  mlir::Value eleSize =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 16);
+  genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+                 fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
+}
+
+// TMA_BULK_LOADI4
+void IntrinsicLibrary::genTMABulkLoadI4(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 4);
+  mlir::Value eleSize =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 4);
+  genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+                 fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
+}
+
+// TMA_BULK_LOADI8
+void IntrinsicLibrary::genTMABulkLoadI8(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 4);
+  mlir::Value eleSize =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 8);
+  genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+                 fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
+}
+
+// TMA_BULK_LOADR2
+void IntrinsicLibrary::genTMABulkLoadR2(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 4);
+  mlir::Value eleSize =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 2);
+  genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+                 fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
+}
+
+// TMA_BULK_LOADR4
+void IntrinsicLibrary::genTMABulkLoadR4(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 4);
+  mlir::Value eleSize =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 4);
+  genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+                 fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
+}
+
+// TMA_BULK_LOADR8
+void IntrinsicLibrary::genTMABulkLoadR8(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 4);
+  mlir::Value eleSize =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 8);
+  genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+                 fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
+}
+
 // TMA_BULK_S2G (CUDA)
 void IntrinsicLibrary::genTMABulkS2G(llvm::ArrayRef<fir::ExtendedValue> args) {
   assert(args.size() == 3);
@@ -9227,6 +9453,97 @@ void IntrinsicLibrary::genTMABulkS2G(llvm::ArrayRef<fir::ExtendedValue> args) {
                                           mlir::NVVM::NVVMMemorySpace::Global);
   mlir::NVVM::CpAsyncBulkSharedCTAToGlobalOp::create(
       builder, loc, dst, src, fir::getBase(args[2]), {}, {});
+
+  mlir::NVVM::InlinePtxOp::create(builder, loc, mlir::TypeRange{}, {}, {},
+                                  "cp.async.bulk.commit_group", {});
+  mlir::NVVM::CpAsyncBulkWaitGroupOp::create(builder, loc,
+                                             builder.getI32IntegerAttr(0), {});
+}
+
+static void genTMABulkStore(fir::FirOpBuilder &builder, mlir::Location loc,
+                            mlir::Value src, mlir::Value dst, mlir::Value count,
+                            mlir::Value eleSize) {
+  mlir::Value size = mlir::arith::MulIOp::create(builder, loc, eleSize, count);
+  src = convertPtrToNVVMSpace(builder, loc, src,
+                              mlir::NVVM::NVVMMemorySpace::Shared);
+  dst = convertPtrToNVVMSpace(builder, loc, dst,
+                              mlir::NVVM::NVVMMemorySpace::Global);
+  mlir::NVVM::CpAsyncBulkSharedCTAToGlobalOp::create(builder, loc, dst, src,
+                                                     size, {}, {});
+  mlir::NVVM::InlinePtxOp::create(builder, loc, mlir::TypeRange{}, {}, {},
+                                  "cp.async.bulk.commit_group", {});
+  mlir::NVVM::CpAsyncBulkWaitGroupOp::create(builder, loc,
+                                             builder.getI32IntegerAttr(0), {});
+}
+
+// TMA_BULK_STORE_C4 (CUDA)
+void IntrinsicLibrary::genTMABulkStoreC4(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 3);
+  mlir::Value eleSize =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 8);
+  genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+                  fir::getBase(args[2]), eleSize);
+}
+
+// TMA_BULK_STORE_C8 (CUDA)
+void IntrinsicLibrary::genTMABulkStoreC8(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 3);
+  mlir::Value eleSize =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 16);
+  genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+                  fir::getBase(args[2]), eleSize);
+}
+
+// TMA_BULK_STORE_I4 (CUDA)
+void IntrinsicLibrary::genTMABulkStoreI4(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 3);
+  mlir::Value eleSize =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 4);
+  genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+                  fir::getBase(args[2]), eleSize);
+}
+
+// TMA_BULK_STORE_I8 (CUDA)
+void IntrinsicLibrary::genTMABulkStoreI8(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 3);
+  mlir::Value eleSize =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 8);
+  genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+                  fir::getBase(args[2]), eleSize);
+}
+
+// TMA_BULK_STORE_R2 (CUDA)
+void IntrinsicLibrary::genTMABulkStoreR2(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 3);
+  mlir::Value eleSize =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 2);
+  genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+                  fir::getBase(args[2]), eleSize);
+}
+
+// TMA_BULK_STORE_R4 (CUDA)
+void IntrinsicLibrary::genTMABulkStoreR4(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 3);
+  mlir::Value eleSize =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 4);
+  genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+                  fir::getBase(args[2]), eleSize);
+}
+
+// TMA_BULK_STORE_R8 (CUDA)
+void IntrinsicLibrary::genTMABulkStoreR8(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 3);
+  mlir::Value eleSize =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 8);
+  genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+                  fir::getBase(args[2]), eleSize);
 }
 
 // TMA_BULK_WAIT_GROUP (CUDA)
diff --git a/flang/lib/Optimizer/Builder/TemporaryStorage.cpp b/flang/lib/Optimizer/Builder/TemporaryStorage.cpp
index 7e329e357d7b3..5db40aff91878 100644
--- a/flang/lib/Optimizer/Builder/TemporaryStorage.cpp
+++ b/flang/lib/Optimizer/Builder/TemporaryStorage.cpp
@@ -258,13 +258,9 @@ void fir::factory::AnyVariableStack::pushValue(mlir::Location loc,
                                                fir::FirOpBuilder &builder,
                                                mlir::Value variable) {
   hlfir::Entity entity{variable};
-  mlir::Type storageElementType =
-      hlfir::getFortranElementType(retValueBox.getType());
-  auto [box, maybeCleanUp] =
-      hlfir::convertToBox(loc, builder, entity, storageElementType);
+  mlir::Value box =
+      hlfir::genVariableBox(loc, builder, entity, entity.getBoxType());
   fir::runtime::genPushDescriptor(loc, builder, opaquePtr, fir::getBase(box));
-  if (maybeCleanUp)
-    (*maybeCleanUp)();
 }
 
 void fir::factory::AnyVariableStack::resetFetchPosition(
diff --git a/flang/lib/Optimizer/CodeGen/TargetRewrite.cpp b/flang/lib/Optimizer/CodeGen/TargetRewrite.cpp
index 0776346870c72..8ca2869993443 100644
--- a/flang/lib/Optimizer/CodeGen/TargetRewrite.cpp
+++ b/flang/lib/Optimizer/CodeGen/TargetRewrite.cpp
@@ -143,7 +143,8 @@ class TargetRewrite : public fir::impl::TargetRewritePassBase<TargetRewrite> {
         llvm::SmallVector<mlir::Type> operandsTypes;
         for (auto arg : gpuLaunchFunc.getKernelOperands())
           operandsTypes.push_back(arg.getType());
-        auto fctTy = mlir::FunctionType::get(&context, operandsTypes, {});
+        auto fctTy = mlir::FunctionType::get(&context, operandsTypes,
+                                             gpuLaunchFunc.getResultTypes());
         if (!hasPortableSignature(fctTy, op))
           convertCallOp(gpuLaunchFunc, fctTy);
       } else if (auto addr = mlir::dyn_cast<fir::AddrOfOp>(op)) {
@@ -520,10 +521,14 @@ class TargetRewrite : public fir::impl::TargetRewritePassBase<TargetRewrite> {
     llvm::SmallVector<mlir::Value, 1> newCallResults;
     // TODO propagate/update call argument and result attributes.
     if constexpr (std::is_same_v<std::decay_t<A>, mlir::gpu::LaunchFuncOp>) {
+      mlir::Value asyncToken = callOp.getAsyncToken();
       auto newCall = A::create(*rewriter, loc, callOp.getKernel(),
                                callOp.getGridSizeOperandValues(),
                                callOp.getBlockSizeOperandValues(),
-                               callOp.getDynamicSharedMemorySize(), newOpers);
+                               callOp.getDynamicSharedMemorySize(), newOpers,
+                               asyncToken ? asyncToken.getType() : nullptr,
+                               callOp.getAsyncDependencies(),
+                               /*clusterSize=*/std::nullopt);
       if (callOp.getClusterSizeX())
         newCall.getClusterSizeXMutable().assign(callOp.getClusterSizeX());
       if (callOp.getClusterSizeY())
diff --git a/flang/lib/Optimizer/Dialect/FIROps.cpp b/flang/lib/Optimizer/Dialect/FIROps.cpp
index d0164f32d9b6a..4f97acaa88b7a 100644
--- a/flang/lib/Optimizer/Dialect/FIROps.cpp
+++ b/flang/lib/Optimizer/Dialect/FIROps.cpp
@@ -4484,7 +4484,7 @@ void fir::IfOp::getSuccessorRegions(
     llvm::SmallVectorImpl<mlir::RegionSuccessor> &regions) {
   // The `then` and the `else` region branch back to the parent operation.
   if (!point.isParent()) {
-    regions.push_back(mlir::RegionSuccessor(getResults()));
+    regions.push_back(mlir::RegionSuccessor(getOperation(), getResults()));
     return;
   }
 
@@ -4494,7 +4494,8 @@ void fir::IfOp::getSuccessorRegions(
   // Don't consider the else region if it is empty.
   mlir::Region *elseRegion = &this->getElseRegion();
   if (elseRegion->empty())
-    regions.push_back(mlir::RegionSuccessor());
+    regions.push_back(
+        mlir::RegionSuccessor(getOperation(), getOperation()->getResults()));
   else
     regions.push_back(mlir::RegionSuccessor(elseRegion));
 }
@@ -4513,7 +4514,7 @@ void fir::IfOp::getEntrySuccessorRegions(
     if (!getElseRegion().empty())
       regions.emplace_back(&getElseRegion());
     else
-      regions.emplace_back(getResults());
+      regions.emplace_back(getOperation(), getOperation()->getResults());
   }
 }
 
diff --git a/flang/lib/Optimizer/OpenACC/Support/CMakeLists.txt b/flang/lib/Optimizer/OpenACC/Support/CMakeLists.txt
index ef67ab1549537..898fb00d41dfe 100644
--- a/flang/lib/Optimizer/OpenACC/Support/CMakeLists.txt
+++ b/flang/lib/Optimizer/OpenACC/Support/CMakeLists.txt
@@ -2,6 +2,7 @@ get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
 
 add_flang_library(FIROpenACCSupport
   FIROpenACCAttributes.cpp
+  FIROpenACCOpsInterfaces.cpp
   FIROpenACCTypeInterfaces.cpp
   RegisterOpenACCExtensions.cpp
 
diff --git a/flang/lib/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.cpp b/flang/lib/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.cpp
new file mode 100644
index 0000000000000..c1734be5185f4
--- /dev/null
+++ b/flang/lib/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.cpp
@@ -0,0 +1,62 @@
+//===-- FIROpenACCOpsInterfaces.cpp ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation of external operation interfaces for FIR.
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.h"
+
+#include "flang/Optimizer/Dialect/FIROps.h"
+#include "flang/Optimizer/HLFIR/HLFIROps.h"
+
+namespace fir::acc {
+
+template <>
+mlir::Value PartialEntityAccessModel<fir::ArrayCoorOp>::getBaseEntity(
+    mlir::Operation *op) const {
+  return mlir::cast<fir::ArrayCoorOp>(op).getMemref();
+}
+
+template <>
+mlir::Value PartialEntityAccessModel<fir::CoordinateOp>::getBaseEntity(
+    mlir::Operation *op) const {
+  return mlir::cast<fir::CoordinateOp>(op).getRef();
+}
+
+template <>
+mlir::Value PartialEntityAccessModel<hlfir::DesignateOp>::getBaseEntity(
+    mlir::Operation *op) const {
+  return mlir::cast<hlfir::DesignateOp>(op).getMemref();
+}
+
+mlir::Value PartialEntityAccessModel<fir::DeclareOp>::getBaseEntity(
+    mlir::Operation *op) const {
+  return mlir::cast<fir::DeclareOp>(op).getStorage();
+}
+
+bool PartialEntityAccessModel<fir::DeclareOp>::isCompleteView(
+    mlir::Operation *op) const {
+  // Return false (partial view) only if storage is present
+  // Return true (complete view) if storage is absent
+  return !getBaseEntity(op);
+}
+
+mlir::Value PartialEntityAccessModel<hlfir::DeclareOp>::getBaseEntity(
+    mlir::Operation *op) const {
+  return mlir::cast<hlfir::DeclareOp>(op).getStorage();
+}
+
+bool PartialEntityAccessModel<hlfir::DeclareOp>::isCompleteView(
+    mlir::Operation *op) const {
+  // Return false (partial view) only if storage is present
+  // Return true (complete view) if storage is absent
+  return !getBaseEntity(op);
+}
+
+} // namespace fir::acc
diff --git a/flang/lib/Optimizer/OpenACC/Support/FIROpenACCTypeInterfaces.cpp b/flang/lib/Optimizer/OpenACC/Support/FIROpenACCTypeInterfaces.cpp
index ed9e41c743754..ae0f5fb8197fa 100644
--- a/flang/lib/Optimizer/OpenACC/Support/FIROpenACCTypeInterfaces.cpp
+++ b/flang/lib/Optimizer/OpenACC/Support/FIROpenACCTypeInterfaces.cpp
@@ -193,6 +193,28 @@ OpenACCMappableModel<fir::PointerType>::getOffsetInBytes(
     mlir::Type type, mlir::Value var, mlir::ValueRange accBounds,
     const mlir::DataLayout &dataLayout) const;
 
+template <typename Ty>
+bool OpenACCMappableModel<Ty>::hasUnknownDimensions(mlir::Type type) const {
+  assert(fir::isa_ref_type(type) && "expected FIR reference type");
+  return fir::hasDynamicSize(fir::unwrapRefType(type));
+}
+
+template bool OpenACCMappableModel<fir::ReferenceType>::hasUnknownDimensions(
+    mlir::Type type) const;
+
+template bool OpenACCMappableModel<fir::HeapType>::hasUnknownDimensions(
+    mlir::Type type) const;
+
+template bool OpenACCMappableModel<fir::PointerType>::hasUnknownDimensions(
+    mlir::Type type) const;
+
+template <>
+bool OpenACCMappableModel<fir::BaseBoxType>::hasUnknownDimensions(
+    mlir::Type type) const {
+  // Descriptor-based entities have dimensions encoded.
+  return false;
+}
+
 static llvm::SmallVector<mlir::Value>
 generateSeqTyAccBounds(fir::SequenceType seqType, mlir::Value var,
                        mlir::OpBuilder &builder) {
diff --git a/flang/lib/Optimizer/OpenACC/Support/RegisterOpenACCExtensions.cpp b/flang/lib/Optimizer/OpenACC/Support/RegisterOpenACCExtensions.cpp
index 717bf344e40aa..d71c40dfac03c 100644
--- a/flang/lib/Optimizer/OpenACC/Support/RegisterOpenACCExtensions.cpp
+++ b/flang/lib/Optimizer/OpenACC/Support/RegisterOpenACCExtensions.cpp
@@ -11,8 +11,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "flang/Optimizer/OpenACC/Support/RegisterOpenACCExtensions.h"
+
 #include "flang/Optimizer/Dialect/FIRDialect.h"
+#include "flang/Optimizer/Dialect/FIROps.h"
 #include "flang/Optimizer/Dialect/FIRType.h"
+#include "flang/Optimizer/HLFIR/HLFIRDialect.h"
+#include "flang/Optimizer/HLFIR/HLFIROps.h"
+#include "flang/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.h"
 #include "flang/Optimizer/OpenACC/Support/FIROpenACCTypeInterfaces.h"
 
 namespace fir::acc {
@@ -37,7 +42,24 @@ void registerOpenACCExtensions(mlir::DialectRegistry &registry) {
 
     fir::LLVMPointerType::attachInterface<
         OpenACCPointerLikeModel<fir::LLVMPointerType>>(*ctx);
+
+    fir::ArrayCoorOp::attachInterface<
+        PartialEntityAccessModel<fir::ArrayCoorOp>>(*ctx);
+    fir::CoordinateOp::attachInterface<
+        PartialEntityAccessModel<fir::CoordinateOp>>(*ctx);
+    fir::DeclareOp::attachInterface<PartialEntityAccessModel<fir::DeclareOp>>(
+        *ctx);
   });
+
+  // Register HLFIR operation interfaces
+  registry.addExtension(
+      +[](mlir::MLIRContext *ctx, hlfir::hlfirDialect *dialect) {
+        hlfir::DesignateOp::attachInterface<
+            PartialEntityAccessModel<hlfir::DesignateOp>>(*ctx);
+        hlfir::DeclareOp::attachInterface<
+            PartialEntityAccessModel<hlfir::DeclareOp>>(*ctx);
+      });
+
   registerAttrsExtensions(registry);
 }
 
diff --git a/flang/lib/Parser/openmp-parsers.cpp b/flang/lib/Parser/openmp-parsers.cpp
index d1e081cfd1b41..4159d2e41b78c 100644
--- a/flang/lib/Parser/openmp-parsers.cpp
+++ b/flang/lib/Parser/openmp-parsers.cpp
@@ -275,6 +275,13 @@ struct SpecificModifierParser {
 
 // --- Iterator helpers -----------------------------------------------
 
+static EntityDecl MakeEntityDecl(ObjectName &&name) {
+  return EntityDecl(
+      /*ObjectName=*/std::move(name), std::optional<ArraySpec>{},
+      std::optional<CoarraySpec>{}, std::optional<CharLength>{},
+      std::optional<Initialization>{});
+}
+
 // [5.0:47:17-18] In an iterator-specifier, if the iterator-type is not
 // specified then the type of that iterator is default integer.
 // [5.0:49:14] The iterator-type must be an integer type.
@@ -282,11 +289,7 @@ static std::list<EntityDecl> makeEntityList(std::list<ObjectName> &&names) {
   std::list<EntityDecl> entities;
 
   for (auto iter = names.begin(), end = names.end(); iter != end; ++iter) {
-    EntityDecl entityDecl(
-        /*ObjectName=*/std::move(*iter), std::optional<ArraySpec>{},
-        std::optional<CoarraySpec>{}, std::optional<CharLength>{},
-        std::optional<Initialization>{});
-    entities.push_back(std::move(entityDecl));
+    entities.push_back(MakeEntityDecl(std::move(*iter)));
   }
   return entities;
 }
@@ -306,6 +309,217 @@ static TypeDeclarationStmt makeIterSpecDecl(std::list<ObjectName> &&names) {
       makeEntityList(std::move(names)));
 }
 
+// --- Stylized expression handling -----------------------------------
+
+// OpenMP has a concept of am "OpenMP stylized expression". Syntactially
+// it looks like a typical Fortran expression (or statement), except:
+// - the only variables allowed in it are OpenMP special variables, the
+//   exact set of these variables depends on the specific case of the
+//   stylized expression
+// - the special OpenMP variables present may assume one or more types,
+//   and the expression should be semantically valid for each type.
+//
+// The stylized expression can be thought of as a template, which will be
+// instantiated for each type provided somewhere in the context in which
+// the stylized expression appears.
+//
+// AST nodes:
+// - OmpStylizedExpression: contains the source string for the expression,
+//   plus the list of instances (OmpStylizedInstance).
+// - OmpStylizedInstance: corresponds to the instantiation of the stylized
+//   expression for a specific type. The way that the type is specified is
+//   by creating declarations (OmpStylizedDeclaration) for the special
+//   variables. Together with the AST tree corresponding to the stylized
+//   expression the instantiation has enough information for semantic
+//   analysis. Each instance has its own scope, and the special variables
+//   have their own Symbol's (local to the scope).
+// - OmpStylizedDeclaration: encapsulates the information that the visitors
+//   in resolve-names can use to "emulate" a declaration for a special
+//   variable and allow name resolution in the instantiation AST to work.
+//
+// Implementation specifics:
+// The semantic analysis stores "evaluate::Expr" in each AST node rooted
+// in parser::Expr (in the typedExpr member). The evaluate::Expr is specific
+// to a given type, and so to allow different types for a given expression,
+// for each type a separate copy of the parser::Expr subtree is created.
+// Normally, AST nodes are non-copyable (copy-ctor is deleted), so to create
+// several copies of a subtree, the same source string is parsed several
+// times. The ParseState member in OmpStylizedExpression is the parser state
+// immediately before the stylized expression.
+//
+// Initially, when OmpStylizedExpression is first created, the expression is
+// parsed as if it was an actual code, but this parsing is only done to
+// establish where the stylized expression ends (in the source). The source
+// and the initial parser state are stored in the object, and the instance
+// list is empty.
+// Once the parsing of the containing OmpDirectiveSpecification completes,
+// a post-processing "parser" (OmpStylizedInstanceCreator) executes. This
+// post-processor examines the directive specification to see if it expects
+// any stylized expressions to be contained in it, and then instantiates
+// them for each such directive.
+
+template <typename A> struct NeverParser {
+  using resultType = A;
+  std::optional<resultType> Parse(ParseState &state) const {
+    // Always fail, but without any messages.
+    return std::nullopt;
+  }
+};
+
+template <typename A> constexpr auto never() { return NeverParser<A>{}; }
+
+// Parser for optional<T> which always succeeds and returns std::nullptr.
+// It's only needed to produce "std::optional<CallStmt::Chevrons>" in
+// CallStmt.
+template <typename A, typename B = void> struct NullParser;
+template <typename B> struct NullParser<std::optional<B>> {
+  using resultType = std::optional<B>;
+  std::optional<resultType> Parse(ParseState &) const {
+    return resultType{std::nullopt};
+  }
+};
+
+template <typename A> constexpr auto null() { return NullParser<A>{}; }
+
+// OmpStylizedDeclaration and OmpStylizedInstance are helper classes, and
+// don't correspond to anything in the source. Their parsers should still
+// exist, but they should never be executed.
+TYPE_PARSER(construct<OmpStylizedDeclaration>(never<OmpStylizedDeclaration>()))
+TYPE_PARSER(construct<OmpStylizedInstance>(never<OmpStylizedInstance>()))
+
+TYPE_PARSER( //
+    construct<OmpStylizedInstance::Instance>(Parser<AssignmentStmt>{}) ||
+    construct<OmpStylizedInstance::Instance>(
+        sourced(construct<CallStmt>(Parser<ProcedureDesignator>{},
+            null<std::optional<CallStmt::Chevrons>>(),
+            parenthesized(optionalList(actualArgSpec))))) ||
+    construct<OmpStylizedInstance::Instance>(indirect(expr)))
+
+struct OmpStylizedExpressionParser {
+  using resultType = OmpStylizedExpression;
+
+  std::optional<resultType> Parse(ParseState &state) const {
+    auto *saved{new ParseState(state)};
+    auto getSource{verbatim(Parser<OmpStylizedInstance::Instance>{} >> ok)};
+    if (auto &&ok{getSource.Parse(state)}) {
+      OmpStylizedExpression result{std::list<OmpStylizedInstance>{}};
+      result.source = ok->source;
+      result.state = saved;
+      // result.v remains empty
+      return std::move(result);
+    }
+    delete saved;
+    return std::nullopt;
+  }
+};
+
+static void Instantiate(OmpStylizedExpression &ose,
+    llvm::ArrayRef<const OmpTypeName *> types, llvm::ArrayRef<CharBlock> vars) {
+  // 1. For each var in the vars list, declare it with the corresponding
+  //    type from types.
+  // 2. Run the parser to get the AST for the stylized expression.
+  // 3. Create OmpStylizedInstance and append it to the list in ose.
+  assert(types.size() == vars.size() && "List size mismatch");
+  // A ParseState object is irreversibly modified during parsing (in
+  // particular, it cannot be rewound to an earlier position in the source).
+  // Because of that we need to create a local copy for each instantiation.
+  // If rewinding was possible, we could just use the current one, and we
+  // wouldn't need to save it in the AST node.
+  ParseState state{DEREF(ose.state)};
+
+  std::list<OmpStylizedDeclaration> decls;
+  for (auto [type, var] : llvm::zip_equal(types, vars)) {
+    decls.emplace_back(OmpStylizedDeclaration{
+        common::Reference(*type), MakeEntityDecl(Name{var})});
+  }
+
+  if (auto &&instance{Parser<OmpStylizedInstance::Instance>{}.Parse(state)}) {
+    ose.v.emplace_back(
+        OmpStylizedInstance{std::move(decls), std::move(*instance)});
+  }
+}
+
+static void InstantiateForTypes(OmpStylizedExpression &ose,
+    const OmpTypeNameList &typeNames, llvm::ArrayRef<CharBlock> vars) {
+  // For each type in the type list, declare all variables in vars with
+  // that type, and complete the instantiation.
+  for (const OmpTypeName &t : typeNames.v) {
+    std::vector<const OmpTypeName *> types(vars.size(), &t);
+    Instantiate(ose, types, vars);
+  }
+}
+
+static void InstantiateDeclareReduction(OmpDirectiveSpecification &spec) {
+  // There can be arguments/clauses that don't make sense, that analysis
+  // is left until semantic checks. Tolerate any unexpected stuff.
+  auto *rspec{GetFirstArgument<OmpReductionSpecifier>(spec)};
+  if (!rspec) {
+    return;
+  }
+
+  const OmpTypeNameList *typeNames{nullptr};
+
+  if (auto *cexpr{
+          const_cast<OmpCombinerExpression *>(GetCombinerExpr(*rspec))}) {
+    typeNames = &std::get<OmpTypeNameList>(rspec->t);
+
+    InstantiateForTypes(*cexpr, *typeNames, OmpCombinerExpression::Variables());
+    delete cexpr->state;
+    cexpr->state = nullptr;
+  } else {
+    // If there are no types, there is nothing else to do.
+    return;
+  }
+
+  for (const OmpClause &clause : spec.Clauses().v) {
+    llvm::omp::Clause id{clause.Id()};
+    if (id == llvm::omp::Clause::OMPC_initializer) {
+      if (auto *iexpr{const_cast<OmpInitializerExpression *>(
+              GetInitializerExpr(clause))}) {
+        InstantiateForTypes(
+            *iexpr, *typeNames, OmpInitializerExpression::Variables());
+        delete iexpr->state;
+        iexpr->state = nullptr;
+      }
+    }
+  }
+}
+
+static void InstantiateStylizedDirective(OmpDirectiveSpecification &spec) {
+  const OmpDirectiveName &dirName{spec.DirName()};
+  if (dirName.v == llvm::omp::Directive::OMPD_declare_reduction) {
+    InstantiateDeclareReduction(spec);
+  }
+}
+
+template <typename P,
+    typename = std::enable_if_t<
+        std::is_same_v<typename P::resultType, OmpDirectiveSpecification>>>
+struct OmpStylizedInstanceCreator {
+  using resultType = OmpDirectiveSpecification;
+  constexpr OmpStylizedInstanceCreator(P p) : parser_(p) {}
+
+  std::optional<resultType> Parse(ParseState &state) const {
+    if (auto &&spec{parser_.Parse(state)}) {
+      InstantiateStylizedDirective(*spec);
+      return std::move(spec);
+    }
+    return std::nullopt;
+  }
+
+private:
+  const P parser_;
+};
+
+template <typename P>
+OmpStylizedInstanceCreator(P) -> OmpStylizedInstanceCreator<P>;
+
+// --- Parsers for types ----------------------------------------------
+
+TYPE_PARSER( //
+    sourced(construct<OmpTypeName>(Parser<DeclarationTypeSpec>{})) ||
+    sourced(construct<OmpTypeName>(Parser<TypeSpec>{})))
+
 // --- Parsers for arguments ------------------------------------------
 
 // At the moment these are only directive arguments. This is needed for
@@ -366,10 +580,6 @@ struct OmpArgumentListParser {
   }
 };
 
-TYPE_PARSER( //
-    construct<OmpTypeName>(Parser<DeclarationTypeSpec>{}) ||
-    construct<OmpTypeName>(Parser<TypeSpec>{}))
-
 // 2.15.3.6 REDUCTION (reduction-identifier: variable-name-list)
 TYPE_PARSER(construct<OmpReductionIdentifier>(Parser<DefinedOperator>{}) ||
     construct<OmpReductionIdentifier>(Parser<ProcedureDesignator>{}))
@@ -1065,7 +1275,8 @@ TYPE_PARSER(construct<OmpOtherwiseClause>(
 
 TYPE_PARSER(construct<OmpWhenClause>(
     maybe(nonemptyList(Parser<OmpWhenClause::Modifier>{}) / ":"),
-    maybe(indirect(Parser<OmpDirectiveSpecification>{}))))
+    maybe(indirect(
+        OmpStylizedInstanceCreator(Parser<OmpDirectiveSpecification>{})))))
 
 // OMP 5.2 12.6.1 grainsize([ prescriptiveness :] scalar-integer-expression)
 TYPE_PARSER(construct<OmpGrainsizeClause>(
@@ -1777,12 +1988,7 @@ TYPE_PARSER(
             Parser<OpenMPInteropConstruct>{})) /
     endOfLine)
 
-TYPE_PARSER(construct<OmpInitializerProc>(Parser<ProcedureDesignator>{},
-    parenthesized(many(maybe(","_tok) >> Parser<ActualArgSpec>{}))))
-
-TYPE_PARSER(construct<OmpInitializerClause>(
-    construct<OmpInitializerClause>(assignmentStmt) ||
-    construct<OmpInitializerClause>(Parser<OmpInitializerProc>{})))
+TYPE_PARSER(construct<OmpInitializerClause>(Parser<OmpInitializerExpression>{}))
 
 // OpenMP 5.2: 7.5.4 Declare Variant directive
 TYPE_PARSER(sourced(construct<OmpDeclareVariantDirective>(
@@ -1794,7 +2000,7 @@ TYPE_PARSER(sourced(construct<OmpDeclareVariantDirective>(
 TYPE_PARSER(sourced(construct<OpenMPDeclareReductionConstruct>(
     predicated(Parser<OmpDirectiveName>{},
         IsDirective(llvm::omp::Directive::OMPD_declare_reduction)) >=
-    Parser<OmpDirectiveSpecification>{})))
+    OmpStylizedInstanceCreator(Parser<OmpDirectiveSpecification>{}))))
 
 // 2.10.6 Declare Target Construct
 TYPE_PARSER(sourced(construct<OpenMPDeclareTargetConstruct>(
@@ -1832,8 +2038,8 @@ TYPE_PARSER(sourced(construct<OpenMPDeclareMapperConstruct>(
         IsDirective(llvm::omp::Directive::OMPD_declare_mapper)) >=
     Parser<OmpDirectiveSpecification>{})))
 
-TYPE_PARSER(construct<OmpCombinerExpression>(Parser<AssignmentStmt>{}) ||
-    construct<OmpCombinerExpression>(Parser<FunctionReference>{}))
+TYPE_PARSER(construct<OmpCombinerExpression>(OmpStylizedExpressionParser{}))
+TYPE_PARSER(construct<OmpInitializerExpression>(OmpStylizedExpressionParser{}))
 
 TYPE_PARSER(sourced(construct<OpenMPCriticalConstruct>(
     OmpBlockConstructParser{llvm::omp::Directive::OMPD_critical})))
diff --git a/flang/lib/Parser/openmp-utils.cpp b/flang/lib/Parser/openmp-utils.cpp
index 937a17f29f221..95ad3f60770f5 100644
--- a/flang/lib/Parser/openmp-utils.cpp
+++ b/flang/lib/Parser/openmp-utils.cpp
@@ -74,4 +74,16 @@ const BlockConstruct *GetFortranBlockConstruct(
   return nullptr;
 }
 
+const OmpCombinerExpression *GetCombinerExpr(
+    const OmpReductionSpecifier &rspec) {
+  return addr_if(std::get<std::optional<OmpCombinerExpression>>(rspec.t));
+}
+
+const OmpInitializerExpression *GetInitializerExpr(const OmpClause &init) {
+  if (auto *wrapped{std::get_if<OmpClause::Initializer>(&init.u)}) {
+    return &wrapped->v.v;
+  }
+  return nullptr;
+}
+
 } // namespace Fortran::parser::omp
diff --git a/flang/lib/Parser/parse-tree.cpp b/flang/lib/Parser/parse-tree.cpp
index 8cbaa399c4763..ad0016e1404f9 100644
--- a/flang/lib/Parser/parse-tree.cpp
+++ b/flang/lib/Parser/parse-tree.cpp
@@ -11,6 +11,7 @@
 #include "flang/Common/indirection.h"
 #include "flang/Parser/tools.h"
 #include "flang/Parser/user-state.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/Frontend/OpenMP/OMP.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
@@ -430,4 +431,30 @@ const OmpClauseList &OmpDirectiveSpecification::Clauses() const {
   }
   return empty;
 }
+
+static bool InitCharBlocksFromStrings(llvm::MutableArrayRef<CharBlock> blocks,
+    llvm::ArrayRef<std::string> strings) {
+  for (auto [i, n] : llvm::enumerate(strings)) {
+    blocks[i] = CharBlock(n);
+  }
+  return true;
+}
+
+// The names should have static storage duration. Keep these names
+// in a sigle place.
+llvm::ArrayRef<CharBlock> OmpCombinerExpression::Variables() {
+  static std::string names[]{"omp_in", "omp_out"};
+  static CharBlock vars[std::size(names)];
+
+  [[maybe_unused]] static bool init = InitCharBlocksFromStrings(vars, names);
+  return vars;
+}
+
+llvm::ArrayRef<CharBlock> OmpInitializerExpression::Variables() {
+  static std::string names[]{"omp_orig", "omp_priv"};
+  static CharBlock vars[std::size(names)];
+
+  [[maybe_unused]] static bool init = InitCharBlocksFromStrings(vars, names);
+  return vars;
+}
 } // namespace Fortran::parser
diff --git a/flang/lib/Parser/prescan.cpp b/flang/lib/Parser/prescan.cpp
index 4739da0676fa9..efce8fc3d2e35 100644
--- a/flang/lib/Parser/prescan.cpp
+++ b/flang/lib/Parser/prescan.cpp
@@ -557,7 +557,7 @@ bool Prescanner::MustSkipToEndOfLine() const {
     return true; // skip over ignored columns in right margin (73:80)
   } else if (*at_ == '!' && !inCharLiteral_ &&
       (!inFixedForm_ || tabInCurrentLine_ || column_ != 6)) {
-    return !IsCompilerDirectiveSentinel(at_);
+    return !IsCompilerDirectiveSentinel(at_ + 1);
   } else {
     return false;
   }
@@ -1642,6 +1642,17 @@ Prescanner::IsFixedFormCompilerDirectiveLine(const char *start) const {
       // This is a Continuation line, not an initial directive line.
       return std::nullopt;
     }
+    ++column, ++p;
+  }
+  if (isOpenMPConditional) {
+    for (; column <= fixedFormColumnLimit_; ++column, ++p) {
+      if (IsSpaceOrTab(p)) {
+      } else if (*p == '!') {
+        return std::nullopt; // !$    ! is a comment, not a directive
+      } else {
+        break;
+      }
+    }
   }
   if (const char *ss{IsCompilerDirectiveSentinel(
           sentinel, static_cast<std::size_t>(sp - sentinel))}) {
@@ -1657,8 +1668,17 @@ Prescanner::IsFreeFormCompilerDirectiveLine(const char *start) const {
       p && *p++ == '!') {
     if (auto maybePair{IsCompilerDirectiveSentinel(p)}) {
       auto offset{static_cast<std::size_t>(p - start - 1)};
-      return {LineClassification{LineClassification::Kind::CompilerDirective,
-          offset, maybePair->first}};
+      const char *sentinel{maybePair->first};
+      if ((sentinel[0] == '$' && sentinel[1] == '\0') || sentinel[1] == '@') {
+        if (const char *comment{IsFreeFormComment(maybePair->second)}) {
+          if (*comment == '!') {
+            // Conditional line comment - treat as comment
+            return std::nullopt;
+          }
+        }
+      }
+      return {LineClassification{
+          LineClassification::Kind::CompilerDirective, offset, sentinel}};
     }
   }
   return std::nullopt;
diff --git a/flang/lib/Parser/unparse.cpp b/flang/lib/Parser/unparse.cpp
index 20a8d2abd8ca0..9b38cfc40c5b2 100644
--- a/flang/lib/Parser/unparse.cpp
+++ b/flang/lib/Parser/unparse.cpp
@@ -2095,15 +2095,13 @@ class UnparseVisitor {
 
   // OpenMP Clauses & Directives
   void Unparse(const OmpArgumentList &x) { Walk(x.v, ", "); }
+  void Unparse(const OmpTypeNameList &x) { Walk(x.v, ", "); }
 
   void Unparse(const OmpBaseVariantNames &x) {
     Walk(std::get<0>(x.t)); // OmpObject
     Put(":");
     Walk(std::get<1>(x.t)); // OmpObject
   }
-  void Unparse(const OmpTypeNameList &x) { //
-    Walk(x.v, ",");
-  }
   void Unparse(const OmpMapperSpecifier &x) {
     const auto &mapperName{std::get<std::string>(x.t)};
     if (mapperName.find(llvm::omp::OmpDefaultMapperName) == std::string::npos) {
@@ -2202,6 +2200,15 @@ class UnparseVisitor {
     unsigned ompVersion{langOpts_.OpenMPVersion};
     Word(llvm::omp::getOpenMPDirectiveName(x.v, ompVersion));
   }
+  void Unparse(const OmpStylizedDeclaration &x) {
+    // empty
+  }
+  void Unparse(const OmpStylizedExpression &x) { //
+    Put(x.source.ToString());
+  }
+  void Unparse(const OmpStylizedInstance &x) {
+    // empty
+  }
   void Unparse(const OmpIteratorSpecifier &x) {
     Walk(std::get<TypeDeclarationStmt>(x.t));
     Put(" = ");
@@ -2511,29 +2518,11 @@ class UnparseVisitor {
   void Unparse(const OpenMPCriticalConstruct &x) {
     Unparse(static_cast<const OmpBlockConstruct &>(x));
   }
-  void Unparse(const OmpInitializerProc &x) {
-    Walk(std::get<ProcedureDesignator>(x.t));
-    Put("(");
-    Walk(std::get<std::list<ActualArgSpec>>(x.t));
-    Put(")");
-  }
-  void Unparse(const OmpInitializerClause &x) {
-    // Don't let the visitor go to the normal AssignmentStmt Unparse function,
-    // it adds an extra newline that we don't want.
-    if (const auto *assignment{std::get_if<AssignmentStmt>(&x.u)}) {
-      Walk(assignment->t, " = ");
-    } else {
-      Walk(x.u);
-    }
+  void Unparse(const OmpInitializerExpression &x) {
+    Unparse(static_cast<const OmpStylizedExpression &>(x));
   }
   void Unparse(const OmpCombinerExpression &x) {
-    // Don't let the visitor go to the normal AssignmentStmt Unparse function,
-    // it adds an extra newline that we don't want.
-    if (const auto *assignment{std::get_if<AssignmentStmt>(&x.u)}) {
-      Walk(assignment->t, " = ");
-    } else {
-      Walk(x.u);
-    }
+    Unparse(static_cast<const OmpStylizedExpression &>(x));
   }
   void Unparse(const OpenMPDeclareReductionConstruct &x) {
     BeginOpenMP();
diff --git a/flang/lib/Semantics/check-allocate.cpp b/flang/lib/Semantics/check-allocate.cpp
index e019bbdfa27f6..a411e20557456 100644
--- a/flang/lib/Semantics/check-allocate.cpp
+++ b/flang/lib/Semantics/check-allocate.cpp
@@ -26,6 +26,10 @@ struct AllocateCheckerInfo {
   std::optional<evaluate::DynamicType> sourceExprType;
   std::optional<parser::CharBlock> sourceExprLoc;
   std::optional<parser::CharBlock> typeSpecLoc;
+  std::optional<parser::CharBlock> statSource;
+  std::optional<parser::CharBlock> msgSource;
+  const SomeExpr *statVar{nullptr};
+  const SomeExpr *msgVar{nullptr};
   int sourceExprRank{0}; // only valid if gotMold || gotSource
   bool gotStat{false};
   bool gotMsg{false};
@@ -141,12 +145,15 @@ static std::optional<AllocateCheckerInfo> CheckAllocateOptions(
             [&](const parser::StatOrErrmsg &statOrErr) {
               common::visit(
                   common::visitors{
-                      [&](const parser::StatVariable &) {
+                      [&](const parser::StatVariable &var) {
                         if (info.gotStat) { // C943
                           context.Say(
                               "STAT may not be duplicated in a ALLOCATE statement"_err_en_US);
                         }
                         info.gotStat = true;
+                        info.statVar = GetExpr(context, var);
+                        info.statSource =
+                            parser::Unwrap<parser::Variable>(var)->GetSource();
                       },
                       [&](const parser::MsgVariable &var) {
                         WarnOnDeferredLengthCharacterScalar(context,
@@ -159,6 +166,9 @@ static std::optional<AllocateCheckerInfo> CheckAllocateOptions(
                               "ERRMSG may not be duplicated in a ALLOCATE statement"_err_en_US);
                         }
                         info.gotMsg = true;
+                        info.msgVar = GetExpr(context, var);
+                        info.msgSource =
+                            parser::Unwrap<parser::Variable>(var)->GetSource();
                       },
                   },
                   statOrErr.u);
@@ -460,6 +470,16 @@ static bool HaveCompatibleLengths(
   }
 }
 
+bool AreSameAllocation(const SomeExpr *root, const SomeExpr *path) {
+  if (root && path) {
+    // For now we just use equality of expressions. If we implement a more
+    // sophisticated alias analysis we should use it here.
+    return *root == *path;
+  } else {
+    return false;
+  }
+}
+
 bool AllocationCheckerHelper::RunChecks(SemanticsContext &context) {
   if (!ultimate_) {
     CHECK(context.AnyFatalError());
@@ -690,6 +710,17 @@ bool AllocationCheckerHelper::RunChecks(SemanticsContext &context) {
           "Object in ALLOCATE must have DEVICE attribute when STREAM option is specified"_err_en_US);
     }
   }
+
+  if (const SomeExpr *allocObj{GetExpr(context, allocateObject_)}) {
+    if (AreSameAllocation(allocObj, allocateInfo_.statVar)) {
+      context.Say(allocateInfo_.statSource.value_or(name_.source),
+          "STAT variable in ALLOCATE must not be the variable being allocated"_err_en_US);
+    }
+    if (AreSameAllocation(allocObj, allocateInfo_.msgVar)) {
+      context.Say(allocateInfo_.msgSource.value_or(name_.source),
+          "ERRMSG variable in ALLOCATE must not be the variable being allocated"_err_en_US);
+    }
+  }
   return RunCoarrayRelatedChecks(context);
 }
 
diff --git a/flang/lib/Semantics/check-allocate.h b/flang/lib/Semantics/check-allocate.h
index e3f7f07bca5b7..54f7380bc3fe8 100644
--- a/flang/lib/Semantics/check-allocate.h
+++ b/flang/lib/Semantics/check-allocate.h
@@ -24,5 +24,6 @@ class AllocateChecker : public virtual BaseChecker {
 private:
   SemanticsContext &context_;
 };
+bool AreSameAllocation(const SomeExpr *root, const SomeExpr *path);
 } // namespace Fortran::semantics
 #endif // FORTRAN_SEMANTICS_CHECK_ALLOCATE_H_
diff --git a/flang/lib/Semantics/check-call.cpp b/flang/lib/Semantics/check-call.cpp
index c51d40b9e5039..995deaa12dd3b 100644
--- a/flang/lib/Semantics/check-call.cpp
+++ b/flang/lib/Semantics/check-call.cpp
@@ -914,7 +914,8 @@ static void CheckExplicitDataArg(const characteristics::DummyDataObject &dummy,
             dummyName);
       }
       // INTENT(OUT) and INTENT(IN OUT) cases are caught elsewhere
-    } else {
+    } else if (!actualIsAllocatable &&
+        !dummy.ignoreTKR.test(common::IgnoreTKR::Pointer)) {
       messages.Say(
           "ALLOCATABLE %s must be associated with an ALLOCATABLE actual argument"_err_en_US,
           dummyName);
@@ -929,7 +930,8 @@ static void CheckExplicitDataArg(const characteristics::DummyDataObject &dummy,
             dummy, actual, *scope,
             /*isAssumedRank=*/dummyIsAssumedRank, actualIsPointer);
       }
-    } else if (!actualIsPointer) {
+    } else if (!actualIsPointer &&
+        !dummy.ignoreTKR.test(common::IgnoreTKR::Pointer)) {
       messages.Say(
           "Actual argument associated with POINTER %s must also be POINTER unless INTENT(IN)"_err_en_US,
           dummyName);
diff --git a/flang/lib/Semantics/check-deallocate.cpp b/flang/lib/Semantics/check-deallocate.cpp
index c1ebc5f4c0ec2..e6ce1b30a59f5 100644
--- a/flang/lib/Semantics/check-deallocate.cpp
+++ b/flang/lib/Semantics/check-deallocate.cpp
@@ -7,51 +7,87 @@
 //===----------------------------------------------------------------------===//
 
 #include "check-deallocate.h"
+#include "check-allocate.h"
 #include "definable.h"
 #include "flang/Evaluate/type.h"
 #include "flang/Parser/message.h"
 #include "flang/Parser/parse-tree.h"
 #include "flang/Semantics/expression.h"
 #include "flang/Semantics/tools.h"
+#include <optional>
 
 namespace Fortran::semantics {
 
 void DeallocateChecker::Leave(const parser::DeallocateStmt &deallocateStmt) {
+  bool gotStat{false}, gotMsg{false};
+  const SomeExpr *statVar{nullptr}, *msgVar{nullptr};
+  std::optional<parser::CharBlock> statSource;
+  std::optional<parser::CharBlock> msgSource;
+  for (const parser::StatOrErrmsg &deallocOpt :
+      std::get<std::list<parser::StatOrErrmsg>>(deallocateStmt.t)) {
+    common::visit(
+        common::visitors{
+            [&](const parser::StatVariable &var) {
+              if (gotStat) {
+                context_.Say(
+                    "STAT may not be duplicated in a DEALLOCATE statement"_err_en_US);
+              }
+              gotStat = true;
+              statVar = GetExpr(context_, var);
+              statSource = parser::Unwrap<parser::Variable>(var)->GetSource();
+            },
+            [&](const parser::MsgVariable &var) {
+              WarnOnDeferredLengthCharacterScalar(context_,
+                  GetExpr(context_, var),
+                  parser::UnwrapRef<parser::Variable>(var).GetSource(),
+                  "ERRMSG=");
+              if (gotMsg) {
+                context_.Say(
+                    "ERRMSG may not be duplicated in a DEALLOCATE statement"_err_en_US);
+              }
+              gotMsg = true;
+              msgVar = GetExpr(context_, var);
+              msgSource = parser::Unwrap<parser::Variable>(var)->GetSource();
+            },
+        },
+        deallocOpt.u);
+  }
   for (const parser::AllocateObject &allocateObject :
       std::get<std::list<parser::AllocateObject>>(deallocateStmt.t)) {
+    parser::CharBlock source;
     common::visit(
         common::visitors{
             [&](const parser::Name &name) {
               const Symbol *symbol{
                   name.symbol ? &name.symbol->GetUltimate() : nullptr};
-              ;
+              source = name.source;
               if (context_.HasError(symbol)) {
                 // already reported an error
               } else if (!IsVariableName(*symbol)) {
-                context_.Say(name.source,
+                context_.Say(source,
                     "Name in DEALLOCATE statement must be a variable name"_err_en_US);
               } else if (!IsAllocatableOrObjectPointer(symbol)) { // C936
-                context_.Say(name.source,
+                context_.Say(source,
                     "Name in DEALLOCATE statement must have the ALLOCATABLE or POINTER attribute"_err_en_US);
-              } else if (auto whyNot{WhyNotDefinable(name.source,
-                             context_.FindScope(name.source),
-                             {DefinabilityFlag::PointerDefinition,
-                                 DefinabilityFlag::AcceptAllocatable,
-                                 DefinabilityFlag::PotentialDeallocation},
-                             *symbol)}) {
+              } else if (auto whyNot{
+                             WhyNotDefinable(source, context_.FindScope(source),
+                                 {DefinabilityFlag::PointerDefinition,
+                                     DefinabilityFlag::AcceptAllocatable,
+                                     DefinabilityFlag::PotentialDeallocation},
+                                 *symbol)}) {
                 // Catch problems with non-definability of the
                 // pointer/allocatable
                 context_
-                    .Say(name.source,
+                    .Say(source,
                         "Name in DEALLOCATE statement is not definable"_err_en_US)
                     .Attach(std::move(
                         whyNot->set_severity(parser::Severity::Because)));
-              } else if (auto whyNot{WhyNotDefinable(name.source,
-                             context_.FindScope(name.source),
-                             DefinabilityFlags{}, *symbol)}) {
+              } else if (auto whyNot{
+                             WhyNotDefinable(source, context_.FindScope(source),
+                                 DefinabilityFlags{}, *symbol)}) {
                 // Catch problems with non-definability of the dynamic object
                 context_
-                    .Say(name.source,
+                    .Say(source,
                         "Object in DEALLOCATE statement is not deallocatable"_err_en_US)
                     .Attach(std::move(
                         whyNot->set_severity(parser::Severity::Because)));
@@ -62,13 +98,12 @@ void DeallocateChecker::Leave(const parser::DeallocateStmt &deallocateStmt) {
             [&](const parser::StructureComponent &structureComponent) {
               // Only perform structureComponent checks if it was successfully
               // analyzed by expression analysis.
-              auto source{structureComponent.component.source};
+              source = structureComponent.component.source;
               if (const auto *expr{GetExpr(context_, allocateObject)}) {
-                if (const Symbol *
-                        symbol{structureComponent.component.symbol
-                                ? &structureComponent.component.symbol
-                                       ->GetUltimate()
-                                : nullptr};
+                if (const Symbol *symbol{structureComponent.component.symbol
+                            ? &structureComponent.component.symbol
+                                  ->GetUltimate()
+                            : nullptr};
                     !IsAllocatableOrObjectPointer(symbol)) { // F'2023 C936
                   context_.Say(source,
                       "Component in DEALLOCATE statement must have the ALLOCATABLE or POINTER attribute"_err_en_US);
@@ -99,32 +134,16 @@ void DeallocateChecker::Leave(const parser::DeallocateStmt &deallocateStmt) {
             },
         },
         allocateObject.u);
-  }
-  bool gotStat{false}, gotMsg{false};
-  for (const parser::StatOrErrmsg &deallocOpt :
-      std::get<std::list<parser::StatOrErrmsg>>(deallocateStmt.t)) {
-    common::visit(
-        common::visitors{
-            [&](const parser::StatVariable &) {
-              if (gotStat) {
-                context_.Say(
-                    "STAT may not be duplicated in a DEALLOCATE statement"_err_en_US);
-              }
-              gotStat = true;
-            },
-            [&](const parser::MsgVariable &var) {
-              WarnOnDeferredLengthCharacterScalar(context_,
-                  GetExpr(context_, var),
-                  parser::UnwrapRef<parser::Variable>(var).GetSource(),
-                  "ERRMSG=");
-              if (gotMsg) {
-                context_.Say(
-                    "ERRMSG may not be duplicated in a DEALLOCATE statement"_err_en_US);
-              }
-              gotMsg = true;
-            },
-        },
-        deallocOpt.u);
+    if (const SomeExpr *allocObj{GetExpr(context_, allocateObject)}) {
+      if (AreSameAllocation(allocObj, statVar)) {
+        context_.Say(statSource.value_or(source),
+            "STAT variable in DEALLOCATE must not be the variable being deallocated"_err_en_US);
+      }
+      if (AreSameAllocation(allocObj, msgVar)) {
+        context_.Say(msgSource.value_or(source),
+            "ERRMSG variable in DEALLOCATE must not be the variable being deallocated"_err_en_US);
+      }
+    }
   }
 }
 
diff --git a/flang/lib/Semantics/check-declarations.cpp b/flang/lib/Semantics/check-declarations.cpp
index 549ee83b70fce..de407d3b1e125 100644
--- a/flang/lib/Semantics/check-declarations.cpp
+++ b/flang/lib/Semantics/check-declarations.cpp
@@ -949,7 +949,8 @@ void CheckHelper::CheckObjectEntity(
             "!DIR$ IGNORE_TKR(R) may not apply in an ELEMENTAL procedure"_err_en_US);
       }
       if (IsPassedViaDescriptor(symbol)) {
-        if (IsAllocatableOrObjectPointer(&symbol)) {
+        if (IsAllocatableOrObjectPointer(&symbol) &&
+            !ignoreTKR.test(common::IgnoreTKR::Pointer)) {
           if (inExplicitExternalInterface) {
             Warn(common::UsageWarning::IgnoreTKRUsage,
                 "!DIR$ IGNORE_TKR should not apply to an allocatable or pointer"_warn_en_US);
diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp
index e094458f001e3..aaaf1ec5d4626 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -3390,6 +3390,7 @@ CHECK_SIMPLE_CLAUSE(Read, OMPC_read)
 CHECK_SIMPLE_CLAUSE(Threadprivate, OMPC_threadprivate)
 CHECK_SIMPLE_CLAUSE(Groupprivate, OMPC_groupprivate)
 CHECK_SIMPLE_CLAUSE(Threads, OMPC_threads)
+CHECK_SIMPLE_CLAUSE(Threadset, OMPC_threadset)
 CHECK_SIMPLE_CLAUSE(Inbranch, OMPC_inbranch)
 CHECK_SIMPLE_CLAUSE(Link, OMPC_link)
 CHECK_SIMPLE_CLAUSE(Indirect, OMPC_indirect)
diff --git a/flang/lib/Semantics/expression.cpp b/flang/lib/Semantics/expression.cpp
index 32aa6b1e0aa1d..c8167fd34f666 100644
--- a/flang/lib/Semantics/expression.cpp
+++ b/flang/lib/Semantics/expression.cpp
@@ -834,7 +834,7 @@ Constant<TYPE> ReadRealLiteral(
   auto valWithFlags{
       Scalar<TYPE>::Read(p, context.targetCharacteristics().roundingMode())};
   CHECK(p == source.end());
-  RealFlagWarnings(context, valWithFlags.flags, "conversion of REAL literal");
+  context.RealFlagWarnings(valWithFlags.flags, "conversion of REAL literal");
   auto value{valWithFlags.value};
   if (context.targetCharacteristics().areSubnormalsFlushedToZero()) {
     value = value.FlushSubnormalToZero();
diff --git a/flang/lib/Semantics/mod-file.cpp b/flang/lib/Semantics/mod-file.cpp
index 556259d1e5e63..b419864f73b8e 100644
--- a/flang/lib/Semantics/mod-file.cpp
+++ b/flang/lib/Semantics/mod-file.cpp
@@ -1021,6 +1021,9 @@ void ModFileWriter::PutObjectEntity(
       case common::IgnoreTKR::Contiguous:
         os << 'c';
         break;
+      case common::IgnoreTKR::Pointer:
+        os << 'p';
+        break;
       }
     });
     os << ") " << symbol.name() << '\n';
diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp
index 196755e2912a8..628068f9a9f68 100644
--- a/flang/lib/Semantics/resolve-directives.cpp
+++ b/flang/lib/Semantics/resolve-directives.cpp
@@ -26,6 +26,8 @@
 #include "flang/Semantics/symbol.h"
 #include "flang/Semantics/tools.h"
 #include "flang/Support/Flags.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Frontend/OpenMP/OMP.h.inc"
 #include "llvm/Support/Debug.h"
 #include <list>
@@ -453,6 +455,21 @@ class OmpAttributeVisitor : DirectiveAttributeVisitor<llvm::omp::Directive> {
     return true;
   }
 
+  bool Pre(const parser::OmpStylizedDeclaration &x) {
+    static llvm::StringMap<Symbol::Flag> map{
+        {"omp_in", Symbol::Flag::OmpInVar},
+        {"omp_orig", Symbol::Flag::OmpOrigVar},
+        {"omp_out", Symbol::Flag::OmpOutVar},
+        {"omp_priv", Symbol::Flag::OmpPrivVar},
+    };
+    if (auto &name{std::get<parser::ObjectName>(x.var.t)}; name.symbol) {
+      if (auto found{map.find(name.ToString())}; found != map.end()) {
+        ResolveOmp(name, found->second,
+            const_cast<Scope &>(DEREF(name.symbol).owner()));
+      }
+    }
+    return false;
+  }
   bool Pre(const parser::OmpMetadirectiveDirective &x) {
     PushContext(x.v.source, llvm::omp::Directive::OMPD_metadirective);
     return true;
diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index 93faba7873916..220f1c96b9823 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -1605,6 +1605,12 @@ class OmpVisitor : public virtual DeclarationVisitor {
     Post(static_cast<const parser::OmpDirectiveSpecification &>(x));
   }
 
+  void Post(const parser::OmpTypeName &);
+  bool Pre(const parser::OmpStylizedDeclaration &);
+  void Post(const parser::OmpStylizedDeclaration &);
+  bool Pre(const parser::OmpStylizedInstance &);
+  void Post(const parser::OmpStylizedInstance &);
+
   bool Pre(const parser::OpenMPDeclareMapperConstruct &x) {
     AddOmpSourceRange(x.source);
     return true;
@@ -1615,18 +1621,6 @@ class OmpVisitor : public virtual DeclarationVisitor {
     return true;
   }
 
-  bool Pre(const parser::OmpInitializerProc &x) {
-    auto &procDes = std::get<parser::ProcedureDesignator>(x.t);
-    auto &name = std::get<parser::Name>(procDes.u);
-    auto *symbol{FindSymbol(NonDerivedTypeScope(), name)};
-    if (!symbol) {
-      context().Say(name.source,
-          "Implicit subroutine declaration '%s' in DECLARE REDUCTION"_err_en_US,
-          name.source);
-    }
-    return true;
-  }
-
   bool Pre(const parser::OmpDeclareVariantDirective &x) {
     AddOmpSourceRange(x.source);
     return true;
@@ -1772,14 +1766,6 @@ class OmpVisitor : public virtual DeclarationVisitor {
     messageHandler().set_currStmtSource(std::nullopt);
   }
 
-  bool Pre(const parser::OmpTypeName &x) {
-    BeginDeclTypeSpec();
-    return true;
-  }
-  void Post(const parser::OmpTypeName &x) { //
-    EndDeclTypeSpec();
-  }
-
   bool Pre(const parser::OpenMPConstruct &x) {
     // Indicate that the current directive is not a declarative one.
     declaratives_.push_back(nullptr);
@@ -1835,6 +1821,30 @@ void OmpVisitor::Post(const parser::OmpBlockConstruct &x) {
   }
 }
 
+void OmpVisitor::Post(const parser::OmpTypeName &x) {
+  x.declTypeSpec = GetDeclTypeSpec();
+}
+
+bool OmpVisitor::Pre(const parser::OmpStylizedDeclaration &x) {
+  BeginDecl();
+  Walk(x.type.get());
+  Walk(x.var);
+  return true;
+}
+
+void OmpVisitor::Post(const parser::OmpStylizedDeclaration &x) { //
+  EndDecl();
+}
+
+bool OmpVisitor::Pre(const parser::OmpStylizedInstance &x) {
+  PushScope(Scope::Kind::OtherConstruct, nullptr);
+  return true;
+}
+
+void OmpVisitor::Post(const parser::OmpStylizedInstance &x) { //
+  PopScope();
+}
+
 bool OmpVisitor::Pre(const parser::OmpMapClause &x) {
   auto &mods{OmpGetModifiers(x)};
   if (auto *mapper{OmpGetUniqueModifier<parser::OmpMapper>(mods)}) {
@@ -1969,51 +1979,20 @@ void OmpVisitor::ProcessReductionSpecifier(
     }
   }
 
-  auto &typeList{std::get<parser::OmpTypeNameList>(spec.t)};
-
-  // Create a temporary variable declaration for the four variables
-  // used in the reduction specifier and initializer (omp_out, omp_in,
-  // omp_priv and omp_orig), with the type in the  typeList.
-  //
-  // In theory it would be possible to create only variables that are
-  // actually used, but that requires walking the entire parse-tree of the
-  // expressions, and finding the relevant variables [there may well be other
-  // variables involved too].
-  //
-  // This allows doing semantic analysis where the type is a derived type
-  // e.g omp_out%x = omp_out%x + omp_in%x.
-  //
-  // These need to be temporary (in their own scope). If they are created
-  // as variables in the outer scope, if there's more than one type in the
-  // typelist, duplicate symbols will be reported.
-  const parser::CharBlock ompVarNames[]{
-      {"omp_in", 6}, {"omp_out", 7}, {"omp_priv", 8}, {"omp_orig", 8}};
-
-  for (auto &t : typeList.v) {
-    PushScope(Scope::Kind::OtherConstruct, nullptr);
-    BeginDeclTypeSpec();
-    // We need to walk t.u because Walk(t) does it's own BeginDeclTypeSpec.
-    Walk(t.u);
+  reductionDetails->AddDecl(declaratives_.back());
 
-    // Only process types we can find. There will be an error later on when
-    // a type isn't found.
-    if (const DeclTypeSpec *typeSpec{GetDeclTypeSpec()}) {
-      reductionDetails->AddType(*typeSpec);
+  // Do not walk OmpTypeNameList. The types on the list will be visited
+  // during procesing of OmpCombinerExpression.
+  Walk(std::get<std::optional<parser::OmpCombinerExpression>>(spec.t));
+  Walk(clauses);
 
-      for (auto &nm : ompVarNames) {
-        ObjectEntityDetails details{};
-        details.set_type(*typeSpec);
-        MakeSymbol(nm, Attrs{}, std::move(details));
-      }
+  for (auto &type : std::get<parser::OmpTypeNameList>(spec.t).v) {
+    // The declTypeSpec can be null if there is some semantic error.
+    if (type.declTypeSpec) {
+      reductionDetails->AddType(*type.declTypeSpec);
     }
-    EndDeclTypeSpec();
-    Walk(std::get<std::optional<parser::OmpCombinerExpression>>(spec.t));
-    Walk(clauses);
-    PopScope();
   }
 
-  reductionDetails->AddDecl(declaratives_.back());
-
   if (!symbol) {
     symbol = &MakeSymbol(mangledName, Attrs{}, std::move(*reductionDetails));
   }
@@ -9456,13 +9435,18 @@ bool ResolveNamesVisitor::SetProcFlag(
     SayWithDecl(name, symbol,
         "Implicit declaration of function '%s' has a different result type than in previous declaration"_err_en_US);
     return false;
-  } else if (symbol.has<ProcEntityDetails>()) {
-    symbol.set(flag); // in case it hasn't been set yet
-    if (flag == Symbol::Flag::Function) {
-      ApplyImplicitRules(symbol);
-    }
-    if (symbol.attrs().test(Attr::INTRINSIC)) {
-      AcquireIntrinsicProcedureFlags(symbol);
+  } else if (const auto *proc{symbol.detailsIf<ProcEntityDetails>()}) {
+    if (IsPointer(symbol) && !proc->type() && !proc->procInterface()) {
+      // PROCEDURE(), POINTER -- errors will be emitted later about a lack
+      // of known characteristics if used as a function
+    } else {
+      symbol.set(flag); // in case it hasn't been set yet
+      if (flag == Symbol::Flag::Function) {
+        ApplyImplicitRules(symbol);
+      }
+      if (symbol.attrs().test(Attr::INTRINSIC)) {
+        AcquireIntrinsicProcedureFlags(symbol);
+      }
     }
   } else if (symbol.GetType() && flag == Symbol::Flag::Subroutine) {
     SayWithDecl(
@@ -10130,6 +10114,9 @@ void ResolveNamesVisitor::Post(const parser::CompilerDirective &x) {
               case 'c':
                 set.set(common::IgnoreTKR::Contiguous);
                 break;
+              case 'p':
+                set.set(common::IgnoreTKR::Pointer);
+                break;
               case 'a':
                 set = common::ignoreTKRAll;
                 break;
diff --git a/flang/lib/Support/Fortran.cpp b/flang/lib/Support/Fortran.cpp
index 3a8ebbb7d61ef..05d6e0e709e91 100644
--- a/flang/lib/Support/Fortran.cpp
+++ b/flang/lib/Support/Fortran.cpp
@@ -95,6 +95,9 @@ std::string AsFortran(IgnoreTKRSet tkr) {
   if (tkr.test(IgnoreTKR::Contiguous)) {
     result += 'C';
   }
+  if (tkr.test(IgnoreTKR::Pointer)) {
+    result += 'P';
+  }
   return result;
 }
 
diff --git a/flang/module/cudadevice.f90 b/flang/module/cudadevice.f90
index 5182950cbffea..59af58ddcd32e 100644
--- a/flang/module/cudadevice.f90
+++ b/flang/module/cudadevice.f90
@@ -1998,6 +1998,18 @@ attributes(device,host) logical function on_device() bind(c)
 
   ! TMA Operations
 
+  interface barrier_arrive
+    attributes(device) function barrier_arrive(barrier) result(token)
+      integer(8), shared :: barrier
+      integer(8) :: token
+    end function
+    attributes(device) function barrier_arrive_cnt(barrier, count) result(token)
+      integer(8), shared :: barrier
+      integer(4), value :: count
+      integer(8) :: token
+    end function
+  end interface
+
   interface 
     attributes(device) subroutine barrier_init(barrier, count)
       integer(8), shared :: barrier
@@ -2005,15 +2017,18 @@ attributes(device) subroutine barrier_init(barrier, count)
     end subroutine
   end interface
 
-  interface barrier_arrive
-    attributes(device) function barrier_arrive(barrier) result(token)
+  interface
+    attributes(device) integer function barrier_try_wait(barrier, token)
       integer(8), shared :: barrier
-      integer(8) :: token
+      integer(8), value  :: token
     end function
-    attributes(device) function barrier_arrive_cnt(barrier, count) result(token)
+  end interface
+  
+  interface
+    attributes(device) integer function barrier_try_wait_sleep(barrier, token, ns)
       integer(8), shared :: barrier
-      integer(4), value :: count
-      integer(8) :: token
+      integer(8), value  :: token
+      integer(4), value  :: ns
     end function
   end interface
 
@@ -2032,7 +2047,13 @@ attributes(device) subroutine tma_bulk_wait_group()
     end subroutine
   end interface
 
+  ! --------------------
+  ! Bulk load functions
+  ! --------------------
+
   ! Generic load, count is in bytes
+  ! -------------------------------
+
   interface
     attributes(device) subroutine tma_bulk_g2s(barrier, src, dst, nbytes)
       !dir$ ignore_tkr src, dst
@@ -2043,6 +2064,74 @@ attributes(device) subroutine tma_bulk_g2s(barrier, src, dst, nbytes)
     end subroutine
   end interface
 
+  ! Load specific types, count is in elements
+  ! -----------------------------------------
+
+  interface tma_bulk_load
+    attributes(device) subroutine tma_bulk_ldc4(barrier, src, dst, nelems)
+      !dir$ ignore_tkr (r) src, (r) dst
+      integer(8), shared :: barrier
+      complex(4), device :: src(*)
+      complex(4), shared :: dst(*)
+      integer(4), value :: nelems
+    end subroutine
+
+    attributes(device) subroutine tma_bulk_ldc8(barrier, src, dst, nelems)
+      !dir$ ignore_tkr (r) src, (r) dst
+      integer(8), shared :: barrier
+      complex(8), device :: src(*)
+      complex(8), shared :: dst(*)
+      integer(4), value :: nelems
+    end subroutine
+  
+    attributes(device) subroutine tma_bulk_ldi4(barrier, src, dst, nelems)
+      !dir$ ignore_tkr (r) src, (r) dst
+      integer(8), shared :: barrier
+      integer(4), device :: src(*)
+      integer(4), shared :: dst(*)
+      integer(4), value :: nelems
+    end subroutine
+
+    attributes(device) subroutine tma_bulk_ldi8(barrier, src, dst, nelems)
+      !dir$ ignore_tkr (r) src, (r) dst
+      integer(8), shared :: barrier
+      integer(8), device :: src(*)
+      integer(8), shared :: dst(*)
+      integer(4), value :: nelems
+    end subroutine
+
+    attributes(device) subroutine tma_bulk_ldr2(barrier, src, dst, nelems)
+      !dir$ ignore_tkr (r) src, (r) dst
+      integer(8), shared :: barrier
+      real(2), device :: src(*)
+      real(2), shared :: dst(*)
+      integer(4), value :: nelems
+    end subroutine
+
+    attributes(device) subroutine tma_bulk_ldr4(barrier, src, dst, nelems)
+      !dir$ ignore_tkr (r) src, (r) dst
+      integer(8), shared :: barrier
+      real(4), device :: src(*)
+      real(4), shared :: dst(*)
+      integer(4), value :: nelems
+    end subroutine
+
+    attributes(device) subroutine tma_bulk_ldr8(barrier, src, dst, nelems)
+      !dir$ ignore_tkr (r) src, (r) dst
+      integer(8), shared :: barrier
+      real(8), device :: src(*)
+      real(8), shared :: dst(*)
+      integer(4), value :: nelems
+    end subroutine
+  end interface
+
+  ! --------------------
+  ! Bulk Store functions
+  ! --------------------
+
+  ! Generic store, count is in bytes
+  ! --------------------------------
+
   interface
     attributes(device) subroutine tma_bulk_s2g(src, dst, nbytes)
       !dir$ ignore_tkr src, dst
@@ -2052,6 +2141,60 @@ attributes(device) subroutine tma_bulk_s2g(src, dst, nbytes)
     end subroutine
   end interface
 
+  ! Load specific types, count is in elements
+  ! -----------------------------------------
+
+  interface tma_bulk_store
+    attributes(device) subroutine tma_bulk_store_c4(src, dst, nelems)
+      !dir$ ignore_tkr (r) src, (r) dst
+      complex(4), shared :: src(*)
+      complex(4), device :: dst(*)
+      integer(4), value :: nelems
+    end subroutine
+
+    attributes(device) subroutine tma_bulk_store_c8(src, dst, nelems)
+      !dir$ ignore_tkr (r) src, (r) dst
+      complex(8), shared :: src(*)
+      complex(8), device :: dst(*)
+      integer(4), value :: nelems
+    end subroutine
+
+    attributes(device) subroutine tma_bulk_store_i4(src, dst, nelems)
+      !dir$ ignore_tkr (r) src, (r) dst
+      integer(4), shared :: src(*)
+      integer(4), device :: dst(*)
+      integer(4), value :: nelems
+    end subroutine
+
+    attributes(device) subroutine tma_bulk_store_i8(src, dst, nelems)
+      !dir$ ignore_tkr (r) src, (r) dst
+      integer(8), shared :: src(*)
+      integer(8), device :: dst(*)
+      integer(4), value :: nelems
+    end subroutine
+
+    attributes(device) subroutine tma_bulk_store_r2(src, dst, nelems)
+      !dir$ ignore_tkr (r) src, (r) dst
+      real(2), shared :: src(*)
+      real(2), device :: dst(*)
+      integer(4), value :: nelems
+    end subroutine
+
+    attributes(device) subroutine tma_bulk_store_r4(src, dst, nelems)
+      !dir$ ignore_tkr (r) src, (r) dst
+      real(4), shared :: src(*)
+      real(4), device :: dst(*)
+      integer(4), value :: nelems
+    end subroutine
+
+    attributes(device) subroutine tma_bulk_store_r8(src, dst, nelems)
+      !dir$ ignore_tkr (r) src, (r) dst
+      real(8), shared :: src(*)
+      real(8), device :: dst(*)
+      integer(4), value :: nelems
+    end subroutine
+  end interface
+
 contains
 
   attributes(device) subroutine syncthreads()
diff --git a/flang/test/Driver/flang-f-opts.f90 b/flang/test/Driver/flang-f-opts.f90
index 77bb4d7aa8a91..9ef0abaa176f0 100644
--- a/flang/test/Driver/flang-f-opts.f90
+++ b/flang/test/Driver/flang-f-opts.f90
@@ -1,5 +1,5 @@
-! Test for warnings generated when parsing driver options. You can use this file for relatively small tests and to avoid creating
-! new test files.
+! Test for errors and warnings generated when parsing driver options. You can
+! use this file for relatively small tests and to avoid creating new test files.
 
 ! RUN: %flang -### -S -O4 -ffp-contract=on %s 2>&1 | FileCheck %s
 
@@ -26,3 +26,20 @@
 ! RUN:     | FileCheck %s -check-prefix=WARN-BUILTIN-MULTIPLE
 ! WARN-BUILTIN-MULTIPLE: warning: '-fbuiltin' is not valid for Fortran
 ! WARN-BUILTIN-MULTIPLE: warning: '-fno-builtin' is not valid for Fortran
+
+! When emitting an error with a suggestion, ensure that the diagnostic message
+! uses '-Xflang' instead of '-Xclang'. This is typically emitted when an option
+! that is available for `flang -fc1` is passed to `flang`. We use -complex-range
+! since it is only available for fc1. If this option is ever exposed to `flang`,
+! a different option will have to be used in the test below.
+!
+! RUN: not %flang -### -complex-range=full %s 2>&1 \
+! RUN:     | FileCheck %s -check-prefix UNKNOWN-SUGGEST
+!
+! UNKNOWN-SUGGEST: error: unknown argument '-complex-range=full';
+! UNKNOWN-SUGGEST-SAME: did you mean '-Xflang -complex-range=full'
+!
+! RUN: not %flang -### -not-an-option %s 2>&1 \
+! RUN:     | FileCheck %s -check-prefix UNKNOWN-NO-SUGGEST
+!
+! UNKNOWN-NO-SUGGEST: error: unknown argument: '-not-an-option'{{$}}
diff --git a/flang/test/Driver/linker-options.f90 b/flang/test/Driver/linker-options.f90
new file mode 100644
index 0000000000000..07f967b4bac5d
--- /dev/null
+++ b/flang/test/Driver/linker-options.f90
@@ -0,0 +1,106 @@
+! Make sure that `-l` is "visible" to Flang's driver
+! RUN: %flang -lpgmath -### %s
+
+! Make sure that `-Wl` is "visible" to Flang's driver
+! RUN: %flang -Wl,abs -### %s
+
+! Make sure that `-fuse-ld' is "visible" to Flang's driver
+! RUN: %flang -fuse-ld= -### %s
+
+! Make sure that `-L' is "visible" to Flang's driver
+! RUN: %flang -L/ -### %s
+
+! ------------------------------------------------------------------------------
+! Check that '-pie' and '-no-pie' are "visible" to Flang's driver. Check that
+! the correct option is added to the link line.
+!
+! Last match "wins"
+! RUN: %flang -target x86_64-pc-linux-gnu -pie -no-pie -### %s 2>&1 \
+! RUN:     | FileCheck %s --check-prefix=NO-PIE
+! RUN: %flang -target x86_64-pc-linux-gnu -no-pie -pie -### %s 2>&1 \
+! RUN:     | FileCheck %s --check-prefix=PIE
+! RUN: %flang -target x86_64-pc-linux-gnu -pie -### %s 2>&1 \
+! RUN:     | FileCheck %s --check-prefix=PIE
+! RUN: %flang -target x86_64-pc-linux-gnu -no-pie -### %s 2>&1 \
+! RUN:     | FileCheck %s --check-prefix=NO-PIE
+!
+! Ensure that "-pie" is passed to the linker.
+! RUN: %flang -target i386-unknown-freebsd -pie -### %s 2>&1 \
+! RUN:     | FileCheck %s --check-prefix=PIE
+! RUN: %flang -target aarch64-pc-linux-gnu -pie -### %s 2>&1 \
+! RUN:     | FileCheck %s --check-prefix=PIE
+!
+! On Musl Linux, PIE is enabled by default, but can be disabled.
+! RUN: %flang -target x86_64-linux-musl -### %s 2>&1 \
+! RUN:   | FileCheck %s --check-prefix=PIE
+! RUN: %flang -target i686-linux-musl -### %s 2>&1 \
+! RUN:   | FileCheck %s --check-prefix=PIE
+! RUN: %flang -target armv6-linux-musleabihf %s -### 2>&1 \
+! RUN:   | FileCheck %s --check-prefix=PIE
+! RUN: %flang -target armv7-linux-musleabihf %s -### 2>&1 \
+! RUN:   | FileCheck %s --check-prefix=PIE
+! RUN: %flang --target=x86_64-linux-musl -no-pie -### 2>&1 \
+! RUN:   | FileCheck %s --check-prefix=NO-PIE
+!
+! On OpenBSD, -pie is not passed to the linker, but can be forced.
+! RUN: %flang -target amd64-pc-openbsd -### %s 2>&1 \
+! RUN:   | FileCheck %s --check-prefix=NO-PIE
+! RUN: %flang -target i386-pc-openbsd -### %s 2>&1 \
+! RUN:   | FileCheck %s --check-prefix=NO-PIE
+! RUN: %flang -target aarch64-unknown-openbsd -### %s 2>&1 \
+! RUN:   | FileCheck %s --check-prefix=NO-PIE
+! RUN: %flang -target arm-unknown-openbsd -### %s 2>&1 \
+! RUN:   | FileCheck %s --check-prefix=NO-PIE
+! RUN: %flang -target powerpc-unknown-openbsd -### %s 2>&1 \
+! RUN:   | FileCheck %s --check-prefix=NO-PIE
+! RUN: %flang -target sparc64-unknown-openbsd -### %s 2>&1 \
+! RUN:   | FileCheck %s --check-prefix=NO-PIE
+! RUN: %flang -target i386-pc-openbsd -pie -### %s 2>&1 \
+! RUN:   | FileCheck %s --check-prefix=PIE
+!
+! On FreeBSD, -pie is not passed to the linker, but can be forced.
+! RUN: %flang -target amd64-pc-freebsd -### %s 2>&1 \
+! RUN:   | FileCheck %s --check-prefix=NO-PIE
+! RUN: %flang -target i386-pc-freebsd -### %s 2>&1 \
+! RUN:   | FileCheck %s --check-prefix=NO-PIE
+! RUN: %flang -target aarch64-unknown-freebsd -### %s 2>&1 \
+! RUN:   | FileCheck %s --check-prefix=NO-PIE
+! RUN: %flang -target arm-unknown-freebsd -### %s 2>&1 \
+! RUN:   | FileCheck %s --check-prefix=NO-PIE
+! RUN: %flang -target powerpc-unknown-freebsd -### %s 2>&1 \
+! RUN:   | FileCheck %s --check-prefix=NO-PIE
+! RUN: %flang -target sparc64-unknown-freebsd -### %s 2>&1 \
+! RUN:   | FileCheck %s --check-prefix=NO-PIE
+! RUN: %flang -target i386-pc-freebsd -pie -### %s 2>&1 \
+! RUN:   | FileCheck %s --check-prefix=PIE
+!
+! On AIX, -pie is never passed to the linker.
+! RUN: %flang -target powerpc64-unknown-aix -### %s 2>&1 \
+! RUN:     | FileCheck %s --check-prefixes=NO-PIE
+! RUN: %flang -target powerpc64-unknown-aix -pie -### %s 2>&1 \
+! RUN:     | FileCheck %s --check-prefixes=NO-PIE,UNUSED
+! RUN: %flang -target powerpc64-unknown-aix -no-pie -### %s 2>&1 \
+! RUN:     | FileCheck %s --check-prefixes=NO-PIE,UNUSED
+!
+! On MinGW and Windows, -pie may be specified, but it is ignored.
+! RUN: %flang -target aarch64-pc-windows-gnu -### %s 2>&1 \
+! RUN:   | FileCheck %s --check-prefixes=NO-PIE
+! RUN: %flang -target x86_64-pc-windows-gnu -pie -### %s 2>&1 \
+! RUN:   | FileCheck %s --check-prefixes=NO-PIE,UNUSED
+! RUN: %flang -target i686-pc-windows-gnu -no-pie -### %s 2>&1 \
+! RUN:   | FileCheck %s --check-prefixes=NO-PIE,UNUSED
+! RUN: %flang -target aarch64-windows-msvc -### %s 2>&1 \
+! RUN:     | FileCheck %s --check-prefixes=NO-PIE
+! RUN: %flang -target aarch64-windows-msvc -pie -### %s 2>&1 \
+! RUN:     | FileCheck %s --check-prefixes=NO-PIE,UNUSED
+! RUN: %flang -target aarch64-windows-msvc -no-pie -### %s 2>&1 \
+! RUN:     | FileCheck %s --check-prefixes=NO-PIE,UNUSED
+!
+! PIE: "-pie"
+! NO-PIE-NOT: "-pie"
+! UNUSED: warning: argument unused during compilation: '{{(-no)?}}-pie'
+! ------------------------------------------------------------------------------
+
+program hello
+  write(*,*), "Hello world!"
+end program hello
diff --git a/flang/test/Driver/misc-flags.f90 b/flang/test/Driver/misc-flags.f90
deleted file mode 100644
index 61d763c5b64dd..0000000000000
--- a/flang/test/Driver/misc-flags.f90
+++ /dev/null
@@ -1,15 +0,0 @@
-! Make sure that `-l` is "visible" to Flang's driver
-! RUN: %flang -lpgmath -### %s
-
-! Make sure that `-Wl` is "visible" to Flang's driver
-! RUN: %flang -Wl,abs -### %s
-
-! Make sure that `-fuse-ld' is "visible" to Flang's driver
-! RUN: %flang -fuse-ld= -### %s
-
-! Make sure that `-L' is "visible" to Flang's driver
-! RUN: %flang -L/ -### %s
-
-program hello
-  write(*,*), "Hello world!"
-end program hello
diff --git a/flang/test/Evaluate/folding33.f90 b/flang/test/Evaluate/folding33.f90
new file mode 100644
index 0000000000000..fb5a23cf1f209
--- /dev/null
+++ b/flang/test/Evaluate/folding33.f90
@@ -0,0 +1,4 @@
+!RUN: %flang_fc1 -fsyntax-only %s 2>&1 | FileCheck %s
+!CHECK: warning: overflow on REAL(4) to REAL(2) conversion after folding a call to 'exp' [-Wfolding-exception]
+print *, exp((11.265625_2,1._2))
+end
diff --git a/flang/test/Fir/CUDA/cuda-target-rewrite.mlir b/flang/test/Fir/CUDA/cuda-target-rewrite.mlir
index 48fee10f3db97..5562e00085526 100644
--- a/flang/test/Fir/CUDA/cuda-target-rewrite.mlir
+++ b/flang/test/Fir/CUDA/cuda-target-rewrite.mlir
@@ -108,3 +108,23 @@ module attributes {gpu.container_module, fir.defaultkind = "a1c4d8i4l4r4", fir.k
   }
 }
 
+// -----
+
+module attributes {gpu.container_module, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
+  gpu.module @testmod {
+    gpu.func @_QPtest(%arg0: complex<f32>) -> () kernel {
+      gpu.return
+    }
+  }
+  func.func @main(%arg0: complex<f32>) {
+    %0 = llvm.mlir.constant(0 : i64) : i64
+    %1 = llvm.mlir.constant(0 : i32) : i32
+    %2 = fir.alloca i64
+    %3 = cuf.stream_cast %2 : !fir.ref<i64>
+    %4 = gpu.launch_func async [%3] @testmod::@_QPtest blocks in (%0, %0, %0) threads in (%0, %0, %0) : i64 dynamic_shared_memory_size %1 args(%arg0 : complex<f32>) {cuf.proc_attr = #cuf.cuda_proc<global>}
+    return
+  }
+}
+
+// CHECK-LABEL: func.func @main
+// CHECK: %{{.*}} = gpu.launch_func async [%{{.*}}] @testmod::@_QPtest blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) : i64 dynamic_shared_memory_size %{{.*}} args(%{{.*}} : !fir.vector<2:f32>) {cuf.proc_attr = #cuf.cuda_proc<global>}
diff --git a/flang/test/Fir/OpenACC/openacc-mappable.fir b/flang/test/Fir/OpenACC/openacc-mappable.fir
index 05df35a482907..00fe2574da62a 100644
--- a/flang/test/Fir/OpenACC/openacc-mappable.fir
+++ b/flang/test/Fir/OpenACC/openacc-mappable.fir
@@ -21,11 +21,13 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<f16 = dense<16> : vector<2xi64>,
   // CHECK: Mappable: !fir.box<!fir.array<10xf32>>
   // CHECK: Type category: array
   // CHECK: Size: 40
+  // CHECK: Has unknown dimensions: false
 
   // CHECK: Visiting: %{{.*}} = acc.copyin varPtr(%{{.*}} : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {name = "arr", structured = false}
   // CHECK: Pointer-like and Mappable: !fir.ref<!fir.array<10xf32>>
   // CHECK: Type category: array
   // CHECK: Size: 40
+  // CHECK: Has unknown dimensions: false
 
   // This second test exercises argument of explicit-shape arrays in following forms:
   // `real :: arr1(nn), arr2(2:nn), arr3(10)`
@@ -62,6 +64,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<f16 = dense<16> : vector<2xi64>,
   // CHECK: Visiting: %{{.*}} = acc.copyin varPtr(%{{.*}} : !fir.ref<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>> {name = "arr1", structured = false}
   // CHECK: Pointer-like and Mappable: !fir.ref<!fir.array<?xf32>>
   // CHECK: Type category: array
+  // CHECK: Has unknown dimensions: true
   // CHECK: Shape: %{{.*}} = fir.shape %[[EXTENT1:.*]] : (index) -> !fir.shape<1>
   // CHECK: Bound[0]: %{{.*}} = acc.bounds lowerbound(%[[LB1:.*]] : index) upperbound(%[[UB1:.*]] : index) extent(%{{.*}} : index) stride(%c1{{.*}} : index) startIdx(%c1{{.*}} : index)
   // CHECK: Lower bound: %[[LB1]] = arith.constant 0 : index
@@ -70,6 +73,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<f16 = dense<16> : vector<2xi64>,
   // CHECK: Visiting: %{{.*}} = acc.copyin varPtr(%{{.*}} : !fir.ref<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>> {name = "arr2", structured = false}
   // CHECK: Pointer-like and Mappable: !fir.ref<!fir.array<?xf32>>
   // CHECK: Type category: array
+  // CHECK: Has unknown dimensions: true
   // CHECK: Shape: %{{.*}} = fir.shape_shift %c2{{.*}}, %[[EXTENT2:.*]] : (index, index) -> !fir.shapeshift<1>
   // CHECK: Bound[0]: %{{.*}} = acc.bounds lowerbound(%[[LB2:.*]] : index) upperbound(%[[UB2:.*]] : index) extent(%{{.*}} : index) stride(%c1{{.*}} : index) startIdx(%c2{{.*}} : index)
   // CHECK: Lower bound: %[[LB2]] = arith.constant 0 : index
@@ -80,6 +84,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<f16 = dense<16> : vector<2xi64>,
   // CHECK: Type category: array
   // CHECK: Size: 40
   // CHECK: Offset: 0
+  // CHECK: Has unknown dimensions: false
   // CHECK: Shape: %{{.*}} = fir.shape %[[EXTENT3:.*]] : (index) -> !fir.shape<1>
   // CHECK: Bound[0]: %{{.*}} = acc.bounds lowerbound(%[[LB3:.*]] : index) upperbound(%[[UB3:.*]] : index) extent(%c10{{.*}} : index) stride(%c1{{.*}} : index) startIdx(%c1{{.*}} : index)
   // CHECK: Lower bound: %[[LB3]] = arith.constant 0 : index
diff --git a/flang/test/HLFIR/order_assignments/forall-pointer-assignment-codegen.fir b/flang/test/HLFIR/order_assignments/forall-pointer-assignment-codegen.fir
index 1d198765aff9e..855b62ca0ed39 100644
--- a/flang/test/HLFIR/order_assignments/forall-pointer-assignment-codegen.fir
+++ b/flang/test/HLFIR/order_assignments/forall-pointer-assignment-codegen.fir
@@ -91,10 +91,8 @@ func.func @test_need_to_save_rhs(%n: i64, %arg1: !fir.box<!fir.array<?x!ptr_wrap
 // CHECK:             %[[VAL_21:.*]] = hlfir.designate %[[VAL_6]]#0 (%[[VAL_20]])  : (!fir.box<!fir.array<?x!fir.type<ptr_wrapper{p:!fir.box<!fir.ptr<!fir.type<t{i:i64}>>>}>>>, i64) -> !fir.ref<!fir.type<ptr_wrapper{p:!fir.box<!fir.ptr<!fir.type<t{i:i64}>>>}>>
 // CHECK:             %[[VAL_22:.*]] = hlfir.designate %[[VAL_21]]{"p"}   {fortran_attrs = #fir.var_attrs<pointer>} : (!fir.ref<!fir.type<ptr_wrapper{p:!fir.box<!fir.ptr<!fir.type<t{i:i64}>>>}>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.type<t{i:i64}>>>>
 // CHECK:             %[[VAL_23:.*]] = fir.load %[[VAL_22]] : !fir.ref<!fir.box<!fir.ptr<!fir.type<t{i:i64}>>>>
-// CHECK:             %[[VAL_24:.*]] = fir.box_addr %[[VAL_23]] : (!fir.box<!fir.ptr<!fir.type<t{i:i64}>>>) -> !fir.ptr<!fir.type<t{i:i64}>>
-// CHECK:             %[[VAL_25:.*]] = fir.embox %[[VAL_24]] : (!fir.ptr<!fir.type<t{i:i64}>>) -> !fir.box<!fir.type<t{i:i64}>>
-// CHECK:             %[[VAL_26:.*]] = fir.convert %[[VAL_25]] : (!fir.box<!fir.type<t{i:i64}>>) -> !fir.box<none>
-// CHECK:             fir.call @_FortranAPushDescriptor(%[[VAL_16]], %[[VAL_26]]) : (!fir.llvm_ptr<i8>, !fir.box<none>) -> ()
+// CHECK:             %[[VAL_24:.*]] = fir.convert %[[VAL_23]] : (!fir.box<!fir.ptr<!fir.type<t{i:i64}>>>) -> !fir.box<none>
+// CHECK:             fir.call @_FortranAPushDescriptor(%[[VAL_16]], %[[VAL_24]]) : (!fir.llvm_ptr<i8>, !fir.box<none>) -> ()
 // CHECK:           }
 // CHECK:           %[[VAL_27:.*]] = fir.convert %[[VAL_4]] : (i64) -> index
 // CHECK:           %[[VAL_28:.*]] = fir.convert %[[VAL_0]] : (i64) -> index
diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf
index 5c4c3c6d39820..09b4302446ee7 100644
--- a/flang/test/Lower/CUDA/cuda-device-proc.cuf
+++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf
@@ -431,7 +431,7 @@ end subroutine
 ! CHECK: %[[COUNT:.*]] = arith.constant 256 : i32
 ! CHECK: %[[LLVM_PTR:.*]] = fir.convert %[[DECL_SHARED]]#0 : (!fir.ref<i64>) -> !llvm.ptr
 ! CHECK: %[[SHARED_PTR:.*]] = llvm.addrspacecast %[[LLVM_PTR]] : !llvm.ptr to !llvm.ptr<3>
-! CHECK: nvvm.mbarrier.init.shared %[[SHARED_PTR]], %[[COUNT]] : !llvm.ptr<3>, i32
+! CHECK: nvvm.mbarrier.init %[[SHARED_PTR]], %[[COUNT]] : !llvm.ptr<3>, i32
 ! CHECK: nvvm.fence.proxy {kind = #nvvm.proxy_kind<async.shared>, space = #nvvm.shared_space<cta>}
 
 ! CHECK: %[[LLVM_PTR:.*]] = fir.convert %[[DECL_SHARED]]#0 : (!fir.ref<i64>) -> !llvm.ptr
@@ -468,7 +468,18 @@ attributes(global) subroutine test_bulk_g2s(a)
 end subroutine
 
 ! CHECK-LABEL: func.func @_QPtest_bulk_g2s
-! CHECK: nvvm.cp.async.bulk.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : <7>, <1>
+! CHECK: %[[BARRIER:.*]]:2 = hlfir.declare %4 {data_attr = #cuf.cuda<shared>, uniq_name = "_QFtest_bulk_g2sEbarrier1"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK: %[[DST:.*]]:2 = hlfir.declare %16(%17) {data_attr = #cuf.cuda<shared>, uniq_name = "_QFtest_bulk_g2sEtmpa"} : (!fir.ref<!fir.array<1024xf64>>, !fir.shape<1>) -> (!fir.ref<!fir.array<1024xf64>>, !fir.ref<!fir.array<1024xf64>>)
+! CHECK: %[[COUNT:.*]]:2 = hlfir.declare %19 {data_attr = #cuf.cuda<device>, uniq_name = "_QFtest_bulk_g2sEtx_count"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)    
+! CHECK: %[[SRC:.*]] = hlfir.designate %{{.*}} (%{{.*}})  : (!fir.box<!fir.array<?xf64>>, i64) -> !fir.ref<f64>
+! CHECK: %[[COUNT_LOAD:.*]] = fir.load %20#0 : !fir.ref<i32>
+! CHECK: %[[BARRIER_PTR:.*]] = fir.convert %[[BARRIER]]#0 : (!fir.ref<i64>) -> !llvm.ptr
+! CHECK: %[[BARRIER_3:.*]] = llvm.addrspacecast %[[BARRIER_PTR]] : !llvm.ptr to !llvm.ptr<3>
+! CHECK: %[[DST_PTR:.*]] = fir.convert %[[DST]]#0 : (!fir.ref<!fir.array<1024xf64>>) -> !llvm.ptr
+! CHECK: %[[DST_7:.*]] = llvm.addrspacecast %[[DST_PTR]] : !llvm.ptr to !llvm.ptr<7>
+! CHECK: %[[SRC_PTR:.*]] = fir.convert %[[SRC]] : (!fir.ref<f64>) -> !llvm.ptr
+! CHECK: %[[SRC_3:.*]] = llvm.addrspacecast %[[SRC_PTR]] : !llvm.ptr to !llvm.ptr<1>
+! CHECK: nvvm.cp.async.bulk.shared.cluster.global %[[DST_7]], %[[SRC_3]], %[[BARRIER_3]], %[[COUNT_LOAD]] : <7>, <1>
 
 attributes(global) subroutine test_bulk_s2g(a)
   real(8), device :: a(*)
@@ -479,6 +490,8 @@ end subroutine
 
 ! CHECK-LABEL: func.func @_QPtest_bulk_s2g
 ! CHECL: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3>
+! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group"
+! CHECK: nvvm.cp.async.bulk.wait_group 0
 
 attributes(device) subroutine testAtomicCasLoop(aa, n)
   integer :: a
@@ -492,3 +505,250 @@ end subroutine
 ! CHECK: %[[CASTED_CMP_XCHG_EV:.*]] = fir.convert %[[CMP_XCHG_EV]] : (i1) -> i32
 ! CHECK: %{{.*}} = arith.constant 1 : i32
 ! CHECK: %19 = arith.cmpi eq, %[[CASTED_CMP_XCHG_EV]], %{{.*}} : i32
+
+attributes(global) subroutine test_barrier_try_wait()
+  integer :: istat
+  integer(8), shared :: barrier1
+  integer(8) :: token
+  istat = barrier_try_wait(barrier1, token)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_barrier_try_wait()
+! CHECK: scf.while
+! CHECK: %{{.*}} = nvvm.inline_ptx ".reg .pred p; mbarrier.try_wait.shared.b64 p, [%{{.*}}], %{{.*}}, %{{.*}}; selp.b32 %{{.*}}, 1, 0, p;" ro(%{{.*}}, %{{.*}}, %c1000000{{.*}} : !llvm.ptr, i64, i32) -> i32
+
+attributes(global) subroutine test_barrier_try_wait_sleep()
+  integer :: istat
+  integer(8), shared :: barrier1
+  integer(8) :: token
+  integer(4) :: sleep_time
+  istat = barrier_try_wait_sleep(barrier1, token, sleep_time)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_barrier_try_wait_sleep()
+! CHECK: %{{.*}} = nvvm.inline_ptx ".reg .pred p; mbarrier.try_wait.shared.b64 p, [%{{.*}}], %{{.*}}, %{{.*}}; selp.b32 %0, 1, 0, p;" ro(%{{.*}}, %{{.*}}, %{{.*}} : !llvm.ptr, i64, i32) -> i32
+
+attributes(global) subroutine test_tma_bulk_load_c4(a, n)
+  integer(8), shared :: barrier1
+  integer, value :: n
+  complex(4), device :: r8(n)
+  complex(4), shared :: tmp(1024)
+  integer(4) :: j, elem_count
+  call tma_bulk_load(barrier1, r8(j), tmp, elem_count)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_tma_bulk_load_c4
+! CHECK: %[[BARRIER:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<shared>, uniq_name = "_QFtest_tma_bulk_load_c4Ebarrier1"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK: %[[ELEM_COUNT:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFtest_tma_bulk_load_c4Eelem_count"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[COUNT:.*]] = fir.load %[[ELEM_COUNT]]#0 : !fir.ref<i32>
+! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 8 : i32
+! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32
+! CHECK: %[[BARRIER_PTR:.*]] = fir.convert %[[BARRIER]]#0 : (!fir.ref<i64>) -> !llvm.ptr
+! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !llvm.ptr, !llvm.ptr, i32, !llvm.ptr)
+! CHECK: nvvm.inline_ptx "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" ro(%[[BARRIER_PTR]], %[[SIZE]] : !llvm.ptr, i32)
+
+attributes(global) subroutine test_tma_bulk_load_c8(a, n)
+  integer(8), shared :: barrier1
+  integer, value :: n
+  complex(8), device :: r8(n)
+  complex(8), shared :: tmp(1024)
+  integer(4) :: j, elem_count
+  call tma_bulk_load(barrier1, r8(j), tmp, elem_count)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_tma_bulk_load_c8
+! CHECK: %[[BARRIER:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<shared>, uniq_name = "_QFtest_tma_bulk_load_c8Ebarrier1"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK: %[[ELEM_COUNT:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFtest_tma_bulk_load_c8Eelem_count"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[COUNT:.*]] = fir.load %[[ELEM_COUNT]]#0 : !fir.ref<i32>
+! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 16 : i32
+! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32
+! CHECK: %[[BARRIER_PTR:.*]] = fir.convert %[[BARRIER]]#0 : (!fir.ref<i64>) -> !llvm.ptr
+! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !llvm.ptr, !llvm.ptr, i32, !llvm.ptr)
+! CHECK: nvvm.inline_ptx "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" ro(%[[BARRIER_PTR]], %[[SIZE]] : !llvm.ptr, i32)
+
+attributes(global) subroutine test_tma_bulk_load_i4(a, n)
+  integer(8), shared :: barrier1
+  integer, value :: n
+  integer(4), device :: r8(n)
+  integer(4), shared :: tmp(1024)
+  integer(4) :: j, elem_count
+  call tma_bulk_load(barrier1, r8(j), tmp, elem_count)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_tma_bulk_load_i4
+! CHECK: %[[BARRIER:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<shared>, uniq_name = "_QFtest_tma_bulk_load_i4Ebarrier1"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK: %[[ELEM_COUNT:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFtest_tma_bulk_load_i4Eelem_count"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[COUNT:.*]] = fir.load %[[ELEM_COUNT]]#0 : !fir.ref<i32>
+! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 4 : i32
+! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32
+! CHECK: %[[BARRIER_PTR:.*]] = fir.convert %[[BARRIER]]#0 : (!fir.ref<i64>) -> !llvm.ptr
+! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !llvm.ptr, !llvm.ptr, i32, !llvm.ptr)
+! CHECK: nvvm.inline_ptx "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" ro(%[[BARRIER_PTR]], %[[SIZE]] : !llvm.ptr, i32)
+
+attributes(global) subroutine test_tma_bulk_load_i8(a, n)
+  integer(8), shared :: barrier1
+  integer, value :: n
+  integer(8), device :: r8(n)
+  integer(8), shared :: tmp(1024)
+  integer(4) :: j, elem_count
+  call tma_bulk_load(barrier1, r8(j), tmp, elem_count)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_tma_bulk_load_i8
+! CHECK: %[[BARRIER:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<shared>, uniq_name = "_QFtest_tma_bulk_load_i8Ebarrier1"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK: %[[ELEM_COUNT:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFtest_tma_bulk_load_i8Eelem_count"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[COUNT:.*]] = fir.load %[[ELEM_COUNT]]#0 : !fir.ref<i32>
+! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 8 : i32
+! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32
+! CHECK: %[[BARRIER_PTR:.*]] = fir.convert %[[BARRIER]]#0 : (!fir.ref<i64>) -> !llvm.ptr
+! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !llvm.ptr, !llvm.ptr, i32, !llvm.ptr)
+! CHECK: nvvm.inline_ptx "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" ro(%[[BARRIER_PTR]], %[[SIZE]] : !llvm.ptr, i32)
+
+attributes(global) subroutine test_tma_bulk_load_r2(a, n)
+  integer(8), shared :: barrier1
+  integer, value :: n
+  real(2), device :: r8(n)
+  real(2), shared :: tmp(1024)
+  integer(4) :: j, elem_count
+  call tma_bulk_load(barrier1, r8(j), tmp, elem_count)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_tma_bulk_load_r2
+! CHECK: %[[BARRIER:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<shared>, uniq_name = "_QFtest_tma_bulk_load_r2Ebarrier1"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK: %[[ELEM_COUNT:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFtest_tma_bulk_load_r2Eelem_count"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[COUNT:.*]] = fir.load %[[ELEM_COUNT]]#0 : !fir.ref<i32>
+! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 2 : i32
+! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32
+! CHECK: %[[BARRIER_PTR:.*]] = fir.convert %[[BARRIER]]#0 : (!fir.ref<i64>) -> !llvm.ptr
+! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !llvm.ptr, !llvm.ptr, i32, !llvm.ptr)
+! CHECK: nvvm.inline_ptx "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" ro(%[[BARRIER_PTR]], %[[SIZE]] : !llvm.ptr, i32)
+
+attributes(global) subroutine test_tma_bulk_load_r4(a, n)
+  integer(8), shared :: barrier1
+  integer, value :: n
+  real(4), device :: r8(n)
+  real(4), shared :: tmp(1024)
+  integer(4) :: j, elem_count
+  call tma_bulk_load(barrier1, r8(j), tmp, elem_count)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_tma_bulk_load_r4
+! CHECK: %[[BARRIER:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<shared>, uniq_name = "_QFtest_tma_bulk_load_r4Ebarrier1"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK: %[[ELEM_COUNT:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFtest_tma_bulk_load_r4Eelem_count"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[COUNT:.*]] = fir.load %[[ELEM_COUNT]]#0 : !fir.ref<i32>
+! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 4 : i32
+! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32
+! CHECK: %[[BARRIER_PTR:.*]] = fir.convert %[[BARRIER]]#0 : (!fir.ref<i64>) -> !llvm.ptr
+! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !llvm.ptr, !llvm.ptr, i32, !llvm.ptr)
+! CHECK: nvvm.inline_ptx "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" ro(%[[BARRIER_PTR]], %[[SIZE]] : !llvm.ptr, i32)
+
+attributes(global) subroutine test_tma_bulk_load_r8(a, n)
+  integer(8), shared :: barrier1
+  integer, value :: n
+  real(8), device :: r8(n)
+  real(8), shared :: tmp(1024)
+  integer(4) :: j, elem_count
+  call tma_bulk_load(barrier1, r8(j), tmp, elem_count)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_tma_bulk_load_r8
+! CHECK: %[[BARRIER:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<shared>, uniq_name = "_QFtest_tma_bulk_load_r8Ebarrier1"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK: %[[ELEM_COUNT:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFtest_tma_bulk_load_r8Eelem_count"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[COUNT:.*]] = fir.load %[[ELEM_COUNT]]#0 : !fir.ref<i32>
+! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 8 : i32
+! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32
+! CHECK: %[[BARRIER_PTR:.*]] = fir.convert %[[BARRIER]]#0 : (!fir.ref<i64>) -> !llvm.ptr
+! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !llvm.ptr, !llvm.ptr, i32, !llvm.ptr)
+! CHECK: nvvm.inline_ptx "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" ro(%[[BARRIER_PTR]], %[[SIZE]] : !llvm.ptr, i32)
+
+attributes(global) subroutine test_tma_bulk_store_c4(c, n)
+  integer, value :: n
+  complex(4), device :: c(n)
+  complex(4), shared :: tmpa(1024)
+  integer(4) :: j, elem_count
+  call tma_bulk_store(tmpa, c(j), elem_count)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_c4
+! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3>
+! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group"
+! CHECK: nvvm.cp.async.bulk.wait_group 0
+
+attributes(global) subroutine test_tma_bulk_store_c8(c, n)
+  integer, value :: n
+  complex(8), device :: c(n)
+  complex(8), shared :: tmpa(1024)
+  integer(4) :: j, elem_count
+  call tma_bulk_store(tmpa, c(j), elem_count)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_c8
+! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3>
+! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group"
+! CHECK: nvvm.cp.async.bulk.wait_group 0
+
+attributes(global) subroutine test_tma_bulk_store_i4(c, n)
+  integer, value :: n
+  integer(4), device :: c(n)
+  integer(4), shared :: tmpa(1024)
+  integer(4) :: j, elem_count
+  call tma_bulk_store(tmpa, c(j), elem_count)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_i4
+! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3>
+! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group"
+! CHECK: nvvm.cp.async.bulk.wait_group 0
+
+attributes(global) subroutine test_tma_bulk_store_i8(c, n)
+  integer, value :: n
+  integer(8), device :: c(n)
+  integer(8), shared :: tmpa(1024)
+  integer(4) :: j, elem_count
+  call tma_bulk_store(tmpa, c(j), elem_count)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_i8
+! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3>
+! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group"
+! CHECK: nvvm.cp.async.bulk.wait_group 0
+
+
+attributes(global) subroutine test_tma_bulk_store_r2(c, n)
+  integer, value :: n
+  real(2), device :: c(n)
+  real(2), shared :: tmpa(1024)
+  integer(4) :: j, elem_count
+  call tma_bulk_store(tmpa, c(j), elem_count)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_r2
+! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3>
+! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group"
+! CHECK: nvvm.cp.async.bulk.wait_group 0
+
+attributes(global) subroutine test_tma_bulk_store_r4(c, n)
+  integer, value :: n
+  real(4), device :: c(n)
+  real(4), shared :: tmpa(1024)
+  integer(4) :: j, elem_count
+  call tma_bulk_store(tmpa, c(j), elem_count)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_r4
+! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3>
+! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group"
+! CHECK: nvvm.cp.async.bulk.wait_group 0
+
+attributes(global) subroutine test_tma_bulk_store_r8(c, n)
+  integer, value :: n
+  real(8), device :: c(n)
+  real(8), shared :: tmpa(1024)
+  integer(4) :: j, elem_count
+  call tma_bulk_store(tmpa, c(j), elem_count)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_r8
+! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3>
+! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group"
+! CHECK: nvvm.cp.async.bulk.wait_group 0
diff --git a/flang/test/Lower/OpenMP/atomic-read-complex.f90 b/flang/test/Lower/OpenMP/atomic-read-complex.f90
new file mode 100644
index 0000000000000..2f51f03820926
--- /dev/null
+++ b/flang/test/Lower/OpenMP/atomic-read-complex.f90
@@ -0,0 +1,34 @@
+! Test lowering of atomic read to LLVM IR for complex types.
+! This is a regression test for issue #165184.
+
+! RUN: %flang_fc1 -emit-llvm -fopenmp -o - %s | FileCheck %s
+
+! Test that atomic read operations with complex types emit the correct
+! size parameter to __atomic_load:
+! - complex(4) (8 bytes total): should call __atomic_load(i64 8, ...)
+! - complex(8) (16 bytes total): should call __atomic_load(i64 16, ...)
+
+program atomic_read_complex
+  implicit none
+
+  ! Test complex(4) - single precision (8 bytes)
+  complex(4) :: c41, c42
+  ! Test complex(8) - double precision (16 bytes)
+  complex(8) :: c81, c82
+  
+  c42 = (1.0_4, 1.0_4)
+  c82 = (1.0_8, 1.0_8)
+
+  ! CHECK-LABEL: define {{.*}} @_QQmain
+
+  ! Single precision complex: 8 bytes
+  ! CHECK: call void @__atomic_load(i64 8, ptr {{.*}}, ptr {{.*}}, i32 {{.*}})
+!$omp atomic read
+  c41 = c42
+  
+  ! Double precision complex: 16 bytes (this was broken before the fix)
+  ! CHECK: call void @__atomic_load(i64 16, ptr {{.*}}, ptr {{.*}}, i32 {{.*}})
+!$omp atomic read
+  c81 = c82
+
+end program atomic_read_complex
diff --git a/flang/test/Lower/OpenMP/atomic-write-complex.f90 b/flang/test/Lower/OpenMP/atomic-write-complex.f90
new file mode 100644
index 0000000000000..48cfe26ca5a49
--- /dev/null
+++ b/flang/test/Lower/OpenMP/atomic-write-complex.f90
@@ -0,0 +1,34 @@
+! Test lowering of atomic write to LLVM IR for complex types.
+! This is a regression test for issue #165184.
+
+! RUN: %flang_fc1 -emit-llvm -fopenmp -o - %s | FileCheck %s
+
+! Test that atomic write operations with complex types emit the correct
+! size parameter to __atomic_store:
+! - complex(4) (8 bytes total): should call __atomic_store(i64 8, ...)
+! - complex(8) (16 bytes total): should call __atomic_store(i64 16, ...)
+
+program atomic_write_complex
+  implicit none
+
+  ! Test complex(4) - single precision (8 bytes)
+  complex(4) :: c41, c42
+  ! Test complex(8) - double precision (16 bytes)  
+  complex(8) :: c81, c82
+  
+  c42 = (1.0_4, 1.0_4)
+  c82 = (1.0_8, 1.0_8)
+
+  ! CHECK-LABEL: define {{.*}} @_QQmain
+  
+  ! Single precision complex: 8 bytes
+  ! CHECK: call void @__atomic_store(i64 8, ptr {{.*}}, ptr {{.*}}, i32 {{.*}})
+!$omp atomic write
+  c41 = c42
+  
+  ! Double precision complex: 16 bytes (this was broken before the fix)
+  ! CHECK: call void @__atomic_store(i64 16, ptr {{.*}}, ptr {{.*}}, i32 {{.*}})
+!$omp atomic write
+  c81 = c82
+
+end program atomic_write_complex
diff --git a/flang/test/Lower/forall-polymorphic.f90 b/flang/test/Lower/forall-pointer-assignment.f90
similarity index 66%
rename from flang/test/Lower/forall-polymorphic.f90
rename to flang/test/Lower/forall-pointer-assignment.f90
index 2b7a51f9b549a..ec142e3f13ebc 100644
--- a/flang/test/Lower/forall-polymorphic.f90
+++ b/flang/test/Lower/forall-pointer-assignment.f90
@@ -1,6 +1,7 @@
-! Test lower of FORALL polymorphic pointer assignment 
+! Test lower of FORALL pointer assignment 
 ! RUN: bbc -emit-fir %s -o - | FileCheck %s
 
+
 !! Test when LHS is polymorphic and RHS is not polymorphic
 ! CHECK-LABEL: c.func @_QPforallpolymorphic
   subroutine forallPolymorphic()
@@ -46,6 +47,7 @@ subroutine forallPolymorphic()
 
   end subroutine forallPolymorphic
 
+
 !! Test when LHS is not polymorphic but RHS is polymorphic
 ! CHECK-LABEL: c.func @_QPforallpolymorphic2(
 ! CHECK-SAME: %arg0: !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt{ptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt>>>>}>>>>> {fir.bindc_name = "tar1", fir.target}) {
@@ -87,3 +89,86 @@ subroutine forallPolymorphic2(Tar1)
 
   end subroutine forallPolymorphic2
 
+
+!! Test when LHS is unlimited polymorphic and RHS non-polymorphic intrinsic
+!! type target.
+! CHECK-LABEL: c.func @_QPforallpolymorphic3
+subroutine forallPolymorphic3()
+  TYPE :: DT
+    CLASS(*), POINTER    :: Ptr => NULL()
+  END TYPE
+
+  TYPE(DT) :: D1(10)
+  CHARACTER*1, TARGET :: TAR1(10)
+  INTEGER :: I
+
+  FORALL (I=1:10)
+    D1(I)%Ptr => Tar1(I)
+  END FORALL
+
+! CHECK: %[[V_7:[0-9]+]] = fir.alloca !fir.array<10x!fir.type<_QFforallpolymorphic3Tdt{ptr:!fir.class<!fir.ptr<none>>}>> {bindc_name = "d1", uniq_name = "_QFforallpolymorphic3Ed1"}
+! CHECK: %[[V_8:[0-9]+]] = fir.shape %c10 : (index) -> !fir.shape<1>
+! CHECK: %[[V_9:[0-9]+]] = fir.declare %[[V_7]](%[[V_8]]) {uniq_name = "_QFforallpolymorphic3Ed1"} : (!fir.ref<!fir.array<10x!fir.type<_QFforallpolymorphic3Tdt{ptr:!fir.class<!fir.ptr<none>>}>>>, !fir.shape<1>) -> !fir.ref<!fir.array<10x!fir.type<_QFforallpolymorphic3Tdt{ptr:!fir.class<!fir.ptr<none>>}>>>
+! CHECK: %[[V_16:[0-9]+]] = fir.alloca !fir.array<10x!fir.char<1>> {bindc_name = "tar1", fir.target, uniq_name = "_QFforallpolymorphic3Etar1"}
+! CHECK: %[[V_17:[0-9]+]] = fir.declare %[[V_16]](%[[V_8]]) typeparams %c1 {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFforallpolymorphic3Etar1"} : (!fir.ref<!fir.array<10x!fir.char<1>>>, !fir.shape<1>, index) -> !fir.ref<!fir.array<10x!fir.char<1>>>
+! CHECK: %[[V_24:[0-9]+]] = fir.convert %c1_i32 : (i32) -> index
+! CHECK: %[[V_25:[0-9]+]] = fir.convert %c10_i32 : (i32) -> index
+! CHECK: fir.do_loop %arg0 = %[[V_24]] to %[[V_25]] step %c1
+! CHECK: {
+! CHECK: %[[V_26:[0-9]+]] = fir.convert %arg0 : (index) -> i32
+! CHECK: %[[V_27:[0-9]+]] = fir.convert %[[V_26]] : (i32) -> i64
+! CHECK: %[[V_28:[0-9]+]] = fir.array_coor %[[V_9]](%[[V_8]]) %[[V_27]] : (!fir.ref<!fir.array<10x!fir.type<_QFforallpolymorphic3Tdt{ptr:!fir.class<!fir.ptr<none>>}>>>, !fir.shape<1>, i64) -> !fir.ref<!fir.type<_QFforallpolymorphic3Tdt{ptr:!fir.class<!fir.ptr<none>>}>>
+! CHECK: %[[V_29:[0-9]+]] = fir.field_index ptr, !fir.type<_QFforallpolymorphic3Tdt{ptr:!fir.class<!fir.ptr<none>>}>
+! CHECK: %[[V_30:[0-9]+]] = fir.coordinate_of %[[V_28]], ptr : (!fir.ref<!fir.type<_QFforallpolymorphic3Tdt{ptr:!fir.class<!fir.ptr<none>>}>>) -> !fir.ref<!fir.class<!fir.ptr<none>>>
+! CHECK: %[[V_31:[0-9]+]] = fir.convert %[[V_26]] : (i32) -> i64
+! CHECK: %[[V_32:[0-9]+]] = fir.array_coor %[[V_17]](%[[V_8]]) %31 : (!fir.ref<!fir.array<10x!fir.char<1>>>, !fir.shape<1>, i64) -> !fir.ref<!fir.char<1>>
+! CHECK: %[[V_33:[0-9]+]] = fir.embox %[[V_32]] : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.ptr<!fir.char<1>>>
+! CHECK: %[[V_34:[0-9]+]] = fir.rebox %[[V_33]] : (!fir.box<!fir.ptr<!fir.char<1>>>) -> !fir.class<!fir.ptr<none>>
+! CHECK: fir.store %[[V_34]] to %[[V_30]] : !fir.ref<!fir.class<!fir.ptr<none>>>
+! CHECK: }
+
+end subroutine forallPolymorphic3
+
+
+!! Test the LHS of a pointer assignment gets the isPointer flag from the
+!! RHS that is a reference to a function that returns a pointer.
+! CHECK-LABEL: c.func @_QPforallpointerassignment1
+  subroutine forallPointerAssignment1()
+    type base
+        real, pointer :: data => null()
+    end type
+
+    interface
+      pure function makeData (i)
+        real, pointer :: makeData
+        integer*4, intent(in) :: i
+      end function
+    end interface
+
+    type(base) :: co1(10)
+
+    forall (i=1:10)
+        co1(i)%data => makeData (i)
+    end forall
+
+! CHECK: %[[V_3:[0-9]+]] = fir.alloca i64
+! CHECK: %[[V_3:[0-9]+]] = fir.alloca i32 {bindc_name = "i"}
+! CHECK: %[[V_4:[0-9]+]] = fir.alloca !fir.box<!fir.ptr<f32>> {bindc_name = ".result"}
+! CHECK: %[[V_25:[0-9]+]] = fir.convert %c1_i32 : (i32) -> index
+! CHECK: %[[V_26:[0-9]+]] = fir.convert %c10_i32 : (i32) -> index
+! CHECK: %[[V_27:[0-9]+]] = fir.address_of(@{{_QQcl.*}}) : !fir.ref<!fir.char<1,{{.*}}>>
+! CHECK: %[[V_28:[0-9]+]] = fir.convert %[[V_27]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
+! CHECK: %[[V_29:[0-9]+]] = fir.call @_FortranACreateDescriptorStack(%[[V_28]], %c{{.*}}) : (!fir.ref<i8>, i32) -> !fir.llvm_ptr<i8>
+! CHECK: fir.do_loop %arg0 = %[[V_25]] to %[[V_26]] step %c1
+! CHECK: {
+! CHECK: %[[V_32:[0-9]+]] = fir.convert %arg0 : (index) -> i32
+! CHECK: fir.store %[[V_32]] to %[[V_3]] : !fir.ref<i32>
+! CHECK: %[[V_33:[0-9]+]] = fir.call @_QPmakedata(%[[V_3]]) proc_attrs<pure> fastmath<contract> : (!fir.ref<i32>) -> !fir.box<!fir.ptr<f32>>
+! CHECK: fir.save_result %[[V_33]] to %[[V_4]] : !fir.box<!fir.ptr<f32>>, !fir.ref<!fir.box<!fir.ptr<f32>>>
+! CHECK: %[[V_34:[0-9]+]] = fir.declare %[[V_4]] {uniq_name = ".tmp.func_result"} : (!fir.ref<!fir.box<!fir.ptr<f32>>>) -> !fir.ref<!fir.box<!fir.ptr<f32>>>
+! CHECK: %[[V_35:[0-9]+]] = fir.load %[[V_34]] : !fir.ref<!fir.box<!fir.ptr<f32>>>
+! CHECK: %[[V_36:[0-9]+]] = fir.convert %[[V_35]] : (!fir.box<!fir.ptr<f32>>) -> !fir.box<none>
+! CHECK: fir.call @_FortranAPushDescriptor(%[[V_29]], %[[V_36]]) : (!fir.llvm_ptr<i8>, !fir.box<none>) -> ()
+! CHECK: }
+
+  end subroutine forallPointerAssignment1
diff --git a/flang/test/Parser/OpenMP/declare-reduction-multi.f90 b/flang/test/Parser/OpenMP/declare-reduction-multi.f90
index a682958eb9128..88566613bd412 100644
--- a/flang/test/Parser/OpenMP/declare-reduction-multi.f90
+++ b/flang/test/Parser/OpenMP/declare-reduction-multi.f90
@@ -26,7 +26,8 @@ program omp_examples
   type(tt) :: values(n), sum, prod, big, small
 
   !$omp declare reduction(+:tt:omp_out%r = omp_out%r + omp_in%r) initializer(omp_priv%r = 0)
-!CHECK: !$OMP DECLARE REDUCTION(+:tt: omp_out%r = omp_out%r+omp_in%r) INITIALIZER(omp_priv%r = 0_4)
+!CHECK: !$OMP DECLARE REDUCTION(+:tt: omp_out%r = omp_out%r + omp_in%r) INITIALIZER(om&
+!CHECK-NEXT: !$OMP&p_priv%r = 0)
 
 !PARSE-TREE: DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OpenMPDeclareReductionConstruct -> OmpDirectiveSpecification
 !PARSE-TREE: | OmpDirectiveName -> llvm::omp::Directive = declare reduction
@@ -34,11 +35,39 @@ program omp_examples
 !PARSE-TREE: | | OmpReductionIdentifier -> DefinedOperator -> IntrinsicOperator = Add
 !PARSE-TREE: | | OmpTypeNameList -> OmpTypeName -> TypeSpec -> DerivedTypeSpec
 !PARSE-TREE: | | | Name = 'tt'
-!PARSE-TREE: | | OmpCombinerExpression -> AssignmentStmt = 'omp_out%r=omp_out%r+omp_in%r'
-!PARSE-TREE: | OmpClauseList -> OmpClause -> Initializer -> OmpInitializerClause -> AssignmentStmt = 'omp_priv%r=0._4'
+!PARSE-TREE: | | OmpCombinerExpression -> OmpStylizedInstance
+!PARSE-TREE: | | | OmpStylizedDeclaration
+!PARSE-TREE: | | | OmpStylizedDeclaration
+!PARSE-TREE: | | | Instance -> AssignmentStmt = 'omp_out%r=omp_out%r+omp_in%r'
+!PARSE-TREE: | | | | Variable = 'omp_out%r'
+!PARSE-TREE: | | | | | Designator -> DataRef -> StructureComponent
+!PARSE-TREE: | | | | | | DataRef -> Name = 'omp_out'
+!PARSE-TREE: | | | | | | Name = 'r'
+!PARSE-TREE: | | | | Expr = 'omp_out%r+omp_in%r'
+!PARSE-TREE: | | | | | Add
+!PARSE-TREE: | | | | | | Expr = 'omp_out%r'
+!PARSE-TREE: | | | | | | | Designator -> DataRef -> StructureComponent
+!PARSE-TREE: | | | | | | | | DataRef -> Name = 'omp_out'
+!PARSE-TREE: | | | | | | | | Name = 'r'
+!PARSE-TREE: | | | | | | Expr = 'omp_in%r'
+!PARSE-TREE: | | | | | | | Designator -> DataRef -> StructureComponent
+!PARSE-TREE: | | | | | | | | DataRef -> Name = 'omp_in'
+!PARSE-TREE: | | | | | | | | Name = 'r'
+!PARSE-TREE: | OmpClauseList -> OmpClause -> Initializer -> OmpInitializerClause -> OmpInitializerExpression -> OmpStylizedInstance
+!PARSE-TREE: | | OmpStylizedDeclaration
+!PARSE-TREE: | | OmpStylizedDeclaration
+!PARSE-TREE: | | Instance -> AssignmentStmt = 'omp_priv%r=0._4'
+!PARSE-TREE: | | | Variable = 'omp_priv%r'
+!PARSE-TREE: | | | | Designator -> DataRef -> StructureComponent
+!PARSE-TREE: | | | | | DataRef -> Name = 'omp_priv'
+!PARSE-TREE: | | | | | Name = 'r'
+!PARSE-TREE: | | | Expr = '0_4'
+!PARSE-TREE: | | | | LiteralConstant -> IntLiteralConstant = '0'
+!PARSE-TREE: | Flags = None
 
   !$omp declare reduction(*:tt:omp_out%r = omp_out%r * omp_in%r) initializer(omp_priv%r = 1)
-!CHECK-NEXT: !$OMP DECLARE REDUCTION(*:tt: omp_out%r = omp_out%r*omp_in%r) INITIALIZER(omp_priv%r = 1_4)
+!CHECK-NEXT: !$OMP DECLARE REDUCTION(*:tt: omp_out%r = omp_out%r * omp_in%r) INITIALIZER(om&
+!CHECK-NEXT: !$OMP&p_priv%r = 1)
 
 !PARSE-TREE: DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OpenMPDeclareReductionConstruct -> OmpDirectiveSpecification
 !PARSE-TREE: | OmpDirectiveName -> llvm::omp::Directive = declare reduction
@@ -46,11 +75,39 @@ program omp_examples
 !PARSE-TREE: | | OmpReductionIdentifier -> DefinedOperator -> IntrinsicOperator = Multiply
 !PARSE-TREE: | | OmpTypeNameList -> OmpTypeName -> TypeSpec -> DerivedTypeSpec
 !PARSE-TREE: | | | Name = 'tt'
-!PARSE-TREE: | | OmpCombinerExpression -> AssignmentStmt = 'omp_out%r=omp_out%r*omp_in%r'
-!PARSE-TREE: | OmpClauseList -> OmpClause -> Initializer -> OmpInitializerClause -> AssignmentStmt = 'omp_priv%r=1._4'
+!PARSE-TREE: | | OmpCombinerExpression -> OmpStylizedInstance
+!PARSE-TREE: | | | OmpStylizedDeclaration
+!PARSE-TREE: | | | OmpStylizedDeclaration
+!PARSE-TREE: | | | Instance -> AssignmentStmt = 'omp_out%r=omp_out%r*omp_in%r'
+!PARSE-TREE: | | | | Variable = 'omp_out%r'
+!PARSE-TREE: | | | | | Designator -> DataRef -> StructureComponent
+!PARSE-TREE: | | | | | | DataRef -> Name = 'omp_out'
+!PARSE-TREE: | | | | | | Name = 'r'
+!PARSE-TREE: | | | | Expr = 'omp_out%r*omp_in%r'
+!PARSE-TREE: | | | | | Multiply
+!PARSE-TREE: | | | | | | Expr = 'omp_out%r'
+!PARSE-TREE: | | | | | | | Designator -> DataRef -> StructureComponent
+!PARSE-TREE: | | | | | | | | DataRef -> Name = 'omp_out'
+!PARSE-TREE: | | | | | | | | Name = 'r'
+!PARSE-TREE: | | | | | | Expr = 'omp_in%r'
+!PARSE-TREE: | | | | | | | Designator -> DataRef -> StructureComponent
+!PARSE-TREE: | | | | | | | | DataRef -> Name = 'omp_in'
+!PARSE-TREE: | | | | | | | | Name = 'r'
+!PARSE-TREE: | OmpClauseList -> OmpClause -> Initializer -> OmpInitializerClause -> OmpInitializerExpression -> OmpStylizedInstance
+!PARSE-TREE: | | OmpStylizedDeclaration
+!PARSE-TREE: | | OmpStylizedDeclaration
+!PARSE-TREE: | | Instance -> AssignmentStmt = 'omp_priv%r=1._4'
+!PARSE-TREE: | | | Variable = 'omp_priv%r'
+!PARSE-TREE: | | | | Designator -> DataRef -> StructureComponent
+!PARSE-TREE: | | | | | DataRef -> Name = 'omp_priv'
+!PARSE-TREE: | | | | | Name = 'r'
+!PARSE-TREE: | | | Expr = '1_4'
+!PARSE-TREE: | | | | LiteralConstant -> IntLiteralConstant = '1'
+!PARSE-TREE: | Flags = None
 
   !$omp declare reduction(max:tt:omp_out = mymax(omp_out, omp_in)) initializer(omp_priv%r = 0)
-!CHECK-NEXT: !$OMP DECLARE REDUCTION(max:tt: omp_out = mymax(omp_out,omp_in)) INITIALIZER(omp_priv%r = 0_4)
+!CHECK-NEXT: !$OMP DECLARE REDUCTION(max:tt: omp_out = mymax(omp_out, omp_in)) INITIALIZER(&
+!CHECK-NEXT: !$OMP&omp_priv%r = 0)
 
 !PARSE-TREE: DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OpenMPDeclareReductionConstruct -> OmpDirectiveSpecification
 !PARSE-TREE: | OmpDirectiveName -> llvm::omp::Directive = declare reduction
@@ -58,11 +115,36 @@ program omp_examples
 !PARSE-TREE: | | OmpReductionIdentifier -> ProcedureDesignator -> Name = 'max'
 !PARSE-TREE: | | OmpTypeNameList -> OmpTypeName -> TypeSpec -> DerivedTypeSpec
 !PARSE-TREE: | | | Name = 'tt'
-!PARSE-TREE: | | OmpCombinerExpression -> AssignmentStmt = 'omp_out=mymax(omp_out,omp_in)'
-!PARSE-TREE: | OmpClauseList -> OmpClause -> Initializer -> OmpInitializerClause -> AssignmentStmt = 'omp_priv%r=0._4'
+!PARSE-TREE: | | OmpCombinerExpression -> OmpStylizedInstance
+!PARSE-TREE: | | | OmpStylizedDeclaration
+!PARSE-TREE: | | | OmpStylizedDeclaration
+!PARSE-TREE: | | | Instance -> AssignmentStmt = 'omp_out=mymax(omp_out,omp_in)'
+!PARSE-TREE: | | | | Variable = 'omp_out'
+!PARSE-TREE: | | | | | Designator -> DataRef -> Name = 'omp_out'
+!PARSE-TREE: | | | | Expr = 'mymax(omp_out,omp_in)'
+!PARSE-TREE: | | | | | FunctionReference -> Call
+!PARSE-TREE: | | | | | | ProcedureDesignator -> Name = 'mymax'
+!PARSE-TREE: | | | | | | ActualArgSpec
+!PARSE-TREE: | | | | | | | ActualArg -> Expr = 'omp_out'
+!PARSE-TREE: | | | | | | | | Designator -> DataRef -> Name = 'omp_out'
+!PARSE-TREE: | | | | | | ActualArgSpec
+!PARSE-TREE: | | | | | | | ActualArg -> Expr = 'omp_in'
+!PARSE-TREE: | | | | | | | | Designator -> DataRef -> Name = 'omp_in'
+!PARSE-TREE: | OmpClauseList -> OmpClause -> Initializer -> OmpInitializerClause -> OmpInitializerExpression -> OmpStylizedInstance
+!PARSE-TREE: | | OmpStylizedDeclaration
+!PARSE-TREE: | | OmpStylizedDeclaration
+!PARSE-TREE: | | Instance -> AssignmentStmt = 'omp_priv%r=0._4'
+!PARSE-TREE: | | | Variable = 'omp_priv%r'
+!PARSE-TREE: | | | | Designator -> DataRef -> StructureComponent
+!PARSE-TREE: | | | | | DataRef -> Name = 'omp_priv'
+!PARSE-TREE: | | | | | Name = 'r'
+!PARSE-TREE: | | | Expr = '0_4'
+!PARSE-TREE: | | | | LiteralConstant -> IntLiteralConstant = '0'
+!PARSE-TREE: | Flags = None
 
   !$omp declare reduction(min:tt:omp_out%r = min(omp_out%r, omp_in%r)) initializer(omp_priv%r = 1)
-!CHECK-NEXT: !$OMP DECLARE REDUCTION(min:tt: omp_out%r = min(omp_out%r,omp_in%r)) INITIALIZER(omp_priv%r = 1_4)
+!CHECK-NEXT: !$OMP DECLARE REDUCTION(min:tt: omp_out%r = min(omp_out%r, omp_in%r)) INITIALI&
+!CHECK-NEXT: !$OMP&ZER(omp_priv%r = 1)
 
 !PARSE-TREE: DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OpenMPDeclareReductionConstruct -> OmpDirectiveSpecification
 !PARSE-TREE: | OmpDirectiveName -> llvm::omp::Directive = declare reduction
@@ -70,8 +152,38 @@ program omp_examples
 !PARSE-TREE: | | OmpReductionIdentifier -> ProcedureDesignator -> Name = 'min'
 !PARSE-TREE: | | OmpTypeNameList -> OmpTypeName -> TypeSpec -> DerivedTypeSpec
 !PARSE-TREE: | | | Name = 'tt'
-!PARSE-TREE: | | OmpCombinerExpression -> AssignmentStmt = 'omp_out%r=min(omp_out%r,omp_in%r)'
-!PARSE-TREE: | OmpClauseList -> OmpClause -> Initializer -> OmpInitializerClause -> AssignmentStmt = 'omp_priv%r=1._4'
+!PARSE-TREE: | | OmpCombinerExpression -> OmpStylizedInstance
+!PARSE-TREE: | | | OmpStylizedDeclaration
+!PARSE-TREE: | | | OmpStylizedDeclaration
+!PARSE-TREE: | | | Instance -> AssignmentStmt = 'omp_out%r=min(omp_out%r,omp_in%r)'
+!PARSE-TREE: | | | | Variable = 'omp_out%r'
+!PARSE-TREE: | | | | | Designator -> DataRef -> StructureComponent
+!PARSE-TREE: | | | | | | DataRef -> Name = 'omp_out'
+!PARSE-TREE: | | | | | | Name = 'r'
+!PARSE-TREE: | | | | Expr = 'min(omp_out%r,omp_in%r)'
+!PARSE-TREE: | | | | | FunctionReference -> Call
+!PARSE-TREE: | | | | | | ProcedureDesignator -> Name = 'min'
+!PARSE-TREE: | | | | | | ActualArgSpec
+!PARSE-TREE: | | | | | | | ActualArg -> Expr = 'omp_out%r'
+!PARSE-TREE: | | | | | | | | Designator -> DataRef -> StructureComponent
+!PARSE-TREE: | | | | | | | | | DataRef -> Name = 'omp_out'
+!PARSE-TREE: | | | | | | | | | Name = 'r'
+!PARSE-TREE: | | | | | | ActualArgSpec
+!PARSE-TREE: | | | | | | | ActualArg -> Expr = 'omp_in%r'
+!PARSE-TREE: | | | | | | | | Designator -> DataRef -> StructureComponent
+!PARSE-TREE: | | | | | | | | | DataRef -> Name = 'omp_in'
+!PARSE-TREE: | | | | | | | | | Name = 'r'
+!PARSE-TREE: | OmpClauseList -> OmpClause -> Initializer -> OmpInitializerClause -> OmpInitializerExpression -> OmpStylizedInstance
+!PARSE-TREE: | | OmpStylizedDeclaration
+!PARSE-TREE: | | OmpStylizedDeclaration
+!PARSE-TREE: | | Instance -> AssignmentStmt = 'omp_priv%r=1._4'
+!PARSE-TREE: | | | Variable = 'omp_priv%r'
+!PARSE-TREE: | | | | Designator -> DataRef -> StructureComponent
+!PARSE-TREE: | | | | | DataRef -> Name = 'omp_priv'
+!PARSE-TREE: | | | | | Name = 'r'
+!PARSE-TREE: | | | Expr = '1_4'
+!PARSE-TREE: | | | | LiteralConstant -> IntLiteralConstant = '1'
+!PARSE-TREE: | Flags = None
 
   call random_number(values%r)
 
diff --git a/flang/test/Parser/OpenMP/declare-reduction-operator.f90 b/flang/test/Parser/OpenMP/declare-reduction-operator.f90
index e4d07c8265b1e..0d337c1ef42f3 100644
--- a/flang/test/Parser/OpenMP/declare-reduction-operator.f90
+++ b/flang/test/Parser/OpenMP/declare-reduction-operator.f90
@@ -16,7 +16,8 @@ subroutine reduce_1 ( n, tts )
   type(tt) :: tts(n)
   type(tt2) :: tts2(n)
 
-!CHECK: !$OMP DECLARE REDUCTION(+:tt: omp_out = tt(x=omp_out%x-omp_in%x,y=omp_out%y-omp_in%y)) INITIALIZER(omp_priv = tt(x=0_4,y=0_4))
+!CHECK: !$OMP DECLARE REDUCTION(+:tt: omp_out = tt(omp_out%x - omp_in%x , omp_out%y - &
+!CHECK: !$OMP&omp_in%y)) INITIALIZER(omp_priv = tt(0,0))
 
 !PARSE-TREE: DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OpenMPDeclareReductionConstruct -> OmpDirectiveSpecification
 !PARSE-TREE: | OmpDirectiveName -> llvm::omp::Directive = declare reduction
@@ -24,13 +25,60 @@ subroutine reduce_1 ( n, tts )
 !PARSE-TREE: | | OmpReductionIdentifier -> DefinedOperator -> IntrinsicOperator = Add
 !PARSE-TREE: | | OmpTypeNameList -> OmpTypeName -> TypeSpec -> DerivedTypeSpec
 !PARSE-TREE: | | | Name = 'tt'
-!PARSE-TREE: | | OmpCombinerExpression -> AssignmentStmt = 'omp_out=tt(x=omp_out%x-omp_in%x,y=omp_out%y-omp_in%y)'
-!PARSE-TREE: | OmpClauseList -> OmpClause -> Initializer -> OmpInitializerClause -> AssignmentStmt = 'omp_priv=tt(x=0_4,y=0_4)'
-
+!PARSE-TREE: | | OmpCombinerExpression -> OmpStylizedInstance
+!PARSE-TREE: | | | OmpStylizedDeclaration
+!PARSE-TREE: | | | OmpStylizedDeclaration
+!PARSE-TREE: | | | Instance -> AssignmentStmt = 'omp_out=tt(x=omp_out%x-omp_in%x,y=omp_out%y-omp_in%y)'
+!PARSE-TREE: | | | | Variable = 'omp_out'
+!PARSE-TREE: | | | | | Designator -> DataRef -> Name = 'omp_out'
+!PARSE-TREE: | | | | Expr = 'tt(x=omp_out%x-omp_in%x,y=omp_out%y-omp_in%y)'
+!PARSE-TREE: | | | | | StructureConstructor
+!PARSE-TREE: | | | | | | DerivedTypeSpec
+!PARSE-TREE: | | | | | | | Name = 'tt'
+!PARSE-TREE: | | | | | | ComponentSpec
+!PARSE-TREE: | | | | | | | ComponentDataSource -> Expr = 'omp_out%x-omp_in%x'
+!PARSE-TREE: | | | | | | | | Subtract
+!PARSE-TREE: | | | | | | | | | Expr = 'omp_out%x'
+!PARSE-TREE: | | | | | | | | | | Designator -> DataRef -> StructureComponent
+!PARSE-TREE: | | | | | | | | | | | DataRef -> Name = 'omp_out'
+!PARSE-TREE: | | | | | | | | | | | Name = 'x'
+!PARSE-TREE: | | | | | | | | | Expr = 'omp_in%x'
+!PARSE-TREE: | | | | | | | | | | Designator -> DataRef -> StructureComponent
+!PARSE-TREE: | | | | | | | | | | | DataRef -> Name = 'omp_in'
+!PARSE-TREE: | | | | | | | | | | | Name = 'x'
+!PARSE-TREE: | | | | | | ComponentSpec
+!PARSE-TREE: | | | | | | | ComponentDataSource -> Expr = 'omp_out%y-omp_in%y'
+!PARSE-TREE: | | | | | | | | Subtract
+!PARSE-TREE: | | | | | | | | | Expr = 'omp_out%y'
+!PARSE-TREE: | | | | | | | | | | Designator -> DataRef -> StructureComponent
+!PARSE-TREE: | | | | | | | | | | | DataRef -> Name = 'omp_out'
+!PARSE-TREE: | | | | | | | | | | | Name = 'y'
+!PARSE-TREE: | | | | | | | | | Expr = 'omp_in%y'
+!PARSE-TREE: | | | | | | | | | | Designator -> DataRef -> StructureComponent
+!PARSE-TREE: | | | | | | | | | | | DataRef -> Name = 'omp_in'
+!PARSE-TREE: | | | | | | | | | | | Name = 'y'
+!PARSE-TREE: | OmpClauseList -> OmpClause -> Initializer -> OmpInitializerClause -> OmpInitializerExpression -> OmpStylizedInstance
+!PARSE-TREE: | | OmpStylizedDeclaration
+!PARSE-TREE: | | OmpStylizedDeclaration
+!PARSE-TREE: | | Instance -> AssignmentStmt = 'omp_priv=tt(x=0_4,y=0_4)'
+!PARSE-TREE: | | | Variable = 'omp_priv'
+!PARSE-TREE: | | | | Designator -> DataRef -> Name = 'omp_priv'
+!PARSE-TREE: | | | Expr = 'tt(x=0_4,y=0_4)'
+!PARSE-TREE: | | | | StructureConstructor
+!PARSE-TREE: | | | | | DerivedTypeSpec
+!PARSE-TREE: | | | | | | Name = 'tt'
+!PARSE-TREE: | | | | | ComponentSpec
+!PARSE-TREE: | | | | | | ComponentDataSource -> Expr = '0_4'
+!PARSE-TREE: | | | | | | | LiteralConstant -> IntLiteralConstant = '0'
+!PARSE-TREE: | | | | | ComponentSpec
+!PARSE-TREE: | | | | | | ComponentDataSource -> Expr = '0_4'
+!PARSE-TREE: | | | | | | | LiteralConstant -> IntLiteralConstant = '0'
+!PARSE-TREE: | Flags = None
   !$omp declare reduction(+ : tt :  omp_out = tt(omp_out%x - omp_in%x , omp_out%y - omp_in%y)) initializer(omp_priv = tt(0,0))
 
   
-!CHECK: !$OMP DECLARE REDUCTION(+:tt2: omp_out = tt2(x=omp_out%x-omp_in%x,y=omp_out%y-omp_in%y)) INITIALIZER(omp_priv = tt2(x=0._8,y=0._8)
+!CHECK: !$OMP DECLARE REDUCTION(+:tt2: omp_out = tt2(omp_out%x - omp_in%x , omp_out%y &
+!CHECK: !$OMP&- omp_in%y)) INITIALIZER(omp_priv = tt2(0,0))
 
 !PARSE-TREE: DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OpenMPDeclareReductionConstruct -> OmpDirectiveSpecification
 !PARSE-TREE: | OmpDirectiveName -> llvm::omp::Directive = declare reduction
@@ -38,9 +86,55 @@ subroutine reduce_1 ( n, tts )
 !PARSE-TREE: | | OmpReductionIdentifier -> DefinedOperator -> IntrinsicOperator = Add
 !PARSE-TREE: | | OmpTypeNameList -> OmpTypeName -> TypeSpec -> DerivedTypeSpec
 !PARSE-TREE: | | | Name = 'tt2'
-!PARSE-TREE: | | OmpCombinerExpression -> AssignmentStmt = 'omp_out=tt2(x=omp_out%x-omp_in%x,y=omp_out%y-omp_in%y)'
-!PARSE-TREE: | OmpClauseList -> OmpClause -> Initializer -> OmpInitializerClause -> AssignmentStmt = 'omp_priv=tt2(x=0._8,y=0._8)'
-
+!PARSE-TREE: | | OmpCombinerExpression -> OmpStylizedInstance
+!PARSE-TREE: | | | OmpStylizedDeclaration
+!PARSE-TREE: | | | OmpStylizedDeclaration
+!PARSE-TREE: | | | Instance -> AssignmentStmt = 'omp_out=tt2(x=omp_out%x-omp_in%x,y=omp_out%y-omp_in%y)'
+!PARSE-TREE: | | | | Variable = 'omp_out'
+!PARSE-TREE: | | | | | Designator -> DataRef -> Name = 'omp_out'
+!PARSE-TREE: | | | | Expr = 'tt2(x=omp_out%x-omp_in%x,y=omp_out%y-omp_in%y)'
+!PARSE-TREE: | | | | | StructureConstructor
+!PARSE-TREE: | | | | | | DerivedTypeSpec
+!PARSE-TREE: | | | | | | | Name = 'tt2'
+!PARSE-TREE: | | | | | | ComponentSpec
+!PARSE-TREE: | | | | | | | ComponentDataSource -> Expr = 'omp_out%x-omp_in%x'
+!PARSE-TREE: | | | | | | | | Subtract
+!PARSE-TREE: | | | | | | | | | Expr = 'omp_out%x'
+!PARSE-TREE: | | | | | | | | | | Designator -> DataRef -> StructureComponent
+!PARSE-TREE: | | | | | | | | | | | DataRef -> Name = 'omp_out'
+!PARSE-TREE: | | | | | | | | | | | Name = 'x'
+!PARSE-TREE: | | | | | | | | | Expr = 'omp_in%x'
+!PARSE-TREE: | | | | | | | | | | Designator -> DataRef -> StructureComponent
+!PARSE-TREE: | | | | | | | | | | | DataRef -> Name = 'omp_in'
+!PARSE-TREE: | | | | | | | | | | | Name = 'x'
+!PARSE-TREE: | | | | | | ComponentSpec
+!PARSE-TREE: | | | | | | | ComponentDataSource -> Expr = 'omp_out%y-omp_in%y'
+!PARSE-TREE: | | | | | | | | Subtract
+!PARSE-TREE: | | | | | | | | | Expr = 'omp_out%y'
+!PARSE-TREE: | | | | | | | | | | Designator -> DataRef -> StructureComponent
+!PARSE-TREE: | | | | | | | | | | | DataRef -> Name = 'omp_out'
+!PARSE-TREE: | | | | | | | | | | | Name = 'y'
+!PARSE-TREE: | | | | | | | | | Expr = 'omp_in%y'
+!PARSE-TREE: | | | | | | | | | | Designator -> DataRef -> StructureComponent
+!PARSE-TREE: | | | | | | | | | | | DataRef -> Name = 'omp_in'
+!PARSE-TREE: | | | | | | | | | | | Name = 'y'
+!PARSE-TREE: | OmpClauseList -> OmpClause -> Initializer -> OmpInitializerClause -> OmpInitializerExpression -> OmpStylizedInstance
+!PARSE-TREE: | | OmpStylizedDeclaration
+!PARSE-TREE: | | OmpStylizedDeclaration
+!PARSE-TREE: | | Instance -> AssignmentStmt = 'omp_priv=tt2(x=0._8,y=0._8)'
+!PARSE-TREE: | | | Variable = 'omp_priv'
+!PARSE-TREE: | | | | Designator -> DataRef -> Name = 'omp_priv'
+!PARSE-TREE: | | | Expr = 'tt2(x=0._8,y=0._8)'
+!PARSE-TREE: | | | | StructureConstructor
+!PARSE-TREE: | | | | | DerivedTypeSpec
+!PARSE-TREE: | | | | | | Name = 'tt2'
+!PARSE-TREE: | | | | | ComponentSpec
+!PARSE-TREE: | | | | | | ComponentDataSource -> Expr = '0_4'
+!PARSE-TREE: | | | | | | | LiteralConstant -> IntLiteralConstant = '0'
+!PARSE-TREE: | | | | | ComponentSpec
+!PARSE-TREE: | | | | | | ComponentDataSource -> Expr = '0_4'
+!PARSE-TREE: | | | | | | | LiteralConstant -> IntLiteralConstant = '0'
+!PARSE-TREE: | Flags = None
   !$omp declare reduction(+ :tt2 :  omp_out = tt2(omp_out%x - omp_in%x , omp_out%y - omp_in%y)) initializer(omp_priv = tt2(0,0))
   
   type(tt) :: diffp = tt( 0, 0 )
diff --git a/flang/test/Parser/OpenMP/declare-reduction-unparse-with-symbols.f90 b/flang/test/Parser/OpenMP/declare-reduction-unparse-with-symbols.f90
index 455fc17871ad3..f026f15ddd88c 100644
--- a/flang/test/Parser/OpenMP/declare-reduction-unparse-with-symbols.f90
+++ b/flang/test/Parser/OpenMP/declare-reduction-unparse-with-symbols.f90
@@ -8,6 +8,6 @@ subroutine f00
 
 !CHECK: !DEF: /f00 (Subroutine) Subprogram
 !CHECK: subroutine f00
-!CHECK: !$omp declare reduction(fred:integer,real: omp_out = omp_in+omp_out)
+!CHECK: !$omp declare reduction(fred:integer, real: omp_out = omp_in + omp_out)
 !CHECK: end subroutine
 
diff --git a/flang/test/Parser/OpenMP/declare-reduction-unparse.f90 b/flang/test/Parser/OpenMP/declare-reduction-unparse.f90
index 73d7ccf489f01..7897eb0fb46f0 100644
--- a/flang/test/Parser/OpenMP/declare-reduction-unparse.f90
+++ b/flang/test/Parser/OpenMP/declare-reduction-unparse.f90
@@ -19,7 +19,8 @@ subroutine initme(x,n)
      end subroutine initme
   end interface
 !$omp declare reduction(red_add:integer(4):omp_out=omp_out+omp_in) initializer(initme(omp_priv,0))
-!CHECK: !$OMP DECLARE REDUCTION(red_add:INTEGER(KIND=4_4): omp_out = omp_out+omp_in) INITIALIZER(initme(omp_priv, 0_4))
+!CHECK: !$OMP DECLARE REDUCTION(red_add:INTEGER(KIND=4_4): omp_out=omp_out+omp_in) INITIA&
+!CHECKL !$OMP&LIZER(initme(omp_priv,0))
 
 !PARSE-TREE: DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OpenMPDeclareReductionConstruct -> OmpDirectiveSpecification
 !PARSE-TREE: | OmpDirectiveName -> llvm::omp::Directive = declare reduction
@@ -27,9 +28,31 @@ end subroutine initme
 !PARSE-TREE: | | OmpReductionIdentifier -> ProcedureDesignator -> Name = 'red_add'
 !PARSE-TREE: | | OmpTypeNameList -> OmpTypeName -> DeclarationTypeSpec -> IntrinsicTypeSpec -> IntegerTypeSpec -> KindSelector -> Scalar -> Integer -> Constant -> Expr = '4_4'
 !PARSE-TREE: | | | LiteralConstant -> IntLiteralConstant = '4'
-!PARSE-TREE: | | OmpCombinerExpression -> AssignmentStmt = 'omp_out=omp_out+omp_in'
-!PARSE-TREE: | OmpClauseList -> OmpClause -> Initializer -> OmpInitializerClause -> OmpInitializerProc
-!PARSE-TREE: | | ProcedureDesignator -> Name = 'initme'
+!PARSE-TREE: | | OmpCombinerExpression -> OmpStylizedInstance
+!PARSE-TREE: | | | OmpStylizedDeclaration
+!PARSE-TREE: | | | OmpStylizedDeclaration
+!PARSE-TREE: | | | Instance -> AssignmentStmt = 'omp_out=omp_out+omp_in'
+!PARSE-TREE: | | | | Variable = 'omp_out'
+!PARSE-TREE: | | | | | Designator -> DataRef -> Name = 'omp_out'
+!PARSE-TREE: | | | | Expr = 'omp_out+omp_in'
+!PARSE-TREE: | | | | | Add
+!PARSE-TREE: | | | | | | Expr = 'omp_out'
+!PARSE-TREE: | | | | | | | Designator -> DataRef -> Name = 'omp_out'
+!PARSE-TREE: | | | | | | Expr = 'omp_in'
+!PARSE-TREE: | | | | | | | Designator -> DataRef -> Name = 'omp_in'
+!PARSE-TREE: | OmpClauseList -> OmpClause -> Initializer -> OmpInitializerClause -> OmpInitializerExpression -> OmpStylizedInstance
+!PARSE-TREE: | | OmpStylizedDeclaration
+!PARSE-TREE: | | OmpStylizedDeclaration
+!PARSE-TREE: | | Instance -> CallStmt = 'CALL initme(omp_priv,0_4)'
+!PARSE-TREE: | | | Call
+!PARSE-TREE: | | | | ProcedureDesignator -> Name = 'initme'
+!PARSE-TREE: | | | | ActualArgSpec
+!PARSE-TREE: | | | | | ActualArg -> Expr = 'omp_priv'
+!PARSE-TREE: | | | | | | Designator -> DataRef -> Name = 'omp_priv'
+!PARSE-TREE: | | | | ActualArgSpec
+!PARSE-TREE: | | | | | ActualArg -> Expr = '0_4'
+!PARSE-TREE: | | | | | | LiteralConstant -> IntLiteralConstant = '0'
+!PARSE-TREE: | Flags = None
 
   res=init
 !$omp simd reduction(red_add:res)
@@ -59,7 +82,8 @@ end function func
 !CHECK-LABEL: program main
 program main
   integer :: my_var
-!CHECK: !$OMP DECLARE REDUCTION(my_add_red:INTEGER: omp_out = omp_out+omp_in) INITIALIZER(omp_priv = 0_4)
+!CHECK: !$OMP DECLARE REDUCTION(my_add_red:INTEGER: omp_out = omp_out + omp_in) INITIA&
+!CHECK: !$OMP&LIZER(omp_priv=0)
 
   !$omp declare reduction (my_add_red : integer : omp_out = omp_out + omp_in) initializer (omp_priv=0)
   my_var = 0
@@ -74,5 +98,24 @@ end program main
 !PARSE-TREE: | OmpArgumentList -> OmpArgument -> OmpReductionSpecifier
 !PARSE-TREE: | | OmpReductionIdentifier -> ProcedureDesignator -> Name = 'my_add_red'
 !PARSE-TREE: | | OmpTypeNameList -> OmpTypeName -> DeclarationTypeSpec -> IntrinsicTypeSpec -> IntegerTypeSpec ->
-!PARSE-TREE: | | OmpCombinerExpression -> AssignmentStmt = 'omp_out=omp_out+omp_in'
-!PARSE-TREE: | OmpClauseList -> OmpClause -> Initializer -> OmpInitializerClause -> AssignmentStmt = 'omp_priv=0_4'
+!PARSE-TREE: | | OmpCombinerExpression -> OmpStylizedInstance
+!PARSE-TREE: | | | OmpStylizedDeclaration
+!PARSE-TREE: | | | OmpStylizedDeclaration
+!PARSE-TREE: | | | Instance -> AssignmentStmt = 'omp_out=omp_out+omp_in'
+!PARSE-TREE: | | | | Variable = 'omp_out'
+!PARSE-TREE: | | | | | Designator -> DataRef -> Name = 'omp_out'
+!PARSE-TREE: | | | | Expr = 'omp_out+omp_in'
+!PARSE-TREE: | | | | | Add
+!PARSE-TREE: | | | | | | Expr = 'omp_out'
+!PARSE-TREE: | | | | | | | Designator -> DataRef -> Name = 'omp_out'
+!PARSE-TREE: | | | | | | Expr = 'omp_in'
+!PARSE-TREE: | | | | | | | Designator -> DataRef -> Name = 'omp_in'
+!PARSE-TREE: | OmpClauseList -> OmpClause -> Initializer -> OmpInitializerClause -> OmpInitializerExpression -> OmpStylizedInstance
+!PARSE-TREE: | | OmpStylizedDeclaration
+!PARSE-TREE: | | OmpStylizedDeclaration
+!PARSE-TREE: | | Instance -> AssignmentStmt = 'omp_priv=0_4'
+!PARSE-TREE: | | | Variable = 'omp_priv'
+!PARSE-TREE: | | | | Designator -> DataRef -> Name = 'omp_priv'
+!PARSE-TREE: | | | Expr = '0_4'
+!PARSE-TREE: | | | | LiteralConstant -> IntLiteralConstant = '0'
+!PARSE-TREE: | Flags = None
diff --git a/flang/test/Parser/OpenMP/metadirective-dirspec.f90 b/flang/test/Parser/OpenMP/metadirective-dirspec.f90
index c373001be8963..b64ceb1a98164 100644
--- a/flang/test/Parser/OpenMP/metadirective-dirspec.f90
+++ b/flang/test/Parser/OpenMP/metadirective-dirspec.f90
@@ -105,8 +105,8 @@ subroutine f03
 !UNPARSE:  TYPE :: tt2
 !UNPARSE:   REAL :: x
 !UNPARSE:  END TYPE
-!UNPARSE: !$OMP METADIRECTIVE WHEN(USER={CONDITION(.true._4)}: DECLARE REDUCTION(+:tt1,tt2: omp_out%x = omp_in%x+omp_out%x)&
-!UNPARSE: !$OMP&)
+!UNPARSE: !$OMP METADIRECTIVE WHEN(USER={CONDITION(.true._4)}: DECLARE REDUCTION(+:tt1, tt2: omp&
+!UNPARSE: !$OMP&_out%x = omp_in%x + omp_out%x))
 !UNPARSE: END SUBROUTINE
 
 !PARSE-TREE: DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OmpMetadirectiveDirective
@@ -127,21 +127,44 @@ subroutine f03
 !PARSE-TREE: | | | | | Name = 'tt1'
 !PARSE-TREE: | | | | OmpTypeName -> TypeSpec -> DerivedTypeSpec
 !PARSE-TREE: | | | | | Name = 'tt2'
-!PARSE-TREE: | | | | OmpCombinerExpression -> AssignmentStmt = 'omp_out%x=omp_in%x+omp_out%x'
-!PARSE-TREE: | | | | | | Designator -> DataRef -> StructureComponent
-!PARSE-TREE: | | | | | | | DataRef -> Name = 'omp_out'
-!PARSE-TREE: | | | | | | | Name = 'x'
-!PARSE-TREE: | | | | | Expr = 'omp_in%x+omp_out%x'
-!PARSE-TREE: | | | | | | Add
-!PARSE-TREE: | | | | | | | Expr = 'omp_in%x'
-!PARSE-TREE: | | | | | | | | Designator -> DataRef -> StructureComponent
-!PARSE-TREE: | | | | | | | | | DataRef -> Name = 'omp_in'
-!PARSE-TREE: | | | | | | | | | Name = 'x'
-!PARSE-TREE: | | | | | | | Expr = 'omp_out%x'
-!PARSE-TREE: | | | | | | | | Designator -> DataRef -> StructureComponent
-!PARSE-TREE: | | | | | | | | | DataRef -> Name = 'omp_out'
-!PARSE-TREE: | | | | | | | | | Name = 'x'
+!PARSE-TREE: | | | | OmpCombinerExpression -> OmpStylizedInstance
+!PARSE-TREE: | | | | | OmpStylizedDeclaration
+!PARSE-TREE: | | | | | OmpStylizedDeclaration
+!PARSE-TREE: | | | | | Instance -> AssignmentStmt = 'omp_out%x=omp_in%x+omp_out%x'
+!PARSE-TREE: | | | | | | Variable = 'omp_out%x'
+!PARSE-TREE: | | | | | | | Designator -> DataRef -> StructureComponent
+!PARSE-TREE: | | | | | | | | DataRef -> Name = 'omp_out'
+!PARSE-TREE: | | | | | | | | Name = 'x'
+!PARSE-TREE: | | | | | | Expr = 'omp_in%x+omp_out%x'
+!PARSE-TREE: | | | | | | | Add
+!PARSE-TREE: | | | | | | | | Expr = 'omp_in%x'
+!PARSE-TREE: | | | | | | | | | Designator -> DataRef -> StructureComponent
+!PARSE-TREE: | | | | | | | | | | DataRef -> Name = 'omp_in'
+!PARSE-TREE: | | | | | | | | | | Name = 'x'
+!PARSE-TREE: | | | | | | | | Expr = 'omp_out%x'
+!PARSE-TREE: | | | | | | | | | Designator -> DataRef -> StructureComponent
+!PARSE-TREE: | | | | | | | | | | DataRef -> Name = 'omp_out'
+!PARSE-TREE: | | | | | | | | | | Name = 'x'
+!PARSE-TREE: | | | | OmpStylizedInstance
+!PARSE-TREE: | | | | | OmpStylizedDeclaration
+!PARSE-TREE: | | | | | OmpStylizedDeclaration
+!PARSE-TREE: | | | | | Instance -> AssignmentStmt = 'omp_out%x=omp_in%x+omp_out%x'
+!PARSE-TREE: | | | | | | Variable = 'omp_out%x'
+!PARSE-TREE: | | | | | | | Designator -> DataRef -> StructureComponent
+!PARSE-TREE: | | | | | | | | DataRef -> Name = 'omp_out'
+!PARSE-TREE: | | | | | | | | Name = 'x'
+!PARSE-TREE: | | | | | | Expr = 'omp_in%x+omp_out%x'
+!PARSE-TREE: | | | | | | | Add
+!PARSE-TREE: | | | | | | | | Expr = 'omp_in%x'
+!PARSE-TREE: | | | | | | | | | Designator -> DataRef -> StructureComponent
+!PARSE-TREE: | | | | | | | | | | DataRef -> Name = 'omp_in'
+!PARSE-TREE: | | | | | | | | | | Name = 'x'
+!PARSE-TREE: | | | | | | | | Expr = 'omp_out%x'
+!PARSE-TREE: | | | | | | | | | Designator -> DataRef -> StructureComponent
+!PARSE-TREE: | | | | | | | | | | DataRef -> Name = 'omp_out'
+!PARSE-TREE: | | | | | | | | | | Name = 'x'
 !PARSE-TREE: | | | OmpClauseList ->
+!PARSE-TREE: | | | Flags = None
 
 subroutine f04
   !$omp metadirective when(user={condition(.true.)}: &
diff --git a/flang/test/Parser/OpenMP/openmp6-directive-spellings.f90 b/flang/test/Parser/OpenMP/openmp6-directive-spellings.f90
index 39e8f059bbb24..50a38c6494aa6 100644
--- a/flang/test/Parser/OpenMP/openmp6-directive-spellings.f90
+++ b/flang/test/Parser/OpenMP/openmp6-directive-spellings.f90
@@ -79,7 +79,7 @@ subroutine f02
 !UNPARSE:  TYPE :: t
 !UNPARSE:   INTEGER :: x
 !UNPARSE:  END TYPE
-!UNPARSE: !$OMP DECLARE_REDUCTION(+:t: omp_out%x = omp_out%x+omp_in%x)
+!UNPARSE: !$OMP DECLARE_REDUCTION(+:t: omp_out%x = omp_out%x + omp_in%x)
 !UNPARSE: END SUBROUTINE
 
 !PARSE-TREE: DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OpenMPDeclareReductionConstruct -> OmpDirectiveSpecification
@@ -88,21 +88,24 @@ subroutine f02
 !PARSE-TREE: | | OmpReductionIdentifier -> DefinedOperator -> IntrinsicOperator = Add
 !PARSE-TREE: | | OmpTypeNameList -> OmpTypeName -> TypeSpec -> DerivedTypeSpec
 !PARSE-TREE: | | | Name = 't'
-!PARSE-TREE: | | OmpCombinerExpression -> AssignmentStmt = 'omp_out%x=omp_out%x+omp_in%x'
-!PARSE-TREE: | | | Variable = 'omp_out%x'
-!PARSE-TREE: | | | | Designator -> DataRef -> StructureComponent
-!PARSE-TREE: | | | | | DataRef -> Name = 'omp_out'
-!PARSE-TREE: | | | | | Name = 'x'
-!PARSE-TREE: | | | Expr = 'omp_out%x+omp_in%x'
-!PARSE-TREE: | | | | Add
-!PARSE-TREE: | | | | | Expr = 'omp_out%x'
-!PARSE-TREE: | | | | | | Designator -> DataRef -> StructureComponent
-!PARSE-TREE: | | | | | | | DataRef -> Name = 'omp_out'
-!PARSE-TREE: | | | | | | | Name = 'x'
-!PARSE-TREE: | | | | | Expr = 'omp_in%x'
-!PARSE-TREE: | | | | | | Designator -> DataRef -> StructureComponent
-!PARSE-TREE: | | | | | | | DataRef -> Name = 'omp_in'
-!PARSE-TREE: | | | | | | | Name = 'x'
+!PARSE-TREE: | | OmpCombinerExpression -> OmpStylizedInstance
+!PARSE-TREE: | | | OmpStylizedDeclaration
+!PARSE-TREE: | | | OmpStylizedDeclaration
+!PARSE-TREE: | | | Instance -> AssignmentStmt = 'omp_out%x=omp_out%x+omp_in%x'
+!PARSE-TREE: | | | | Variable = 'omp_out%x'
+!PARSE-TREE: | | | | | Designator -> DataRef -> StructureComponent
+!PARSE-TREE: | | | | | | DataRef -> Name = 'omp_out'
+!PARSE-TREE: | | | | | | Name = 'x'
+!PARSE-TREE: | | | | Expr = 'omp_out%x+omp_in%x'
+!PARSE-TREE: | | | | | Add
+!PARSE-TREE: | | | | | | Expr = 'omp_out%x'
+!PARSE-TREE: | | | | | | | Designator -> DataRef -> StructureComponent
+!PARSE-TREE: | | | | | | | | DataRef -> Name = 'omp_out'
+!PARSE-TREE: | | | | | | | | Name = 'x'
+!PARSE-TREE: | | | | | | Expr = 'omp_in%x'
+!PARSE-TREE: | | | | | | | Designator -> DataRef -> StructureComponent
+!PARSE-TREE: | | | | | | | | DataRef -> Name = 'omp_in'
+!PARSE-TREE: | | | | | | | | Name = 'x'
 !PARSE-TREE: | OmpClauseList ->
 !PARSE-TREE: | Flags = None
 
diff --git a/flang/test/Parser/inline-directives.f90 b/flang/test/Parser/inline-directives.f90
new file mode 100644
index 0000000000000..24d4f95759a6e
--- /dev/null
+++ b/flang/test/Parser/inline-directives.f90
@@ -0,0 +1,29 @@
+! RUN: %flang_fc1 -fdebug-unparse %s 2>&1 | FileCheck %s
+
+! Test that checks whether compiler directives can be inlined without mistaking it as comment.
+
+module m
+contains
+#define MACRO(X)  subroutine func1(X); real(2) :: X; !dir$ ignore_tkr(d) X; end subroutine func1;
+MACRO(foo)
+
+!CHECK: SUBROUTINE func1 (foo)
+!CHECK: !DIR$ IGNORE_TKR (d) foo
+!CHECK: END SUBROUTINE func1
+
+  subroutine func2(foo)
+    real(2) :: foo; !dir$ ignore_tkr(d) foo;
+  end subroutine func2
+
+!CHECK: SUBROUTINE func2 (foo)
+!CHECK: !DIR$ IGNORE_TKR (d) foo
+!CHECK: END SUBROUTINE func2
+
+  subroutine func3(foo)
+    real(2) :: foo; !dir$ ignore_tkr(d) foo; end subroutine func3;
+
+!CHECK: SUBROUTINE func3 (foo)
+!CHECK: !DIR$ IGNORE_TKR (d) foo
+!CHECK: END SUBROUTINE func3
+
+end module
diff --git a/flang/test/Preprocessing/bug136845.F b/flang/test/Preprocessing/bug136845.F
index ce52c2953bb57..311ee0a2d874c 100644
--- a/flang/test/Preprocessing/bug136845.F
+++ b/flang/test/Preprocessing/bug136845.F
@@ -18,7 +18,6 @@
 *$1   continue
       end
 
-!PREPRO:!$   &
 !PREPRO:              continue
 !PREPRO:      k=0
 !PREPRO:      k=0
diff --git a/flang/test/Preprocessing/cond-comment.f b/flang/test/Preprocessing/cond-comment.f
new file mode 100644
index 0000000000000..a484fcbfa8fa7
--- /dev/null
+++ b/flang/test/Preprocessing/cond-comment.f
@@ -0,0 +1,5 @@
+!RUN: %flang_fc1 -fopenmp -fdebug-unparse %s 2>&1 | FileCheck %s
+!CHECK: END
+!CHECK-NOT: error:
+      end
+c$      !
diff --git a/flang/test/Preprocessing/cond-comment.f90 b/flang/test/Preprocessing/cond-comment.f90
new file mode 100644
index 0000000000000..457614ae9372e
--- /dev/null
+++ b/flang/test/Preprocessing/cond-comment.f90
@@ -0,0 +1,5 @@
+!RUN: %flang_fc1 -fopenmp -fdebug-unparse %s 2>&1 | FileCheck %s
+!CHECK: END
+!CHECK-NOT: error:
+end
+!$ !
diff --git a/flang/test/Semantics/OpenMP/declare-reduction-error.f90 b/flang/test/Semantics/OpenMP/declare-reduction-error.f90
deleted file mode 100644
index 21f5cc186e037..0000000000000
--- a/flang/test/Semantics/OpenMP/declare-reduction-error.f90
+++ /dev/null
@@ -1,11 +0,0 @@
-! RUN: not %flang_fc1 -emit-obj -fopenmp -fopenmp-version=50 %s 2>&1 | FileCheck %s
-
-subroutine initme(x,n)
-  integer x,n
-  x=n
-end subroutine initme
-
-subroutine subr
-  !$omp declare reduction(red_add:integer(4):omp_out=omp_out+omp_in) initializer(initme(omp_priv,0))
-  !CHECK: error: Implicit subroutine declaration 'initme' in DECLARE REDUCTION
-end subroutine subr
diff --git a/flang/test/Semantics/OpenMP/declare-reduction-functions.f90 b/flang/test/Semantics/OpenMP/declare-reduction-functions.f90
index 000d323f522cf..89e0771e8abff 100644
--- a/flang/test/Semantics/OpenMP/declare-reduction-functions.f90
+++ b/flang/test/Semantics/OpenMP/declare-reduction-functions.f90
@@ -57,9 +57,10 @@ function functwo(x, n)
 !CHECK: adder: UserReductionDetails TYPE(two)
 !CHECK OtherConstruct scope
 !CHECK: omp_in size=8 offset=0: ObjectEntity type: TYPE(two)
-!CHECK: omp_orig size=8 offset=8: ObjectEntity type: TYPE(two)
-!CHECK: omp_out size=8 offset=16: ObjectEntity type: TYPE(two)
-!CHECK: omp_priv size=8 offset=24: ObjectEntity type: TYPE(two)
+!CHECK: omp_out size=8 offset=8: ObjectEntity type: TYPE(two)
+!CHECK OtherConstruct scope
+!CHECK: omp_orig size=8 offset=0: ObjectEntity type: TYPE(two)
+!CHECK: omp_priv size=8 offset=8: ObjectEntity type: TYPE(two)
     
   
     !$omp simd reduction(adder:res)
@@ -101,14 +102,16 @@ function functwothree(x, n)
 !CHECK: adder: UserReductionDetails TYPE(two) TYPE(three)
 !CHECK OtherConstruct scope
 !CHECK: omp_in size=8 offset=0: ObjectEntity type: TYPE(two)
-!CHECK: omp_orig size=8 offset=8: ObjectEntity type: TYPE(two)
-!CHECK: omp_out size=8 offset=16: ObjectEntity type: TYPE(two)
-!CHECK: omp_priv size=8 offset=24: ObjectEntity type: TYPE(two)
+!CHECK: omp_out size=8 offset=8: ObjectEntity type: TYPE(two)
+!CHECK OtherConstruct scope
+!CHECK: omp_orig size=8 offset=0: ObjectEntity type: TYPE(two)
+!CHECK: omp_priv size=8 offset=8: ObjectEntity type: TYPE(two)
 !CHECK OtherConstruct scope
 !CHECK: omp_in size=24 offset=0: ObjectEntity type: TYPE(three)
-!CHECK: omp_orig size=24 offset=24: ObjectEntity type: TYPE(three)
-!CHECK: omp_out size=24 offset=48: ObjectEntity type: TYPE(three)
-!CHECK: omp_priv size=24 offset=72: ObjectEntity type: TYPE(three)
+!CHECK: omp_out size=24 offset=24: ObjectEntity type: TYPE(three)
+!CHECK OtherConstruct scope
+!CHECK: omp_orig size=24 offset=0: ObjectEntity type: TYPE(three)
+!CHECK: omp_priv size=24 offset=24: ObjectEntity type: TYPE(three)
 
     !$omp simd reduction(adder:res3)
     do i=1,n
@@ -135,9 +138,10 @@ function funcBtwo(x, n)
 !CHECK: op.+: UserReductionDetails TYPE(two)
 !CHECK OtherConstruct scope
 !CHECK: omp_in size=8 offset=0: ObjectEntity type: TYPE(two)
-!CHECK: omp_orig size=8 offset=8: ObjectEntity type: TYPE(two)
-!CHECK: omp_out size=8 offset=16: ObjectEntity type: TYPE(two)
-!CHECK: omp_priv size=8 offset=24: ObjectEntity type: TYPE(two)
+!CHECK: omp_out size=8 offset=8: ObjectEntity type: TYPE(two)
+!CHECK OtherConstruct scope
+!CHECK: omp_orig size=8 offset=0: ObjectEntity type: TYPE(two)
+!CHECK: omp_priv size=8 offset=8: ObjectEntity type: TYPE(two)
     
   
     !$omp simd reduction(+:res)
@@ -163,14 +167,16 @@ function funcBtwothree(x, n)
 !CHECK: op.+: UserReductionDetails TYPE(two) TYPE(three)
 !CHECK OtherConstruct scope
 !CHECK: omp_in size=8 offset=0: ObjectEntity type: TYPE(two)
-!CHECK: omp_orig size=8 offset=8: ObjectEntity type: TYPE(two)
-!CHECK: omp_out size=8 offset=16: ObjectEntity type: TYPE(two)
-!CHECK: omp_priv size=8 offset=24: ObjectEntity type: TYPE(two)
+!CHECK: omp_out size=8 offset=8: ObjectEntity type: TYPE(two)
+!CHECK OtherConstruct scope
+!CHECK: omp_orig size=8 offset=0: ObjectEntity type: TYPE(two)
+!CHECK: omp_priv size=8 offset=8: ObjectEntity type: TYPE(two)
 !CHECK: OtherConstruct scope
 !CHECK: omp_in size=24 offset=0: ObjectEntity type: TYPE(three)
-!CHECK: omp_orig size=24 offset=24: ObjectEntity type: TYPE(three)
-!CHECK: omp_out size=24 offset=48: ObjectEntity type: TYPE(three)
-!CHECK: omp_priv size=24 offset=72: ObjectEntity type: TYPE(three)
+!CHECK: omp_out size=24 offset=24: ObjectEntity type: TYPE(three)
+!CHECK OtherConstruct scope
+!CHECK: omp_orig size=24 offset=0: ObjectEntity type: TYPE(three)
+!CHECK: omp_priv size=24 offset=24: ObjectEntity type: TYPE(three)
 
     !$omp simd reduction(+:res3)
     do i=1,n
@@ -183,6 +189,7 @@ function funcBtwothree(x, n)
     enddo
     res%t2 = res2
     res%t3 = res3
+    funcBtwothree = res
   end function funcBtwothree
 
   !! This is checking a special case, where a reduction is declared inside a
@@ -191,11 +198,12 @@ end function funcBtwothree
   pure logical function reduction()
 !CHECK: reduction size=4 offset=0: ObjectEntity funcResult type: LOGICAL(4)
 !CHECK: rr: UserReductionDetails INTEGER(4)
-!CHECK: OtherConstruct scope: size=16 alignment=4 sourceRange=0 bytes
+!CHECK: OtherConstruct scope: size=8 alignment=4 sourceRange=0 bytes
 !CHECK: omp_in size=4 offset=0: ObjectEntity type: INTEGER(4)
-!CHECK: omp_orig size=4 offset=4: ObjectEntity type: INTEGER(4)
-!CHECK: omp_out size=4 offset=8: ObjectEntity type: INTEGER(4)
-!CHECK: omp_priv size=4 offset=12: ObjectEntity type: INTEGER(4)
+!CHECK: omp_out size=4 offset=4: ObjectEntity type: INTEGER(4)
+!CHECK: OtherConstruct scope: size=8 alignment=4 sourceRange=0 bytes
+!CHECK: omp_orig size=4 offset=0: ObjectEntity type: INTEGER(4)
+!CHECK: omp_priv size=4 offset=4: ObjectEntity type: INTEGER(4)
     !$omp declare reduction (rr : integer : omp_out = omp_out + omp_in) initializer (omp_priv = 0)
     reduction = .false.
   end function reduction
diff --git a/flang/test/Semantics/OpenMP/declare-reduction-logical.f90 b/flang/test/Semantics/OpenMP/declare-reduction-logical.f90
index 7ab7cad473ac8..87fcecdbae2a5 100644
--- a/flang/test/Semantics/OpenMP/declare-reduction-logical.f90
+++ b/flang/test/Semantics/OpenMP/declare-reduction-logical.f90
@@ -18,9 +18,10 @@ function func(x, n)
 !CHECK: op.AND: UserReductionDetails TYPE(logicalwrapper)
 !CHECK OtherConstruct scope
 !CHECK: omp_in size=4 offset=0: ObjectEntity type: TYPE(logicalwrapper)
-!CHECK: omp_orig size=4 offset=4: ObjectEntity type: TYPE(logicalwrapper)
-!CHECK: omp_out size=4 offset=8: ObjectEntity type: TYPE(logicalwrapper)
-!CHECK: omp_priv size=4 offset=12: ObjectEntity type: TYPE(logicalwrapper)
+!CHECK: omp_out size=4 offset=4: ObjectEntity type: TYPE(logicalwrapper)
+!CHECK OtherConstruct scope
+!CHECK: omp_orig size=4 offset=0: ObjectEntity type: TYPE(logicalwrapper)
+!CHECK: omp_priv size=4 offset=4: ObjectEntity type: TYPE(logicalwrapper)
   
     !$omp simd reduction(.AND.:res)
     do i=1,n
diff --git a/flang/test/Semantics/OpenMP/declare-reduction-modfile.f90 b/flang/test/Semantics/OpenMP/declare-reduction-modfile.f90
index 0882de80fdcc6..763179cb52a13 100644
--- a/flang/test/Semantics/OpenMP/declare-reduction-modfile.f90
+++ b/flang/test/Semantics/OpenMP/declare-reduction-modfile.f90
@@ -6,13 +6,13 @@
 !type::t1
 !integer(4)::val
 !endtype
-!!$OMP DECLARE REDUCTION(*:t1:omp_out=omp_out*omp_in)INITIALIZER(omp_priv=&
-!!$OMP&t1(1))
+!!$OMP DECLARE REDUCTION(*:t1: omp_out=omp_out*omp_in) INITIALIZER(omp_priv=t1(&
+!!$OMP&1))
 !!$OMP METADIRECTIVE OTHERWISE(DECLARE REDUCTION(+:INTEGER))
-!!$OMP DECLARE REDUCTION(.fluffy.:t1:omp_out=omp_out.fluffy.omp_in)INITIALI&
-!!$OMP&ZER(omp_priv=t1(0))
-!!$OMP DECLARE REDUCTION(.mul.:t1:omp_out=omp_out.mul.omp_in)INITIALIZER(om&
-!!$OMP&p_priv=t1(1))
+!!$OMP DECLARE REDUCTION(.fluffy.:t1: omp_out=omp_out.fluffy.omp_in) INITIALIZE&
+!!$OMP&R(omp_priv=t1(0))
+!!$OMP DECLARE REDUCTION(.mul.:t1: omp_out=omp_out.mul.omp_in) INITIALIZER(omp_&
+!!$OMP&priv=t1(1))
 !interface operator(.mul.)
 !procedure::mul
 !end interface
diff --git a/flang/test/Semantics/OpenMP/declare-reduction-operator.f90 b/flang/test/Semantics/OpenMP/declare-reduction-operator.f90
index dc12332b80baf..5fc42054882f0 100644
--- a/flang/test/Semantics/OpenMP/declare-reduction-operator.f90
+++ b/flang/test/Semantics/OpenMP/declare-reduction-operator.f90
@@ -11,11 +11,9 @@ module m1
 !$omp declare reduction(.fluffy.:t1:omp_out=omp_out.fluffy.omp_in)
 !CHECK: op.fluffy., PUBLIC: UserReductionDetails TYPE(t1)
 !CHECK: t1, PUBLIC: DerivedType components: val
-!CHECK: OtherConstruct scope: size=16 alignment=4 sourceRange=0 bytes
+!CHECK: OtherConstruct scope: size=8 alignment=4 sourceRange=0 bytes
 !CHECK: omp_in size=4 offset=0: ObjectEntity type: TYPE(t1)
-!CHECK: omp_orig size=4 offset=4: ObjectEntity type: TYPE(t1)
-!CHECK: omp_out size=4 offset=8: ObjectEntity type: TYPE(t1)
-!CHECK: omp_priv size=4 offset=12: ObjectEntity type: TYPE(t1)
+!CHECK: omp_out size=4 offset=4: ObjectEntity type: TYPE(t1)
 contains
   function my_mul(x, y)
     type (t1), intent (in) :: x, y
diff --git a/flang/test/Semantics/OpenMP/declare-reduction-operators.f90 b/flang/test/Semantics/OpenMP/declare-reduction-operators.f90
index 84dbe1af01877..e0006bfb1fb6a 100644
--- a/flang/test/Semantics/OpenMP/declare-reduction-operators.f90
+++ b/flang/test/Semantics/OpenMP/declare-reduction-operators.f90
@@ -64,9 +64,10 @@ program test_vector
 
 !CHECK: OtherConstruct scope:
 !CHECK: omp_in size=12 offset=0: ObjectEntity type: TYPE(vector)
-!CHECK: omp_orig size=12 offset=12: ObjectEntity type: TYPE(vector)
-!CHECK: omp_out size=12 offset=24: ObjectEntity type: TYPE(vector)
-!CHECK: omp_priv size=12 offset=36: ObjectEntity type: TYPE(vector)
+!CHECK: omp_out size=12 offset=12: ObjectEntity type: TYPE(vector)
+!CHECK: OtherConstruct scope:
+!CHECK: omp_orig size=12 offset=0: ObjectEntity type: TYPE(vector)
+!CHECK: omp_priv size=12 offset=12: ObjectEntity type: TYPE(vector)
 
   v2 = Vector(0.0, 0.0, 0.0)
   v1 = Vector(1.0, 2.0, 3.0)
diff --git a/flang/test/Semantics/OpenMP/declare-reduction-renamedop.f90 b/flang/test/Semantics/OpenMP/declare-reduction-renamedop.f90
index 9cd638d796091..115fe517be181 100644
--- a/flang/test/Semantics/OpenMP/declare-reduction-renamedop.f90
+++ b/flang/test/Semantics/OpenMP/declare-reduction-renamedop.f90
@@ -33,11 +33,12 @@ program test_omp_reduction
   !$omp declare reduction (.modmul. : t1 : omp_out = omp_out .modmul. omp_in) initializer(omp_priv = t1(1.0))
 !CHECK: op.modmul.: UserReductionDetails TYPE(t1)
 !CHECK: t1: Use from t1 in module1
-!CHECK: OtherConstruct scope: size=16 alignment=4 sourceRange=0 bytes
+!CHECK: OtherConstruct scope: size=8 alignment=4 sourceRange=0 bytes
 !CHECK: omp_in size=4 offset=0: ObjectEntity type: TYPE(t1)
-!CHECK: omp_orig size=4 offset=4: ObjectEntity type: TYPE(t1)
-!CHECK: omp_out size=4 offset=8: ObjectEntity type: TYPE(t1)
-!CHECK: omp_priv size=4 offset=12: ObjectEntity type: TYPE(t1)
+!CHECK: omp_out size=4 offset=4: ObjectEntity type: TYPE(t1)
+!CHECK: OtherConstruct scope: size=8 alignment=4 sourceRange=0 bytes
+!CHECK: omp_orig size=4 offset=0: ObjectEntity type: TYPE(t1)
+!CHECK: omp_priv size=4 offset=4: ObjectEntity type: TYPE(t1)
   result = t1(1.0)
   !$omp parallel do reduction(.modmul.:result)
   do i = 1, 10
diff --git a/flang/test/Semantics/OpenMP/declare-reduction.f90 b/flang/test/Semantics/OpenMP/declare-reduction.f90
index 1f39c57c54ad1..c8dee5e240918 100644
--- a/flang/test/Semantics/OpenMP/declare-reduction.f90
+++ b/flang/test/Semantics/OpenMP/declare-reduction.f90
@@ -19,10 +19,12 @@ end subroutine initme
   !$omp declare reduction(red_add:integer(4):omp_out=omp_out+omp_in) initializer(initme(omp_priv,0))
 !CHECK: red_add: UserReductionDetails
 !CHECK: Subprogram scope: initme
+!CHECK: OtherConstruct scope:
 !CHECK: omp_in size=4 offset=0: ObjectEntity type: INTEGER(4)
-!CHECK: omp_orig size=4 offset=4: ObjectEntity type: INTEGER(4)
-!CHECK: omp_out size=4 offset=8: ObjectEntity type: INTEGER(4)
-!CHECK: omp_priv size=4 offset=12: ObjectEntity type: INTEGER(4)
+!CHECK: omp_out size=4 offset=4: ObjectEntity type: INTEGER(4)
+!CHECK: OtherConstruct scope:
+!CHECK: omp_orig size=4 offset=0: ObjectEntity type: INTEGER(4)
+!CHECK: omp_priv size=4 offset=4: ObjectEntity type: INTEGER(4)
 !$omp simd reduction(red_add:res)
   do i=1,n
      res=res+x(i)
@@ -36,9 +38,11 @@ program main
   !$omp declare reduction (my_add_red : integer : omp_out = omp_out + omp_in) initializer (omp_priv=0)
 
 !CHECK: my_add_red: UserReductionDetails
+!CHECK: OtherConstruct scope:
 !CHECK: omp_in size=4 offset=0: ObjectEntity type: INTEGER(4)
-!CHECK: omp_orig size=4 offset=4: ObjectEntity type: INTEGER(4)
-!CHECK: omp_out size=4 offset=8: ObjectEntity type: INTEGER(4)
-!CHECK: omp_priv size=4 offset=12: ObjectEntity type: INTEGER(4)
+!CHECK: omp_out size=4 offset=4: ObjectEntity type: INTEGER(4)
+!CHECK: OtherConstruct scope:
+!CHECK: omp_orig size=4 offset=0: ObjectEntity type: INTEGER(4)
+!CHECK: omp_priv size=4 offset=4: ObjectEntity type: INTEGER(4)
   
 end program main
diff --git a/flang/test/Semantics/allocate14.f90 b/flang/test/Semantics/allocate14.f90
new file mode 100644
index 0000000000000..a97cf5ad88b08
--- /dev/null
+++ b/flang/test/Semantics/allocate14.f90
@@ -0,0 +1,56 @@
+! RUN: %python %S/test_errors.py %s %flang_fc1
+! Check for semantic errors in ALLOCATE statements
+
+program allocate14
+  
+  integer, allocatable :: i1, i2
+  character(200), allocatable :: msg1, msg2
+  type t
+    integer, allocatable :: i
+    character(10), allocatable :: msg
+  end type t
+  type(t) :: tt(2)
+  type(t), allocatable :: ts(:)
+
+  allocate(i1)
+  allocate(msg1)
+
+  allocate(i2, stat=i1, errmsg=msg1)
+  allocate(msg2, stat=i1, errmsg=msg1)
+  deallocate(i2, stat=i1, errmsg=msg1)
+  deallocate(msg2, stat=i1, errmsg=msg1)
+
+  !ERROR: STAT variable in ALLOCATE must not be the variable being allocated
+  allocate(i2, stat=i2, errmsg=msg2)
+  !ERROR: ERRMSG variable in ALLOCATE must not be the variable being allocated
+  allocate(msg2, stat=i2, errmsg=msg2)
+  !ERROR: STAT variable in DEALLOCATE must not be the variable being deallocated
+  deallocate(i2, stat=i2, errmsg=msg2)
+  !ERROR: ERRMSG variable in DEALLOCATE must not be the variable being deallocated
+  deallocate(msg2, stat=i2, errmsg=msg2)
+
+  allocate(tt(1)%i)
+  allocate(tt(1)%msg)
+
+  allocate(tt(2)%i, stat=tt(1)%i, errmsg=tt(1)%msg)
+  allocate(tt(2)%msg, stat=tt(1)%i, errmsg=tt(1)%msg)
+  deallocate(tt(2)%i, stat=tt(1)%i, errmsg=tt(1)%msg)
+  deallocate(tt(2)%msg, stat=tt(1)%i, errmsg=tt(1)%msg)
+
+  !ERROR: STAT variable in ALLOCATE must not be the variable being allocated
+  allocate(tt(2)%i, stat=tt(2)%i, errmsg=tt(2)%msg)
+  !ERROR: ERRMSG variable in ALLOCATE must not be the variable being allocated
+  allocate(tt(2)%msg, stat=tt(2)%i, errmsg=tt(2)%msg)
+  !ERROR: STAT variable in DEALLOCATE must not be the variable being deallocated
+  deallocate(tt(2)%i, stat=tt(2)%i, errmsg=tt(2)%msg)
+  !ERROR: ERRMSG variable in DEALLOCATE must not be the variable being deallocated
+  deallocate(tt(2)%msg, stat=tt(2)%i, errmsg=tt(2)%msg)
+
+  !TODO: STAT variable in ALLOCATE must not be the variable being allocated
+  !TODO: ERRMSG variable in ALLOCATE must not be the variable being allocated
+  allocate(ts(10), stat=ts(1)%i, errmsg=ts(1)%msg)
+  !TODO: STAT variable in DEALLOCATE must not be the variable being deallocated
+  !TODO: ERRMSG variable in DEALLOCATE must not be the variable being deallocated
+  deallocate(ts, stat=ts(1)%i, errmsg=ts(1)%msg)
+end program
+
diff --git a/flang/test/Semantics/ignore_tkr04.f90 b/flang/test/Semantics/ignore_tkr04.f90
new file mode 100644
index 0000000000000..8becc85857bb1
--- /dev/null
+++ b/flang/test/Semantics/ignore_tkr04.f90
@@ -0,0 +1,26 @@
+! RUN: %python %S/test_errors.py %s %flang_fc1
+! Tests for ignore_tkr(p)
+module ignore_tkr_4_m
+interface
+  subroutine s(a)
+  real, pointer :: a(:)
+!dir$ ignore_tkr(p) a
+  end subroutine
+  subroutine s1(a)
+    real, allocatable :: a(:)
+!dir$ ignore_tkr(p) a
+  end subroutine
+end interface
+end module
+program t
+  use ignore_tkr_4_m
+  real, allocatable :: x(:)
+  real, pointer :: x1(:)
+  call s(x)
+!CHECK-NOT: error
+!CHECK-NOT: warning
+  call s1(x1)
+!CHECK-NOT: error
+!CHECK-NOT: warning
+end
+
diff --git a/flang/test/Semantics/resolve09.f90 b/flang/test/Semantics/resolve09.f90
index 2fe21aebf66bd..3384b05bf8f27 100644
--- a/flang/test/Semantics/resolve09.f90
+++ b/flang/test/Semantics/resolve09.f90
@@ -140,11 +140,11 @@ subroutine s9
     procedure(), nopass, pointer :: p1, p2
   end type
   type(t) x
+  !ERROR: Function result characteristics are not known
   print *, x%p1()
-  call x%p2
-  !ERROR: Cannot call function 'p1' like a subroutine
-  call x%p1
-  !ERROR: Cannot call subroutine 'p2' like a function
+  call x%p2 ! ok
+  call x%p1 ! ok
+  !ERROR: Function result characteristics are not known
   print *, x%p2()
 end subroutine
 
diff --git a/flang/test/lib/OpenACC/TestOpenACCInterfaces.cpp b/flang/test/lib/OpenACC/TestOpenACCInterfaces.cpp
index 9a80e3b1a9aee..072aee5ba269f 100644
--- a/flang/test/lib/OpenACC/TestOpenACCInterfaces.cpp
+++ b/flang/test/lib/OpenACC/TestOpenACCInterfaces.cpp
@@ -100,6 +100,10 @@ struct TestFIROpenACCInterfaces
             }
           }
 
+          llvm::errs() << "\t\tHas unknown dimensions: "
+                       << (mappableTy.hasUnknownDimensions() ? "true" : "false")
+                       << "\n";
+
           if (auto declareOp =
                   dyn_cast_if_present<hlfir::DeclareOp>(var.getDefiningOp())) {
             llvm::errs() << "\t\tShape: " << declareOp.getShape() << "\n";
diff --git a/flang/unittests/CMakeLists.txt b/flang/unittests/CMakeLists.txt
index db04923e2943a..2d612e58dae24 100644
--- a/flang/unittests/CMakeLists.txt
+++ b/flang/unittests/CMakeLists.txt
@@ -48,6 +48,7 @@ function(add_flang_nongtest_unittest test_name)
     llvm_map_components_to_libnames(llvm_libs Support)
   endif()
   target_link_libraries(${test_name}${suffix} ${llvm_libs} ${ARG_UNPARSED_ARGUMENTS})
+  set_unittest_link_flags(${test_name}${suffix})
 
   if(NOT ARG_SLOW_TEST)
     add_dependencies(FlangUnitTests ${test_name}${suffix})
diff --git a/libc/CMakeLists.txt b/libc/CMakeLists.txt
index 14718e2090bde..ae555a256ba66 100644
--- a/libc/CMakeLists.txt
+++ b/libc/CMakeLists.txt
@@ -363,7 +363,7 @@ elseif(LLVM_LIBC_FULL_BUILD)
   message(FATAL_ERROR "${LIBC_CONFIG_PATH}/headers.txt file not found and fullbuild requested.")
 endif()
 
-# Check exclude.txt that appends to LIBC_EXCLUDE_ENTRYPOINTS list
+# Check exclude.txt that appends to TARGET_LLVMLIBC_REMOVED_ENTRYPOINTS list
 if(EXISTS "${LIBC_CONFIG_PATH}/exclude.txt")
     include("${LIBC_CONFIG_PATH}/exclude.txt")
 endif()
diff --git a/libc/config/linux/x86_64/exclude.txt b/libc/config/linux/x86_64/exclude.txt
index 2c218b753b176..a0686310d21ac 100644
--- a/libc/config/linux/x86_64/exclude.txt
+++ b/libc/config/linux/x86_64/exclude.txt
@@ -19,3 +19,11 @@ if(NOT has_sys_random)
     )
   endif()
 endif()
+
+include(CheckSymbolExists)
+check_symbol_exists(SYS_faccessat2 "sys/syscall.h" HAVE_SYS_FACCESSAT2)
+if(NOT HAVE_SYS_FACCESSAT2)
+  list(APPEND TARGET_LLVMLIBC_REMOVED_ENTRYPOINTS
+    libc.src.unistd.faccessat
+  )
+endif()
diff --git a/libc/include/llvm-libc-types/__barrier_type.h b/libc/include/llvm-libc-types/__barrier_type.h
index 59712619e917d..5752f832f04b9 100644
--- a/libc/include/llvm-libc-types/__barrier_type.h
+++ b/libc/include/llvm-libc-types/__barrier_type.h
@@ -9,6 +9,8 @@
 #ifndef LLVM_LIBC_TYPES__BARRIER_TYPE_H
 #define LLVM_LIBC_TYPES__BARRIER_TYPE_H
 
+#include <stdbool.h>
+
 typedef struct __attribute__((aligned(8 /* alignof (Barrier) */))) {
   unsigned expected;
   unsigned waiting;
diff --git a/libc/include/llvm-libc-types/pthread_barrierattr_t.h b/libc/include/llvm-libc-types/pthread_barrierattr_t.h
index 064be5bfb6721..b62fdc0f72e12 100644
--- a/libc/include/llvm-libc-types/pthread_barrierattr_t.h
+++ b/libc/include/llvm-libc-types/pthread_barrierattr_t.h
@@ -9,6 +9,8 @@
 #ifndef LLVM_LIBC_TYPES_PTHREAD_BARRIERATTR_T_H
 #define LLVM_LIBC_TYPES_PTHREAD_BARRIERATTR_T_H
 
+#include <stdbool.h>
+
 typedef struct {
   bool pshared;
 } pthread_barrierattr_t;
diff --git a/libc/include/locale.yaml b/libc/include/locale.yaml
index 4566984ad83af..3c3998eb07aa4 100644
--- a/libc/include/locale.yaml
+++ b/libc/include/locale.yaml
@@ -1,7 +1,7 @@
 header: locale.h
 header_template: locale.h.def
 macros:
-  - macro_name: NULL
+  - macro_name: "NULL"
     macro_header: null-macro.h
 types:
   - type_name: locale_t
diff --git a/libc/include/stdio.yaml b/libc/include/stdio.yaml
index 394437ba3bbcd..c50b4ecb0bf08 100644
--- a/libc/include/stdio.yaml
+++ b/libc/include/stdio.yaml
@@ -1,7 +1,7 @@
 header: stdio.h
 header_template: stdio.h.def
 macros:
-  - macro_name: NULL
+  - macro_name: "NULL"
     macro_header: null-macro.h
   - macro_name: stdout
     macro_value: stdout
diff --git a/libc/include/stdlib.yaml b/libc/include/stdlib.yaml
index 3b2ff13c684b1..495eb7e1317b6 100644
--- a/libc/include/stdlib.yaml
+++ b/libc/include/stdlib.yaml
@@ -5,7 +5,7 @@ standards:
 merge_yaml_files:
   - stdlib-malloc.yaml
 macros:
-  - macro_name: NULL
+  - macro_name: "NULL"
     macro_header: null-macro.h
 types:
   - type_name: __atexithandler_t
diff --git a/libc/include/string.yaml b/libc/include/string.yaml
index 0bf297ee747a4..22010f4afa812 100644
--- a/libc/include/string.yaml
+++ b/libc/include/string.yaml
@@ -2,7 +2,7 @@ header: string.h
 standards:
   - stdc
 macros:
-  - macro_name: NULL
+  - macro_name: "NULL"
     macro_header: null-macro.h
 types:
   - type_name: locale_t
diff --git a/libc/include/time.yaml b/libc/include/time.yaml
index 2f8024298fad1..88e50d1288238 100644
--- a/libc/include/time.yaml
+++ b/libc/include/time.yaml
@@ -1,7 +1,7 @@
 header: time.h
 header_template: time.h.def
 macros:
-  - macro_name: NULL
+  - macro_name: "NULL"
     macro_header: null-macro.h
 types:
   - type_name: struct_timeval
diff --git a/libc/include/wchar.yaml b/libc/include/wchar.yaml
index b8a0a748cd3ad..fb5b19b523b31 100644
--- a/libc/include/wchar.yaml
+++ b/libc/include/wchar.yaml
@@ -1,11 +1,15 @@
 header: wchar.h
 header_template: wchar.h.def
 macros:
-  - macro_name: NULL
+  - macro_name: "NULL"
     macro_header: null-macro.h
 types:
   - type_name: FILE
   - type_name: size_t
+  # TODO: Remove this once we have a function declaration using "struct tm"
+  # (wcsftime). We're declaring it here now, since libc++ expects
+  # forward-declaration of "struct tm" in the <wchar.h> header.
+  - type_name: struct_tm
   - type_name: wint_t
   - type_name: wchar_t
   - type_name: mbstate_t
@@ -188,8 +192,8 @@ functions:
     standards:
       - stdc
     return_type: wchar_t *
-    arguments: 
-      - type: wchar_t *__restrict 
+    arguments:
+      - type: wchar_t *__restrict
       - type: const wchar_t *__restrict
       - type: size_t
   - name: wmemmove
@@ -212,7 +216,7 @@ functions:
     standards:
       - stdc
     return_type: wchar_t *
-    arguments: 
+    arguments:
       - type: wchar_t *__restrict
       - type: const wchar_t *__restrict
   - name: wcslcat
diff --git a/libc/src/__support/CMakeLists.txt b/libc/src/__support/CMakeLists.txt
index 0ef09a9b8c9d0..b7af751ec3f27 100644
--- a/libc/src/__support/CMakeLists.txt
+++ b/libc/src/__support/CMakeLists.txt
@@ -179,19 +179,7 @@ add_header_library(
   DEPENDS
     .ctype_utils
     .str_to_num_result
-    libc.hdr.errno_macros
-    libc.src.__support.CPP.limits
-    libc.src.__support.CPP.type_traits
-    libc.src.__support.common
-)
-
-add_header_library(
-  wcs_to_integer
-  HDRS
-    wcs_to_integer.h
-  DEPENDS
     .wctype_utils
-    .str_to_num_result
     libc.hdr.errno_macros
     libc.src.__support.CPP.limits
     libc.src.__support.CPP.type_traits
diff --git a/libc/src/__support/str_to_integer.h b/libc/src/__support/str_to_integer.h
index d332c929f2c31..ba3f49fa2f47b 100644
--- a/libc/src/__support/str_to_integer.h
+++ b/libc/src/__support/str_to_integer.h
@@ -25,36 +25,63 @@
 #include "src/__support/macros/config.h"
 #include "src/__support/str_to_num_result.h"
 #include "src/__support/uint128.h"
+#include "src/__support/wctype_utils.h"
 
 namespace LIBC_NAMESPACE_DECL {
 namespace internal {
 
 // Returns the idx to the first character in src that is not a whitespace
-// character (as determined by isspace())
+// character (as determined by isspace() / iswspace())
+template <typename CharType>
 LIBC_INLINE size_t
-first_non_whitespace(const char *__restrict src,
+first_non_whitespace(const CharType *__restrict src,
                      size_t src_len = cpp::numeric_limits<size_t>::max()) {
   size_t src_cur = 0;
-  while (src_cur < src_len && internal::isspace(src[src_cur])) {
+  while (src_cur < src_len) {
+    if constexpr (cpp::is_same_v<CharType, char>) {
+      if (!internal::isspace(src[src_cur]))
+        break;
+    } else {
+      if (!internal::iswspace(src[src_cur]))
+        break;
+    }
     ++src_cur;
   }
   return src_cur;
 }
 
+// Returns +1, -1, or 0 if 'src' starts with (respectively)
+// plus sign, minus sign, or neither.
+template <typename CharType>
+LIBC_INLINE static int get_sign(const CharType *__restrict src) {
+  if constexpr (cpp::is_same_v<CharType, char>) {
+    return (src[0] == '+') ? 1 : (src[0] == '-' ? -1 : 0);
+  } else {
+    return (src[0] == L'+') ? 1 : (src[0] == L'-' ? -1 : 0);
+  }
+}
+
 // checks if the next 3 characters of the string pointer are the start of a
 // hexadecimal number. Does not advance the string pointer.
-LIBC_INLINE bool
-is_hex_start(const char *__restrict src,
-             size_t src_len = cpp::numeric_limits<size_t>::max()) {
+template <typename CharType>
+LIBC_INLINE static bool is_hex_start(const CharType *__restrict src,
+                                     size_t src_len) {
   if (src_len < 3)
     return false;
-  return *src == '0' && tolower(*(src + 1)) == 'x' && isalnum(*(src + 2)) &&
-         b36_char_to_int(*(src + 2)) < 16;
+  if constexpr (cpp::is_same_v<CharType, char>) {
+    return src[0] == '0' && tolower(src[1]) == 'x' && isalnum(src[2]) &&
+           b36_char_to_int(src[2]) < 16;
+  } else {
+    return src[0] == L'0' && towlower(src[1]) == L'x' && iswalnum(src[2]) &&
+           b36_wchar_to_int(src[2]) < 16;
+  }
 }
 
 // Takes the address of the string pointer and parses the base from the start of
 // it.
-LIBC_INLINE int infer_base(const char *__restrict src, size_t src_len) {
+template <typename CharType>
+LIBC_INLINE static int infer_base(const CharType *__restrict src,
+                                  size_t src_len) {
   // A hexadecimal number is defined as "the prefix 0x or 0X followed by a
   // sequence of the decimal digits and the letters a (or A) through f (or F)
   // with values 10 through 15 respectively." (C standard 6.4.4.1)
@@ -63,8 +90,15 @@ LIBC_INLINE int infer_base(const char *__restrict src, size_t src_len) {
   // An octal number is defined as "the prefix 0 optionally followed by a
   // sequence of the digits 0 through 7 only" (C standard 6.4.4.1) and so any
   // number that starts with 0, including just 0, is an octal number.
-  if (src_len > 0 && src[0] == '0')
-    return 8;
+  if (src_len > 0) {
+    if constexpr (cpp::is_same_v<CharType, char>) {
+      if (src[0] == '0')
+        return 8;
+    } else {
+      if (src[0] == L'0')
+        return 8;
+    }
+  }
   // A decimal number is defined as beginning "with a nonzero digit and
   // consist[ing] of a sequence of decimal digits." (C standard 6.4.4.1)
   return 10;
@@ -77,32 +111,27 @@ LIBC_INLINE int infer_base(const char *__restrict src, size_t src_len) {
 // -----------------------------------------------------------------------------
 // Takes a pointer to a string and the base to convert to. This function is used
 // as the backend for all of the string to int functions.
-template <class T>
+template <typename T, typename CharType>
 LIBC_INLINE StrToNumResult<T>
-strtointeger(const char *__restrict src, int base,
+strtointeger(const CharType *__restrict src, int base,
              const size_t src_len = cpp::numeric_limits<size_t>::max()) {
   using ResultType = make_integral_or_big_int_unsigned_t<T>;
 
-  ResultType result = 0;
-
-  bool is_number = false;
-  size_t src_cur = 0;
-  int error_val = 0;
-
   if (src_len == 0)
     return {0, 0, 0};
 
   if (base < 0 || base == 1 || base > 36)
     return {0, 0, EINVAL};
 
-  src_cur = first_non_whitespace(src, src_len);
-
-  char result_sign = '+';
-  if (src[src_cur] == '+' || src[src_cur] == '-') {
-    result_sign = src[src_cur];
-    ++src_cur;
+  size_t src_cur = first_non_whitespace(src, src_len);
+  if (src_cur == src_len) {
+    return {0, 0, 0};
   }
 
+  int sign = get_sign(src + src_cur);
+  bool is_positive = (sign >= 0);
+  src_cur += (sign != 0);
+
   if (base == 0)
     base = infer_base(src + src_cur, src_len - src_cur);
 
@@ -110,8 +139,6 @@ strtointeger(const char *__restrict src, int base,
     src_cur = src_cur + 2;
 
   constexpr bool IS_UNSIGNED = cpp::is_unsigned_v<T>;
-  const bool is_positive = (result_sign == '+');
-
   ResultType constexpr NEGATIVE_MAX =
       !IS_UNSIGNED ? static_cast<ResultType>(cpp::numeric_limits<T>::max()) + 1
                    : cpp::numeric_limits<T>::max();
@@ -120,8 +147,21 @@ strtointeger(const char *__restrict src, int base,
   ResultType const abs_max_div_by_base =
       abs_max / static_cast<ResultType>(base);
 
-  while (src_cur < src_len && isalnum(src[src_cur])) {
-    int cur_digit = b36_char_to_int(src[src_cur]);
+  bool is_number = false;
+  int error_val = 0;
+  ResultType result = 0;
+  while (src_cur < src_len) {
+    int cur_digit;
+    if constexpr (cpp::is_same_v<CharType, char>) {
+      if (!isalnum(src[src_cur]))
+        break;
+      cur_digit = b36_char_to_int(src[src_cur]);
+    } else {
+      if (!iswalnum(src[src_cur]))
+        break;
+      cur_digit = b36_wchar_to_int(src[src_cur]);
+    }
+
     if (cur_digit >= base)
       break;
 
diff --git a/libc/src/__support/wcs_to_integer.h b/libc/src/__support/wcs_to_integer.h
deleted file mode 100644
index 4254bd860f77a..0000000000000
--- a/libc/src/__support/wcs_to_integer.h
+++ /dev/null
@@ -1,155 +0,0 @@
-//===-- Widechar string to integer conversion utils -------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SRC___SUPPORT_WCS_TO_INTEGER_H
-#define LLVM_LIBC_SRC___SUPPORT_WCS_TO_INTEGER_H
-
-#include "hdr/errno_macros.h" // For ERANGE
-#include "src/__support/CPP/limits.h"
-#include "src/__support/CPP/type_traits.h"
-#include "src/__support/CPP/type_traits/make_unsigned.h"
-#include "src/__support/big_int.h"
-#include "src/__support/common.h"
-#include "src/__support/macros/config.h"
-#include "src/__support/str_to_num_result.h"
-#include "src/__support/uint128.h"
-#include "src/__support/wctype_utils.h"
-
-namespace LIBC_NAMESPACE_DECL {
-namespace internal {
-
-// Returns the idx of the first character in src that is not a whitespace
-// character (as determined by iswspace())
-LIBC_INLINE size_t
-first_non_whitespace(const wchar_t *__restrict src,
-                     size_t src_len = cpp::numeric_limits<size_t>::max()) {
-  size_t src_cur = 0;
-  while (src_cur < src_len && internal::iswspace(src[src_cur])) {
-    ++src_cur;
-  }
-  return src_cur;
-}
-
-// checks if the next 3 characters of the string pointer are the start of a
-// hexadecimal number. Does not advance the string pointer.
-LIBC_INLINE bool
-is_hex_start(const wchar_t *__restrict src,
-             size_t src_len = cpp::numeric_limits<size_t>::max()) {
-  if (src_len < 3)
-    return false;
-  return *src == L'0' && towlower(*(src + 1)) == L'x' && iswalnum(*(src + 2)) &&
-         b36_wchar_to_int(*(src + 2)) < 16;
-}
-
-// Takes the address of the string pointer and parses the base from the start of
-// it.
-LIBC_INLINE int infer_base(const wchar_t *__restrict src, size_t src_len) {
-  // A hexadecimal number is defined as "the prefix 0x or 0X followed by a
-  // sequence of the decimal digits and the letters a (or A) through f (or F)
-  // with values 10 through 15 respectively." (C standard 6.4.4.1)
-  if (is_hex_start(src, src_len))
-    return 16;
-  // An octal number is defined as "the prefix 0 optionally followed by a
-  // sequence of the digits 0 through 7 only" (C standard 6.4.4.1) and so any
-  // number that starts with 0, including just 0, is an octal number.
-  if (src_len > 0 && src[0] == L'0')
-    return 8;
-  // A decimal number is defined as beginning "with a nonzero digit and
-  // consist[ing] of a sequence of decimal digits." (C standard 6.4.4.1)
-  return 10;
-}
-
-template <class T>
-LIBC_INLINE StrToNumResult<T>
-wcstointeger(const wchar_t *__restrict src, int base,
-             const size_t src_len = cpp::numeric_limits<size_t>::max()) {
-  using ResultType = make_integral_or_big_int_unsigned_t<T>;
-
-  ResultType result = 0;
-
-  bool is_number = false;
-  size_t src_cur = 0;
-  int error_val = 0;
-
-  if (src_len == 0)
-    return {0, 0, 0};
-
-  if (base < 0 || base == 1 || base > 36)
-    return {0, 0, EINVAL};
-
-  src_cur = first_non_whitespace(src, src_len);
-
-  wchar_t result_sign = L'+';
-  if (src[src_cur] == L'+' || src[src_cur] == L'-') {
-    result_sign = src[src_cur];
-    ++src_cur;
-  }
-
-  if (base == 0)
-    base = infer_base(src + src_cur, src_len - src_cur);
-
-  if (base == 16 && is_hex_start(src + src_cur, src_len - src_cur))
-    src_cur = src_cur + 2;
-
-  constexpr bool IS_UNSIGNED = cpp::is_unsigned_v<T>;
-  const bool is_positive = (result_sign == L'+');
-
-  ResultType constexpr NEGATIVE_MAX =
-      !IS_UNSIGNED ? static_cast<ResultType>(cpp::numeric_limits<T>::max()) + 1
-                   : cpp::numeric_limits<T>::max();
-  ResultType const abs_max =
-      (is_positive ? cpp::numeric_limits<T>::max() : NEGATIVE_MAX);
-  ResultType const abs_max_div_by_base =
-      abs_max / static_cast<ResultType>(base);
-
-  while (src_cur < src_len && iswalnum(src[src_cur])) {
-    int cur_digit = b36_wchar_to_int(src[src_cur]);
-    if (cur_digit >= base)
-      break;
-
-    is_number = true;
-    ++src_cur;
-
-    // If the number has already hit the maximum value for the current type then
-    // the result cannot change, but we still need to advance src to the end of
-    // the number.
-    if (result == abs_max) {
-      error_val = ERANGE;
-      continue;
-    }
-
-    if (result > abs_max_div_by_base) {
-      result = abs_max;
-      error_val = ERANGE;
-    } else {
-      result = result * static_cast<ResultType>(base);
-    }
-    if (result > abs_max - static_cast<ResultType>(cur_digit)) {
-      result = abs_max;
-      error_val = ERANGE;
-    } else {
-      result = result + static_cast<ResultType>(cur_digit);
-    }
-  }
-
-  ptrdiff_t str_len = is_number ? static_cast<ptrdiff_t>(src_cur) : 0;
-
-  if (error_val == ERANGE) {
-    if (is_positive || IS_UNSIGNED)
-      return {cpp::numeric_limits<T>::max(), str_len, error_val};
-    else // T is signed and there is a negative overflow
-      return {cpp::numeric_limits<T>::min(), str_len, error_val};
-  }
-
-  return {static_cast<T>(is_positive ? result : -result), str_len, error_val};
-}
-
-} // namespace internal
-} // namespace LIBC_NAMESPACE_DECL
-
-#endif // LLVM_LIBC_SRC___SUPPORT_WCS_TO_INTEGER_H
diff --git a/libc/src/fenv/CMakeLists.txt b/libc/src/fenv/CMakeLists.txt
index c5431b1b9d55e..f368845977964 100644
--- a/libc/src/fenv/CMakeLists.txt
+++ b/libc/src/fenv/CMakeLists.txt
@@ -6,8 +6,6 @@ add_entrypoint_object(
     fegetround.h
   DEPENDS
     libc.src.__support.FPUtil.fenv_impl
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -18,8 +16,6 @@ add_entrypoint_object(
     fesetround.h
   DEPENDS
     libc.src.__support.FPUtil.fenv_impl
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -30,8 +26,6 @@ add_entrypoint_object(
     feclearexcept.h
   DEPENDS
     libc.src.__support.FPUtil.fenv_impl
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -42,8 +36,6 @@ add_entrypoint_object(
     feraiseexcept.h
   DEPENDS
     libc.src.__support.FPUtil.fenv_impl
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -54,8 +46,6 @@ add_entrypoint_object(
     fetestexcept.h
   DEPENDS
     libc.src.__support.FPUtil.fenv_impl
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -67,8 +57,6 @@ add_entrypoint_object(
   DEPENDS
     libc.hdr.types.fexcept_t
     libc.src.__support.FPUtil.fenv_impl
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -80,8 +68,6 @@ add_entrypoint_object(
   DEPENDS
     libc.hdr.types.fenv_t
     libc.src.__support.FPUtil.fenv_impl
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -93,8 +79,6 @@ add_entrypoint_object(
   DEPENDS
     libc.hdr.types.fenv_t
     libc.src.__support.FPUtil.fenv_impl
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -107,8 +91,6 @@ add_entrypoint_object(
     libc.hdr.fenv_macros
     libc.hdr.types.fexcept_t
     libc.src.__support.FPUtil.fenv_impl
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -119,8 +101,6 @@ add_entrypoint_object(
     fesetexcept.h
   DEPENDS
     libc.src.__support.FPUtil.fenv_impl
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -133,8 +113,6 @@ add_entrypoint_object(
     libc.hdr.fenv_macros
     libc.hdr.types.fexcept_t
     libc.src.__support.FPUtil.fenv_impl
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -147,8 +125,6 @@ add_entrypoint_object(
     libc.hdr.fenv_macros
     libc.hdr.types.fenv_t
     libc.src.__support.FPUtil.fenv_impl
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -161,8 +137,6 @@ add_entrypoint_object(
     libc.hdr.fenv_macros
     libc.hdr.types.fenv_t
     libc.src.__support.FPUtil.fenv_impl
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -173,8 +147,6 @@ add_entrypoint_object(
     feenableexcept.h
   DEPENDS
     libc.src.__support.FPUtil.fenv_impl
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -185,8 +157,6 @@ add_entrypoint_object(
     fedisableexcept.h
   DEPENDS
     libc.src.__support.FPUtil.fenv_impl
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -197,6 +167,4 @@ add_entrypoint_object(
     fegetexcept.h
   DEPENDS
     libc.src.__support.FPUtil.fenv_impl
-  COMPILE_OPTIONS
-    -O2
 )
diff --git a/libc/src/math/amdgpu/CMakeLists.txt b/libc/src/math/amdgpu/CMakeLists.txt
index e2cd3b99c3037..d05d519b74b4f 100644
--- a/libc/src/math/amdgpu/CMakeLists.txt
+++ b/libc/src/math/amdgpu/CMakeLists.txt
@@ -4,8 +4,6 @@ add_entrypoint_object(
     ceil.cpp
   HDRS
     ../ceil.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -14,8 +12,6 @@ add_entrypoint_object(
     ceilf.cpp
   HDRS
     ../ceilf.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -24,8 +20,6 @@ add_entrypoint_object(
     copysign.cpp
   HDRS
     ../copysign.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -34,8 +28,6 @@ add_entrypoint_object(
     copysignf.cpp
   HDRS
     ../copysignf.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -44,8 +36,6 @@ add_entrypoint_object(
     fabs.cpp
   HDRS
     ../fabs.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -54,8 +44,6 @@ add_entrypoint_object(
     fabsf.cpp
   HDRS
     ../fabsf.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -64,8 +52,6 @@ add_entrypoint_object(
     floor.cpp
   HDRS
     ../floor.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -74,8 +60,6 @@ add_entrypoint_object(
     floorf.cpp
   HDRS
     ../floorf.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -84,8 +68,6 @@ add_entrypoint_object(
     fma.cpp
   HDRS
     ../fma.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -94,8 +76,6 @@ add_entrypoint_object(
     fmaf.cpp
   HDRS
     ../fmaf.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -104,8 +84,6 @@ add_entrypoint_object(
     fmax.cpp
   HDRS
     ../fmax.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -114,8 +92,6 @@ add_entrypoint_object(
     fmaxf.cpp
   HDRS
     ../fmaxf.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -124,8 +100,6 @@ add_entrypoint_object(
     fmin.cpp
   HDRS
     ../fmin.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -134,8 +108,6 @@ add_entrypoint_object(
     fminf.cpp
   HDRS
     ../fminf.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -144,8 +116,6 @@ add_entrypoint_object(
     fmod.cpp
   HDRS
     ../fmod.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -154,8 +124,6 @@ add_entrypoint_object(
     fmodf.cpp
   HDRS
     ../fmodf.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -164,8 +132,6 @@ add_entrypoint_object(
     nearbyint.cpp
   HDRS
     ../nearbyint.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -174,8 +140,6 @@ add_entrypoint_object(
     nearbyintf.cpp
   HDRS
     ../nearbyintf.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -184,8 +148,6 @@ add_entrypoint_object(
     remainder.cpp
   HDRS
     ../remainder.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -194,8 +156,6 @@ add_entrypoint_object(
     remainderf.cpp
   HDRS
     ../remainderf.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -204,8 +164,6 @@ add_entrypoint_object(
     rint.cpp
   HDRS
     ../rint.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -214,8 +172,6 @@ add_entrypoint_object(
     rintf.cpp
   HDRS
     ../rintf.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -224,8 +180,6 @@ add_entrypoint_object(
     round.cpp
   HDRS
     ../round.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -234,8 +188,6 @@ add_entrypoint_object(
     sqrt.cpp
   HDRS
     ../sqrt.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -244,8 +196,6 @@ add_entrypoint_object(
     sqrtf.cpp
   HDRS
     ../sqrtf.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -254,8 +204,6 @@ add_entrypoint_object(
     trunc.cpp
   HDRS
     ../trunc.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -264,8 +212,6 @@ add_entrypoint_object(
     truncf.cpp
   HDRS
     ../truncf.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -274,8 +220,6 @@ add_entrypoint_object(
     frexp.cpp
   HDRS
     ../frexp.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -284,8 +228,6 @@ add_entrypoint_object(
     frexpf.cpp
   HDRS
     ../frexpf.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -294,8 +236,6 @@ add_entrypoint_object(
     scalbn.cpp
   HDRS
     ../scalbn.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -304,8 +244,6 @@ add_entrypoint_object(
     scalbnf.cpp
   HDRS
     ../scalbnf.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -314,8 +252,6 @@ add_entrypoint_object(
     ldexp.cpp
   HDRS
     ../ldexp.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -324,8 +260,6 @@ add_entrypoint_object(
     ldexpf.cpp
   HDRS
     ../ldexpf.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -336,7 +270,6 @@ add_entrypoint_object(
     ../tgamma.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
-    -O2
 )
 
 add_entrypoint_object(
@@ -347,7 +280,6 @@ add_entrypoint_object(
     ../tgammaf.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
-    -O2
 )
 
 add_entrypoint_object(
@@ -358,7 +290,6 @@ add_entrypoint_object(
     ../lgamma.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
-    -O2
 )
 
 add_entrypoint_object(
@@ -369,5 +300,4 @@ add_entrypoint_object(
     ../lgamma_r.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
-    -O2
 )
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index 6068c36e558ef..c048a64db6bc2 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -2662,8 +2662,6 @@ add_entrypoint_object(
     ../fmaximum_mag.h
   DEPENDS
     libc.src.__support.FPUtil.basic_operations
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -2674,8 +2672,6 @@ add_entrypoint_object(
     ../fmaximum_magf.h
   DEPENDS
     libc.src.__support.FPUtil.basic_operations
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -2686,8 +2682,6 @@ add_entrypoint_object(
     ../fmaximum_magl.h
   DEPENDS
     libc.src.__support.FPUtil.basic_operations
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -2735,8 +2729,6 @@ add_entrypoint_object(
     ../fmaximum_mag_num.h
   DEPENDS
     libc.src.__support.FPUtil.basic_operations
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -2747,8 +2739,6 @@ add_entrypoint_object(
     ../fmaximum_mag_numf.h
   DEPENDS
     libc.src.__support.FPUtil.basic_operations
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -2759,8 +2749,6 @@ add_entrypoint_object(
     ../fmaximum_mag_numl.h
   DEPENDS
     libc.src.__support.FPUtil.basic_operations
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -2954,8 +2942,6 @@ add_entrypoint_object(
     ../fminimum_mag.h
   DEPENDS
     libc.src.__support.FPUtil.basic_operations
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -2966,8 +2952,6 @@ add_entrypoint_object(
     ../fminimum_magf.h
   DEPENDS
     libc.src.__support.FPUtil.basic_operations
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -2978,8 +2962,6 @@ add_entrypoint_object(
     ../fminimum_magl.h
   DEPENDS
     libc.src.__support.FPUtil.basic_operations
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -3027,8 +3009,6 @@ add_entrypoint_object(
     ../fminimum_mag_num.h
   DEPENDS
     libc.src.__support.FPUtil.basic_operations
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -3039,8 +3019,6 @@ add_entrypoint_object(
     ../fminimum_mag_numf.h
   DEPENDS
     libc.src.__support.FPUtil.basic_operations
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -3051,8 +3029,6 @@ add_entrypoint_object(
     ../fminimum_mag_numl.h
   DEPENDS
     libc.src.__support.FPUtil.basic_operations
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -4306,7 +4282,7 @@ add_entrypoint_object(
     libc.hdr.errno_macros
     libc.hdr.fenv_macros
     libc.src.__support.FPUtil.except_value_utils
-    libc.src.__support.FPUtil.fenv_impl 
+    libc.src.__support.FPUtil.fenv_impl
     libc.src.__support.FPUtil.fp_bits
     libc.src.__support.FPUtil.rounding_mode
     libc.src.__support.macros.optimization
@@ -4546,8 +4522,6 @@ add_entrypoint_object(
     atan.cpp
   HDRS
     ../atan.h
-  COMPILE_OPTIONS
-    -O3
   DEPENDS
     libc.src.__support.math.atan
 )
diff --git a/libc/src/math/nvptx/CMakeLists.txt b/libc/src/math/nvptx/CMakeLists.txt
index fcb2870b4bb1c..e27c316ff20ca 100644
--- a/libc/src/math/nvptx/CMakeLists.txt
+++ b/libc/src/math/nvptx/CMakeLists.txt
@@ -4,8 +4,6 @@ add_entrypoint_object(
     ceil.cpp
   HDRS
     ../ceil.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -14,8 +12,6 @@ add_entrypoint_object(
     ceilf.cpp
   HDRS
     ../ceilf.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -24,8 +20,6 @@ add_entrypoint_object(
     copysign.cpp
   HDRS
     ../copysign.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -34,8 +28,6 @@ add_entrypoint_object(
     copysignf.cpp
   HDRS
     ../copysignf.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -44,8 +36,6 @@ add_entrypoint_object(
     fabs.cpp
   HDRS
     ../fabs.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -54,8 +44,6 @@ add_entrypoint_object(
     fabsf.cpp
   HDRS
     ../fabsf.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -64,8 +52,6 @@ add_entrypoint_object(
     floor.cpp
   HDRS
     ../floor.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -74,8 +60,6 @@ add_entrypoint_object(
     floorf.cpp
   HDRS
     ../floorf.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -84,8 +68,6 @@ add_entrypoint_object(
     fma.cpp
   HDRS
     ../fma.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -94,8 +76,6 @@ add_entrypoint_object(
     fmaf.cpp
   HDRS
     ../fmaf.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -104,8 +84,6 @@ add_entrypoint_object(
     fmax.cpp
   HDRS
     ../fmax.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -114,8 +92,6 @@ add_entrypoint_object(
     fmaxf.cpp
   HDRS
     ../fmaxf.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -124,8 +100,6 @@ add_entrypoint_object(
     fmin.cpp
   HDRS
     ../fmin.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -134,8 +108,6 @@ add_entrypoint_object(
     fminf.cpp
   HDRS
     ../fminf.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -144,8 +116,6 @@ add_entrypoint_object(
     fmod.cpp
   HDRS
     ../fmod.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -154,8 +124,6 @@ add_entrypoint_object(
     fmodf.cpp
   HDRS
     ../fmodf.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -164,8 +132,6 @@ add_entrypoint_object(
     nearbyint.cpp
   HDRS
     ../nearbyint.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -174,8 +140,6 @@ add_entrypoint_object(
     nearbyintf.cpp
   HDRS
     ../nearbyintf.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -184,8 +148,6 @@ add_entrypoint_object(
     remainder.cpp
   HDRS
     ../remainder.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -194,8 +156,6 @@ add_entrypoint_object(
     remainderf.cpp
   HDRS
     ../remainderf.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -204,8 +164,6 @@ add_entrypoint_object(
     rint.cpp
   HDRS
     ../rint.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -214,8 +172,6 @@ add_entrypoint_object(
     rintf.cpp
   HDRS
     ../rintf.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -224,8 +180,6 @@ add_entrypoint_object(
     round.cpp
   HDRS
     ../round.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -234,8 +188,6 @@ add_entrypoint_object(
     sqrt.cpp
   HDRS
     ../sqrt.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -244,8 +196,6 @@ add_entrypoint_object(
     sqrtf.cpp
   HDRS
     ../sqrtf.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -254,8 +204,6 @@ add_entrypoint_object(
     trunc.cpp
   HDRS
     ../trunc.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -264,8 +212,6 @@ add_entrypoint_object(
     truncf.cpp
   HDRS
     ../truncf.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -276,7 +222,6 @@ add_entrypoint_object(
     ../tgamma.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
-    -O2
 )
 
 add_entrypoint_object(
@@ -287,7 +232,6 @@ add_entrypoint_object(
     ../tgammaf.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
-    -O2
 )
 
 add_entrypoint_object(
@@ -298,7 +242,6 @@ add_entrypoint_object(
     ../lgamma.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
-    -O2
 )
 
 add_entrypoint_object(
@@ -309,5 +252,4 @@ add_entrypoint_object(
     ../lgamma_r.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
-    -O2
 )
diff --git a/libc/src/time/strftime.cpp b/libc/src/time/strftime.cpp
index f36091bc9736e..89b7d9bb7c1b9 100644
--- a/libc/src/time/strftime.cpp
+++ b/libc/src/time/strftime.cpp
@@ -26,7 +26,7 @@ LLVM_LIBC_FUNCTION(size_t, strftime,
   int ret = strftime_core::strftime_main(&writer, format, timeptr);
   if (buffsz > 0) // if the buffsz is 0 the buffer may be a null pointer.
     wb.buff[wb.buff_cur] = '\0';
-  return (ret < 0 || static_cast<size_t>(ret) > buffsz) ? 0 : ret;
+  return (ret < 0 || static_cast<size_t>(ret) >= buffsz) ? 0 : ret;
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/time/strftime_l.cpp b/libc/src/time/strftime_l.cpp
index 201b85da39ee2..409f8683b7289 100644
--- a/libc/src/time/strftime_l.cpp
+++ b/libc/src/time/strftime_l.cpp
@@ -29,7 +29,7 @@ LLVM_LIBC_FUNCTION(size_t, strftime_l,
   int ret = strftime_core::strftime_main(&writer, format, timeptr);
   if (buffsz > 0) // if the buffsz is 0 the buffer may be a null pointer.
     wb.buff[wb.buff_cur] = '\0';
-  return (ret < 0 || static_cast<size_t>(ret) > buffsz) ? 0 : ret;
+  return (ret < 0 || static_cast<size_t>(ret) >= buffsz) ? 0 : ret;
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/wchar/CMakeLists.txt b/libc/src/wchar/CMakeLists.txt
index adde382bf0950..ba27cd77f6bac 100644
--- a/libc/src/wchar/CMakeLists.txt
+++ b/libc/src/wchar/CMakeLists.txt
@@ -63,7 +63,7 @@ add_entrypoint_object(
     wcstol.h
   DEPENDS
     libc.src.errno.errno
-    libc.src.__support.wcs_to_integer
+    libc.src.__support.str_to_integer
 )
 
 add_entrypoint_object(
@@ -74,7 +74,7 @@ add_entrypoint_object(
     wcstoll.h
   DEPENDS
     libc.src.errno.errno
-    libc.src.__support.wcs_to_integer
+    libc.src.__support.str_to_integer
 )
 
 add_entrypoint_object(
@@ -85,7 +85,7 @@ add_entrypoint_object(
     wcstoul.h
   DEPENDS
     libc.src.errno.errno
-    libc.src.__support.wcs_to_integer
+    libc.src.__support.str_to_integer
 )
 
 add_entrypoint_object(
@@ -96,7 +96,7 @@ add_entrypoint_object(
     wcstoull.h
   DEPENDS
     libc.src.errno.errno
-    libc.src.__support.wcs_to_integer
+    libc.src.__support.str_to_integer
 )
 
 add_entrypoint_object(
diff --git a/libc/src/wchar/wcstol.cpp b/libc/src/wchar/wcstol.cpp
index a05718f706dfd..a56b5f91272cd 100644
--- a/libc/src/wchar/wcstol.cpp
+++ b/libc/src/wchar/wcstol.cpp
@@ -10,14 +10,14 @@
 #include "src/__support/common.h"
 #include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/__support/wcs_to_integer.h"
+#include "src/__support/str_to_integer.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(long, wcstol,
                    (const wchar_t *__restrict str, wchar_t **__restrict str_end,
                     int base)) {
-  auto result = internal::wcstointeger<long>(str, base);
+  auto result = internal::strtointeger<long>(str, base);
   if (result.has_error())
     libc_errno = result.error;
 
diff --git a/libc/src/wchar/wcstoll.cpp b/libc/src/wchar/wcstoll.cpp
index de1299d681cdb..6229d24172b51 100644
--- a/libc/src/wchar/wcstoll.cpp
+++ b/libc/src/wchar/wcstoll.cpp
@@ -10,14 +10,14 @@
 #include "src/__support/common.h"
 #include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/__support/wcs_to_integer.h"
+#include "src/__support/str_to_integer.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(long long, wcstoll,
                    (const wchar_t *__restrict str, wchar_t **__restrict str_end,
                     int base)) {
-  auto result = internal::wcstointeger<long long>(str, base);
+  auto result = internal::strtointeger<long long>(str, base);
   if (result.has_error())
     libc_errno = result.error;
 
diff --git a/libc/src/wchar/wcstoul.cpp b/libc/src/wchar/wcstoul.cpp
index 79b8c9b5c9fa3..c5639bee1d649 100644
--- a/libc/src/wchar/wcstoul.cpp
+++ b/libc/src/wchar/wcstoul.cpp
@@ -10,14 +10,14 @@
 #include "src/__support/common.h"
 #include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/__support/wcs_to_integer.h"
+#include "src/__support/str_to_integer.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(unsigned long, wcstoul,
                    (const wchar_t *__restrict str, wchar_t **__restrict str_end,
                     int base)) {
-  auto result = internal::wcstointeger<unsigned long>(str, base);
+  auto result = internal::strtointeger<unsigned long>(str, base);
   if (result.has_error())
     libc_errno = result.error;
 
diff --git a/libc/src/wchar/wcstoull.cpp b/libc/src/wchar/wcstoull.cpp
index 768e03c4bd189..2ab24e9b2b2a1 100644
--- a/libc/src/wchar/wcstoull.cpp
+++ b/libc/src/wchar/wcstoull.cpp
@@ -10,14 +10,14 @@
 #include "src/__support/common.h"
 #include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/__support/wcs_to_integer.h"
+#include "src/__support/str_to_integer.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(unsigned long long, wcstoull,
                    (const wchar_t *__restrict str, wchar_t **__restrict str_end,
                     int base)) {
-  auto result = internal::wcstointeger<unsigned long long>(str, base);
+  auto result = internal::strtointeger<unsigned long long>(str, base);
   if (result.has_error())
     libc_errno = result.error;
 
diff --git a/libc/test/src/__support/CMakeLists.txt b/libc/test/src/__support/CMakeLists.txt
index a02514106a307..138866b4cc869 100644
--- a/libc/test/src/__support/CMakeLists.txt
+++ b/libc/test/src/__support/CMakeLists.txt
@@ -151,7 +151,7 @@ add_libc_test(
     wcs_to_integer_test.cpp
   DEPENDS
     libc.src.__support.integer_literals
-    libc.src.__support.wcs_to_integer
+    libc.src.__support.str_to_integer
 )
 
 add_libc_test(
diff --git a/libc/test/src/__support/str_to_integer_test.cpp b/libc/test/src/__support/str_to_integer_test.cpp
index 1ec882b212b8a..e5ac1d6cbb7b3 100644
--- a/libc/test/src/__support/str_to_integer_test.cpp
+++ b/libc/test/src/__support/str_to_integer_test.cpp
@@ -49,12 +49,14 @@ TEST(LlvmLibcStrToIntegerTest, LeadingSpaces) {
   EXPECT_EQ(result.parsed_len, ptrdiff_t(7));
   ASSERT_EQ(result.value, 12);
 
-  result = LIBC_NAMESPACE::internal::strtointeger<int>("     12345", 10, 5);
+  // Use a non-null-terminated buffer to test for possible OOB access.
+  char buf[5] = {' ', ' ', ' ', ' ', ' '};
+  result = LIBC_NAMESPACE::internal::strtointeger<int>(buf, 10, 5);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(0));
   ASSERT_EQ(result.value, 0);
 
-  result = LIBC_NAMESPACE::internal::strtointeger<int>("     12345", 10, 0);
+  result = LIBC_NAMESPACE::internal::strtointeger<int>(buf, 10, 0);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(0));
   ASSERT_EQ(result.value, 0);
diff --git a/libc/test/src/__support/wcs_to_integer_test.cpp b/libc/test/src/__support/wcs_to_integer_test.cpp
index 4554968be67ce..38af778ca2440 100644
--- a/libc/test/src/__support/wcs_to_integer_test.cpp
+++ b/libc/test/src/__support/wcs_to_integer_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/__support/wcs_to_integer.h"
+#include "src/__support/str_to_integer.h"
 #include <stddef.h>
 
 #include "test/UnitTest/Test.h"
@@ -14,224 +14,226 @@
 // This file is for testing the src_len argument and other internal interface
 // features. Primary testing is done through the public interface.
 
-TEST(LlvmLibcStrToIntegerTest, SimpleLength) {
-  auto result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"12345", 10, 10);
+TEST(LlvmLibcWcsToIntegerTest, SimpleLength) {
+  auto result = LIBC_NAMESPACE::internal::strtointeger<int>(L"12345", 10, 10);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(5));
   ASSERT_EQ(result.value, 12345);
 
-  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"12345", 10, 2);
+  result = LIBC_NAMESPACE::internal::strtointeger<int>(L"12345", 10, 2);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(2));
   ASSERT_EQ(result.value, 12);
 
-  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"12345", 10, 0);
+  result = LIBC_NAMESPACE::internal::strtointeger<int>(L"12345", 10, 0);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(0));
   ASSERT_EQ(result.value, 0);
 }
 
-TEST(LlvmLibcStrToIntegerTest, LeadingSpaces) {
+TEST(LlvmLibcWcsToIntegerTest, LeadingSpaces) {
   auto result =
-      LIBC_NAMESPACE::internal::wcstointeger<int>(L"     12345", 10, 15);
+      LIBC_NAMESPACE::internal::strtointeger<int>(L"     12345", 10, 15);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(10));
   ASSERT_EQ(result.value, 12345);
 
-  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"     12345", 10, 10);
+  result = LIBC_NAMESPACE::internal::strtointeger<int>(L"     12345", 10, 10);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(10));
   ASSERT_EQ(result.value, 12345);
 
-  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"     12345", 10, 7);
+  result = LIBC_NAMESPACE::internal::strtointeger<int>(L"     12345", 10, 7);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(7));
   ASSERT_EQ(result.value, 12);
 
-  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"     12345", 10, 5);
+  // Use a non-null-terminated buffer to test for possible OOB access.
+  wchar_t buf[5] = {L' ', L' ', L' ', L' ', L' '};
+  result = LIBC_NAMESPACE::internal::strtointeger<int>(buf, 10, 5);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(0));
   ASSERT_EQ(result.value, 0);
 
-  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"     12345", 10, 0);
+  result = LIBC_NAMESPACE::internal::strtointeger<int>(buf, 10, 0);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(0));
   ASSERT_EQ(result.value, 0);
 }
 
-TEST(LlvmLibcStrToIntegerTest, LeadingSign) {
-  auto result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"+12345", 10, 10);
+TEST(LlvmLibcWcsToIntegerTest, LeadingSign) {
+  auto result = LIBC_NAMESPACE::internal::strtointeger<int>(L"+12345", 10, 10);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(6));
   ASSERT_EQ(result.value, 12345);
 
-  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"-12345", 10, 10);
+  result = LIBC_NAMESPACE::internal::strtointeger<int>(L"-12345", 10, 10);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(6));
   ASSERT_EQ(result.value, -12345);
 
-  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"+12345", 10, 6);
+  result = LIBC_NAMESPACE::internal::strtointeger<int>(L"+12345", 10, 6);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(6));
   ASSERT_EQ(result.value, 12345);
 
-  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"-12345", 10, 6);
+  result = LIBC_NAMESPACE::internal::strtointeger<int>(L"-12345", 10, 6);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(6));
   ASSERT_EQ(result.value, -12345);
 
-  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"+12345", 10, 3);
+  result = LIBC_NAMESPACE::internal::strtointeger<int>(L"+12345", 10, 3);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(3));
   ASSERT_EQ(result.value, 12);
 
-  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"-12345", 10, 3);
+  result = LIBC_NAMESPACE::internal::strtointeger<int>(L"-12345", 10, 3);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(3));
   ASSERT_EQ(result.value, -12);
 
-  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"+12345", 10, 1);
+  result = LIBC_NAMESPACE::internal::strtointeger<int>(L"+12345", 10, 1);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(0));
   ASSERT_EQ(result.value, 0);
 
-  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"-12345", 10, 1);
+  result = LIBC_NAMESPACE::internal::strtointeger<int>(L"-12345", 10, 1);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(0));
   ASSERT_EQ(result.value, 0);
 
-  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"+12345", 10, 0);
+  result = LIBC_NAMESPACE::internal::strtointeger<int>(L"+12345", 10, 0);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(0));
   ASSERT_EQ(result.value, 0);
 
-  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"-12345", 10, 0);
+  result = LIBC_NAMESPACE::internal::strtointeger<int>(L"-12345", 10, 0);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(0));
   ASSERT_EQ(result.value, 0);
 }
 
-TEST(LlvmLibcStrToIntegerTest, Base16PrefixAutoSelect) {
-  auto result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"0x12345", 0, 10);
+TEST(LlvmLibcWcsToIntegerTest, Base16PrefixAutoSelect) {
+  auto result = LIBC_NAMESPACE::internal::strtointeger<int>(L"0x12345", 0, 10);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(7));
   ASSERT_EQ(result.value, 0x12345);
 
-  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"0x12345", 0, 7);
+  result = LIBC_NAMESPACE::internal::strtointeger<int>(L"0x12345", 0, 7);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(7));
   ASSERT_EQ(result.value, 0x12345);
 
-  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"0x12345", 0, 5);
+  result = LIBC_NAMESPACE::internal::strtointeger<int>(L"0x12345", 0, 5);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(5));
   ASSERT_EQ(result.value, 0x123);
 
-  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"0x12345", 0, 2);
+  result = LIBC_NAMESPACE::internal::strtointeger<int>(L"0x12345", 0, 2);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(1));
   ASSERT_EQ(result.value, 0);
 
-  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"0x12345", 0, 0);
+  result = LIBC_NAMESPACE::internal::strtointeger<int>(L"0x12345", 0, 0);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(0));
   ASSERT_EQ(result.value, 0);
 }
 
-TEST(LlvmLibcStrToIntegerTest, Base16PrefixManualSelect) {
-  auto result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"0x12345", 16, 10);
+TEST(LlvmLibcWcsToIntegerTest, Base16PrefixManualSelect) {
+  auto result = LIBC_NAMESPACE::internal::strtointeger<int>(L"0x12345", 16, 10);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(7));
   ASSERT_EQ(result.value, 0x12345);
 
-  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"0x12345", 16, 7);
+  result = LIBC_NAMESPACE::internal::strtointeger<int>(L"0x12345", 16, 7);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(7));
   ASSERT_EQ(result.value, 0x12345);
 
-  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"0x12345", 16, 5);
+  result = LIBC_NAMESPACE::internal::strtointeger<int>(L"0x12345", 16, 5);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(5));
   ASSERT_EQ(result.value, 0x123);
 
-  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"0x12345", 16, 2);
+  result = LIBC_NAMESPACE::internal::strtointeger<int>(L"0x12345", 16, 2);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(1));
   ASSERT_EQ(result.value, 0);
 
-  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"0x12345", 16, 0);
+  result = LIBC_NAMESPACE::internal::strtointeger<int>(L"0x12345", 16, 0);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(0));
   ASSERT_EQ(result.value, 0);
 }
 
-TEST(LlvmLibcStrToIntegerTest, Base8PrefixAutoSelect) {
-  auto result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"012345", 0, 10);
+TEST(LlvmLibcWcsToIntegerTest, Base8PrefixAutoSelect) {
+  auto result = LIBC_NAMESPACE::internal::strtointeger<int>(L"012345", 0, 10);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(6));
   ASSERT_EQ(result.value, 012345);
 
-  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"012345", 0, 6);
+  result = LIBC_NAMESPACE::internal::strtointeger<int>(L"012345", 0, 6);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(6));
   ASSERT_EQ(result.value, 012345);
 
-  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"012345", 0, 4);
+  result = LIBC_NAMESPACE::internal::strtointeger<int>(L"012345", 0, 4);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(4));
   ASSERT_EQ(result.value, 0123);
 
-  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"012345", 0, 1);
+  result = LIBC_NAMESPACE::internal::strtointeger<int>(L"012345", 0, 1);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(1));
   ASSERT_EQ(result.value, 0);
 
-  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"012345", 0, 0);
+  result = LIBC_NAMESPACE::internal::strtointeger<int>(L"012345", 0, 0);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(0));
   ASSERT_EQ(result.value, 0);
 }
 
-TEST(LlvmLibcStrToIntegerTest, Base8PrefixManualSelect) {
-  auto result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"012345", 8, 10);
+TEST(LlvmLibcWcsToIntegerTest, Base8PrefixManualSelect) {
+  auto result = LIBC_NAMESPACE::internal::strtointeger<int>(L"012345", 8, 10);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(6));
   ASSERT_EQ(result.value, 012345);
 
-  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"012345", 8, 6);
+  result = LIBC_NAMESPACE::internal::strtointeger<int>(L"012345", 8, 6);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(6));
   ASSERT_EQ(result.value, 012345);
 
-  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"012345", 8, 4);
+  result = LIBC_NAMESPACE::internal::strtointeger<int>(L"012345", 8, 4);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(4));
   ASSERT_EQ(result.value, 0123);
 
-  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"012345", 8, 1);
+  result = LIBC_NAMESPACE::internal::strtointeger<int>(L"012345", 8, 1);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(1));
   ASSERT_EQ(result.value, 0);
 
-  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"012345", 8, 0);
+  result = LIBC_NAMESPACE::internal::strtointeger<int>(L"012345", 8, 0);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(0));
   ASSERT_EQ(result.value, 0);
 }
 
-TEST(LlvmLibcStrToIntegerTest, CombinedTests) {
+TEST(LlvmLibcWcsToIntegerTest, CombinedTests) {
   auto result =
-      LIBC_NAMESPACE::internal::wcstointeger<int>(L"    -0x123", 0, 10);
+      LIBC_NAMESPACE::internal::strtointeger<int>(L"    -0x123", 0, 10);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(10));
   ASSERT_EQ(result.value, -0x123);
 
-  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"    -0x123", 0, 8);
+  result = LIBC_NAMESPACE::internal::strtointeger<int>(L"    -0x123", 0, 8);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(8));
   ASSERT_EQ(result.value, -0x1);
 
-  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"    -0x123", 0, 7);
+  result = LIBC_NAMESPACE::internal::strtointeger<int>(L"    -0x123", 0, 7);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(6));
   ASSERT_EQ(result.value, 0);
diff --git a/libc/test/src/time/strftime_test.cpp b/libc/test/src/time/strftime_test.cpp
index cac7560b2b945..5222152791905 100644
--- a/libc/test/src/time/strftime_test.cpp
+++ b/libc/test/src/time/strftime_test.cpp
@@ -2326,3 +2326,24 @@ TEST(LlvmLibcStrftimeTest, TimeFormatFullDateTime) {
 //    size_t written = 0;
 //    SimplePaddedNum spn;
 //  }
+
+TEST(LlvmLibcStrftimeTest, BufferTooSmall) {
+  struct tm time;
+  char tiny_buffer[1];
+
+  time.tm_year = get_adjusted_year(2025);
+  time.tm_mon = 10;
+  time.tm_mday = 24;
+
+  size_t written =
+      LIBC_NAMESPACE::strftime(tiny_buffer, sizeof(tiny_buffer), "%F", &time);
+  EXPECT_EQ(written, size_t{0});
+
+  char small_buffer[10];
+
+  // The string "2025-11-24" is 10 chars,
+  // so strftime needs 10 + 1 bytes to write the string and the null terminator.
+  written =
+      LIBC_NAMESPACE::strftime(small_buffer, sizeof(small_buffer), "%F", &time);
+  EXPECT_EQ(written, size_t{0});
+}
diff --git a/libc/utils/hdrgen/hdrgen/enumeration.py b/libc/utils/hdrgen/hdrgen/enumeration.py
index 198720826720c..1e0f64aec1eda 100644
--- a/libc/utils/hdrgen/hdrgen/enumeration.py
+++ b/libc/utils/hdrgen/hdrgen/enumeration.py
@@ -6,24 +6,14 @@
 #
 # ==-------------------------------------------------------------------------==#
 
-from functools import total_ordering
+from hdrgen.symbol import Symbol
 
 
-@total_ordering
-class Enumeration:
+class Enumeration(Symbol):
     def __init__(self, name, value):
-        self.name = name
+        super().__init__(name)
         self.value = value
 
-    def __eq__(self, other):
-        return self.name == other.name
-
-    def __lt__(self, other):
-        return self.name < other.name
-
-    def __hash__(self):
-        return self.name.__hash__()
-
     def __str__(self):
         if self.value != None:
             return f"{self.name} = {self.value}"
diff --git a/libc/utils/hdrgen/hdrgen/function.py b/libc/utils/hdrgen/hdrgen/function.py
index f039996584e31..4de3406cc408e 100644
--- a/libc/utils/hdrgen/hdrgen/function.py
+++ b/libc/utils/hdrgen/hdrgen/function.py
@@ -7,7 +7,7 @@
 # ==-------------------------------------------------------------------------==#
 
 import re
-from functools import total_ordering
+from hdrgen.symbol import Symbol
 from hdrgen.type import Type
 
 
@@ -37,14 +37,13 @@
 NONIDENTIFIER = re.compile("[^a-zA-Z0-9_]+")
 
 
-@total_ordering
-class Function:
+class Function(Symbol):
     def __init__(
         self, return_type, name, arguments, standards, guard=None, attributes=[]
     ):
+        super().__init__(name)
         assert return_type
         self.return_type = return_type
-        self.name = name
         self.arguments = [
             arg if isinstance(arg, str) else arg["type"] for arg in arguments
         ]
@@ -53,15 +52,6 @@ def __init__(
         self.guard = guard
         self.attributes = attributes or []
 
-    def __eq__(self, other):
-        return self.name == other.name
-
-    def __lt__(self, other):
-        return self.name < other.name
-
-    def __hash__(self):
-        return self.name.__hash__()
-
     def signature_types(self):
         def collapse(type_string):
             assert type_string
diff --git a/libc/utils/hdrgen/hdrgen/header.py b/libc/utils/hdrgen/hdrgen/header.py
index 2118db6e5fb75..f592327f06ad6 100644
--- a/libc/utils/hdrgen/hdrgen/header.py
+++ b/libc/utils/hdrgen/hdrgen/header.py
@@ -35,6 +35,13 @@
 
 COMMON_HEADER = PurePosixPath("__llvm-libc-common.h")
 
+# These "attributes" are known macros defined in COMMON_HEADER.
+# Others are found in "llvm-libc-macros/{name}.h".
+COMMON_ATTRIBUTES = {
+    "_Noreturn",
+    "_Returns_twice",
+}
+
 # All the canonical identifiers are in lowercase for easy maintenance.
 # This maps them to the pretty descriptions to generate in header comments.
 LIBRARY_DESCRIPTIONS = {
@@ -50,9 +57,7 @@
 HEADER_TEMPLATE = """\
 //===-- {library} header <{header}> --===//
 //
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+{license_lines}
 //
 //===---------------------------------------------------------------------===//
 
@@ -64,6 +69,12 @@
 #endif // {guard}
 """
 
+LLVM_LICENSE_TEXT = [
+    "Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.",
+    "See https://llvm.org/LICENSE.txt for license information.",
+    "SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception",
+]
+
 
 class HeaderFile:
     def __init__(self, name):
@@ -74,8 +85,10 @@ def __init__(self, name):
         self.enumerations = []
         self.objects = []
         self.functions = []
+        self.extra_standards = {}
         self.standards = []
         self.merge_yaml_files = []
+        self.license_text = []
 
     def add_macro(self, macro):
         self.macros.append(macro)
@@ -98,6 +111,11 @@ def merge(self, other):
         self.enumerations = sorted(set(self.enumerations) | set(other.enumerations))
         self.objects = sorted(set(self.objects) | set(other.objects))
         self.functions = sorted(set(self.functions) | set(other.functions))
+        self.extra_standards |= other.extra_standards
+        if self.license_text:
+            assert not other.license_text, "only one `license_text` allowed"
+        else:
+            self.license_text = other.license_text
 
     def all_types(self):
         return reduce(
@@ -106,6 +124,13 @@ def all_types(self):
             set(self.types),
         )
 
+    def all_attributes(self):
+        return reduce(
+            lambda a, b: a | b,
+            [set(f.attributes) for f in self.functions],
+            set(),
+        )
+
     def all_standards(self):
         # FIXME: Only functions have the "standard" field, but all the entity
         # types should have one too.
@@ -114,16 +139,24 @@ def all_standards(self):
         )
 
     def includes(self):
-        return {
-            PurePosixPath("llvm-libc-macros") / macro.header
-            for macro in self.macros
-            if macro.header is not None
-        } | {
-            COMPILER_HEADER_TYPES.get(
-                typ.type_name, PurePosixPath("llvm-libc-types") / f"{typ.type_name}.h"
-            )
-            for typ in self.all_types()
-        }
+        return (
+            {
+                PurePosixPath("llvm-libc-macros") / macro.header
+                for macro in self.macros
+                if macro.header is not None
+            }
+            | {
+                COMPILER_HEADER_TYPES.get(
+                    typ.name,
+                    PurePosixPath("llvm-libc-types") / f"{typ.name}.h",
+                )
+                for typ in self.all_types()
+            }
+            | {
+                PurePosixPath("llvm-libc-macros") / f"{attr}.h"
+                for attr in self.all_attributes() - COMMON_ATTRIBUTES
+            }
+        )
 
     def header_guard(self):
         return "_LLVM_LIBC_" + "_".join(
@@ -131,24 +164,29 @@ def header_guard(self):
         )
 
     def library_description(self):
+        descriptions = LIBRARY_DESCRIPTIONS | self.extra_standards
         # If the header itself is in standard C, just call it that.
         if "stdc" in self.standards:
-            return LIBRARY_DESCRIPTIONS["stdc"]
+            return descriptions["stdc"]
         # If the header itself is in POSIX, just call it that.
         if "posix" in self.standards:
-            return LIBRARY_DESCRIPTIONS["posix"]
+            return descriptions["posix"]
         # Otherwise, consider the standards for each symbol as well.
         standards = self.all_standards()
         # Otherwise, it's described by all those that apply, but ignoring
         # "stdc" and "posix" since this is not a "stdc" or "posix" header.
         return " / ".join(
             sorted(
-                LIBRARY_DESCRIPTIONS[standard]
+                descriptions[standard]
                 for standard in standards
                 if standard not in {"stdc", "posix"}
             )
         )
 
+    def license_lines(self):
+        lines = self.license_text or LLVM_LICENSE_TEXT
+        return "\n".join([f"// {line}" for line in lines])
+
     def template(self, dir, files_read):
         if self.template_file is not None:
             # There's a custom template file, so just read it in and record
@@ -162,6 +200,7 @@ def template(self, dir, files_read):
             library=self.library_description(),
             header=self.name,
             guard=self.header_guard(),
+            license_lines=self.license_lines(),
         )
 
     def public_api(self):
@@ -188,7 +227,7 @@ def relpath(file):
             )
         ]
 
-        for macro in self.macros:
+        for macro in sorted(self.macros):
             # When there is nothing to define, the Macro object converts to str
             # as an empty string.  Don't emit a blank line for those cases.
             if str(macro):
@@ -203,7 +242,12 @@ def relpath(file):
         content.append("\n__BEGIN_C_DECLS\n")
 
         current_guard = None
-        for function in self.functions:
+        last_name = None
+        for function in sorted(self.functions):
+            # If the last function's name was the same after underscores,
+            # elide the blank line between the declarations.
+            if last_name == function.name_without_underscores():
+                content.pop()
             if function.guard == None and current_guard == None:
                 content.append(str(function) + " __NOEXCEPT;")
                 content.append("")
@@ -225,6 +269,7 @@ def relpath(file):
                         content.append(f"#ifdef {current_guard}")
                     content.append(str(function) + " __NOEXCEPT;")
                     content.append("")
+            last_name = function.name_without_underscores()
         if current_guard != None:
             content.pop()
             content.append(f"#endif // {current_guard}")
@@ -241,7 +286,5 @@ def json_data(self):
         return {
             "name": self.name,
             "standards": self.standards,
-            "includes": [
-                str(file) for file in sorted({COMMON_HEADER} | self.includes())
-            ],
+            "includes": sorted(str(file) for file in {COMMON_HEADER} | self.includes()),
         }
diff --git a/libc/utils/hdrgen/hdrgen/macro.py b/libc/utils/hdrgen/hdrgen/macro.py
index e42e82845694d..4664d9fb00494 100644
--- a/libc/utils/hdrgen/hdrgen/macro.py
+++ b/libc/utils/hdrgen/hdrgen/macro.py
@@ -6,25 +6,15 @@
 #
 # ==-------------------------------------------------------------------------==#
 
-from functools import total_ordering
+from hdrgen.symbol import Symbol
 
 
-@total_ordering
-class Macro:
+class Macro(Symbol):
     def __init__(self, name, value=None, header=None):
-        self.name = name
+        super().__init__(name)
         self.value = value
         self.header = header
 
-    def __eq__(self, other):
-        return self.name == other.name
-
-    def __lt__(self, other):
-        return self.name < other.name
-
-    def __hash__(self):
-        return self.name.__hash__()
-
     def __str__(self):
         if self.header != None:
             return ""
diff --git a/libc/utils/hdrgen/hdrgen/main.py b/libc/utils/hdrgen/hdrgen/main.py
index 25df41e506a1f..c12e89ef771d1 100755
--- a/libc/utils/hdrgen/hdrgen/main.py
+++ b/libc/utils/hdrgen/hdrgen/main.py
@@ -105,6 +105,7 @@ def merge_from(paths):
                 return 2
             header.merge(merge_from_header)
 
+        assert header.name, f"`header: name.h` line is required in {yaml_file}"
         return header
 
     if args.json:
diff --git a/libc/utils/hdrgen/hdrgen/object.py b/libc/utils/hdrgen/hdrgen/object.py
index a311c37168d60..a2ab496bed013 100644
--- a/libc/utils/hdrgen/hdrgen/object.py
+++ b/libc/utils/hdrgen/hdrgen/object.py
@@ -6,23 +6,13 @@
 #
 # ==-------------------------------------------------------------------------==#
 
-from functools import total_ordering
+from hdrgen.symbol import Symbol
 
 
-@total_ordering
-class Object:
+class Object(Symbol):
     def __init__(self, name, type):
-        self.name = name
+        super().__init__(name)
         self.type = type
 
-    def __eq__(self, other):
-        return self.name == other.name
-
-    def __lt__(self, other):
-        return self.name < other.name
-
-    def __hash__(self):
-        return self.name.__hash__()
-
     def __str__(self):
         return f"extern {self.type} {self.name};"
diff --git a/libc/utils/hdrgen/hdrgen/symbol.py b/libc/utils/hdrgen/hdrgen/symbol.py
new file mode 100644
index 0000000000000..28e9def128e47
--- /dev/null
+++ b/libc/utils/hdrgen/hdrgen/symbol.py
@@ -0,0 +1,41 @@
+# ====-- Symbol class for libc function headers----------------*- python -*--==#
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# ==-------------------------------------------------------------------------==#
+
+from functools import total_ordering
+
+
+@total_ordering
+class Symbol:
+    """
+    Symbol is the common superclass for each kind of entity named by an
+    identifier.  It provides the name field, and defines sort ordering,
+    hashing, and equality based only on the name.  The sorting is pretty
+    presentation order for identifiers, which is to say it first sorts
+    lexically but ignores leading underscores and secondarily sorts with the
+    fewest underscores first.
+    """
+
+    def __init__(self, name):
+        assert name
+        self.name = name
+
+    def __eq__(self, other):
+        return self.name == other.name
+
+    def __hash__(self):
+        return self.name.__hash__()
+
+    def name_without_underscores(self):
+        return self.name.lstrip("_")
+
+    def name_sort_key(self):
+        ident = self.name_without_underscores()
+        return ident, len(self.name) - len(ident)
+
+    def __lt__(self, other):
+        return self.name_sort_key() < other.name_sort_key()
diff --git a/libc/utils/hdrgen/hdrgen/type.py b/libc/utils/hdrgen/hdrgen/type.py
index 0c0af8569c61e..20c1881a9379a 100644
--- a/libc/utils/hdrgen/hdrgen/type.py
+++ b/libc/utils/hdrgen/hdrgen/type.py
@@ -6,20 +6,10 @@
 #
 # ==-------------------------------------------------------------------------==#
 
-from functools import total_ordering
+from hdrgen.symbol import Symbol
 
 
-@total_ordering
-class Type:
-    def __init__(self, type_name):
-        assert type_name
-        self.type_name = type_name
-
-    def __eq__(self, other):
-        return self.type_name == other.type_name
-
-    def __lt__(self, other):
-        return self.type_name < other.type_name
-
-    def __hash__(self):
-        return self.type_name.__hash__()
+class Type(Symbol):
+    # A type so far carries no specific information beyond its name.
+    def __init__(self, name):
+        super().__init__(name)
diff --git a/libc/utils/hdrgen/hdrgen/yaml_to_classes.py b/libc/utils/hdrgen/hdrgen/yaml_to_classes.py
index ebe7781d449f7..9eddbe615cbba 100644
--- a/libc/utils/hdrgen/hdrgen/yaml_to_classes.py
+++ b/libc/utils/hdrgen/hdrgen/yaml_to_classes.py
@@ -37,6 +37,8 @@ def yaml_to_classes(yaml_data, header_class, entry_points=None):
     header = header_class(header_name)
     header.template_file = yaml_data.get("header_template")
     header.standards = yaml_data.get("standards", [])
+    header.extra_standards = yaml_data.get("extra_standards", {})
+    header.license_text = yaml_data.get("license_text", [])
     header.merge_yaml_files = yaml_data.get("merge_yaml_files", [])
 
     for macro_data in yaml_data.get("macros", []):
diff --git a/libc/utils/hdrgen/tests/expected_output/custom.h b/libc/utils/hdrgen/tests/expected_output/custom.h
new file mode 100644
index 0000000000000..5f9ed231490fd
--- /dev/null
+++ b/libc/utils/hdrgen/tests/expected_output/custom.h
@@ -0,0 +1,21 @@
+//===-- Wile E. Coyote header <custom.h> --===//
+//
+// Caveat emptor.
+// I never studied law.
+//
+//===---------------------------------------------------------------------===//
+
+#ifndef _LLVM_LIBC_CUSTOM_H
+#define _LLVM_LIBC_CUSTOM_H
+
+#include "__llvm-libc-common.h"
+#include "llvm-libc-types/meep.h"
+#include "llvm-libc-types/road.h"
+
+__BEGIN_C_DECLS
+
+road runner(meep, meep) __NOEXCEPT;
+
+__END_C_DECLS
+
+#endif // _LLVM_LIBC_CUSTOM_H
diff --git a/libc/utils/hdrgen/tests/expected_output/sorting.h b/libc/utils/hdrgen/tests/expected_output/sorting.h
new file mode 100644
index 0000000000000..a091a421b2c3f
--- /dev/null
+++ b/libc/utils/hdrgen/tests/expected_output/sorting.h
@@ -0,0 +1,24 @@
+//===-- Standard C header <sorting.h> --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===---------------------------------------------------------------------===//
+
+#ifndef _LLVM_LIBC_SORTING_H
+#define _LLVM_LIBC_SORTING_H
+
+#include "__llvm-libc-common.h"
+
+__BEGIN_C_DECLS
+
+void func_with_aliases(int) __NOEXCEPT;
+void _func_with_aliases(int) __NOEXCEPT;
+void __func_with_aliases(int) __NOEXCEPT;
+
+void gunk(const char *) __NOEXCEPT;
+
+__END_C_DECLS
+
+#endif // _LLVM_LIBC_SORTING_H
diff --git a/libc/utils/hdrgen/tests/expected_output/test_header.h b/libc/utils/hdrgen/tests/expected_output/test_header.h
index 748c09808c128..49112a353f7b6 100644
--- a/libc/utils/hdrgen/tests/expected_output/test_header.h
+++ b/libc/utils/hdrgen/tests/expected_output/test_header.h
@@ -12,6 +12,7 @@
 #include "__llvm-libc-common.h"
 #include "llvm-libc-macros/float16-macros.h"
 
+#include "llvm-libc-macros/CONST_FUNC_A.h"
 #include "llvm-libc-macros/test_more-macros.h"
 #include "llvm-libc-macros/test_small-macros.h"
 #include "llvm-libc-types/float128.h"
diff --git a/libc/utils/hdrgen/tests/expected_output/test_small.json b/libc/utils/hdrgen/tests/expected_output/test_small.json
index 9cc73d013a679..8502df23b9a41 100644
--- a/libc/utils/hdrgen/tests/expected_output/test_small.json
+++ b/libc/utils/hdrgen/tests/expected_output/test_small.json
@@ -4,6 +4,7 @@
     "standards": [],
     "includes": [
       "__llvm-libc-common.h",
+      "llvm-libc-macros/CONST_FUNC_A.h",
       "llvm-libc-macros/test_more-macros.h",
       "llvm-libc-macros/test_small-macros.h",
       "llvm-libc-types/float128.h",
diff --git a/libc/utils/hdrgen/tests/input/custom-common.yaml b/libc/utils/hdrgen/tests/input/custom-common.yaml
new file mode 100644
index 0000000000000..909a3ba5163a5
--- /dev/null
+++ b/libc/utils/hdrgen/tests/input/custom-common.yaml
@@ -0,0 +1,6 @@
+license_text:
+  - Caveat emptor.
+  - I never studied law.
+
+extra_standards:
+  acme: Wile E. Coyote
diff --git a/libc/utils/hdrgen/tests/input/custom.yaml b/libc/utils/hdrgen/tests/input/custom.yaml
new file mode 100644
index 0000000000000..7d3ff8ec421dd
--- /dev/null
+++ b/libc/utils/hdrgen/tests/input/custom.yaml
@@ -0,0 +1,13 @@
+merge_yaml_files:
+  - custom-common.yaml
+
+header: custom.h
+standards:
+  - acme
+
+functions:
+  - name: runner
+    return_type: road
+    arguments:
+      - type: meep
+      - type: meep
diff --git a/libc/utils/hdrgen/tests/input/sorting.yaml b/libc/utils/hdrgen/tests/input/sorting.yaml
new file mode 100644
index 0000000000000..3c26cde9e6c41
--- /dev/null
+++ b/libc/utils/hdrgen/tests/input/sorting.yaml
@@ -0,0 +1,20 @@
+header: sorting.h
+standards:
+  - stdc
+functions:
+  - name: gunk
+    return_type: void
+    arguments:
+      - type: const char *
+  - name: _func_with_aliases
+    return_type: void
+    arguments:
+      - type: int
+  - name: func_with_aliases
+    return_type: void
+    arguments:
+      - type: int
+  - name: __func_with_aliases
+    return_type: void
+    arguments:
+      - type: int
diff --git a/libc/utils/hdrgen/tests/test_integration.py b/libc/utils/hdrgen/tests/test_integration.py
index bf393d26a8101..b975d8ff007b1 100644
--- a/libc/utils/hdrgen/tests/test_integration.py
+++ b/libc/utils/hdrgen/tests/test_integration.py
@@ -59,6 +59,13 @@ def test_generate_subdir_header(self):
         self.run_script(yaml_file, output_file)
         self.compare_files(output_file, expected_output_file)
 
+    def test_custom_license_and_standards(self):
+        yaml_file = self.source_dir / "input" / "custom.yaml"
+        expected_output_file = self.source_dir / "expected_output" / "custom.h"
+        output_file = self.output_dir / "custom.h"
+        self.run_script(yaml_file, output_file)
+        self.compare_files(output_file, expected_output_file)
+
     def test_generate_json(self):
         yaml_file = self.source_dir / "input/test_small.yaml"
         expected_output_file = self.source_dir / "expected_output/test_small.json"
@@ -68,6 +75,13 @@ def test_generate_json(self):
 
         self.compare_files(output_file, expected_output_file)
 
+    def test_sorting(self):
+        yaml_file = self.source_dir / "input" / "sorting.yaml"
+        expected_output_file = self.source_dir / "expected_output" / "sorting.h"
+        output_file = self.output_dir / "sorting.h"
+        self.run_script(yaml_file, output_file)
+        self.compare_files(output_file, expected_output_file)
+
 
 def main():
     parser = argparse.ArgumentParser(description="TestHeaderGenIntegration arguments")
diff --git a/libcxx/docs/FeatureTestMacroTable.rst b/libcxx/docs/FeatureTestMacroTable.rst
index 8fba6db871f08..d5ed9188b1b23 100644
--- a/libcxx/docs/FeatureTestMacroTable.rst
+++ b/libcxx/docs/FeatureTestMacroTable.rst
@@ -426,6 +426,10 @@ Status
     ---------------------------------------------------------- -----------------
     ``__cpp_lib_constexpr_algorithms``                         ``202306L``
     ---------------------------------------------------------- -----------------
+    ``__cpp_lib_constexpr_flat_map``                           ``202502L``
+    ---------------------------------------------------------- -----------------
+    ``__cpp_lib_constexpr_flat_set``                           ``202502L``
+    ---------------------------------------------------------- -----------------
     ``__cpp_lib_constexpr_forward_list``                       ``202502L``
     ---------------------------------------------------------- -----------------
     ``__cpp_lib_constexpr_list``                               ``202502L``
@@ -474,7 +478,7 @@ Status
     ---------------------------------------------------------- -----------------
     ``__cpp_lib_is_virtual_base_of``                           ``202406L``
     ---------------------------------------------------------- -----------------
-    ``__cpp_lib_is_within_lifetime``                           *unimplemented*
+    ``__cpp_lib_is_within_lifetime``                           ``202306L``
     ---------------------------------------------------------- -----------------
     ``__cpp_lib_linalg``                                       *unimplemented*
     ---------------------------------------------------------- -----------------
diff --git a/libcxx/docs/ReleaseNotes/22.rst b/libcxx/docs/ReleaseNotes/22.rst
index 25d33a9c2eb50..58e0ee9993065 100644
--- a/libcxx/docs/ReleaseNotes/22.rst
+++ b/libcxx/docs/ReleaseNotes/22.rst
@@ -43,6 +43,8 @@ Implemented Papers
 - P3044R2: sub-``string_view`` from ``string`` (`Github <https://llvm.org/PR148140>`__)
 - P3223R2: Making ``std::istream::ignore`` less surprising (`Github <https://llvm.org/PR148178>`__)
 - P3060R3: Add ``std::views::indices(n)`` (`Github <https://llvm.org/PR148175>`__)
+- P2641R4: Checking if a ``union`` alternative is active (``std::is_within_lifetime``)
+  (`Github <https://llvm.org/PR105381>`__)
 - P2835R7: Expose ``std::atomic_ref``'s object address (`Github <https://llvm.org/PR118377>`__)
 - P2944R3: Comparisons for ``reference_wrapper`` (`Github <https://llvm.org/PR105424>`__)
 - P3168R2: Give ``std::optional`` Range Support (`Github <https://llvm.org/PR105430>`__)
@@ -76,8 +78,9 @@ Improvements and New Features
 - The ``std::{fill, fill_n}`` and ``std::ranges::{fill, fill_n}`` algorithms have been optimized for segmented iterators,
   resulting in a performance improvement of at least 10x for ``std::deque<int>`` iterators and
   ``std::join_view<std::vector<std::vector<int>>>`` iterators.
-- The ``std::generate`` and ``std::generate_n`` algorithms have been optimized for segmented iterators, resulting in a
-  performance improvement for ``std::deque<short>`` and ``std::join_view<vector<vector<short>>>`` iterators.
+- The ``std::{generate, generate_n}`` and ``std::ranges::generate_n`` algorithms have been optimized for segmented
+  iterators, resulting in a performance improvement for ``std::deque<short>`` and
+  ``std::join_view<vector<vector<short>>>`` iterators.
 
 Deprecations and Removals
 -------------------------
diff --git a/libcxx/docs/Status/Cxx2cPapers.csv b/libcxx/docs/Status/Cxx2cPapers.csv
index a5423acf0d419..e0e47b864d38f 100644
--- a/libcxx/docs/Status/Cxx2cPapers.csv
+++ b/libcxx/docs/Status/Cxx2cPapers.csv
@@ -18,7 +18,7 @@
 "`P2874R2 <https://wg21.link/P2874R2>`__","P2874R2: Mandating Annex D Require No More","2023-06 (Varna)","|Complete|","12","`#105377 <https://github.com/llvm/llvm-project/issues/105377>`__",""
 "`P2757R3 <https://wg21.link/P2757R3>`__","Type-checking format args","2023-06 (Varna)","","","`#105378 <https://github.com/llvm/llvm-project/issues/105378>`__",""
 "`P2637R3 <https://wg21.link/P2637R3>`__","Member ``visit``","2023-06 (Varna)","|Complete|","19","`#105380 <https://github.com/llvm/llvm-project/issues/105380>`__","Change of ``__cpp_lib_variant`` is completed in LLVM 20. Change of ``__cpp_lib_format`` is blocked by `P2419R2 <https://wg21.link/P2419R2>`__."
-"`P2641R4 <https://wg21.link/P2641R4>`__","Checking if a ``union`` alternative is active","2023-06 (Varna)","","","`#105381 <https://github.com/llvm/llvm-project/issues/105381>`__",""
+"`P2641R4 <https://wg21.link/P2641R4>`__","Checking if a ``union`` alternative is active","2023-06 (Varna)","|Complete|","22","`#105381 <https://github.com/llvm/llvm-project/issues/105381>`__",""
 "`P1759R6 <https://wg21.link/P1759R6>`__","Native handles and file streams","2023-06 (Varna)","|Complete|","18","`#105382 <https://github.com/llvm/llvm-project/issues/105382>`__",""
 "`P2697R1 <https://wg21.link/P2697R1>`__","Interfacing ``bitset`` with ``string_view``","2023-06 (Varna)","|Complete|","18","`#105384 <https://github.com/llvm/llvm-project/issues/105384>`__",""
 "`P1383R2 <https://wg21.link/P1383R2>`__","More ``constexpr`` for ``<cmath>`` and ``<complex>``","2023-06 (Varna)","","","`#105385 <https://github.com/llvm/llvm-project/issues/105385>`__",""
diff --git a/libcxx/docs/index.rst b/libcxx/docs/index.rst
index 495ccceb31cef..03dfb9d41aa1a 100644
--- a/libcxx/docs/index.rst
+++ b/libcxx/docs/index.rst
@@ -132,7 +132,7 @@ velocity, libc++ drops support for older compilers as newer ones are released.
 ============ =================== ========================== =====================
 Compiler     Versions            Restrictions               Support policy
 ============ =================== ========================== =====================
-Clang        19, 20, 21-git                                 latest two stable releases per `LLVM's release page <https://releases.llvm.org>`_ and the development version
+Clang        20, 21, 22-git                                 latest two stable releases per `LLVM's release page <https://releases.llvm.org>`_ and the development version
 AppleClang   26.0                                           latest stable release per `Xcode's release page <https://developer.apple.com/documentation/xcode-release-notes>`_
 Open XL      17.1.3 (AIX)                                   latest stable release per `Open XL's documentation page <https://www.ibm.com/docs/en/openxl-c-and-cpp-aix>`_
 GCC          15                  In C++11 or later only     latest stable release per `GCC's release page <https://gcc.gnu.org/releases.html>`_
diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index 37259a7e6e7dd..57032ce26d4fd 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -529,6 +529,7 @@ set(files
   __locale_dir/support/freebsd.h
   __locale_dir/support/fuchsia.h
   __locale_dir/support/linux.h
+  __locale_dir/support/netbsd.h
   __locale_dir/support/no_locale/characters.h
   __locale_dir/support/no_locale/strtonum.h
   __locale_dir/support/windows.h
@@ -877,6 +878,7 @@ set(files
   __type_traits/is_valid_expansion.h
   __type_traits/is_void.h
   __type_traits/is_volatile.h
+  __type_traits/is_within_lifetime.h
   __type_traits/lazy.h
   __type_traits/make_32_64_or_128_bit.h
   __type_traits/make_const_lvalue_ref.h
diff --git a/libcxx/include/__algorithm/generate_n.h b/libcxx/include/__algorithm/generate_n.h
index e9da133f0570a..23899e49e0b65 100644
--- a/libcxx/include/__algorithm/generate_n.h
+++ b/libcxx/include/__algorithm/generate_n.h
@@ -13,22 +13,34 @@
 #include <__config>
 #include <__functional/identity.h>
 #include <__utility/forward.h>
+#include <__utility/move.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _OutputIterator, class _Size, class _Generator>
 inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator
-generate_n(_OutputIterator __first, _Size __orig_n, _Generator __gen) {
+__generate_n(_OutputIterator __first, _Size __orig_n, _Generator& __gen) {
   using __iter_ref = decltype(*__first);
   __identity __proj;
   auto __f = [&](__iter_ref __element) { std::forward<__iter_ref>(__element) = __gen(); };
-  return std::__for_each_n(__first, __orig_n, __f, __proj);
+  return std::__for_each_n(std::move(__first), __orig_n, __f, __proj);
+}
+
+template <class _OutputIterator, class _Size, class _Generator>
+inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator
+generate_n(_OutputIterator __first, _Size __orig_n, _Generator __gen) {
+  return std::__generate_n(std::move(__first), __orig_n, __gen);
 }
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_GENERATE_N_H
diff --git a/libcxx/include/__algorithm/ranges_generate_n.h b/libcxx/include/__algorithm/ranges_generate_n.h
index a318994d0eaf8..0cc9ce7b1193b 100644
--- a/libcxx/include/__algorithm/ranges_generate_n.h
+++ b/libcxx/include/__algorithm/ranges_generate_n.h
@@ -9,6 +9,7 @@
 #ifndef _LIBCPP___ALGORITHM_RANGES_GENERATE_N_H
 #define _LIBCPP___ALGORITHM_RANGES_GENERATE_N_H
 
+#include <__algorithm/generate_n.h>
 #include <__concepts/constructible.h>
 #include <__concepts/invocable.h>
 #include <__config>
@@ -38,12 +39,7 @@ struct __generate_n {
     requires invocable<_Func&> && indirectly_writable<_OutIter, invoke_result_t<_Func&>>
   _LIBCPP_HIDE_FROM_ABI constexpr _OutIter
   operator()(_OutIter __first, iter_difference_t<_OutIter> __n, _Func __gen) const {
-    for (; __n > 0; --__n) {
-      *__first = __gen();
-      ++__first;
-    }
-
-    return __first;
+    return std::__generate_n(std::move(__first), __n, __gen);
   }
 };
 
diff --git a/libcxx/include/__config b/libcxx/include/__config
index b4c081dcdff1b..357f77b7d27d6 100644
--- a/libcxx/include/__config
+++ b/libcxx/include/__config
@@ -1050,8 +1050,7 @@ typedef __char32_t char32_t;
 #    define _LIBCPP_CTAD_SUPPORTED_FOR_TYPE(_ClassName) static_assert(true, "")
 #  endif
 
-// TODO(LLVM 22): Remove the workaround
-#  if defined(__OBJC__) && (!defined(_LIBCPP_CLANG_VER) || _LIBCPP_CLANG_VER < 2001)
+#  if defined(__OBJC__) && defined(_LIBCPP_APPLE_CLANG_VER)
 #    define _LIBCPP_WORKAROUND_OBJCXX_COMPILER_INTRINSICS
 #  endif
 
@@ -1255,14 +1254,6 @@ typedef __char32_t char32_t;
 #    define _LIBCPP_DIAGNOSE_NULLPTR
 #  endif
 
-// TODO(LLVM 22): Remove this macro once LLVM19 support ends. __cpp_explicit_this_parameter has been set in LLVM20.
-// Clang-18 has support for deducing this, but it does not set the FTM.
-#  if defined(__cpp_explicit_this_parameter) || (defined(_LIBCPP_CLANG_VER) && _LIBCPP_CLANG_VER >= 1800)
-#    define _LIBCPP_HAS_EXPLICIT_THIS_PARAMETER 1
-#  else
-#    define _LIBCPP_HAS_EXPLICIT_THIS_PARAMETER 0
-#  endif
-
 #endif // __cplusplus
 
 #endif // _LIBCPP___CONFIG
diff --git a/libcxx/include/__configuration/abi.h b/libcxx/include/__configuration/abi.h
index c9936df30ff7f..38b85c6ac70de 100644
--- a/libcxx/include/__configuration/abi.h
+++ b/libcxx/include/__configuration/abi.h
@@ -61,14 +61,6 @@
 // According to the Standard, `bitset::operator[] const` returns bool
 #  define _LIBCPP_ABI_BITSET_VECTOR_BOOL_CONST_SUBSCRIPT_RETURN_BOOL
 
-// In LLVM 20, we've changed to take these ABI breaks unconditionally. These flags only exist in case someone is running
-// into the static_asserts we added to catch the ABI break and don't care that it is one.
-// TODO(LLVM 22): Remove these flags
-#  define _LIBCPP_ABI_LIST_REMOVE_NODE_POINTER_UB
-#  define _LIBCPP_ABI_TREE_REMOVE_NODE_POINTER_UB
-#  define _LIBCPP_ABI_FIX_UNORDERED_NODE_POINTER_UB
-#  define _LIBCPP_ABI_FORWARD_LIST_REMOVE_NODE_POINTER_UB
-
 // These flags are documented in ABIGuarantees.rst
 #  define _LIBCPP_ABI_ALTERNATE_STRING_LAYOUT
 #  define _LIBCPP_ABI_DO_NOT_EXPORT_BASIC_STRING_COMMON
diff --git a/libcxx/include/__configuration/availability.h b/libcxx/include/__configuration/availability.h
index d0414ecfac2bb..5433df872fa39 100644
--- a/libcxx/include/__configuration/availability.h
+++ b/libcxx/include/__configuration/availability.h
@@ -118,14 +118,40 @@
 #  define _LIBCPP_INTRODUCED_IN_LLVM_21_ATTRIBUTE __attribute__((unavailable))
 
 // LLVM 20
-// TODO: Fill this in
-#  define _LIBCPP_INTRODUCED_IN_LLVM_20 0
-#  define _LIBCPP_INTRODUCED_IN_LLVM_20_ATTRIBUTE __attribute__((unavailable))
+//
+// Note that versions for most Apple OSes were bumped forward and aligned in that release.
+#  if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 260000) ||       \
+      (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 260000) ||     \
+      (defined(__ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__ < 260000) ||             \
+      (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 260000) ||       \
+      (defined(__ENVIRONMENT_BRIDGE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_BRIDGE_OS_VERSION_MIN_REQUIRED__ < 100000)
+#    define _LIBCPP_INTRODUCED_IN_LLVM_20 0
+#  else
+#    define _LIBCPP_INTRODUCED_IN_LLVM_20 1
+#  endif
+#  define _LIBCPP_INTRODUCED_IN_LLVM_20_ATTRIBUTE                                                                 \
+    __attribute__((availability(macos, strict, introduced = 26.0)))                                               \
+    __attribute__((availability(ios, strict, introduced = 26.0)))                                                 \
+    __attribute__((availability(tvos, strict, introduced = 26.0)))                                                \
+    __attribute__((availability(watchos, strict, introduced = 26.0)))                                             \
+    __attribute__((availability(bridgeos, strict, introduced = 10.0)))
 
 // LLVM 19
-// TODO: Fill this in
-#  define _LIBCPP_INTRODUCED_IN_LLVM_19 0
-#  define _LIBCPP_INTRODUCED_IN_LLVM_19_ATTRIBUTE __attribute__((unavailable))
+#  if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 150400) ||       \
+      (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 180400) ||     \
+      (defined(__ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__ < 180400) ||             \
+      (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 110400) ||       \
+      (defined(__ENVIRONMENT_BRIDGE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_BRIDGE_OS_VERSION_MIN_REQUIRED__ < 90400)
+#    define _LIBCPP_INTRODUCED_IN_LLVM_19 0
+#  else
+#    define _LIBCPP_INTRODUCED_IN_LLVM_19 1
+#  endif
+#  define _LIBCPP_INTRODUCED_IN_LLVM_19_ATTRIBUTE                                                                 \
+    __attribute__((availability(macos, strict, introduced = 15.4)))                                               \
+    __attribute__((availability(ios, strict, introduced = 18.4)))                                                 \
+    __attribute__((availability(tvos, strict, introduced = 18.4)))                                                \
+    __attribute__((availability(watchos, strict, introduced = 11.4)))                                             \
+    __attribute__((availability(bridgeos, strict, introduced = 9.4)))
 
 // LLVM 18
 #  if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 150000) ||       \
diff --git a/libcxx/include/__configuration/compiler.h b/libcxx/include/__configuration/compiler.h
index 11c07ed0dc474..7cd81e03b05ba 100644
--- a/libcxx/include/__configuration/compiler.h
+++ b/libcxx/include/__configuration/compiler.h
@@ -33,16 +33,16 @@
 // Warn if a compiler version is used that is not supported anymore
 // LLVM RELEASE Update the minimum compiler versions
 #  if defined(_LIBCPP_CLANG_VER)
-#    if _LIBCPP_CLANG_VER < 1900
-#      warning "Libc++ only supports Clang 19 and later"
+#    if _LIBCPP_CLANG_VER < 2001
+#      warning "Libc++ only supports Clang 20 and later"
 #    endif
 #  elif defined(_LIBCPP_APPLE_CLANG_VER)
-#    if _LIBCPP_APPLE_CLANG_VER < 1600
-#      warning "Libc++ only supports AppleClang 15 and later"
+#    if _LIBCPP_APPLE_CLANG_VER < 1700
+#      warning "Libc++ only supports AppleClang 26 and later"
 #    endif
 #  elif defined(_LIBCPP_GCC_VER)
-#    if _LIBCPP_GCC_VER < 1400
-#      warning "Libc++ only supports GCC 14 and later"
+#    if _LIBCPP_GCC_VER < 1500
+#      warning "Libc++ only supports GCC 15 and later"
 #    endif
 #  endif
 
diff --git a/libcxx/include/__flat_set/flat_multiset.h b/libcxx/include/__flat_set/flat_multiset.h
index 7be0b2d20c54d..0f6bae584ca90 100644
--- a/libcxx/include/__flat_set/flat_multiset.h
+++ b/libcxx/include/__flat_set/flat_multiset.h
@@ -95,16 +95,16 @@ class flat_multiset {
 
 public:
   // [flat.multiset.cons], constructors
-  _LIBCPP_HIDE_FROM_ABI flat_multiset() noexcept(is_nothrow_default_constructible_v<_KeyContainer> &&
-                                                 is_nothrow_default_constructible_v<_Compare>)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multiset() noexcept(
+      is_nothrow_default_constructible_v<_KeyContainer> && is_nothrow_default_constructible_v<_Compare>)
       : __keys_(), __compare_() {}
 
-  _LIBCPP_HIDE_FROM_ABI flat_multiset(const flat_multiset&) = default;
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multiset(const flat_multiset&) = default;
 
   // The copy/move constructors are not specified in the spec, which means they should be defaulted.
   // However, the move constructor can potentially leave a moved-from object in an inconsistent
   // state if an exception is thrown.
-  _LIBCPP_HIDE_FROM_ABI flat_multiset(flat_multiset&& __other) noexcept(
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multiset(flat_multiset&& __other) noexcept(
       is_nothrow_move_constructible_v<_KeyContainer> && is_nothrow_move_constructible_v<_Compare>)
 #  if _LIBCPP_HAS_EXCEPTIONS
       try
@@ -121,14 +121,16 @@ class flat_multiset {
 #  endif // _LIBCPP_HAS_EXCEPTIONS
   }
 
-  _LIBCPP_HIDE_FROM_ABI explicit flat_multiset(const key_compare& __comp) : __keys_(), __compare_(__comp) {}
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 explicit flat_multiset(const key_compare& __comp)
+      : __keys_(), __compare_(__comp) {}
 
-  _LIBCPP_HIDE_FROM_ABI explicit flat_multiset(container_type __keys, const key_compare& __comp = key_compare())
+  _LIBCPP_HIDE_FROM_ABI
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 explicit flat_multiset(container_type __keys, const key_compare& __comp = key_compare())
       : __keys_(std::move(__keys)), __compare_(__comp) {
     ranges::sort(__keys_, __compare_);
   }
 
-  _LIBCPP_HIDE_FROM_ABI
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
   flat_multiset(sorted_equivalent_t, container_type __keys, const key_compare& __comp = key_compare())
       : __keys_(std::move(__keys)), __compare_(__comp) {
     _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(ranges::is_sorted(__keys_, __compare_), "Key container is not sorted");
@@ -136,7 +138,7 @@ class flat_multiset {
 
   template <class _InputIterator>
     requires __has_input_iterator_category<_InputIterator>::value
-  _LIBCPP_HIDE_FROM_ABI
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
   flat_multiset(_InputIterator __first, _InputIterator __last, const key_compare& __comp = key_compare())
       : __keys_(), __compare_(__comp) {
     insert(__first, __last);
@@ -144,48 +146,53 @@ class flat_multiset {
 
   template <class _InputIterator>
     requires __has_input_iterator_category<_InputIterator>::value
-  _LIBCPP_HIDE_FROM_ABI flat_multiset(
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multiset(
       sorted_equivalent_t, _InputIterator __first, _InputIterator __last, const key_compare& __comp = key_compare())
       : __keys_(__first, __last), __compare_(__comp) {
     _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(ranges::is_sorted(__keys_, __compare_), "Key container is not sorted");
   }
 
   template <_ContainerCompatibleRange<value_type> _Range>
-  _LIBCPP_HIDE_FROM_ABI flat_multiset(from_range_t __fr, _Range&& __rg)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multiset(from_range_t __fr, _Range&& __rg)
       : flat_multiset(__fr, std::forward<_Range>(__rg), key_compare()) {}
 
   template <_ContainerCompatibleRange<value_type> _Range>
-  _LIBCPP_HIDE_FROM_ABI flat_multiset(from_range_t, _Range&& __rg, const key_compare& __comp) : flat_multiset(__comp) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
+  flat_multiset(from_range_t, _Range&& __rg, const key_compare& __comp)
+      : flat_multiset(__comp) {
     insert_range(std::forward<_Range>(__rg));
   }
 
-  _LIBCPP_HIDE_FROM_ABI flat_multiset(initializer_list<value_type> __il, const key_compare& __comp = key_compare())
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
+  flat_multiset(initializer_list<value_type> __il, const key_compare& __comp = key_compare())
       : flat_multiset(__il.begin(), __il.end(), __comp) {}
 
-  _LIBCPP_HIDE_FROM_ABI
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
   flat_multiset(sorted_equivalent_t, initializer_list<value_type> __il, const key_compare& __comp = key_compare())
       : flat_multiset(sorted_equivalent, __il.begin(), __il.end(), __comp) {}
 
   template <class _Allocator>
     requires uses_allocator<container_type, _Allocator>::value
-  _LIBCPP_HIDE_FROM_ABI explicit flat_multiset(const _Allocator& __alloc)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 explicit flat_multiset(const _Allocator& __alloc)
       : __keys_(std::make_obj_using_allocator<container_type>(__alloc)), __compare_() {}
 
   template <class _Allocator>
     requires uses_allocator<container_type, _Allocator>::value
-  _LIBCPP_HIDE_FROM_ABI flat_multiset(const key_compare& __comp, const _Allocator& __alloc)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
+  flat_multiset(const key_compare& __comp, const _Allocator& __alloc)
       : __keys_(std::make_obj_using_allocator<container_type>(__alloc)), __compare_(__comp) {}
 
   template <class _Allocator>
     requires uses_allocator<container_type, _Allocator>::value
-  _LIBCPP_HIDE_FROM_ABI flat_multiset(const container_type& __keys, const _Allocator& __alloc)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
+  flat_multiset(const container_type& __keys, const _Allocator& __alloc)
       : __keys_(std::make_obj_using_allocator<container_type>(__alloc, __keys)), __compare_() {
     ranges::sort(__keys_, __compare_);
   }
 
   template <class _Allocator>
     requires uses_allocator<container_type, _Allocator>::value
-  _LIBCPP_HIDE_FROM_ABI
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
   flat_multiset(const container_type& __keys, const key_compare& __comp, const _Allocator& __alloc)
       : __keys_(std::make_obj_using_allocator<container_type>(__alloc, __keys)), __compare_(__comp) {
     ranges::sort(__keys_, __compare_);
@@ -193,14 +200,15 @@ class flat_multiset {
 
   template <class _Allocator>
     requires uses_allocator<container_type, _Allocator>::value
-  _LIBCPP_HIDE_FROM_ABI flat_multiset(sorted_equivalent_t, const container_type& __keys, const _Allocator& __alloc)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
+  flat_multiset(sorted_equivalent_t, const container_type& __keys, const _Allocator& __alloc)
       : __keys_(std::make_obj_using_allocator<container_type>(__alloc, __keys)), __compare_() {
     _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(ranges::is_sorted(__keys_, __compare_), "Key container is not sorted");
   }
 
   template <class _Allocator>
     requires uses_allocator<container_type, _Allocator>::value
-  _LIBCPP_HIDE_FROM_ABI
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
   flat_multiset(sorted_equivalent_t, const container_type& __keys, const key_compare& __comp, const _Allocator& __alloc)
       : __keys_(std::make_obj_using_allocator<container_type>(__alloc, __keys)), __compare_(__comp) {
     _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(ranges::is_sorted(__keys_, __compare_), "Key container is not sorted");
@@ -208,13 +216,14 @@ class flat_multiset {
 
   template <class _Allocator>
     requires uses_allocator<container_type, _Allocator>::value
-  _LIBCPP_HIDE_FROM_ABI flat_multiset(const flat_multiset& __other, const _Allocator& __alloc)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
+  flat_multiset(const flat_multiset& __other, const _Allocator& __alloc)
       : __keys_(std::make_obj_using_allocator<container_type>(__alloc, __other.__keys_)),
         __compare_(__other.__compare_) {}
 
   template <class _Allocator>
     requires uses_allocator<container_type, _Allocator>::value
-  _LIBCPP_HIDE_FROM_ABI flat_multiset(flat_multiset&& __other, const _Allocator& __alloc)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multiset(flat_multiset&& __other, const _Allocator& __alloc)
 #  if _LIBCPP_HAS_EXCEPTIONS
       try
 #  endif // _LIBCPP_HAS_EXCEPTIONS
@@ -230,14 +239,15 @@ class flat_multiset {
 
   template <class _InputIterator, class _Allocator>
     requires(__has_input_iterator_category<_InputIterator>::value && uses_allocator<container_type, _Allocator>::value)
-  _LIBCPP_HIDE_FROM_ABI flat_multiset(_InputIterator __first, _InputIterator __last, const _Allocator& __alloc)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
+  flat_multiset(_InputIterator __first, _InputIterator __last, const _Allocator& __alloc)
       : __keys_(std::make_obj_using_allocator<container_type>(__alloc)), __compare_() {
     insert(__first, __last);
   }
 
   template <class _InputIterator, class _Allocator>
     requires(__has_input_iterator_category<_InputIterator>::value && uses_allocator<container_type, _Allocator>::value)
-  _LIBCPP_HIDE_FROM_ABI
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
   flat_multiset(_InputIterator __first, _InputIterator __last, const key_compare& __comp, const _Allocator& __alloc)
       : __keys_(std::make_obj_using_allocator<container_type>(__alloc)), __compare_(__comp) {
     insert(__first, __last);
@@ -245,7 +255,7 @@ class flat_multiset {
 
   template <class _InputIterator, class _Allocator>
     requires(__has_input_iterator_category<_InputIterator>::value && uses_allocator<container_type, _Allocator>::value)
-  _LIBCPP_HIDE_FROM_ABI
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
   flat_multiset(sorted_equivalent_t, _InputIterator __first, _InputIterator __last, const _Allocator& __alloc)
       : __keys_(std::make_obj_using_allocator<container_type>(__alloc, __first, __last)), __compare_() {
     _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(ranges::is_sorted(__keys_, __compare_), "Key container is not sorted");
@@ -253,53 +263,57 @@ class flat_multiset {
 
   template <class _InputIterator, class _Allocator>
     requires(__has_input_iterator_category<_InputIterator>::value && uses_allocator<container_type, _Allocator>::value)
-  _LIBCPP_HIDE_FROM_ABI
-  flat_multiset(sorted_equivalent_t,
-                _InputIterator __first,
-                _InputIterator __last,
-                const key_compare& __comp,
-                const _Allocator& __alloc)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multiset(
+      sorted_equivalent_t,
+      _InputIterator __first,
+      _InputIterator __last,
+      const key_compare& __comp,
+      const _Allocator& __alloc)
       : __keys_(std::make_obj_using_allocator<container_type>(__alloc, __first, __last)), __compare_(__comp) {
     _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(ranges::is_sorted(__keys_, __compare_), "Key container is not sorted");
   }
 
   template <_ContainerCompatibleRange<value_type> _Range, class _Allocator>
     requires uses_allocator<container_type, _Allocator>::value
-  _LIBCPP_HIDE_FROM_ABI flat_multiset(from_range_t, _Range&& __rg, const _Allocator& __alloc)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
+  flat_multiset(from_range_t, _Range&& __rg, const _Allocator& __alloc)
       : __keys_(std::make_obj_using_allocator<container_type>(__alloc)), __compare_() {
     insert_range(std::forward<_Range>(__rg));
   }
 
   template <_ContainerCompatibleRange<value_type> _Range, class _Allocator>
     requires uses_allocator<container_type, _Allocator>::value
-  _LIBCPP_HIDE_FROM_ABI flat_multiset(from_range_t, _Range&& __rg, const key_compare& __comp, const _Allocator& __alloc)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
+  flat_multiset(from_range_t, _Range&& __rg, const key_compare& __comp, const _Allocator& __alloc)
       : __keys_(std::make_obj_using_allocator<container_type>(__alloc)), __compare_(__comp) {
     insert_range(std::forward<_Range>(__rg));
   }
 
   template <class _Allocator>
     requires uses_allocator<container_type, _Allocator>::value
-  _LIBCPP_HIDE_FROM_ABI flat_multiset(initializer_list<value_type> __il, const _Allocator& __alloc)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
+  flat_multiset(initializer_list<value_type> __il, const _Allocator& __alloc)
       : flat_multiset(__il.begin(), __il.end(), __alloc) {}
 
   template <class _Allocator>
     requires uses_allocator<container_type, _Allocator>::value
-  _LIBCPP_HIDE_FROM_ABI
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
   flat_multiset(initializer_list<value_type> __il, const key_compare& __comp, const _Allocator& __alloc)
       : flat_multiset(__il.begin(), __il.end(), __comp, __alloc) {}
 
   template <class _Allocator>
     requires uses_allocator<container_type, _Allocator>::value
-  _LIBCPP_HIDE_FROM_ABI flat_multiset(sorted_equivalent_t, initializer_list<value_type> __il, const _Allocator& __alloc)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
+  flat_multiset(sorted_equivalent_t, initializer_list<value_type> __il, const _Allocator& __alloc)
       : flat_multiset(sorted_equivalent, __il.begin(), __il.end(), __alloc) {}
 
   template <class _Allocator>
     requires uses_allocator<container_type, _Allocator>::value
-  _LIBCPP_HIDE_FROM_ABI flat_multiset(
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multiset(
       sorted_equivalent_t, initializer_list<value_type> __il, const key_compare& __comp, const _Allocator& __alloc)
       : flat_multiset(sorted_equivalent, __il.begin(), __il.end(), __comp, __alloc) {}
 
-  _LIBCPP_HIDE_FROM_ABI flat_multiset& operator=(initializer_list<value_type> __il) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multiset& operator=(initializer_list<value_type> __il) {
     clear();
     insert(__il);
     return *this;
@@ -308,9 +322,9 @@ class flat_multiset {
   // copy/move assignment are not specified in the spec (defaulted)
   // but move assignment can potentially leave moved from object in an inconsistent
   // state if an exception is thrown
-  _LIBCPP_HIDE_FROM_ABI flat_multiset& operator=(const flat_multiset&) = default;
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multiset& operator=(const flat_multiset&) = default;
 
-  _LIBCPP_HIDE_FROM_ABI flat_multiset& operator=(flat_multiset&& __other) noexcept(
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multiset& operator=(flat_multiset&& __other) noexcept(
       is_nothrow_move_assignable_v<_KeyContainer> && is_nothrow_move_assignable_v<_Compare>) {
     auto __clear_other_guard = std::__make_scope_guard([&]() noexcept { __other.clear() /* noexcept */; });
     auto __clear_self_guard  = std::__make_exception_guard([&]() noexcept { clear() /* noexcept */; });
@@ -321,30 +335,52 @@ class flat_multiset {
   }
 
   // iterators
-  _LIBCPP_HIDE_FROM_ABI iterator begin() noexcept { return iterator(std::as_const(__keys_).begin()); }
-  _LIBCPP_HIDE_FROM_ABI const_iterator begin() const noexcept { return const_iterator(__keys_.begin()); }
-  _LIBCPP_HIDE_FROM_ABI iterator end() noexcept { return iterator(std::as_const(__keys_).end()); }
-  _LIBCPP_HIDE_FROM_ABI const_iterator end() const noexcept { return const_iterator(__keys_.end()); }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator begin() noexcept {
+    return iterator(std::as_const(__keys_).begin());
+  }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator begin() const noexcept {
+    return const_iterator(__keys_.begin());
+  }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator end() noexcept {
+    return iterator(std::as_const(__keys_).end());
+  }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator end() const noexcept {
+    return const_iterator(__keys_.end());
+  }
 
-  _LIBCPP_HIDE_FROM_ABI reverse_iterator rbegin() noexcept { return reverse_iterator(end()); }
-  _LIBCPP_HIDE_FROM_ABI const_reverse_iterator rbegin() const noexcept { return const_reverse_iterator(end()); }
-  _LIBCPP_HIDE_FROM_ABI reverse_iterator rend() noexcept { return reverse_iterator(begin()); }
-  _LIBCPP_HIDE_FROM_ABI const_reverse_iterator rend() const noexcept { return const_reverse_iterator(begin()); }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 reverse_iterator rbegin() noexcept {
+    return reverse_iterator(end());
+  }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_reverse_iterator rbegin() const noexcept {
+    return const_reverse_iterator(end());
+  }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 reverse_iterator rend() noexcept {
+    return reverse_iterator(begin());
+  }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_reverse_iterator rend() const noexcept {
+    return const_reverse_iterator(begin());
+  }
 
-  _LIBCPP_HIDE_FROM_ABI const_iterator cbegin() const noexcept { return begin(); }
-  _LIBCPP_HIDE_FROM_ABI const_iterator cend() const noexcept { return end(); }
-  _LIBCPP_HIDE_FROM_ABI const_reverse_iterator crbegin() const noexcept { return const_reverse_iterator(end()); }
-  _LIBCPP_HIDE_FROM_ABI const_reverse_iterator crend() const noexcept { return const_reverse_iterator(begin()); }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator cbegin() const noexcept { return begin(); }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator cend() const noexcept { return end(); }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_reverse_iterator crbegin() const noexcept {
+    return const_reverse_iterator(end());
+  }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_reverse_iterator crend() const noexcept {
+    return const_reverse_iterator(begin());
+  }
 
   // capacity
-  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI bool empty() const noexcept { return __keys_.empty(); }
-  _LIBCPP_HIDE_FROM_ABI size_type size() const noexcept { return __keys_.size(); }
-  _LIBCPP_HIDE_FROM_ABI size_type max_size() const noexcept { return __keys_.max_size(); }
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 bool empty() const noexcept {
+    return __keys_.empty();
+  }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 size_type size() const noexcept { return __keys_.size(); }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 size_type max_size() const noexcept { return __keys_.max_size(); }
 
   // [flat.multiset.modifiers], modifiers
   template <class... _Args>
     requires is_constructible_v<value_type, _Args...>
-  _LIBCPP_HIDE_FROM_ABI iterator emplace(_Args&&... __args) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator emplace(_Args&&... __args) {
     if constexpr (sizeof...(__args) == 1 && (is_same_v<remove_cvref_t<_Args>, _Key> && ...)) {
       return __emplace(std::forward<_Args>(__args)...);
     } else {
@@ -354,7 +390,7 @@ class flat_multiset {
 
   template <class... _Args>
     requires is_constructible_v<value_type, _Args...>
-  _LIBCPP_HIDE_FROM_ABI iterator emplace_hint(const_iterator __hint, _Args&&... __args) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator emplace_hint(const_iterator __hint, _Args&&... __args) {
     if constexpr (sizeof...(__args) == 1 && (is_same_v<remove_cvref_t<_Args>, _Key> && ...)) {
       return __emplace_hint(std::move(__hint), std::forward<_Args>(__args)...);
     } else {
@@ -362,21 +398,23 @@ class flat_multiset {
     }
   }
 
-  _LIBCPP_HIDE_FROM_ABI iterator insert(const value_type& __x) { return emplace(__x); }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator insert(const value_type& __x) { return emplace(__x); }
 
-  _LIBCPP_HIDE_FROM_ABI iterator insert(value_type&& __x) { return emplace(std::move(__x)); }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator insert(value_type&& __x) {
+    return emplace(std::move(__x));
+  }
 
-  _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __hint, const value_type& __x) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator insert(const_iterator __hint, const value_type& __x) {
     return emplace_hint(__hint, __x);
   }
 
-  _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __hint, value_type&& __x) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator insert(const_iterator __hint, value_type&& __x) {
     return emplace_hint(__hint, std::move(__x));
   }
 
   template <class _InputIterator>
     requires __has_input_iterator_category<_InputIterator>::value
-  _LIBCPP_HIDE_FROM_ABI void insert(_InputIterator __first, _InputIterator __last) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void insert(_InputIterator __first, _InputIterator __last) {
     if constexpr (sized_sentinel_for<_InputIterator, _InputIterator>) {
       __reserve(__last - __first);
     }
@@ -385,7 +423,8 @@ class flat_multiset {
 
   template <class _InputIterator>
     requires __has_input_iterator_category<_InputIterator>::value
-  _LIBCPP_HIDE_FROM_ABI void insert(sorted_equivalent_t, _InputIterator __first, _InputIterator __last) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void
+  insert(sorted_equivalent_t, _InputIterator __first, _InputIterator __last) {
     if constexpr (sized_sentinel_for<_InputIterator, _InputIterator>) {
       __reserve(__last - __first);
     }
@@ -394,7 +433,7 @@ class flat_multiset {
   }
 
   template <_ContainerCompatibleRange<value_type> _Range>
-  _LIBCPP_HIDE_FROM_ABI void insert_range(_Range&& __range) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void insert_range(_Range&& __range) {
     if constexpr (ranges::sized_range<_Range>) {
       __reserve(ranges::size(__range));
     }
@@ -402,26 +441,29 @@ class flat_multiset {
     __append_sort_merge</*WasSorted = */ false>(std::forward<_Range>(__range));
   }
 
-  _LIBCPP_HIDE_FROM_ABI void insert(initializer_list<value_type> __il) { insert(__il.begin(), __il.end()); }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void insert(initializer_list<value_type> __il) {
+    insert(__il.begin(), __il.end());
+  }
 
-  _LIBCPP_HIDE_FROM_ABI void insert(sorted_equivalent_t, initializer_list<value_type> __il) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void
+  insert(sorted_equivalent_t, initializer_list<value_type> __il) {
     insert(sorted_equivalent, __il.begin(), __il.end());
   }
 
-  _LIBCPP_HIDE_FROM_ABI container_type extract() && {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 container_type extract() && {
     auto __guard = std::__make_scope_guard([&]() noexcept { clear() /* noexcept */; });
     auto __ret   = std::move(__keys_);
     return __ret;
   }
 
-  _LIBCPP_HIDE_FROM_ABI void replace(container_type&& __keys) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void replace(container_type&& __keys) {
     _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(ranges::is_sorted(__keys, __compare_), "Key container is not sorted");
     auto __guard = std::__make_exception_guard([&]() noexcept { clear() /* noexcept */; });
     __keys_      = std::move(__keys);
     __guard.__complete();
   }
 
-  _LIBCPP_HIDE_FROM_ABI iterator erase(iterator __position) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator erase(iterator __position) {
     auto __on_failure = std::__make_exception_guard([&]() noexcept { clear() /* noexcept */; });
     auto __key_iter   = __keys_.erase(__position.__base());
     __on_failure.__complete();
@@ -431,7 +473,7 @@ class flat_multiset {
   // The following overload is the same as the iterator overload
   // iterator erase(const_iterator __position);
 
-  _LIBCPP_HIDE_FROM_ABI size_type erase(const key_type& __x) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 size_type erase(const key_type& __x) {
     auto [__first, __last] = equal_range(__x);
     auto __res             = __last - __first;
     erase(__first, __last);
@@ -441,21 +483,21 @@ class flat_multiset {
   template <class _Kp>
     requires(__is_transparent_v<_Compare> && !is_convertible_v<_Kp &&, iterator> &&
              !is_convertible_v<_Kp &&, const_iterator>)
-  _LIBCPP_HIDE_FROM_ABI size_type erase(_Kp&& __x) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 size_type erase(_Kp&& __x) {
     auto [__first, __last] = equal_range(__x);
     auto __res             = __last - __first;
     erase(__first, __last);
     return __res;
   }
 
-  _LIBCPP_HIDE_FROM_ABI iterator erase(const_iterator __first, const_iterator __last) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator erase(const_iterator __first, const_iterator __last) {
     auto __on_failure = std::__make_exception_guard([&]() noexcept { clear() /* noexcept */; });
     auto __key_it     = __keys_.erase(__first.__base(), __last.__base());
     __on_failure.__complete();
     return iterator(std::move(__key_it));
   }
 
-  _LIBCPP_HIDE_FROM_ABI void swap(flat_multiset& __y) noexcept {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void swap(flat_multiset& __y) noexcept {
     // warning: The spec has unconditional noexcept, which means that
     // if any of the following functions throw an exception,
     // std::terminate will be called
@@ -464,126 +506,139 @@ class flat_multiset {
     ranges::swap(__keys_, __y.__keys_);
   }
 
-  _LIBCPP_HIDE_FROM_ABI void clear() noexcept { __keys_.clear(); }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void clear() noexcept { __keys_.clear(); }
 
   // observers
-  _LIBCPP_HIDE_FROM_ABI key_compare key_comp() const { return __compare_; }
-  _LIBCPP_HIDE_FROM_ABI value_compare value_comp() const { return __compare_; }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 key_compare key_comp() const { return __compare_; }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 value_compare value_comp() const { return __compare_; }
 
   // map operations
-  _LIBCPP_HIDE_FROM_ABI iterator find(const key_type& __x) { return __find_impl(*this, __x); }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator find(const key_type& __x) {
+    return __find_impl(*this, __x);
+  }
 
-  _LIBCPP_HIDE_FROM_ABI const_iterator find(const key_type& __x) const { return __find_impl(*this, __x); }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator find(const key_type& __x) const {
+    return __find_impl(*this, __x);
+  }
 
   template <class _Kp>
     requires __is_transparent_v<_Compare>
-  _LIBCPP_HIDE_FROM_ABI iterator find(const _Kp& __x) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator find(const _Kp& __x) {
     return __find_impl(*this, __x);
   }
 
   template <class _Kp>
     requires __is_transparent_v<_Compare>
-  _LIBCPP_HIDE_FROM_ABI const_iterator find(const _Kp& __x) const {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator find(const _Kp& __x) const {
     return __find_impl(*this, __x);
   }
 
-  _LIBCPP_HIDE_FROM_ABI size_type count(const key_type& __x) const {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 size_type count(const key_type& __x) const {
     auto [__first, __last] = equal_range(__x);
     return __last - __first;
   }
 
   template <class _Kp>
     requires __is_transparent_v<_Compare>
-  _LIBCPP_HIDE_FROM_ABI size_type count(const _Kp& __x) const {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 size_type count(const _Kp& __x) const {
     auto [__first, __last] = equal_range(__x);
     return __last - __first;
   }
 
-  _LIBCPP_HIDE_FROM_ABI bool contains(const key_type& __x) const { return find(__x) != end(); }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 bool contains(const key_type& __x) const {
+    return find(__x) != end();
+  }
 
   template <class _Kp>
     requires __is_transparent_v<_Compare>
-  _LIBCPP_HIDE_FROM_ABI bool contains(const _Kp& __x) const {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 bool contains(const _Kp& __x) const {
     return find(__x) != end();
   }
 
-  _LIBCPP_HIDE_FROM_ABI iterator lower_bound(const key_type& __x) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator lower_bound(const key_type& __x) {
     const auto& __keys = __keys_;
     return iterator(std::lower_bound(__keys.begin(), __keys.end(), __x, __compare_));
   }
 
-  _LIBCPP_HIDE_FROM_ABI const_iterator lower_bound(const key_type& __x) const {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator lower_bound(const key_type& __x) const {
     return const_iterator(std::lower_bound(__keys_.begin(), __keys_.end(), __x, __compare_));
   }
 
   template <class _Kp>
     requires __is_transparent_v<_Compare>
-  _LIBCPP_HIDE_FROM_ABI iterator lower_bound(const _Kp& __x) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator lower_bound(const _Kp& __x) {
     const auto& __keys = __keys_;
     return iterator(std::lower_bound(__keys.begin(), __keys.end(), __x, __compare_));
   }
 
   template <class _Kp>
     requires __is_transparent_v<_Compare>
-  _LIBCPP_HIDE_FROM_ABI const_iterator lower_bound(const _Kp& __x) const {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator lower_bound(const _Kp& __x) const {
     return const_iterator(std::lower_bound(__keys_.begin(), __keys_.end(), __x, __compare_));
   }
 
-  _LIBCPP_HIDE_FROM_ABI iterator upper_bound(const key_type& __x) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator upper_bound(const key_type& __x) {
     const auto& __keys = __keys_;
     return iterator(std::upper_bound(__keys.begin(), __keys.end(), __x, __compare_));
   }
 
-  _LIBCPP_HIDE_FROM_ABI const_iterator upper_bound(const key_type& __x) const {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator upper_bound(const key_type& __x) const {
     return const_iterator(std::upper_bound(__keys_.begin(), __keys_.end(), __x, __compare_));
   }
 
   template <class _Kp>
     requires __is_transparent_v<_Compare>
-  _LIBCPP_HIDE_FROM_ABI iterator upper_bound(const _Kp& __x) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator upper_bound(const _Kp& __x) {
     const auto& __keys = __keys_;
     return iterator(std::upper_bound(__keys.begin(), __keys.end(), __x, __compare_));
   }
 
   template <class _Kp>
     requires __is_transparent_v<_Compare>
-  _LIBCPP_HIDE_FROM_ABI const_iterator upper_bound(const _Kp& __x) const {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator upper_bound(const _Kp& __x) const {
     return const_iterator(std::upper_bound(__keys_.begin(), __keys_.end(), __x, __compare_));
   }
 
-  _LIBCPP_HIDE_FROM_ABI pair<iterator, iterator> equal_range(const key_type& __x) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 pair<iterator, iterator> equal_range(const key_type& __x) {
     return __equal_range_impl(*this, __x);
   }
 
-  _LIBCPP_HIDE_FROM_ABI pair<const_iterator, const_iterator> equal_range(const key_type& __x) const {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 pair<const_iterator, const_iterator>
+  equal_range(const key_type& __x) const {
     return __equal_range_impl(*this, __x);
   }
 
   template <class _Kp>
     requires __is_transparent_v<_Compare>
-  _LIBCPP_HIDE_FROM_ABI pair<iterator, iterator> equal_range(const _Kp& __x) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 pair<iterator, iterator> equal_range(const _Kp& __x) {
     return __equal_range_impl(*this, __x);
   }
   template <class _Kp>
     requires __is_transparent_v<_Compare>
-  _LIBCPP_HIDE_FROM_ABI pair<const_iterator, const_iterator> equal_range(const _Kp& __x) const {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 pair<const_iterator, const_iterator>
+  equal_range(const _Kp& __x) const {
     return __equal_range_impl(*this, __x);
   }
 
-  friend _LIBCPP_HIDE_FROM_ABI bool operator==(const flat_multiset& __x, const flat_multiset& __y) {
+  friend _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 bool
+  operator==(const flat_multiset& __x, const flat_multiset& __y) {
     return ranges::equal(__x, __y);
   }
 
-  friend _LIBCPP_HIDE_FROM_ABI auto operator<=>(const flat_multiset& __x, const flat_multiset& __y) {
+  friend _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 auto
+  operator<=>(const flat_multiset& __x, const flat_multiset& __y) {
     return std::lexicographical_compare_three_way(
         __x.begin(), __x.end(), __y.begin(), __y.end(), std::__synth_three_way);
   }
 
-  friend _LIBCPP_HIDE_FROM_ABI void swap(flat_multiset& __x, flat_multiset& __y) noexcept { __x.swap(__y); }
+  friend _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void
+  swap(flat_multiset& __x, flat_multiset& __y) noexcept {
+    __x.swap(__y);
+  }
 
 private:
   template <bool _WasSorted, class... _Args>
-  _LIBCPP_HIDE_FROM_ABI void __append_sort_merge(_Args&&... __args) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void __append_sort_merge(_Args&&... __args) {
     auto __on_failure    = std::__make_exception_guard([&]() noexcept { clear() /* noexcept */; });
     size_type __old_size = size();
     __flat_set_utils::__append(*this, std::forward<_Args>(__args)...);
@@ -598,13 +653,13 @@ class flat_multiset {
   }
 
   template <class _Kp>
-  _LIBCPP_HIDE_FROM_ABI iterator __emplace(_Kp&& __key) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator __emplace(_Kp&& __key) {
     auto __it = upper_bound(__key);
     return __flat_set_utils::__emplace_exact_pos(*this, __it, std::forward<_Kp>(__key));
   }
 
   template <class _Kp>
-  _LIBCPP_HIDE_FROM_ABI iterator __emplace_hint(const_iterator __hint, _Kp&& __key) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator __emplace_hint(const_iterator __hint, _Kp&& __key) {
     auto __prev_larger  = __hint != cbegin() && __compare_(__key, *std::prev(__hint));
     auto __next_smaller = __hint != cend() && __compare_(*__hint, __key);
 
@@ -636,7 +691,7 @@ class flat_multiset {
   }
 
   template <class _Self, class _Kp>
-  _LIBCPP_HIDE_FROM_ABI static auto __find_impl(_Self&& __self, const _Kp& __key) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 static auto __find_impl(_Self&& __self, const _Kp& __key) {
     auto __it   = __self.lower_bound(__key);
     auto __last = __self.end();
     if (__it == __last || __self.__compare_(__key, *__it)) {
@@ -646,29 +701,30 @@ class flat_multiset {
   }
 
   template <class _Self, class _Kp>
-  _LIBCPP_HIDE_FROM_ABI static auto __equal_range_impl(_Self&& __self, const _Kp& __key) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 static auto __equal_range_impl(_Self&& __self, const _Kp& __key) {
     using __iter = _If<is_const_v<__libcpp_remove_reference_t<_Self>>, const_iterator, iterator>;
     auto [__key_first, __key_last] =
         std::equal_range(__self.__keys_.begin(), __self.__keys_.end(), __key, __self.__compare_);
     return std::make_pair(__iter(__key_first), __iter(__key_last));
   }
 
-  _LIBCPP_HIDE_FROM_ABI void __reserve(size_t __size) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void __reserve(size_t __size) {
     if constexpr (__container_traits<_KeyContainer>::__reservable) {
       __keys_.reserve(__size);
     }
   }
 
   template <class _Key2, class _Compare2, class _KeyContainer2, class _Predicate>
-  friend typename flat_multiset<_Key2, _Compare2, _KeyContainer2>::size_type
+  friend typename flat_multiset<_Key2, _Compare2, _KeyContainer2>::size_type _LIBCPP_CONSTEXPR_SINCE_CXX26
   erase_if(flat_multiset<_Key2, _Compare2, _KeyContainer2>&, _Predicate);
 
   _KeyContainer __keys_;
   _LIBCPP_NO_UNIQUE_ADDRESS key_compare __compare_;
 
   struct __key_equiv {
-    _LIBCPP_HIDE_FROM_ABI __key_equiv(key_compare __c) : __comp_(__c) {}
-    _LIBCPP_HIDE_FROM_ABI bool operator()(const_reference __x, const_reference __y) const {
+    _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 __key_equiv(key_compare __c) : __comp_(__c) {}
+    _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 bool
+    operator()(const_reference __x, const_reference __y) const {
       return !__comp_(std::get<0>(__x), std::get<0>(__y)) && !__comp_(std::get<0>(__y), std::get<0>(__x));
     }
     key_compare __comp_;
@@ -757,7 +813,7 @@ struct uses_allocator<flat_multiset<_Key, _Compare, _KeyContainer>, _Allocator>
     : bool_constant<uses_allocator_v<_KeyContainer, _Allocator> > {};
 
 template <class _Key, class _Compare, class _KeyContainer, class _Predicate>
-_LIBCPP_HIDE_FROM_ABI typename flat_multiset<_Key, _Compare, _KeyContainer>::size_type
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 typename flat_multiset<_Key, _Compare, _KeyContainer>::size_type
 erase_if(flat_multiset<_Key, _Compare, _KeyContainer>& __flat_multiset, _Predicate __pred) {
   auto __guard = std::__make_exception_guard([&] { __flat_multiset.clear(); });
   auto __it =
diff --git a/libcxx/include/__format/format_arg.h b/libcxx/include/__format/format_arg.h
index ed5e76275ea87..19794f0f084ce 100644
--- a/libcxx/include/__format/format_arg.h
+++ b/libcxx/include/__format/format_arg.h
@@ -149,7 +149,7 @@ _LIBCPP_HIDE_FROM_ABI decltype(auto) __visit_format_arg(_Visitor&& __vis, basic_
   __libcpp_unreachable();
 }
 
-#  if _LIBCPP_STD_VER >= 26 && _LIBCPP_HAS_EXPLICIT_THIS_PARAMETER
+#  if _LIBCPP_STD_VER >= 26
 
 template <class _Rp, class _Visitor, class _Context>
 _LIBCPP_HIDE_FROM_ABI _Rp __visit_format_arg(_Visitor&& __vis, basic_format_arg<_Context> __arg) {
@@ -200,7 +200,7 @@ _LIBCPP_HIDE_FROM_ABI _Rp __visit_format_arg(_Visitor&& __vis, basic_format_arg<
   __libcpp_unreachable();
 }
 
-#  endif // _LIBCPP_STD_VER >= 26 && _LIBCPP_HAS_EXPLICIT_THIS_PARAMETER
+#  endif // _LIBCPP_STD_VER >= 26
 
 /// Contains the values used in basic_format_arg.
 ///
@@ -285,7 +285,7 @@ class _LIBCPP_NO_SPECIALIZATIONS basic_format_arg {
 
   _LIBCPP_HIDE_FROM_ABI explicit operator bool() const noexcept { return __type_ != __format::__arg_t::__none; }
 
-#  if _LIBCPP_STD_VER >= 26 && _LIBCPP_HAS_EXPLICIT_THIS_PARAMETER
+#  if _LIBCPP_STD_VER >= 26
 
   // This function is user facing, so it must wrap the non-standard types of
   // the "variant" in a handle to stay conforming. See __arg_t for more details.
@@ -329,7 +329,7 @@ class _LIBCPP_NO_SPECIALIZATIONS basic_format_arg {
     }
   }
 
-#  endif // _LIBCPP_STD_VER >= 26 && _LIBCPP_HAS_EXPLICIT_THIS_PARAMETER
+#  endif // _LIBCPP_STD_VER >= 26
 
 private:
   using char_type = typename _Context::char_type;
@@ -371,11 +371,8 @@ class basic_format_arg<_Context>::handle {
 // This function is user facing, so it must wrap the non-standard types of
 // the "variant" in a handle to stay conforming. See __arg_t for more details.
 template <class _Visitor, class _Context>
-#  if _LIBCPP_STD_VER >= 26 && _LIBCPP_HAS_EXPLICIT_THIS_PARAMETER
-_LIBCPP_DEPRECATED_IN_CXX26
-#  endif
-    _LIBCPP_HIDE_FROM_ABI decltype(auto)
-    visit_format_arg(_Visitor&& __vis, basic_format_arg<_Context> __arg) {
+_LIBCPP_DEPRECATED_IN_CXX26 _LIBCPP_HIDE_FROM_ABI decltype(auto)
+visit_format_arg(_Visitor&& __vis, basic_format_arg<_Context> __arg) {
   switch (__arg.__type_) {
 #  if _LIBCPP_HAS_INT128
   case __format::__arg_t::__i128: {
@@ -387,7 +384,7 @@ _LIBCPP_DEPRECATED_IN_CXX26
     typename __basic_format_arg_value<_Context>::__handle __h{__arg.__value_.__u128_};
     return std::invoke(std::forward<_Visitor>(__vis), typename basic_format_arg<_Context>::handle{__h});
   }
-#  endif // _LIBCPP_STD_VER >= 26 && _LIBCPP_HAS_EXPLICIT_THIS_PARAMETER
+#  endif // _LIBCPP_HAS_INT128
   default:
     return std::__visit_format_arg(std::forward<_Visitor>(__vis), __arg);
   }
diff --git a/libcxx/include/__format/format_context.h b/libcxx/include/__format/format_context.h
index e672ee7ad0581..1771dd34b82fb 100644
--- a/libcxx/include/__format/format_context.h
+++ b/libcxx/include/__format/format_context.h
@@ -175,13 +175,13 @@ class basic_format_context<typename __format::__retarget_buffer<_CharT>::__itera
                   __format::__determine_arg_t<basic_format_context, decltype(__arg)>(),
                   __basic_format_arg_value<basic_format_context>(__arg)};
           };
-#  if _LIBCPP_STD_VER >= 26 && _LIBCPP_HAS_EXPLICIT_THIS_PARAMETER
+#  if _LIBCPP_STD_VER >= 26
           return static_cast<_Context*>(__c)->arg(__id).visit(std::move(__visitor));
 #  else
           _LIBCPP_SUPPRESS_DEPRECATED_PUSH
           return std::visit_format_arg(std::move(__visitor), static_cast<_Context*>(__c)->arg(__id));
           _LIBCPP_SUPPRESS_DEPRECATED_POP
-#  endif // _LIBCPP_STD_VER >= 26 && _LIBCPP_HAS_EXPLICIT_THIS_PARAMETER
+#  endif // _LIBCPP_STD_VER >= 26
         }) {
   }
 
diff --git a/libcxx/include/__format/formatter_output.h b/libcxx/include/__format/formatter_output.h
index d53b6cec707d8..63dd7fcacdcc9 100644
--- a/libcxx/include/__format/formatter_output.h
+++ b/libcxx/include/__format/formatter_output.h
@@ -151,45 +151,41 @@ _LIBCPP_HIDE_FROM_ABI _OutIt __fill(_OutIt __out_it, size_t __n, _CharT __value)
   }
 }
 
-#  if _LIBCPP_HAS_UNICODE
 template <__fmt_char_type _CharT, output_iterator<const _CharT&> _OutIt>
-  requires(same_as<_CharT, char>)
 _LIBCPP_HIDE_FROM_ABI _OutIt __fill(_OutIt __out_it, size_t __n, __format_spec::__code_point<_CharT> __value) {
-  std::size_t __bytes = std::countl_one(static_cast<unsigned char>(__value.__data[0]));
-  if (__bytes == 0)
-    return __formatter::__fill(std::move(__out_it), __n, __value.__data[0]);
-
-  for (size_t __i = 0; __i < __n; ++__i)
-    __out_it = __formatter::__copy(
-        std::addressof(__value.__data[0]), std::addressof(__value.__data[0]) + __bytes, std::move(__out_it));
-  return __out_it;
-}
-
+#  if _LIBCPP_HAS_UNICODE
+  if constexpr (same_as<_CharT, char>) {
+    std::size_t __bytes = std::countl_one(static_cast<unsigned char>(__value.__data[0]));
+    if (__bytes == 0)
+      return __formatter::__fill(std::move(__out_it), __n, __value.__data[0]);
+
+    for (size_t __i = 0; __i < __n; ++__i)
+      __out_it = __formatter::__copy(
+          std::addressof(__value.__data[0]), std::addressof(__value.__data[0]) + __bytes, std::move(__out_it));
+    return __out_it;
 #    if _LIBCPP_HAS_WIDE_CHARACTERS
-template <__fmt_char_type _CharT, output_iterator<const _CharT&> _OutIt>
-  requires(same_as<_CharT, wchar_t> && sizeof(wchar_t) == 2)
-_LIBCPP_HIDE_FROM_ABI _OutIt __fill(_OutIt __out_it, size_t __n, __format_spec::__code_point<_CharT> __value) {
-  if (!__unicode::__is_high_surrogate(__value.__data[0]))
-    return __formatter::__fill(std::move(__out_it), __n, __value.__data[0]);
-
-  for (size_t __i = 0; __i < __n; ++__i)
-    __out_it = __formatter::__copy(
-        std::addressof(__value.__data[0]), std::addressof(__value.__data[0]) + 2, std::move(__out_it));
-  return __out_it;
-}
-
-template <__fmt_char_type _CharT, output_iterator<const _CharT&> _OutIt>
-  requires(same_as<_CharT, wchar_t> && sizeof(wchar_t) == 4)
-_LIBCPP_HIDE_FROM_ABI _OutIt __fill(_OutIt __out_it, size_t __n, __format_spec::__code_point<_CharT> __value) {
-  return __formatter::__fill(std::move(__out_it), __n, __value.__data[0]);
-}
+  } else if constexpr (same_as<_CharT, wchar_t>) {
+    if constexpr (sizeof(wchar_t) == 2) {
+      if (!__unicode::__is_high_surrogate(__value.__data[0]))
+        return __formatter::__fill(std::move(__out_it), __n, __value.__data[0]);
+
+      for (size_t __i = 0; __i < __n; ++__i)
+        __out_it = __formatter::__copy(
+            std::addressof(__value.__data[0]), std::addressof(__value.__data[0]) + 2, std::move(__out_it));
+      return __out_it;
+    } else if constexpr (sizeof(wchar_t) == 4) {
+      return __formatter::__fill(std::move(__out_it), __n, __value.__data[0]);
+    } else {
+      static_assert(false, "expected sizeof(wchar_t) to be 2 or 4");
+    }
 #    endif // _LIBCPP_HAS_WIDE_CHARACTERS
-#  else    // _LIBCPP_HAS_UNICODE
-template <__fmt_char_type _CharT, output_iterator<const _CharT&> _OutIt>
-_LIBCPP_HIDE_FROM_ABI _OutIt __fill(_OutIt __out_it, size_t __n, __format_spec::__code_point<_CharT> __value) {
+  } else {
+    static_assert(false, "Unexpected CharT");
+  }
+#  else  // _LIBCPP_HAS_UNICODE
   return __formatter::__fill(std::move(__out_it), __n, __value.__data[0]);
+#  endif // _LIBCPP_HAS_UNICODE
 }
-#  endif   // _LIBCPP_HAS_UNICODE
 
 /// Writes the input to the output with the required padding.
 ///
diff --git a/libcxx/include/__functional/identity.h b/libcxx/include/__functional/identity.h
index 1b1c6cf73c378..02dde2b4f323d 100644
--- a/libcxx/include/__functional/identity.h
+++ b/libcxx/include/__functional/identity.h
@@ -44,7 +44,7 @@ struct __is_identity<reference_wrapper<const __identity> > : true_type {};
 
 struct identity {
   template <class _Tp>
-  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr _Tp&& operator()(_Tp&& __t) const noexcept {
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr _Tp&& operator()(_LIBCPP_LIFETIMEBOUND _Tp&& __t) const noexcept {
     return std::forward<_Tp>(__t);
   }
 
diff --git a/libcxx/include/__hash_table b/libcxx/include/__hash_table
index 5432abb4ab39d..e1897949a47e6 100644
--- a/libcxx/include/__hash_table
+++ b/libcxx/include/__hash_table
@@ -83,18 +83,6 @@ struct __hash_node_base {
   typedef _NodePtr __node_pointer;
   typedef __node_base_pointer __next_pointer;
 
-// TODO(LLVM 22): Remove this check
-#ifndef _LIBCPP_ABI_FIX_UNORDERED_NODE_POINTER_UB
-  static_assert(sizeof(__node_base_pointer) == sizeof(__node_pointer) && _LIBCPP_ALIGNOF(__node_base_pointer) ==
-                    _LIBCPP_ALIGNOF(__node_pointer),
-                "It looks like you are using std::__hash_table (an implementation detail for the unordered containers) "
-                "with a fancy pointer type that thas a different representation depending on whether it points to a "
-                "__hash_table base pointer or a __hash_table node pointer (both of which are implementation details of "
-                "the standard library). This means that your ABI is being broken between LLVM 19 and LLVM 20. If you "
-                "don't care about your ABI being broken, define the _LIBCPP_ABI_TREE_REMOVE_NODE_POINTER_UB macro to "
-                "silence this diagnostic.");
-#endif
-
   __next_pointer __next_;
 
   _LIBCPP_HIDE_FROM_ABI __next_pointer __ptr() _NOEXCEPT {
diff --git a/libcxx/include/__iterator/concepts.h b/libcxx/include/__iterator/concepts.h
index f38688734b38a..3b43920443636 100644
--- a/libcxx/include/__iterator/concepts.h
+++ b/libcxx/include/__iterator/concepts.h
@@ -117,15 +117,12 @@ template <class _Tp>
 concept __signed_integer_like = signed_integral<_Tp>;
 
 template <class _Ip>
-concept weakly_incrementable =
-    // TODO: remove this once the clang bug is fixed (https://llvm.org/PR48173).
-    !same_as<_Ip, bool> && // Currently, clang does not handle bool correctly.
-    movable<_Ip> && requires(_Ip __i) {
-      typename iter_difference_t<_Ip>;
-      requires __signed_integer_like<iter_difference_t<_Ip>>;
-      { ++__i } -> same_as<_Ip&>; // not required to be equality-preserving
-      __i++;                      // not required to be equality-preserving
-    };
+concept weakly_incrementable = movable<_Ip> && requires(_Ip __i) {
+  typename iter_difference_t<_Ip>;
+  requires __signed_integer_like<iter_difference_t<_Ip>>;
+  { ++__i } -> same_as<_Ip&>; // not required to be equality-preserving
+  __i++;                      // not required to be equality-preserving
+};
 
 // [iterator.concept.inc]
 template <class _Ip>
diff --git a/libcxx/include/__locale_dir/locale_base_api.h b/libcxx/include/__locale_dir/locale_base_api.h
index 9f3ce02a3af20..8c8f00061d1ed 100644
--- a/libcxx/include/__locale_dir/locale_base_api.h
+++ b/libcxx/include/__locale_dir/locale_base_api.h
@@ -115,6 +115,8 @@
 #    include <__locale_dir/support/apple.h>
 #  elif defined(__FreeBSD__)
 #    include <__locale_dir/support/freebsd.h>
+#  elif defined(__NetBSD__)
+#    include <__locale_dir/support/netbsd.h>
 #  elif defined(_LIBCPP_MSVCRT_LIKE)
 #    include <__locale_dir/support/windows.h>
 #  elif defined(__Fuchsia__)
diff --git a/libcxx/include/__locale_dir/support/bsd_like.h b/libcxx/include/__locale_dir/support/bsd_like.h
index ac402924709e5..9d4bdd1d5775f 100644
--- a/libcxx/include/__locale_dir/support/bsd_like.h
+++ b/libcxx/include/__locale_dir/support/bsd_like.h
@@ -24,7 +24,9 @@
 #  include <wctype.h>
 #endif
 
-#include <xlocale.h>
+#if __has_include(<xlocale.h>)
+#  include <xlocale.h>
+#endif
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
diff --git a/libcxx/include/__locale_dir/support/netbsd.h b/libcxx/include/__locale_dir/support/netbsd.h
new file mode 100644
index 0000000000000..190857f6f84fe
--- /dev/null
+++ b/libcxx/include/__locale_dir/support/netbsd.h
@@ -0,0 +1,20 @@
+//===-----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___LOCALE_DIR_SUPPORT_NETBSD_H
+#define _LIBCPP___LOCALE_DIR_SUPPORT_NETBSD_H
+
+#include <__config>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+#include <__locale_dir/support/bsd_like.h>
+
+#endif // _LIBCPP___LOCALE_DIR_SUPPORT_NETBSD_H
diff --git a/libcxx/include/__math/traits.h b/libcxx/include/__math/traits.h
index 00db2a8289fb3..ff22cee7305d7 100644
--- a/libcxx/include/__math/traits.h
+++ b/libcxx/include/__math/traits.h
@@ -25,33 +25,26 @@ namespace __math {
 
 // signbit
 
-// TODO(LLVM 22): Remove conditional once support for Clang 19 is dropped.
-#if defined(_LIBCPP_COMPILER_GCC) || __has_constexpr_builtin(__builtin_signbit)
-#  define _LIBCPP_SIGNBIT_CONSTEXPR _LIBCPP_CONSTEXPR_SINCE_CXX23
-#else
-#  define _LIBCPP_SIGNBIT_CONSTEXPR
-#endif
-
 // The universal C runtime (UCRT) in the WinSDK provides floating point overloads
 // for std::signbit(). By defining our overloads as templates, we can work around
 // this issue as templates are less preferred than non-template functions.
 template <class = void>
-[[__nodiscard__]] inline _LIBCPP_SIGNBIT_CONSTEXPR _LIBCPP_HIDE_FROM_ABI bool signbit(float __x) _NOEXCEPT {
+[[__nodiscard__]] inline _LIBCPP_CONSTEXPR_SINCE_CXX23 _LIBCPP_HIDE_FROM_ABI bool signbit(float __x) _NOEXCEPT {
   return __builtin_signbit(__x);
 }
 
 template <class = void>
-[[__nodiscard__]] inline _LIBCPP_SIGNBIT_CONSTEXPR _LIBCPP_HIDE_FROM_ABI bool signbit(double __x) _NOEXCEPT {
+[[__nodiscard__]] inline _LIBCPP_CONSTEXPR_SINCE_CXX23 _LIBCPP_HIDE_FROM_ABI bool signbit(double __x) _NOEXCEPT {
   return __builtin_signbit(__x);
 }
 
 template <class = void>
-[[__nodiscard__]] inline _LIBCPP_SIGNBIT_CONSTEXPR _LIBCPP_HIDE_FROM_ABI bool signbit(long double __x) _NOEXCEPT {
+[[__nodiscard__]] inline _LIBCPP_CONSTEXPR_SINCE_CXX23 _LIBCPP_HIDE_FROM_ABI bool signbit(long double __x) _NOEXCEPT {
   return __builtin_signbit(__x);
 }
 
 template <class _A1, __enable_if_t<is_integral<_A1>::value, int> = 0>
-[[__nodiscard__]] inline _LIBCPP_SIGNBIT_CONSTEXPR _LIBCPP_HIDE_FROM_ABI bool signbit(_A1 __x) _NOEXCEPT {
+[[__nodiscard__]] inline _LIBCPP_CONSTEXPR_SINCE_CXX23 _LIBCPP_HIDE_FROM_ABI bool signbit(_A1 __x) _NOEXCEPT {
   return __x < 0;
 }
 
diff --git a/libcxx/include/__memory/construct_at.h b/libcxx/include/__memory/construct_at.h
index 658269158d945..5378c03abab3a 100644
--- a/libcxx/include/__memory/construct_at.h
+++ b/libcxx/include/__memory/construct_at.h
@@ -14,7 +14,6 @@
 #include <__config>
 #include <__memory/addressof.h>
 #include <__new/placement_new_delete.h>
-#include <__type_traits/enable_if.h>
 #include <__type_traits/is_array.h>
 #include <__utility/declval.h>
 #include <__utility/forward.h>
@@ -55,35 +54,25 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Tp* __construct_at(_Tp* __l
 // The internal functions are available regardless of the language version (with the exception of the `__destroy_at`
 // taking an array).
 
-template <class _Tp, __enable_if_t<!is_array<_Tp>::value, int> = 0>
+template <class _Tp>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __destroy_at(_Tp* __loc) {
   _LIBCPP_ASSERT_NON_NULL(__loc != nullptr, "null pointer given to destroy_at");
-  __loc->~_Tp();
-}
-
 #if _LIBCPP_STD_VER >= 20
-template <class _Tp, __enable_if_t<is_array<_Tp>::value, int> = 0>
-_LIBCPP_HIDE_FROM_ABI constexpr void __destroy_at(_Tp* __loc) {
-  _LIBCPP_ASSERT_NON_NULL(__loc != nullptr, "null pointer given to destroy_at");
-  for (auto&& __val : *__loc)
-    std::__destroy_at(std::addressof(__val));
-}
+  if constexpr (is_array_v<_Tp>) {
+    for (auto&& __val : *__loc)
+      std::__destroy_at(std::addressof(__val));
+  } else
 #endif
+  {
+    __loc->~_Tp();
+  }
+}
 
 #if _LIBCPP_STD_VER >= 17
-
-template <class _Tp, enable_if_t<!is_array_v<_Tp>, int> = 0>
+template <class _Tp>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void destroy_at(_Tp* _LIBCPP_DIAGNOSE_NULLPTR __loc) {
   std::__destroy_at(__loc);
 }
-
-#  if _LIBCPP_STD_VER >= 20
-template <class _Tp, enable_if_t<is_array_v<_Tp>, int> = 0>
-_LIBCPP_HIDE_FROM_ABI constexpr void destroy_at(_Tp* _LIBCPP_DIAGNOSE_NULLPTR __loc) {
-  std::__destroy_at(__loc);
-}
-#  endif
-
 #endif // _LIBCPP_STD_VER >= 17
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__new/align_val_t.h b/libcxx/include/__new/align_val_t.h
index 03ab7cb143a2b..d8ce5283345fb 100644
--- a/libcxx/include/__new/align_val_t.h
+++ b/libcxx/include/__new/align_val_t.h
@@ -16,6 +16,12 @@
 #  pragma GCC system_header
 #endif
 
+// <vcruntime_exception.h> defines its own std::align_val_t type,
+// which we use in order to be ABI-compatible with other STLs on Windows.
+#if _LIBCPP_HAS_LIBRARY_ALIGNED_ALLOCATION && defined(_LIBCPP_ABI_VCRUNTIME)
+#  include <vcruntime_new.h>
+#endif
+
 _LIBCPP_BEGIN_UNVERSIONED_NAMESPACE_STD
 #if _LIBCPP_HAS_LIBRARY_ALIGNED_ALLOCATION && !defined(_LIBCPP_ABI_VCRUNTIME)
 #  ifndef _LIBCPP_CXX03_LANG
diff --git a/libcxx/include/__new/exceptions.h b/libcxx/include/__new/exceptions.h
index 86951818b7aa2..483e5e3811182 100644
--- a/libcxx/include/__new/exceptions.h
+++ b/libcxx/include/__new/exceptions.h
@@ -17,6 +17,12 @@
 #  pragma GCC system_header
 #endif
 
+// <vcruntime_exception.h> defines its own std::bad_alloc type,
+// which we use in order to be ABI-compatible with other STLs on Windows.
+#if defined(_LIBCPP_ABI_VCRUNTIME)
+#  include <vcruntime_exception.h>
+#endif
+
 _LIBCPP_BEGIN_UNVERSIONED_NAMESPACE_STD
 #if !defined(_LIBCPP_ABI_VCRUNTIME)
 
diff --git a/libcxx/include/__ranges/transform_view.h b/libcxx/include/__ranges/transform_view.h
index ae85dfa452d72..ab1adf9cdbe68 100644
--- a/libcxx/include/__ranges/transform_view.h
+++ b/libcxx/include/__ranges/transform_view.h
@@ -13,7 +13,6 @@
 #include <__compare/three_way_comparable.h>
 #include <__concepts/constructible.h>
 #include <__concepts/convertible_to.h>
-#include <__concepts/copyable.h>
 #include <__concepts/derived_from.h>
 #include <__concepts/equality_comparable.h>
 #include <__concepts/invocable.h>
@@ -64,7 +63,7 @@ concept __regular_invocable_with_range_ref = regular_invocable<_Fn, range_refere
 template <class _View, class _Fn>
 concept __transform_view_constraints =
     view<_View> && is_object_v<_Fn> && regular_invocable<_Fn&, range_reference_t<_View>> &&
-    __is_referenceable_v<invoke_result_t<_Fn&, range_reference_t<_View>>>;
+    __referenceable<invoke_result_t<_Fn&, range_reference_t<_View>>>;
 
 #  if _LIBCPP_STD_VER >= 23
 template <input_range _View, move_constructible _Fn>
diff --git a/libcxx/include/__tree b/libcxx/include/__tree
index 0738c8c6a5e2b..694796922c914 100644
--- a/libcxx/include/__tree
+++ b/libcxx/include/__tree
@@ -823,18 +823,6 @@ public:
   using __node_allocator _LIBCPP_NODEBUG = __rebind_alloc<__alloc_traits, __node>;
   using __node_traits _LIBCPP_NODEBUG    = allocator_traits<__node_allocator>;
 
-// TODO(LLVM 22): Remove this check
-#ifndef _LIBCPP_ABI_TREE_REMOVE_NODE_POINTER_UB
-  static_assert(sizeof(__node_base_pointer) == sizeof(__end_node_pointer) && _LIBCPP_ALIGNOF(__node_base_pointer) ==
-                    _LIBCPP_ALIGNOF(__end_node_pointer),
-                "It looks like you are using std::__tree (an implementation detail for (multi)map/set) with a fancy "
-                "pointer type that thas a different representation depending on whether it points to a __tree base "
-                "pointer or a __tree node pointer (both of which are implementation details of the standard library). "
-                "This means that your ABI is being broken between LLVM 19 and LLVM 20. If you don't care about your "
-                "ABI being broken, define the _LIBCPP_ABI_TREE_REMOVE_NODE_POINTER_UB macro to silence this "
-                "diagnostic.");
-#endif
-
 private:
   // check for sane allocator pointer rebinding semantics. Rebinding the
   // allocator for a new pointer type should be exactly the same as rebinding
diff --git a/libcxx/include/__type_traits/is_within_lifetime.h b/libcxx/include/__type_traits/is_within_lifetime.h
new file mode 100644
index 0000000000000..242f2adaf357b
--- /dev/null
+++ b/libcxx/include/__type_traits/is_within_lifetime.h
@@ -0,0 +1,29 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___TYPE_TRAITS_IS_WITHIN_LIFETIME_H
+#define _LIBCPP___TYPE_TRAITS_IS_WITHIN_LIFETIME_H
+
+#include <__config>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+#if _LIBCPP_STD_VER >= 26 && __has_builtin(__builtin_is_within_lifetime)
+template <class _Tp>
+_LIBCPP_HIDE_FROM_ABI consteval bool is_within_lifetime(const _Tp* __p) noexcept {
+  return __builtin_is_within_lifetime(__p);
+}
+#endif
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP___TYPE_TRAITS_IS_WITHIN_LIFETIME_H
diff --git a/libcxx/include/__type_traits/reference_constructs_from_temporary.h b/libcxx/include/__type_traits/reference_constructs_from_temporary.h
index 2ff549b4e15ce..3d097ce90cb09 100644
--- a/libcxx/include/__type_traits/reference_constructs_from_temporary.h
+++ b/libcxx/include/__type_traits/reference_constructs_from_temporary.h
@@ -30,14 +30,8 @@ _LIBCPP_NO_SPECIALIZATIONS inline constexpr bool reference_constructs_from_tempo
 
 #endif
 
-#if __has_builtin(__reference_constructs_from_temporary)
 template <class _Tp, class _Up>
 inline const bool __reference_constructs_from_temporary_v = __reference_constructs_from_temporary(_Tp, _Up);
-#else
-// TODO(LLVM 22): Remove this as all supported compilers should have __reference_constructs_from_temporary implemented.
-template <class _Tp, class _Up>
-inline const bool __reference_constructs_from_temporary_v = __reference_binds_to_temporary(_Tp, _Up);
-#endif
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/deque b/libcxx/include/deque
index 3e7ee8d8565b6..ab41b9db9de26 100644
--- a/libcxx/include/deque
+++ b/libcxx/include/deque
@@ -193,7 +193,6 @@ template <class T, class Allocator, class Predicate>
 #  include <__algorithm/move_backward.h>
 #  include <__algorithm/remove.h>
 #  include <__algorithm/remove_if.h>
-#  include <__algorithm/unwrap_iter.h>
 #  include <__assert>
 #  include <__config>
 #  include <__debug_utils/sanitizers.h>
@@ -220,11 +219,9 @@ template <class T, class Allocator, class Predicate>
 #  include <__ranges/concepts.h>
 #  include <__ranges/container_compatible_range.h>
 #  include <__ranges/from_range.h>
-#  include <__ranges/size.h>
 #  include <__split_buffer>
 #  include <__type_traits/conditional.h>
 #  include <__type_traits/container_traits.h>
-#  include <__type_traits/disjunction.h>
 #  include <__type_traits/enable_if.h>
 #  include <__type_traits/is_allocator.h>
 #  include <__type_traits/is_convertible.h>
diff --git a/libcxx/include/forward_list b/libcxx/include/forward_list
index df7da20cfb611..272e52d68f46a 100644
--- a/libcxx/include/forward_list
+++ b/libcxx/include/forward_list
@@ -223,14 +223,12 @@ template <class T, class Allocator, class Predicate>
 #  include <__ranges/concepts.h>
 #  include <__ranges/container_compatible_range.h>
 #  include <__ranges/from_range.h>
-#  include <__type_traits/conditional.h>
 #  include <__type_traits/container_traits.h>
 #  include <__type_traits/enable_if.h>
 #  include <__type_traits/is_allocator.h>
 #  include <__type_traits/is_const.h>
 #  include <__type_traits/is_nothrow_assignable.h>
 #  include <__type_traits/is_nothrow_constructible.h>
-#  include <__type_traits/is_pointer.h>
 #  include <__type_traits/is_same.h>
 #  include <__type_traits/is_swappable.h>
 #  include <__type_traits/remove_cv.h>
@@ -284,17 +282,6 @@ struct __forward_node_traits {
   typedef _NodePtr __node_pointer;
   typedef __forward_begin_node<_NodePtr> __begin_node;
   typedef __rebind_pointer_t<_NodePtr, __begin_node> __begin_node_pointer;
-
-// TODO(LLVM 22): Remove this check
-#  ifndef _LIBCPP_ABI_FORWARD_LIST_REMOVE_NODE_POINTER_UB
-  static_assert(sizeof(__begin_node_pointer) == sizeof(__node_pointer) && _LIBCPP_ALIGNOF(__begin_node_pointer) ==
-                    _LIBCPP_ALIGNOF(__node_pointer),
-                "It looks like you are using std::forward_list with a fancy pointer type that thas a different "
-                "representation depending on whether it points to a forward_list base pointer or a forward_list node "
-                "pointer (both of which are implementation details of the standard library). This means that your ABI "
-                "is being broken between LLVM 19 and LLVM 20. If you don't care about your ABI being broken, define "
-                "the _LIBCPP_ABI_FORWARD_LIST_REMOVE_NODE_POINTER_UB macro to silence this diagnostic.");
-#  endif
 };
 
 template <class _NodePtr>
diff --git a/libcxx/include/list b/libcxx/include/list
index c5c2a8508999c..2898a45da0029 100644
--- a/libcxx/include/list
+++ b/libcxx/include/list
@@ -228,13 +228,11 @@ template <class T, class Allocator, class Predicate>
 #  include <__ranges/concepts.h>
 #  include <__ranges/container_compatible_range.h>
 #  include <__ranges/from_range.h>
-#  include <__type_traits/conditional.h>
 #  include <__type_traits/container_traits.h>
 #  include <__type_traits/enable_if.h>
 #  include <__type_traits/is_allocator.h>
 #  include <__type_traits/is_nothrow_assignable.h>
 #  include <__type_traits/is_nothrow_constructible.h>
-#  include <__type_traits/is_pointer.h>
 #  include <__type_traits/is_same.h>
 #  include <__type_traits/type_identity.h>
 #  include <__utility/exception_guard.h>
@@ -276,17 +274,6 @@ template <class _Tp, class _VoidPtr>
 struct __list_node_pointer_traits {
   typedef __rebind_pointer_t<_VoidPtr, __list_node<_Tp, _VoidPtr> > __node_pointer;
   typedef __rebind_pointer_t<_VoidPtr, __list_node_base<_Tp, _VoidPtr> > __base_pointer;
-
-// TODO(LLVM 22): Remove this check
-#  ifndef _LIBCPP_ABI_LIST_REMOVE_NODE_POINTER_UB
-  static_assert(sizeof(__node_pointer) == sizeof(__node_pointer) && _LIBCPP_ALIGNOF(__base_pointer) ==
-                    _LIBCPP_ALIGNOF(__node_pointer),
-                "It looks like you are using std::list with a fancy pointer type that thas a different representation "
-                "depending on whether it points to a list base pointer or a list node pointer (both of which are "
-                "implementation details of the standard library). This means that your ABI is being broken between "
-                "LLVM 19 and LLVM 20. If you don't care about your ABI being broken, define the "
-                "_LIBCPP_ABI_LIST_REMOVE_NODE_POINTER_UB macro to silence this diagnostic.");
-#  endif
 };
 
 template <class _Tp, class _VoidPtr>
diff --git a/libcxx/include/map b/libcxx/include/map
index 3ff849afcde09..cc8b8769189d1 100644
--- a/libcxx/include/map
+++ b/libcxx/include/map
@@ -600,9 +600,7 @@ erase_if(multimap<Key, T, Compare, Allocator>& c, Predicate pred);  // C++20
 #  include <__ranges/from_range.h>
 #  include <__tree>
 #  include <__type_traits/container_traits.h>
-#  include <__type_traits/desugars_to.h>
 #  include <__type_traits/is_allocator.h>
-#  include <__type_traits/is_convertible.h>
 #  include <__type_traits/make_transparent.h>
 #  include <__type_traits/remove_const.h>
 #  include <__type_traits/type_identity.h>
diff --git a/libcxx/include/module.modulemap.in b/libcxx/include/module.modulemap.in
index a86d6c6a43d0e..24a2fe761943a 100644
--- a/libcxx/include/module.modulemap.in
+++ b/libcxx/include/module.modulemap.in
@@ -350,6 +350,7 @@ module std_core [system] {
       header "__type_traits/is_volatile.h"
       export std_core.type_traits.integral_constant
     }
+    module is_within_lifetime                         { header "__type_traits/is_within_lifetime.h" }
     module lazy                                       { header "__type_traits/lazy.h" }
     module make_32_64_or_128_bit                      { header "__type_traits/make_32_64_or_128_bit.h" }
     module make_const_lvalue_ref                      { header "__type_traits/make_const_lvalue_ref.h" }
@@ -1587,6 +1588,7 @@ module std [system] {
       textual header "__locale_dir/support/freebsd.h"
       textual header "__locale_dir/support/fuchsia.h"
       textual header "__locale_dir/support/linux.h"
+      textual header "__locale_dir/support/netbsd.h"
       textual header "__locale_dir/support/no_locale/characters.h"
       textual header "__locale_dir/support/no_locale/strtonum.h"
       textual header "__locale_dir/support/windows.h"
diff --git a/libcxx/include/set b/libcxx/include/set
index 59ed0155c1def..d58b6e96b061d 100644
--- a/libcxx/include/set
+++ b/libcxx/include/set
@@ -524,7 +524,6 @@ erase_if(multiset<Key, Compare, Allocator>& c, Predicate pred);  // C++20
 #  include <__functional/operations.h>
 #  include <__iterator/erase_if_container.h>
 #  include <__iterator/iterator_traits.h>
-#  include <__iterator/ranges_iterator_traits.h>
 #  include <__iterator/reverse_iterator.h>
 #  include <__memory/allocator.h>
 #  include <__memory/allocator_traits.h>
@@ -538,7 +537,6 @@ erase_if(multiset<Key, Compare, Allocator>& c, Predicate pred);  // C++20
 #  include <__type_traits/container_traits.h>
 #  include <__type_traits/enable_if.h>
 #  include <__type_traits/is_allocator.h>
-#  include <__type_traits/is_nothrow_assignable.h>
 #  include <__type_traits/is_nothrow_constructible.h>
 #  include <__type_traits/is_same.h>
 #  include <__type_traits/is_swappable.h>
diff --git a/libcxx/include/tuple b/libcxx/include/tuple
index 5f3bb72e0678b..466f501b5f4f8 100644
--- a/libcxx/include/tuple
+++ b/libcxx/include/tuple
@@ -301,7 +301,7 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 bool __tuple_compare_equal(c
 template <class _Tp, class _Up, class _IndexSeq = make_index_sequence<tuple_size_v<_Tp>>>
 inline constexpr bool __can_tuple_compare_equal = false;
 
-// TODO(LLVM 22): Remove `tuple_size_v<_Tp> == tuple_size_v<_Up>` here once once LLVM-20 support ends
+// TODO(LLVM 23): Remove `tuple_size_v<_Tp> == tuple_size_v<_Up>` here once once LLVM-20 support ends
 // because the resolution of CWG2369 landed in LLVM-21.
 template <class _Tp, class _Up, size_t... _Is>
   requires(tuple_size_v<_Tp> == tuple_size_v<_Up>)
@@ -328,7 +328,7 @@ concept __tuple_like_no_tuple = __tuple_like<_Tp> && !__is_tuple_v<_Tp>;
 template <class _Tp, class _Up, class _IndexSeq>
 struct __tuple_common_comparison_category_impl {};
 
-// TODO(LLVM 22): Remove `tuple_size_v<_Tp> == tuple_size_v<_Up>` here once once LLVM-20 support ends
+// TODO(LLVM 23): Remove `tuple_size_v<_Tp> == tuple_size_v<_Up>` here once once LLVM-20 support ends
 // because the resolution of CWG2369 landed in LLVM-21.
 template <class _Tp, class _Up, size_t... _Is>
   requires(tuple_size_v<_Tp> == tuple_size_v<_Up>) && requires {
diff --git a/libcxx/include/type_traits b/libcxx/include/type_traits
index a6e0c1867566b..dab0c0640c389 100644
--- a/libcxx/include/type_traits
+++ b/libcxx/include/type_traits
@@ -454,6 +454,10 @@ namespace std
       template<class B> inline constexpr bool negation_v
         = negation<B>::value;                                   // since C++17
 
+      // [meta.const.eval], constant evaluation context
+      constexpr bool is_constant_evaluated() noexcept;                   // C++20
+      template<class T>
+        consteval bool is_within_lifetime(const T*) noexcept;            // C++26
 }
 
 */
@@ -559,6 +563,10 @@ namespace std
 #    include <__type_traits/reference_converts_from_temporary.h>
 #  endif
 
+#  if _LIBCPP_STD_VER >= 26
+#    include <__type_traits/is_within_lifetime.h>
+#  endif
+
 #  include <version>
 
 #  if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
diff --git a/libcxx/include/unordered_set b/libcxx/include/unordered_set
index 4d0e2ac21e125..9873f1ec70664 100644
--- a/libcxx/include/unordered_set
+++ b/libcxx/include/unordered_set
@@ -544,8 +544,6 @@ template <class Value, class Hash, class Pred, class Alloc>
 #  include <__iterator/distance.h>
 #  include <__iterator/erase_if_container.h>
 #  include <__iterator/iterator_traits.h>
-#  include <__iterator/ranges_iterator_traits.h>
-#  include <__memory/addressof.h>
 #  include <__memory/allocator.h>
 #  include <__memory/allocator_traits.h>
 #  include <__memory_resource/polymorphic_allocator.h>
@@ -558,7 +556,6 @@ template <class Value, class Hash, class Pred, class Alloc>
 #  include <__type_traits/invoke.h>
 #  include <__type_traits/is_allocator.h>
 #  include <__type_traits/is_integral.h>
-#  include <__type_traits/is_nothrow_assignable.h>
 #  include <__type_traits/is_nothrow_constructible.h>
 #  include <__type_traits/is_same.h>
 #  include <__type_traits/is_swappable.h>
diff --git a/libcxx/include/variant b/libcxx/include/variant
index 9beef146f203c..8e958581a6b07 100644
--- a/libcxx/include/variant
+++ b/libcxx/include/variant
@@ -1299,7 +1299,7 @@ public:
     __impl_.__swap(__that.__impl_);
   }
 
-#    if _LIBCPP_STD_VER >= 26 && _LIBCPP_HAS_EXPLICIT_THIS_PARAMETER
+#    if _LIBCPP_STD_VER >= 26
   // Helper class to implement [variant.visit]/10
   //   Constraints: The call to visit does not use an explicit template-argument-list
   //   that begins with a type template-argument.
diff --git a/libcxx/include/version b/libcxx/include/version
index 0fef1bb87cf60..b0030602f854a 100644
--- a/libcxx/include/version
+++ b/libcxx/include/version
@@ -71,6 +71,8 @@ __cpp_lib_constexpr_charconv                            202207L <charconv>
 __cpp_lib_constexpr_cmath                               202202L <cmath> <cstdlib>
 __cpp_lib_constexpr_complex                             201711L <complex>
 __cpp_lib_constexpr_dynamic_alloc                       201907L <memory>
+__cpp_lib_constexpr_flat_map                            202502L <flat_map>
+__cpp_lib_constexpr_flat_set                            202502L <flat_set>
 __cpp_lib_constexpr_forward_list                        202502L <forward_list>
 __cpp_lib_constexpr_functional                          201907L <functional>
 __cpp_lib_constexpr_iterator                            201811L <iterator>
@@ -552,6 +554,8 @@ __cpp_lib_void_t                                        201411L <type_traits>
 # define __cpp_lib_bitset                               202306L
 # undef  __cpp_lib_constexpr_algorithms
 # define __cpp_lib_constexpr_algorithms                 202306L
+# define __cpp_lib_constexpr_flat_map                   202502L
+# define __cpp_lib_constexpr_flat_set                   202502L
 # define __cpp_lib_constexpr_forward_list               202502L
 # define __cpp_lib_constexpr_list                       202502L
 # if !defined(_LIBCPP_ABI_VCRUNTIME)
@@ -582,7 +586,9 @@ __cpp_lib_void_t                                        201411L <type_traits>
 # if __has_builtin(__builtin_is_virtual_base_of)
 #   define __cpp_lib_is_virtual_base_of                 202406L
 # endif
-// # define __cpp_lib_is_within_lifetime                   202306L
+# if __has_builtin(__builtin_is_within_lifetime)
+#   define __cpp_lib_is_within_lifetime                 202306L
+# endif
 // # define __cpp_lib_linalg                               202311L
 # undef  __cpp_lib_mdspan
 # define __cpp_lib_mdspan                               202406L
diff --git a/libcxx/modules/std/type_traits.inc b/libcxx/modules/std/type_traits.inc
index 6823c86ed153b..4e49ed8f255c7 100644
--- a/libcxx/modules/std/type_traits.inc
+++ b/libcxx/modules/std/type_traits.inc
@@ -330,6 +330,9 @@ export namespace std {
 
   // [meta.const.eval], constant evaluation context
   using std::is_constant_evaluated;
+#if _LIBCPP_STD_VER >= 26 && __has_builtin(__builtin_is_within_lifetime)
+  using std::is_within_lifetime;
+#endif
 
   // [depr.meta.types]
   using std::aligned_storage;
diff --git a/libcxx/test/libcxx-03/utilities/meta/is_referenceable.compile.pass.cpp b/libcxx/test/libcxx-03/utilities/meta/is_referenceable.compile.pass.cpp
index 093bbae289723..f39d1a5da41af 100644
--- a/libcxx/test/libcxx-03/utilities/meta/is_referenceable.compile.pass.cpp
+++ b/libcxx/test/libcxx-03/utilities/meta/is_referenceable.compile.pass.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 //
 
-// __is_referenceable_v<Tp>
+// __libcpp_is_referenceable<Tp>
 //
 // [defns.referenceable] defines "a referenceable type" as:
 // An object type, a function type that does not have cv-qualifiers
diff --git a/libcxx/test/libcxx/containers/container.adaptors/flat.multiset/insert.temporary.pass.cpp b/libcxx/test/libcxx/containers/container.adaptors/flat.multiset/insert.temporary.pass.cpp
index 248f282209fd7..acd20ce525a0d 100644
--- a/libcxx/test/libcxx/containers/container.adaptors/flat.multiset/insert.temporary.pass.cpp
+++ b/libcxx/test/libcxx/containers/container.adaptors/flat.multiset/insert.temporary.pass.cpp
@@ -21,7 +21,7 @@
 #include "../flat_helpers.h"
 #include "test_macros.h"
 
-bool test() {
+constexpr bool test() {
   using M = std::flat_multiset<TrackCopyMove>;
   {
     M m;
@@ -43,6 +43,9 @@ bool test() {
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/libcxx/containers/container.adaptors/flat.multiset/insert_range.pass.cpp b/libcxx/test/libcxx/containers/container.adaptors/flat.multiset/insert_range.pass.cpp
index 57a581c6c5cb9..c2fcd86fcf913 100644
--- a/libcxx/test/libcxx/containers/container.adaptors/flat.multiset/insert_range.pass.cpp
+++ b/libcxx/test/libcxx/containers/container.adaptors/flat.multiset/insert_range.pass.cpp
@@ -20,27 +20,36 @@
 #include <cassert>
 #include <flat_set>
 #include <ranges>
-#include <sstream>
 #include <vector>
 
 #include "../flat_helpers.h"
+#include "test_iterators.h"
 #include "test_macros.h"
 
-void test() {
+constexpr bool test() {
   NotQuiteSequenceContainer<int> v;
   std::flat_multiset s(v);
-  std::istringstream ints("0 1 1 0");
-  auto r = std::ranges::subrange(std::istream_iterator<int>(ints), std::istream_iterator<int>()) |
-           std::views::transform([](int i) { return i * i; });
+
+  int ar[]   = {0, 1, 1, 0};
+  using Iter = cpp20_input_iterator<const int*>;
+  using Sent = sentinel_wrapper<Iter>;
+  using R    = std::ranges::subrange<Iter, Sent>;
+  auto r     = R(Iter(ar), Sent(Iter(ar + 4)));
+
   static_assert(
       ![](auto& t) { return requires { t.insert_range(t.end(), r); }; }(v),
       "This test is to test the case where the underlying container does not provide insert_range");
   s.insert_range(r);
   assert(std::ranges::equal(s, std::vector<int>{0, 0, 1, 1}));
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/libcxx/input.output/iostreams.base/ios.base/ios.base.cons/dtor.uninitialized.pass.cpp b/libcxx/test/libcxx/input.output/iostreams.base/ios.base/ios.base.cons/dtor.uninitialized.pass.cpp
index f17c1483c4a99..e5d48a35f4fd7 100644
--- a/libcxx/test/libcxx/input.output/iostreams.base/ios.base/ios.base.cons/dtor.uninitialized.pass.cpp
+++ b/libcxx/test/libcxx/input.output/iostreams.base/ios.base/ios.base.cons/dtor.uninitialized.pass.cpp
@@ -6,9 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// TODO(mordante) Investigate
-// UNSUPPORTED: apple-clang
-
 // UNSUPPORTED: no-exceptions
 
 // The fix for issue 57964 requires an updated dylib due to explicit
diff --git a/libcxx/test/libcxx/numerics/c.math/constexpr-cxx23-clang.pass.cpp b/libcxx/test/libcxx/numerics/c.math/constexpr-cxx23-clang.pass.cpp
index 3f17f21e8c108..20887b8cf2678 100644
--- a/libcxx/test/libcxx/numerics/c.math/constexpr-cxx23-clang.pass.cpp
+++ b/libcxx/test/libcxx/numerics/c.math/constexpr-cxx23-clang.pass.cpp
@@ -220,16 +220,9 @@ int main(int, char**) {
   ASSERT_CONSTEXPR_CXX23(std::isnormal(-1.0) == 1);
   ASSERT_CONSTEXPR_CXX23(std::isnormal(-1.0L) == 1);
 
-// TODO(LLVM 22): Remove `__has_constexpr_builtin` conditional once support for Clang 19 is dropped.
-#if !__has_constexpr_builtin(__builtin_signbit)
-  ASSERT_NOT_CONSTEXPR_CXX23(std::signbit(-1.0f) == 1);
-  ASSERT_NOT_CONSTEXPR_CXX23(std::signbit(-1.0) == 1);
-  ASSERT_NOT_CONSTEXPR_CXX23(std::signbit(-1.0L) == 1);
-#else
   ASSERT_CONSTEXPR_CXX23(std::signbit(-1.0f) == 1);
   ASSERT_CONSTEXPR_CXX23(std::signbit(-1.0) == 1);
   ASSERT_CONSTEXPR_CXX23(std::signbit(-1.0L) == 1);
-#endif
 
   ASSERT_NOT_CONSTEXPR_CXX23(std::isgreater(-1.0f, 0.0f) == 0);
   ASSERT_NOT_CONSTEXPR_CXX23(std::isgreater(-1.0, 0.0) == 0);
diff --git a/libcxx/test/libcxx/utilities/expected/expected.expected/transform_error.mandates.verify.cpp b/libcxx/test/libcxx/utilities/expected/expected.expected/transform_error.mandates.verify.cpp
index 09ebd0069b3a9..3e9bdd98cd394 100644
--- a/libcxx/test/libcxx/utilities/expected/expected.expected/transform_error.mandates.verify.cpp
+++ b/libcxx/test/libcxx/utilities/expected/expected.expected/transform_error.mandates.verify.cpp
@@ -8,15 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 
-// With clang-cl, some warnings have a 'which is a Microsoft extension' suffix
-// which break the tests. But #102851 will turn it into an error, making the test pass.
-// However, upstream libcxx buildbots do not build clang from source while testing, so
-// this tests still expected to fail on these bots.
-//
-// TODO(LLVM 22): Remove '0-1' from 'expected-error-re@*:* 0-1 {{union member {{.*}} has reference type {{.*}}}}'
-// and remove 'expected-warning-re@*:* 0-1 {{union member {{.*}} has reference type {{.*}}, which is a Microsoft extension}}'
-// once LLVM 22 releases. See https://llvm.org/PR104885.
-
 // Test the mandates
 
 // template<class F> constexpr auto transform_error(F&& f) &;
@@ -55,41 +46,39 @@ void test() {
   {
     std::expected<int, int> e;
     e.transform_error(return_unexpected<int&>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
-    // expected-error-re@*:* 0-1 {{no matching constructor for initialization of{{.*}}}}
     // expected-error-re@*:* {{static assertion failed {{.*}}[expected.object.general] A program that instantiates the definition of template expected<T, E> for {{.*}} is ill-formed.}}
-    // expected-error-re@*:* 0-1 {{union member {{.*}} has reference type {{.*}}}}
+    // expected-error-re@*:* {{union member {{.*}} has reference type {{.*}}}}
 
     e.transform_error(return_no_object<int&>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
-    // expected-error-re@*:* 0-1 {{no matching constructor for initialization of{{.*}}}}
+    // expected-error-re@*:* {{no matching constructor for initialization of{{.*}}}}
     // expected-error-re@*:* {{static assertion failed {{.*}}[expected.object.general] A program that instantiates the definition of template expected<T, E> for {{.*}} is ill-formed.}}
-    // expected-warning-re@*:* 0-1 {{union member {{.*}} has reference type {{.*}}, which is a Microsoft extension}}
   }
 
   // Test const& overload
   {
     const std::expected<int, int> e;
     e.transform_error(return_unexpected<const int &>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
-    // expected-error-re@*:* 0-2 {{no matching constructor for initialization of{{.*}}}}
+    // expected-error-re@*:* {{no matching constructor for initialization of{{.*}}}}
     e.transform_error(return_no_object<const int &>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
-    // expected-error-re@*:* 0-2 {{no matching constructor for initialization of{{.*}}}}
+    // expected-error-re@*:* {{no matching constructor for initialization of{{.*}}}}
   }
 
   // Test && overload
   {
     std::expected<int, int> e;
     std::move(e).transform_error(return_unexpected<int&&>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
-    // expected-error-re@*:* 0-2 {{no matching constructor for initialization of{{.*}}}}
+    // expected-error-re@*:* {{no matching constructor for initialization of{{.*}}}}
     std::move(e).transform_error(return_no_object<int&&>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
-    // expected-error-re@*:* 0-2 {{no matching constructor for initialization of{{.*}}}}
+    // expected-error-re@*:* {{no matching constructor for initialization of{{.*}}}}
   }
 
   // Test const&& overload
   {
     const std::expected<int, int> e;
     std::move(e).transform_error(return_unexpected<const int&&>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
-    // expected-error-re@*:* 0-2 {{no matching constructor for initialization of{{.*}}}}
+    // expected-error-re@*:* {{no matching constructor for initialization of{{.*}}}}
     std::move(e).transform_error(return_no_object<const int&&>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
-    // expected-error-re@*:* 0-2 {{no matching constructor for initialization of{{.*}}}}
+    // expected-error-re@*:* {{no matching constructor for initialization of{{.*}}}}
   }
 }
 // clang-format on
diff --git a/libcxx/test/libcxx/utilities/expected/expected.void/transform_error.mandates.verify.cpp b/libcxx/test/libcxx/utilities/expected/expected.void/transform_error.mandates.verify.cpp
index 9fd7452af64fb..c5acc27af03ea 100644
--- a/libcxx/test/libcxx/utilities/expected/expected.void/transform_error.mandates.verify.cpp
+++ b/libcxx/test/libcxx/utilities/expected/expected.void/transform_error.mandates.verify.cpp
@@ -8,16 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 
-// With clang-cl, some warnings have a 'which is a Microsoft extension' suffix
-// which break the tests. But #102851 will turn it into an error, making the test pass.
-// However, upstream libcxx buildbots do not build clang from source while testing, so
-// this tests still expected to fail on these bots.
-//
-// TODO(LLVM 22): Remove '0-1' from 'expected-error-re@*:* 0-1 {{union member {{.*}} has reference type {{.*}}}}'
-// and remove 'expected-warning-re@*:* 0-1 {{union member {{.*}} has reference type {{.*}}, which is a Microsoft extension}}'
-// and remove 'expected-error-re@*:* 0-1 {{call to deleted constructor of {{.*}}}}'
-// once LLVM 22 releases. See See https://llvm.org/PR104885.
-
 // Test the mandates
 
 // template<class F> constexpr auto transform_error(F&& f) &;
@@ -56,43 +46,36 @@ void test() {
   {
     std::expected<void, int> e;
     e.transform_error(return_unexpected<int&>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
-    // expected-error-re@*:* 0-1 {{no matching constructor for initialization of{{.*}}}}
+    // expected-error-re@*:* {{no matching constructor for initialization of{{.*}}}}
     // expected-error-re@*:* {{static assertion failed {{.*}}A program that instantiates expected<T, E> with a E that is not a valid argument for unexpected<E> is ill-formed}}
-    // expected-error-re@*:* 0-1 {{call to deleted constructor of {{.*}}}}
-    // expected-error-re@*:* 0-1 {{union member {{.*}} has reference type {{.*}}}}
+    // expected-error-re@*:* {{union member {{.*}} has reference type {{.*}}}}
 
     e.transform_error(return_no_object<int&>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
-    // expected-error-re@*:* 0-1 {{no matching constructor for initialization of{{.*}}}}
+    // expected-error-re@*:* {{no matching constructor for initialization of{{.*}}}}
     // expected-error-re@*:* {{static assertion failed {{.*}}A program that instantiates expected<T, E> with a E that is not a valid argument for unexpected<E> is ill-formed}}
-    // expected-warning-re@*:* 0-1 {{union member {{.*}} has reference type {{.*}}, which is a Microsoft extension}}
   }
 
   // Test const& overload
   {
     const std::expected<void, int> e;
     e.transform_error(return_unexpected<const int &>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
-    // expected-error-re@*:* 0-1 {{no matching constructor for initialization of{{.*}}}}
+    // expected-error-re@*:* {{no matching constructor for initialization of{{.*}}}}
     e.transform_error(return_no_object<const int &>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
-    // expected-error-re@*:* 0-1 {{no matching constructor for initialization of{{.*}}}}
-    // expected-error-re@*:* 0-1 {{call to deleted constructor of {{.*}}}}
+    // expected-error-re@*:* {{no matching constructor for initialization of{{.*}}}}
   }
 
   // Test && overload
   {
     std::expected<void, int> e;
     std::move(e).transform_error(return_unexpected<int&&>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
-    // expected-error-re@*:* 0-1 {{no matching constructor for initialization of{{.*}}}}
     std::move(e).transform_error(return_no_object<int&&>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
-    // expected-error-re@*:* 0-1 {{no matching constructor for initialization of{{.*}}}}
   }
 
   // Test const&& overload
   {
     const std::expected<void, int> e;
     std::move(e).transform_error(return_unexpected<const int&&>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
-    // expected-error-re@*:* 0-1 {{no matching constructor for initialization of{{.*}}}}
     std::move(e).transform_error(return_no_object<const int&&>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
-    // expected-error-re@*:* 0-1 {{no matching constructor for initialization of{{.*}}}}
   }
 }
 // clang-format on
diff --git a/libcxx/test/libcxx/utilities/function.objects/lifetimebound.verify.cpp b/libcxx/test/libcxx/utilities/function.objects/lifetimebound.verify.cpp
new file mode 100644
index 0000000000000..5c66bc11fca4c
--- /dev/null
+++ b/libcxx/test/libcxx/utilities/function.objects/lifetimebound.verify.cpp
@@ -0,0 +1,20 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// ADDITIONAL_COMPILE_FLAGS: -Wno-pessimizing-move -Wno-unused-variable
+
+#include <functional>
+
+#include "test_macros.h"
+
+// clang-format off
+
+void func() {
+  auto&& v1 = std::identity()(1); // expected-warning {{temporary bound to local reference 'v1' will be destroyed at the end of the full-expression}}
+}
diff --git a/libcxx/test/libcxx/utilities/meta/is_within_lifetime.verify.cpp b/libcxx/test/libcxx/utilities/meta/is_within_lifetime.verify.cpp
new file mode 100644
index 0000000000000..ff3ecfbbc120c
--- /dev/null
+++ b/libcxx/test/libcxx/utilities/meta/is_within_lifetime.verify.cpp
@@ -0,0 +1,26 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23
+// UNSUPPORTED: gcc-15, apple-clang-17
+
+// <type_traits>
+
+// LWG4138 <https://cplusplus.github.io/LWG/issue4138>
+// std::is_within_lifetime shouldn't work when a function type is
+// explicitly specified, even if it isn't evaluated
+
+#include <type_traits>
+
+template <class T>
+consteval bool checked_is_within_lifetime(T* p) {
+  return p ? std::is_within_lifetime<T>(p) : false;
+}
+static_assert(!checked_is_within_lifetime<int>(nullptr));
+static_assert(!checked_is_within_lifetime<void()>(nullptr));
+// expected-error@*:* {{function pointer argument to '__builtin_is_within_lifetime' is not allowed}}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.capacity/empty.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.capacity/empty.pass.cpp
index 52f77438df2ce..88a76d3c1c8b8 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.capacity/empty.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.capacity/empty.pass.cpp
@@ -24,7 +24,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, std::less<int>, KeyContainer>;
   M m;
@@ -38,15 +38,23 @@ void test_one() {
   assert(m.empty());
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.capacity/max_size.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.capacity/max_size.pass.cpp
index 4e3d1414b28af..fb9c38f592262 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.capacity/max_size.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.capacity/max_size.pass.cpp
@@ -24,7 +24,7 @@
 #include "test_allocator.h"
 #include "test_macros.h"
 
-void test() {
+constexpr bool test() {
   {
     using A1 = limited_allocator<int, 10>;
     using C  = std::flat_multiset<int, std::less<int>, std::vector<int, A1>>;
@@ -59,10 +59,15 @@ void test() {
     assert(c.max_size() <= max_dist);
     assert(c.max_size() <= alloc_max_size(std::allocator<char>()));
   }
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.capacity/size.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.capacity/size.pass.cpp
index 4aff08b8127b6..156bb27fae992 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.capacity/size.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.capacity/size.pass.cpp
@@ -7,6 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+// ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-steps): -fconstexpr-steps=200000000
+// ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-ops-limit): -fconstexpr-ops-limit=800000000
 
 // <flat_set>
 
@@ -23,7 +25,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using M = std::flat_multiset<int, std::less<int>, KeyContainer>;
   using S = typename M::size_type;
   {
@@ -46,7 +48,7 @@ void test_one() {
   }
   {
     M m;
-    S s = 500000;
+    S s = 5000;
     for (std::size_t i = 0u; i < s; ++i) {
       m.emplace(i);
       m.emplace(i);
@@ -57,15 +59,23 @@ void test_one() {
   }
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/alloc.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/alloc.pass.cpp
index 4fffcb304d20a..2426fbc0fc063 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/alloc.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/alloc.pass.cpp
@@ -14,6 +14,7 @@
 //   explicit flat_multiset(const Allocator& a);
 
 #include <cassert>
+#include <deque>
 #include <flat_set>
 #include <functional>
 #include <vector>
@@ -22,7 +23,8 @@
 #include "test_allocator.h"
 #include "../../../test_compare.h"
 
-void test() {
+template <template <class...> class KeyContainer>
+constexpr void test() {
   {
     // The constructors in this subclause shall not participate in overload
     // resolution unless uses_allocator_v<container_type, Alloc> is true
@@ -30,8 +32,8 @@ void test() {
     using C  = test_less<int>;
     using A1 = test_allocator<int>;
     using A2 = other_allocator<int>;
-    using V1 = std::vector<int, A1>;
-    using V2 = std::vector<int, A2>;
+    using V1 = KeyContainer<int, A1>;
+    using V2 = KeyContainer<int, A2>;
     using M1 = std::flat_multiset<int, C, V1>;
     using M2 = std::flat_multiset<int, C, V2>;
     static_assert(std::is_constructible_v<M1, const A1&>);
@@ -39,25 +41,38 @@ void test() {
     static_assert(!std::is_constructible_v<M1, const A2&>);
     static_assert(!std::is_constructible_v<M2, const A1&>);
   }
-  {
-    // explicit
-    using M = std::flat_multiset<int, std::less<int>, std::vector<int, test_allocator<int>>>;
-
-    static_assert(std::is_constructible_v<M, test_allocator<int>>);
-    static_assert(!std::is_convertible_v<test_allocator<int>, M>);
-  }
   {
     using A = test_allocator<short>;
-    using M = std::flat_multiset<int, std::less<int>, std::vector<int, test_allocator<int>>>;
+    using M = std::flat_multiset<int, std::less<int>, KeyContainer<int, test_allocator<int>>>;
     M m(A(0, 5));
     assert(m.empty());
     assert(m.begin() == m.end());
     assert(std::move(m).extract().get_allocator().get_id() == 5);
   }
+  {
+    // explicit
+    using M = std::flat_multiset<int, std::less<int>, KeyContainer<int, test_allocator<int>>>;
+
+    static_assert(std::is_constructible_v<M, test_allocator<int>>);
+    static_assert(!std::is_convertible_v<test_allocator<int>, M>);
+  }
+}
+
+constexpr bool test() {
+  test<std::vector>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque>();
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/assign_initializer_list.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/assign_initializer_list.pass.cpp
index ae81ab044932d..a895117517ef4 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/assign_initializer_list.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/assign_initializer_list.pass.cpp
@@ -26,7 +26,7 @@
 #include "test_allocator.h"
 
 template <class KeyContainer>
-void test() {
+constexpr void test() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, std::less<Key>, KeyContainer>;
   {
@@ -53,16 +53,24 @@ void test() {
   }
 }
 
-void test() {
+constexpr bool test() {
   test<std::vector<int>>();
   test<std::vector<double>>();
-  test<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque<int>>();
   test<MinSequenceContainer<int>>();
   test<std::vector<int, min_allocator<int>>>();
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/compare.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/compare.pass.cpp
index 6b68589e6814f..43ebea740f66c 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/compare.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/compare.pass.cpp
@@ -20,11 +20,35 @@
 #include <type_traits>
 #include <vector>
 
+#include "MinSequenceContainer.h"
+#include "min_allocator.h"
 #include "test_macros.h"
 #include "../../../test_compare.h"
 #include "test_allocator.h"
 
-void test() {
+template <class KeyContainer>
+constexpr void test_compare() {
+  using Key = typename KeyContainer::value_type;
+  {
+    // The one-argument ctor is explicit.
+    using C = test_less<Key>;
+    static_assert(std::is_constructible_v<std::flat_multiset<Key, C>, C>);
+    static_assert(!std::is_convertible_v<C, std::flat_multiset<Key, C>>);
+
+    static_assert(std::is_constructible_v<std::flat_multiset<Key>, std::less<Key>>);
+    static_assert(!std::is_convertible_v<std::less<Key>, std::flat_multiset<Key>>);
+  }
+  {
+    using C = test_less<Key>;
+    auto m  = std::flat_multiset<Key, C>(C(3));
+    assert(m.empty());
+    assert(m.begin() == m.end());
+    assert(m.key_comp() == C(3));
+  }
+}
+
+template <template <class...> class KeyContainer>
+constexpr void test_compare_alloc() {
   {
     // The constructors in this subclause shall not participate in overload
     // resolution unless uses_allocator_v<container_type, Alloc> is true
@@ -32,8 +56,8 @@ void test() {
     using C  = test_less<int>;
     using A1 = test_allocator<int>;
     using A2 = other_allocator<int>;
-    using V1 = std::vector<int, A1>;
-    using V2 = std::vector<int, A2>;
+    using V1 = KeyContainer<int, A1>;
+    using V2 = KeyContainer<int, A2>;
     using M1 = std::flat_multiset<int, C, V1>;
     using M2 = std::flat_multiset<int, C, V2>;
     static_assert(std::is_constructible_v<M1, const C&, const A1&>);
@@ -41,26 +65,10 @@ void test() {
     static_assert(!std::is_constructible_v<M1, const C&, const A2&>);
     static_assert(!std::is_constructible_v<M2, const C&, const A1&>);
   }
-  {
-    using C = test_less<int>;
-    auto m  = std::flat_multiset<int, C>(C(3));
-    assert(m.empty());
-    assert(m.begin() == m.end());
-    assert(m.key_comp() == C(3));
-  }
-  {
-    // The one-argument ctor is explicit.
-    using C = test_less<int>;
-    static_assert(std::is_constructible_v<std::flat_multiset<int, C>, C>);
-    static_assert(!std::is_convertible_v<C, std::flat_multiset<int, C>>);
-
-    static_assert(std::is_constructible_v<std::flat_multiset<int>, std::less<int>>);
-    static_assert(!std::is_convertible_v<std::less<int>, std::flat_multiset<int>>);
-  }
   {
     using C  = test_less<int>;
     using A1 = test_allocator<int>;
-    auto m   = std::flat_multiset<int, C, std::vector<int, A1>>(C(4), A1(5));
+    auto m   = std::flat_multiset<int, C, KeyContainer<int, A1>>(C(4), A1(5));
     assert(m.empty());
     assert(m.begin() == m.end());
     assert(m.key_comp() == C(4));
@@ -68,9 +76,9 @@ void test() {
   }
   {
     // explicit(false)
-    using C                                           = test_less<int>;
-    using A1                                          = test_allocator<int>;
-    std::flat_multiset<int, C, std::deque<int, A1>> m = {C(4), A1(5)};
+    using C                                             = test_less<int>;
+    using A1                                            = test_allocator<int>;
+    std::flat_multiset<int, C, KeyContainer<int, A1>> m = {C(4), A1(5)};
     assert(m.empty());
     assert(m.begin() == m.end());
     assert(m.key_comp() == C(4));
@@ -78,8 +86,29 @@ void test() {
   }
 }
 
+constexpr bool test() {
+  test_compare<std::vector<int>>();
+  test_compare<MinSequenceContainer<int>>();
+  test_compare<std::vector<int, min_allocator<int>>>();
+
+  test_compare_alloc<std::vector>();
+
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test_compare<std::deque<int>>();
+    test_compare_alloc<std::deque>();
+  }
+
+  return true;
+}
+
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/containers.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/containers.pass.cpp
index 78eac420a8f22..1a476009e45d3 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/containers.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/containers.pass.cpp
@@ -35,7 +35,8 @@ void conversion_test(T);
 template <class T, class... Args>
 concept ImplicitlyConstructible = requires(Args&&... args) { conversion_test<T>({std::forward<Args>(args)...}); };
 
-void test() {
+template <template <class...> class KeyContainer>
+constexpr void test() {
   {
     // The constructors in this subclause shall not participate in overload
     // resolution unless uses_allocator_v<container_type, Alloc> is true
@@ -43,8 +44,8 @@ void test() {
     using C  = test_less<int>;
     using A1 = test_allocator<int>;
     using A2 = other_allocator<int>;
-    using V1 = std::vector<int, A1>;
-    using V2 = std::vector<int, A2>;
+    using V1 = KeyContainer<int, A1>;
+    using V2 = KeyContainer<int, A2>;
     using M1 = std::flat_multiset<int, C, V1>;
     using M2 = std::flat_multiset<int, C, V2>;
     static_assert(std::is_constructible_v<M1, const V1&, const A1&>);
@@ -59,15 +60,15 @@ void test() {
   }
   {
     // flat_multiset(container_type)
-    using M             = std::flat_multiset<int>;
-    std::vector<int> ks = {1, 1, 1, 2, 2, 3, 2, 3, 3};
-    auto m              = M(ks);
-    int expected[]      = {1, 1, 1, 2, 2, 2, 3, 3, 3};
+    using M              = std::flat_multiset<int, std::less<int>, KeyContainer<int>>;
+    KeyContainer<int> ks = {1, 1, 1, 2, 2, 3, 2, 3, 3};
+    auto m               = M(ks);
+    int expected[]       = {1, 1, 1, 2, 2, 2, 3, 3, 3};
     assert(std::ranges::equal(m, expected));
 
     // explicit(false)
-    static_assert(std::is_constructible_v<M, const std::vector<int>&>);
-    static_assert(!ImplicitlyConstructible<M, const std::vector<int>&>);
+    static_assert(std::is_constructible_v<M, const KeyContainer<int>&>);
+    static_assert(!ImplicitlyConstructible<M, const KeyContainer<int>&>);
 
     m = M(std::move(ks));
     assert(ks.empty()); // it was moved-from
@@ -77,7 +78,7 @@ void test() {
     // flat_multiset(container_type)
     // move-only
     int expected[] = {3, 3, 2, 1};
-    using Ks       = std::deque<MoveOnly, min_allocator<MoveOnly>>;
+    using Ks       = KeyContainer<MoveOnly, min_allocator<MoveOnly>>;
     using M        = std::flat_multiset<MoveOnly, std::greater<MoveOnly>, Ks>;
     Ks ks;
     ks.push_back(1);
@@ -92,8 +93,8 @@ void test() {
     // flat_multiset(container_type)
     // container's allocators are used
     using A = test_allocator<int>;
-    using M = std::flat_multiset<int, std::less<int>, std::deque<int, A>>;
-    auto ks = std::deque<int, A>({1, 1, 1, 2, 2, 3, 2, 3, 3}, A(5));
+    using M = std::flat_multiset<int, std::less<int>, KeyContainer<int, A>>;
+    auto ks = KeyContainer<int, A>({1, 1, 1, 2, 2, 3, 2, 3, 3}, A(5));
     auto m  = M(std::move(ks));
     assert(ks.empty()); // it was moved-from
     assert((m == M{1, 1, 1, 2, 2, 2, 3, 3, 3}));
@@ -102,22 +103,22 @@ void test() {
   }
   {
     // flat_multiset(container_type, key_compare)
-    using C             = test_less<int>;
-    using M             = std::flat_multiset<int, C>;
-    std::vector<int> ks = {1, 1, 1, 2, 2, 3, 2, 3, 3};
-    auto m              = M(ks, C(4));
+    using C              = test_less<int>;
+    using M              = std::flat_multiset<int, C, KeyContainer<int>>;
+    KeyContainer<int> ks = {1, 1, 1, 2, 2, 3, 2, 3, 3};
+    auto m               = M(ks, C(4));
     assert(std::ranges::equal(m, std::vector<int>{1, 1, 1, 2, 2, 2, 3, 3, 3}));
     assert(m.key_comp() == C(4));
 
     // explicit
-    static_assert(std::is_constructible_v<M, const std::vector<int>&, const C&>);
-    static_assert(!ImplicitlyConstructible<M, const std::vector<int>&, const C&>);
+    static_assert(std::is_constructible_v<M, const KeyContainer<int>&, const C&>);
+    static_assert(!ImplicitlyConstructible<M, const KeyContainer<int>&, const C&>);
   }
   {
     // flat_multiset(container_type , const Allocator&)
     using A = test_allocator<int>;
-    using M = std::flat_multiset<int, std::less<int>, std::deque<int, A>>;
-    auto ks = std::deque<int, A>({1, 1, 1, 2, 2, 3, 2, 3, 3}, A(5));
+    using M = std::flat_multiset<int, std::less<int>, KeyContainer<int, A>>;
+    auto ks = KeyContainer<int, A>({1, 1, 1, 2, 2, 3, 2, 3, 3}, A(5));
     auto m  = M(ks, A(4)); // replaces the allocators
     assert(!ks.empty());   // it was an lvalue above
     assert((m == M{1, 1, 1, 2, 2, 2, 3, 3, 3}));
@@ -125,7 +126,7 @@ void test() {
     assert(keys.get_allocator() == A(4));
 
     // explicit(false)
-    static_assert(ImplicitlyConstructible<M, const std::deque<int, A>&, const A&>);
+    static_assert(ImplicitlyConstructible<M, const KeyContainer<int, A>&, const A&>);
     M m2 = {ks, A(4)};   // implicit ctor
     assert(!ks.empty()); // it was an lvalue above
     assert(m2 == m);
@@ -134,19 +135,19 @@ void test() {
   }
   {
     // flat_multiset(container_type , const Allocator&)
-    using C                = test_less<int>;
-    using A                = test_allocator<int>;
-    using M                = std::flat_multiset<int, C, std::vector<int, A>>;
-    std::vector<int, A> ks = {1, 1, 1, 2, 2, 3, 2, 3, 3};
-    auto m                 = M(ks, C(4), A(5));
-    assert(std::ranges::equal(m, std::vector<int, A>{1, 1, 1, 2, 2, 2, 3, 3, 3}));
+    using C                 = test_less<int>;
+    using A                 = test_allocator<int>;
+    using M                 = std::flat_multiset<int, C, KeyContainer<int, A>>;
+    KeyContainer<int, A> ks = {1, 1, 1, 2, 2, 3, 2, 3, 3};
+    auto m                  = M(ks, C(4), A(5));
+    assert(std::ranges::equal(m, KeyContainer<int, A>{1, 1, 1, 2, 2, 2, 3, 3, 3}));
     assert(m.key_comp() == C(4));
     auto m_copy = m;
     auto keys   = std::move(m_copy).extract();
     assert(keys.get_allocator() == A(5));
 
     // explicit(false)
-    static_assert(ImplicitlyConstructible<M, const std::vector<int, A>&, const A&>);
+    static_assert(ImplicitlyConstructible<M, const KeyContainer<int, A>&, const A&>);
     M m2 = {ks, C(4), A(5)};
     assert(m2 == m);
     assert(m2.key_comp() == C(4));
@@ -155,8 +156,22 @@ void test() {
   }
 }
 
+constexpr bool test() {
+  test<std::vector>();
+
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque>();
+
+  return true;
+}
+
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/copy.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/copy.pass.cpp
index b4f7220e1bac7..55f3defc5ddff 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/copy.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/copy.pass.cpp
@@ -14,6 +14,7 @@
 
 #include <algorithm>
 #include <cassert>
+#include <deque>
 #include <flat_set>
 #include <vector>
 
@@ -21,10 +22,11 @@
 #include "../../../test_compare.h"
 #include "test_allocator.h"
 
-void test() {
+template <template <class...> class KeyContainer>
+constexpr void test() {
   {
     using C = test_less<int>;
-    std::vector<int, test_allocator<int>> ks({1, 3, 5, 3, 1}, test_allocator<int>(6));
+    KeyContainer<int, test_allocator<int>> ks({1, 3, 5, 3, 1}, test_allocator<int>(6));
     const int expected[] = {1, 1, 3, 3, 5};
     using M              = std::flat_multiset<int, C, decltype(ks)>;
     auto mo              = M(ks, C(5));
@@ -43,7 +45,7 @@ void test() {
   }
   {
     using C              = test_less<int>;
-    using Ks             = std::vector<int, other_allocator<int>>;
+    using Ks             = KeyContainer<int, other_allocator<int>>;
     auto ks              = Ks({1, 3, 5, 3, 1}, other_allocator<int>(6));
     const int expected[] = {1, 1, 3, 3, 5};
     using M              = std::flat_multiset<int, C, Ks>;
@@ -63,8 +65,22 @@ void test() {
   }
 }
 
+constexpr bool test() {
+  test<std::vector>();
+
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque>();
+
+  return true;
+}
+
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/copy_alloc.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/copy_alloc.pass.cpp
index ec8ad824ea14b..ec9f14ecab6bd 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/copy_alloc.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/copy_alloc.pass.cpp
@@ -23,7 +23,8 @@
 #include "../../../test_compare.h"
 #include "test_allocator.h"
 
-void test() {
+template <template <class...> class KeyContainer>
+constexpr void test() {
   {
     // The constructors in this subclause shall not participate in overload
     // resolution unless uses_allocator_v<container_type, Alloc> is true.
@@ -31,8 +32,8 @@ void test() {
     using C  = test_less<int>;
     using A1 = test_allocator<int>;
     using A2 = other_allocator<int>;
-    using V1 = std::vector<int, A1>;
-    using V2 = std::vector<int, A2>;
+    using V1 = KeyContainer<int, A1>;
+    using V2 = KeyContainer<int, A2>;
     using M1 = std::flat_multiset<int, C, V1>;
     using M2 = std::flat_multiset<int, C, V2>;
     static_assert(std::is_constructible_v<M1, const M1&, const A1&>);
@@ -42,7 +43,7 @@ void test() {
   }
   {
     using C = test_less<int>;
-    std::vector<int, test_allocator<int>> ks({1, 3, 5, 5}, test_allocator<int>(6));
+    KeyContainer<int, test_allocator<int>> ks({1, 3, 5, 5}, test_allocator<int>(6));
     using M = std::flat_multiset<int, C, decltype(ks)>;
     auto mo = M(ks, C(5));
     auto m  = M(mo, test_allocator<int>(3));
@@ -59,8 +60,23 @@ void test() {
     assert(keys2.get_allocator() == test_allocator<int>(6));
   }
 }
+
+constexpr bool test() {
+  test<std::vector>();
+
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque>();
+
+  return true;
+}
+
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/copy_assign.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/copy_assign.pass.cpp
index 2b6176ac915a7..2e63a004ffa88 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/copy_assign.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/copy_assign.pass.cpp
@@ -13,6 +13,7 @@
 // flat_multiset& operator=(const flat_multiset& m);
 
 #include <algorithm>
+#include <deque>
 #include <flat_set>
 #include <functional>
 #include <vector>
@@ -22,11 +23,12 @@
 #include "../../../test_compare.h"
 #include "test_allocator.h"
 
-void test() {
+template <template <class...> class KeyContainer>
+constexpr void test() {
   {
     // test_allocator is not propagated
     using C = test_less<int>;
-    std::vector<int, test_allocator<int>> ks({1, 3, 5, 5}, test_allocator<int>(6));
+    KeyContainer<int, test_allocator<int>> ks({1, 3, 5, 5}, test_allocator<int>(6));
     using M = std::flat_multiset<int, C, decltype(ks)>;
     auto mo = M(ks, C(5));
     auto m  = M({{3, 4, 5, 4}}, C(3), test_allocator<int>(2));
@@ -46,7 +48,7 @@ void test() {
   {
     // other_allocator is propagated
     using C              = test_less<int>;
-    using Ks             = std::vector<int, other_allocator<int>>;
+    using Ks             = KeyContainer<int, other_allocator<int>>;
     auto ks              = Ks({1, 3, 5, 3}, other_allocator<int>(6));
     const int expected[] = {1, 3, 3, 5};
     using M              = std::flat_multiset<int, C, Ks>;
@@ -65,7 +67,7 @@ void test() {
     auto keys2 = std::move(mo).extract();
     assert(keys2.get_allocator() == other_allocator<int>(6));
   }
-  {
+  if (!TEST_IS_CONSTANT_EVALUATED) {
     // comparator is copied and invariant is preserved
     using M = std::flat_multiset<int, std::function<bool(int, int)>>;
     M mo    = M({1, 2}, std::less<int>());
@@ -103,8 +105,22 @@ void test() {
   }
 }
 
+constexpr bool test() {
+  test<std::vector>();
+
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque>();
+
+  return true;
+}
+
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/default.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/default.pass.cpp
index 16f90322cd31a..3a7ff86c6c040 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/default.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/default.pass.cpp
@@ -25,28 +25,29 @@
 #include "test_macros.h"
 
 struct DefaultCtableComp {
-  explicit DefaultCtableComp() { default_constructed_ = true; }
-  bool operator()(int, int) const { return false; }
+  constexpr explicit DefaultCtableComp() { default_constructed_ = true; }
+  constexpr bool operator()(int, int) const { return false; }
   bool default_constructed_ = false;
 };
 
 struct ThrowingCtorComp {
-  ThrowingCtorComp() noexcept(false) {}
-  bool operator()(const auto&, const auto&) const { return false; }
+  constexpr ThrowingCtorComp() noexcept(false) {}
+  constexpr bool operator()(const auto&, const auto&) const { return false; }
 };
 
-void test() {
+template <template <class...> class KeyContainer>
+constexpr void test() {
   {
-    std::flat_multiset<int> m;
+    std::flat_multiset<int, std::less<int>, KeyContainer<int>> m;
     assert(m.empty());
   }
   {
     // explicit(false)
-    std::flat_multiset<int> m = {};
+    std::flat_multiset<int, std::less<int>, KeyContainer<int>> m = {};
     assert(m.empty());
   }
   {
-    std::flat_multiset<int, DefaultCtableComp, std::deque<int, min_allocator<int>>> m;
+    std::flat_multiset<int, DefaultCtableComp, KeyContainer<int, min_allocator<int>>> m;
     assert(m.empty());
     assert(m.begin() == m.end());
     assert(m.key_comp().default_constructed_);
@@ -54,7 +55,7 @@ void test() {
   {
     using A1 = explicit_allocator<int>;
     {
-      std::flat_multiset<int, DefaultCtableComp, std::vector<int, A1>> m;
+      std::flat_multiset<int, DefaultCtableComp, KeyContainer<int, A1>> m;
       assert(m.empty());
       assert(m.key_comp().default_constructed_);
     }
@@ -67,30 +68,46 @@ void test() {
   }
 #if defined(_LIBCPP_VERSION)
   {
-    using C = std::flat_multiset<MoveOnly>;
+    using C = std::flat_multiset<MoveOnly, std::less<MoveOnly>>;
     static_assert(std::is_nothrow_default_constructible_v<C>);
     C c;
   }
   {
-    using C = std::flat_multiset<MoveOnly, std::less<MoveOnly>, std::vector<MoveOnly, test_allocator<MoveOnly>>>;
+    using C = std::flat_multiset<MoveOnly, std::less<MoveOnly>, KeyContainer<MoveOnly, test_allocator<MoveOnly>>>;
     static_assert(std::is_nothrow_default_constructible_v<C>);
     C c;
   }
 #endif // _LIBCPP_VERSION
   {
-    using C = std::flat_multiset<MoveOnly, std::less<MoveOnly>, std::vector<MoveOnly, other_allocator<MoveOnly>>>;
+    using C = std::flat_multiset<MoveOnly, std::less<MoveOnly>, KeyContainer<MoveOnly, other_allocator<MoveOnly>>>;
     static_assert(!std::is_nothrow_default_constructible_v<C>);
     C c;
   }
   {
-    using C = std::flat_multiset<MoveOnly, ThrowingCtorComp>;
+    using C = std::flat_multiset<MoveOnly, ThrowingCtorComp, KeyContainer<MoveOnly>>;
     static_assert(!std::is_nothrow_default_constructible_v<C>);
     C c;
   }
 }
 
+constexpr bool test() {
+  test<std::vector>();
+
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque>();
+  }
+
+  return true;
+}
+
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/dtor_noexcept.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/dtor_noexcept.pass.cpp
index f852f2f85572c..f7243fa7e7fb3 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/dtor_noexcept.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/dtor_noexcept.pass.cpp
@@ -23,39 +23,56 @@
 #include "test_allocator.h"
 
 struct ThrowingDtorComp {
-  bool operator()(const auto&, const auto&) const;
-  ~ThrowingDtorComp() noexcept(false) {}
+  constexpr bool operator()(const auto&, const auto&) const;
+  constexpr ~ThrowingDtorComp() noexcept(false) {}
 };
 
-void test() {
+template <template <class...> class KeyContainer>
+constexpr void test() {
   {
-    using C = std::flat_multiset<MoveOnly, MoveOnly>;
+    using C = std::flat_multiset<MoveOnly, std::less<MoveOnly>, KeyContainer<MoveOnly>>;
     static_assert(std::is_nothrow_destructible_v<C>);
     C c;
   }
   {
-    using V = std::vector<MoveOnly, test_allocator<MoveOnly>>;
+    using V = KeyContainer<MoveOnly, test_allocator<MoveOnly>>;
     using C = std::flat_multiset<MoveOnly, std::less<MoveOnly>, V>;
     static_assert(std::is_nothrow_destructible_v<C>);
     C c;
   }
   {
-    using V = std::deque<MoveOnly, other_allocator<MoveOnly>>;
+    using V = KeyContainer<MoveOnly, other_allocator<MoveOnly>>;
     using C = std::flat_multiset<MoveOnly, std::greater<MoveOnly>, V>;
     static_assert(std::is_nothrow_destructible_v<C>);
     C c;
   }
 #if defined(_LIBCPP_VERSION)
   {
-    using C = std::flat_multiset<MoveOnly, ThrowingDtorComp>;
+    using C = std::flat_multiset<MoveOnly, ThrowingDtorComp, KeyContainer<MoveOnly>>;
     static_assert(!std::is_nothrow_destructible_v<C>);
     C c;
   }
 #endif // _LIBCPP_VERSION
 }
 
+constexpr bool test() {
+  test<std::vector>();
+
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque>();
+  }
+
+  return true;
+}
+
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/initializer_list.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/initializer_list.pass.cpp
index 10638d75bbd14..36f5def21c14c 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/initializer_list.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/initializer_list.pass.cpp
@@ -32,12 +32,13 @@
 #include "../../../test_compare.h"
 
 struct DefaultCtableComp {
-  explicit DefaultCtableComp() { default_constructed_ = true; }
-  bool operator()(int, int) const { return false; }
+  constexpr explicit DefaultCtableComp() { default_constructed_ = true; }
+  constexpr bool operator()(int, int) const { return false; }
   bool default_constructed_ = false;
 };
 
-void test() {
+template <template <class...> class KeyContainer>
+constexpr void test() {
   {
     // The constructors in this subclause shall not participate in overload
     // resolution unless uses_allocator_v<container_type, Alloc> is true.
@@ -45,8 +46,8 @@ void test() {
     using C  = test_less<int>;
     using A1 = test_allocator<int>;
     using A2 = other_allocator<int>;
-    using V1 = std::vector<int, A1>;
-    using V2 = std::vector<int, A2>;
+    using V1 = KeyContainer<int, A1>;
+    using V2 = KeyContainer<int, A2>;
     using M1 = std::flat_multiset<int, C, V1>;
     using M2 = std::flat_multiset<int, C, V2>;
     using IL = std::initializer_list<int>;
@@ -60,10 +61,9 @@ void test() {
     static_assert(!std::is_constructible_v<M1, IL, const C&, const A2&>);
     static_assert(!std::is_constructible_v<M2, IL, const C&, const A1&>);
   }
-
   {
     // initializer_list<value_type> needs to match exactly
-    using M = std::flat_multiset<int>;
+    using M = std::flat_multiset<int, std::less<int>, KeyContainer<int>>;
     using C = typename M::key_compare;
     static_assert(std::is_constructible_v<M, std::initializer_list<int>>);
     static_assert(std::is_constructible_v<M, std::initializer_list<int>, C>);
@@ -78,11 +78,10 @@ void test() {
     static_assert(!std::is_constructible_v<M, std::initializer_list<const int>, C, std::allocator<int>>);
     static_assert(!std::is_constructible_v<M, std::initializer_list<const int>, std::allocator<int>>);
   }
-
   int expected[] = {1, 2, 2, 3, 3, 5};
   {
     // flat_multiset(initializer_list<value_type>);
-    using M                       = std::flat_multiset<int>;
+    using M                       = std::flat_multiset<int, std::less<int>, KeyContainer<int>>;
     std::initializer_list<int> il = {5, 2, 2, 3, 1, 3};
     M m(il);
     assert(std::ranges::equal(m, expected));
@@ -90,13 +89,13 @@ void test() {
   {
     // flat_multiset(initializer_list<value_type>);
     // explicit(false)
-    using M = std::flat_multiset<int>;
+    using M = std::flat_multiset<int, std::less<int>, KeyContainer<int>>;
     M m     = {5, 2, 2, 3, 1, 3};
     assert(std::ranges::equal(m, expected));
   }
   {
     // flat_multiset(initializer_list<value_type>);
-    using M = std::flat_multiset<int, std::greater<int>, std::deque<int, min_allocator<int>>>;
+    using M = std::flat_multiset<int, std::greater<int>, KeyContainer<int, min_allocator<int>>>;
     M m     = {5, 2, 2, 3, 1, 3};
     assert(std::ranges::equal(m, expected | std::views::reverse));
   }
@@ -105,15 +104,14 @@ void test() {
     {
       // flat_multiset(initializer_list<value_type>);
       // different comparator
-      using M = std::flat_multiset<int, DefaultCtableComp, std::vector<int, A>>;
+      using M = std::flat_multiset<int, DefaultCtableComp, KeyContainer<int, A>>;
       M m     = {1, 2, 3};
       assert(m.size() == 3);
-      LIBCPP_ASSERT(*m.begin() == 1);
       assert(m.key_comp().default_constructed_);
     }
     {
       // flat_multiset(initializer_list<value_type>, const Allocator&);
-      using M = std::flat_multiset<int, std::greater<int>, std::deque<int, A>>;
+      using M = std::flat_multiset<int, std::greater<int>, KeyContainer<int, A>>;
       A a;
       M m({5, 2, 2, 3, 1, 3}, a);
       assert(std::ranges::equal(m, expected | std::views::reverse));
@@ -122,7 +120,7 @@ void test() {
   {
     // flat_multiset(initializer_list<value_type>, const key_compare&);
     using C = test_less<int>;
-    using M = std::flat_multiset<int, C>;
+    using M = std::flat_multiset<int, C, KeyContainer<int>>;
     auto m  = M({5, 2, 2, 3, 1, 3}, C(10));
     assert(std::ranges::equal(m, expected));
     assert(m.key_comp() == C(10));
@@ -132,10 +130,10 @@ void test() {
     assert(m2 == m);
     assert(m2.key_comp() == C(10));
   }
-  {
+  if (!TEST_IS_CONSTANT_EVALUATED) {
     // flat_multiset(initializer_list<value_type>, const key_compare&);
     // Sorting uses the comparator that was passed in
-    using M = std::flat_multiset<int, std::function<bool(int, int)>, std::deque<int, min_allocator<int>>>;
+    using M = std::flat_multiset<int, std::function<bool(int, int)>, KeyContainer<int, min_allocator<int>>>;
     auto m  = M({5, 2, 2, 1, 3, 3}, std::greater<int>());
     assert(std::ranges::equal(m, expected | std::views::reverse));
     assert(m.key_comp()(2, 1) == true);
@@ -143,15 +141,31 @@ void test() {
   {
     // flat_multiset(initializer_list<value_type> il, const key_compare& comp, const Alloc& a);
     using A = explicit_allocator<int>;
-    using M = std::flat_multiset<int, std::greater<int>, std::deque<int, A>>;
+    using M = std::flat_multiset<int, std::greater<int>, KeyContainer<int, A>>;
     A a;
     M m({5, 2, 2, 3, 1, 3}, {}, a);
     assert(std::ranges::equal(m, expected | std::views::reverse));
   }
 }
 
+constexpr bool test() {
+  test<std::vector>();
+
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque>();
+  }
+
+  return true;
+}
+
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/iter_iter.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/iter_iter.pass.cpp
index da9aef3dc36cd..0f757db3eb9ac 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/iter_iter.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/iter_iter.pass.cpp
@@ -30,7 +30,8 @@
 #include "test_macros.h"
 #include "../../../test_compare.h"
 
-void test() {
+template <template <class...> class KeyContainer>
+constexpr void test() {
   {
     // The constructors in this subclause shall not participate in overload
     // resolution unless uses_allocator_v<container_type, Alloc> is true.
@@ -38,8 +39,8 @@ void test() {
     using C     = test_less<int>;
     using A1    = test_allocator<int>;
     using A2    = other_allocator<int>;
-    using V1    = std::vector<int, A1>;
-    using V2    = std::vector<int, A2>;
+    using V1    = KeyContainer<int, A1>;
+    using V2    = KeyContainer<int, A2>;
     using M1    = std::flat_multiset<int, C, V1>;
     using M2    = std::flat_multiset<int, C, V2>;
     using Iter1 = typename M1::iterator;
@@ -60,7 +61,7 @@ void test() {
   {
     // flat_multiset(InputIterator , InputIterator)
     // cpp17_input_iterator
-    using M = std::flat_multiset<int>;
+    using M = std::flat_multiset<int, std::less<int>, KeyContainer<int>>;
     auto m  = M(cpp17_input_iterator<const int*>(ar), cpp17_input_iterator<const int*>(ar + 9));
     assert(std::ranges::equal(m, expected));
 
@@ -71,21 +72,21 @@ void test() {
   {
     // flat_multiset(InputIterator , InputIterator)
     // greater
-    using M = std::flat_multiset<int, std::greater<int>, std::deque<int, min_allocator<int>>>;
+    using M = std::flat_multiset<int, std::greater<int>, KeyContainer<int, min_allocator<int>>>;
     auto m  = M(cpp17_input_iterator<const int*>(ar), cpp17_input_iterator<const int*>(ar + 9));
     assert(std::ranges::equal(m, expected | std::views::reverse));
   }
   {
     // flat_multiset(InputIterator , InputIterator)
     // Test when the operands are of array type (also contiguous iterator type)
-    using M = std::flat_multiset<int, std::greater<int>, std::vector<int, min_allocator<int>>>;
+    using M = std::flat_multiset<int, std::greater<int>, KeyContainer<int, min_allocator<int>>>;
     auto m  = M(ar, ar);
     assert(m.empty());
   }
   {
     // flat_multiset(InputIterator , InputIterator, const key_compare&)
     using C = test_less<int>;
-    using M = std::flat_multiset<int, C, std::vector<int>>;
+    using M = std::flat_multiset<int, C, KeyContainer<int>>;
     auto m  = M(ar, ar + 9, C(3));
     assert(std::ranges::equal(m, expected));
     assert(m.key_comp() == C(3));
@@ -98,7 +99,7 @@ void test() {
   {
     // flat_multiset(InputIterator , InputIterator, const Allocator&)
     using A1 = test_allocator<int>;
-    using M  = std::flat_multiset<int, std::less<int>, std::vector<int, A1>>;
+    using M  = std::flat_multiset<int, std::less<int>, KeyContainer<int, A1>>;
     auto m   = M(ar, ar + 9, A1(5));
     assert(std::ranges::equal(m, expected));
     assert(std::move(m).extract().get_allocator() == A1(5));
@@ -107,7 +108,7 @@ void test() {
     // flat_multiset(InputIterator , InputIterator, const Allocator&)
     // explicit(false)
     using A1 = test_allocator<int>;
-    using M  = std::flat_multiset<int, std::less<int>, std::vector<int, A1>>;
+    using M  = std::flat_multiset<int, std::less<int>, KeyContainer<int, A1>>;
     M m      = {ar, ar + 9, A1(5)}; // implicit ctor
     assert(std::ranges::equal(m, expected));
     assert(std::move(m).extract().get_allocator() == A1(5));
@@ -116,7 +117,7 @@ void test() {
     // flat_multiset(InputIterator , InputIterator, const key_compare&, const Allocator&)
     using C  = test_less<int>;
     using A1 = test_allocator<int>;
-    using M  = std::flat_multiset<int, C, std::vector<int, A1>>;
+    using M  = std::flat_multiset<int, C, KeyContainer<int, A1>>;
     auto m   = M(ar, ar + 9, C(3), A1(5));
     assert(std::ranges::equal(m, expected));
     assert(m.key_comp() == C(3));
@@ -126,7 +127,7 @@ void test() {
     // flat_multiset(InputIterator , InputIterator, const key_compare&, const Allocator&)
     // explicit(false)
     using A1 = test_allocator<int>;
-    using M  = std::flat_multiset<int, std::less<int>, std::deque<int, A1>>;
+    using M  = std::flat_multiset<int, std::less<int>, KeyContainer<int, A1>>;
     M m      = {ar, ar + 9, {}, A1(5)}; // implicit ctor
     assert(std::ranges::equal(m, expected));
     LIBCPP_ASSERT(std::ranges::equal(m, expected));
@@ -134,8 +135,21 @@ void test() {
   }
 }
 
+constexpr bool test() {
+  test<std::vector>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque>();
+
+  return true;
+}
+
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/move.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/move.pass.cpp
index 825ad75cc8f4c..7fb0c0e9c3fd0 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/move.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/move.pass.cpp
@@ -25,11 +25,12 @@
 #include "test_allocator.h"
 #include "min_allocator.h"
 
-void test() {
+template <template <class...> class KeyContainer>
+constexpr void test() {
   {
     using C = test_less<int>;
     using A = test_allocator<int>;
-    using M = std::flat_multiset<int, C, std::deque<int, A>>;
+    using M = std::flat_multiset<int, C, KeyContainer<int, A>>;
     M mo    = M({1, 2, 1, 3}, C(5), A(7));
     M m     = std::move(mo);
     assert((m == M{1, 1, 2, 3}));
@@ -43,7 +44,7 @@ void test() {
   {
     using C = test_less<int>;
     using A = min_allocator<int>;
-    using M = std::flat_multiset<int, C, std::vector<int, A>>;
+    using M = std::flat_multiset<int, C, KeyContainer<int, A>>;
     M mo    = M({1, 2, 1, 3}, C(5), A());
     M m     = std::move(mo);
     assert((m == M{1, 1, 2, 3}));
@@ -54,9 +55,9 @@ void test() {
     assert(mo.key_comp() == C(5));
     assert(std::move(mo).extract().get_allocator() == A());
   }
-  {
+  if (!TEST_IS_CONSTANT_EVALUATED) {
     // A moved-from flat_multiset maintains its class invariant in the presence of moved-from comparators.
-    using M = std::flat_multiset<int, std::function<bool(int, int)>>;
+    using M = std::flat_multiset<int, std::function<bool(int, int)>, KeyContainer<int>>;
     M mo    = M({1, 2, 1, 3}, std::less<int>());
     M m     = std::move(mo);
     assert(m.size() == 4);
@@ -81,6 +82,16 @@ void test() {
   }
 }
 
+constexpr bool test() {
+  test<std::vector>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque>();
+
+  return true;
+}
+
 template <class T>
 struct ThrowingMoveAllocator {
   using value_type                                    = T;
@@ -179,6 +190,9 @@ void test_move_exception() {
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
   test_move_noexcept();
 #if !defined(TEST_HAS_NO_EXCEPTIONS)
   test_move_exception();
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/move_alloc.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/move_alloc.pass.cpp
index ee8258e5ac846..1f095edb73370 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/move_alloc.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/move_alloc.pass.cpp
@@ -24,7 +24,8 @@
 #include "../../../test_compare.h"
 #include "test_allocator.h"
 
-void test() {
+template <template <class...> class KeyContainer>
+constexpr void test() {
   {
     // The constructors in this subclause shall not participate in overload
     // resolution unless uses_allocator_v<container_type, Alloc> is true.
@@ -32,8 +33,8 @@ void test() {
     using C  = test_less<int>;
     using A1 = test_allocator<int>;
     using A2 = other_allocator<int>;
-    using V1 = std::vector<int, A1>;
-    using V2 = std::vector<int, A2>;
+    using V1 = KeyContainer<int, A1>;
+    using V2 = KeyContainer<int, A2>;
     using M1 = std::flat_multiset<int, C, V1>;
     using M2 = std::flat_multiset<int, C, V2>;
     static_assert(std::is_constructible_v<M1, M1&&, const A1&>);
@@ -45,7 +46,7 @@ void test() {
     int expected[] = {1, 1, 2, 2, 3};
     using C        = test_less<int>;
     using A        = test_allocator<int>;
-    using M        = std::flat_multiset<int, C, std::deque<int, A>>;
+    using M        = std::flat_multiset<int, C, KeyContainer<int, A>>;
     auto mo        = M(expected, expected + 5, C(5), A(7));
     auto m         = M(std::move(mo), A(3));
 
@@ -72,8 +73,21 @@ void test() {
   }
 }
 
+constexpr bool test() {
+  test<std::vector>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque>();
+
+  return true;
+}
+
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/move_assign.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/move_assign.pass.cpp
index 96e046e38668f..62e21811e4962 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/move_assign.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/move_assign.pass.cpp
@@ -187,25 +187,12 @@ void test_move_assign_no_except() {
   }
 }
 
-void test() {
-  {
-    using C                           = test_less<int>;
-    using A1                          = test_allocator<int>;
-    using M                           = std::flat_multiset<int, C, std::vector<int, A1>>;
-    M mo                              = M({1, 1, 2, 3}, C(5), A1(7));
-    M m                               = M({}, C(3), A1(7));
-    std::same_as<M&> decltype(auto) r = m = std::move(mo);
-    assert(&r == &m);
-    assert((m == M{1, 1, 2, 3}));
-    assert(m.key_comp() == C(5));
-    auto ks = std::move(m).extract();
-    assert(ks.get_allocator() == A1(7));
-    assert(mo.empty());
-  }
+template <template <class...> class KeyContainer>
+constexpr void test() {
   {
     using C                           = test_less<int>;
     using A1                          = other_allocator<int>;
-    using M                           = std::flat_multiset<int, C, std::deque<int, A1>>;
+    using M                           = std::flat_multiset<int, C, KeyContainer<int, A1>>;
     M mo                              = M({4, 4, 5}, C(5), A1(7));
     M m                               = M({1, 1, 2, 3, 4}, C(3), A1(7));
     std::same_as<M&> decltype(auto) r = m = std::move(mo);
@@ -218,7 +205,7 @@ void test() {
   }
   {
     using A                           = min_allocator<int>;
-    using M                           = std::flat_multiset<int, std::greater<int>, std::vector<int, A>>;
+    using M                           = std::flat_multiset<int, std::greater<int>, KeyContainer<int, A>>;
     M mo                              = M({5, 3, 4, 3}, A());
     M m                               = M({4, 1, 3, 2, 1}, A());
     std::same_as<M&> decltype(auto) r = m = std::move(mo);
@@ -228,10 +215,37 @@ void test() {
     assert(ks.get_allocator() == A());
     assert(mo.empty());
   }
+  {
+    using C                           = test_less<int>;
+    using A1                          = test_allocator<int>;
+    using M                           = std::flat_multiset<int, C, KeyContainer<int, A1>>;
+    M mo                              = M({1, 1, 2, 3}, C(5), A1(7));
+    M m                               = M({}, C(3), A1(7));
+    std::same_as<M&> decltype(auto) r = m = std::move(mo);
+    assert(&r == &m);
+    assert((m == M{1, 1, 2, 3}));
+    assert(m.key_comp() == C(5));
+    auto ks = std::move(m).extract();
+    assert(ks.get_allocator() == A1(7));
+    assert(mo.empty());
+  }
+}
+
+constexpr bool test() {
+  test<std::vector>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque>();
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
   test_move_assign_clears();
   test_move_assign_no_except();
 
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/range.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/range.pass.cpp
index 76485b47ec5ea..36501a566fbd6 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/range.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/range.pass.cpp
@@ -56,7 +56,8 @@ static_assert(
     !std::
         is_constructible_v<Set, std::from_range_t, RangeOf<std::pair<int, int>>, std::less<int>, std::allocator<int>>);
 
-void test() {
+template <template <class...> class KeyContainer>
+constexpr void test() {
   {
     // The constructors in this subclause shall not participate in overload
     // resolution unless uses_allocator_v<container_type, Alloc> is true.
@@ -64,8 +65,8 @@ void test() {
     using C  = test_less<int>;
     using A1 = test_allocator<int>;
     using A2 = other_allocator<int>;
-    using V1 = std::vector<int, A1>;
-    using V2 = std::vector<int, A2>;
+    using V1 = KeyContainer<int, A1>;
+    using V2 = KeyContainer<int, A2>;
     using M1 = std::flat_multiset<int, C, V1>;
     using M2 = std::flat_multiset<int, C, V2>;
     static_assert(std::is_constructible_v<M1, std::from_range_t, M1, const A1&>);
@@ -84,7 +85,7 @@ void test() {
   {
     // flat_multiset(from_range_t, R&&)
     // input_range && !common
-    using M    = std::flat_multiset<int>;
+    using M    = std::flat_multiset<int, std::less<int>, KeyContainer<int>>;
     using Iter = cpp20_input_iterator<const int*>;
     using Sent = sentinel_wrapper<Iter>;
     using R    = std::ranges::subrange<Iter, Sent>;
@@ -98,17 +99,17 @@ void test() {
   {
     // flat_multiset(from_range_t, R&&)
     // greater
-    using M    = std::flat_multiset<int, std::greater<int>, std::deque<int, min_allocator<int>>>;
+    using M    = std::flat_multiset<int, std::greater<int>, KeyContainer<int, min_allocator<int>>>;
     using Iter = cpp20_input_iterator<const int*>;
     using Sent = sentinel_wrapper<Iter>;
     using R    = std::ranges::subrange<Iter, Sent>;
     auto m     = M(std::from_range, R(Iter(ar), Sent(Iter(ar + 9))));
-    assert(std::ranges::equal(m, std::deque<int, min_allocator<int>>{3, 3, 3, 2, 2, 2, 1, 1, 1}));
+    assert(std::ranges::equal(m, KeyContainer<int, min_allocator<int>>{3, 3, 3, 2, 2, 2, 1, 1, 1}));
   }
   {
     // flat_multiset(from_range_t, R&&)
     // contiguous range
-    using M = std::flat_multiset<int>;
+    using M = std::flat_multiset<int, std::less<int>, KeyContainer<int>>;
     using R = std::ranges::subrange<const int*>;
     auto m  = M(std::from_range, R(ar, ar + 9));
     assert(std::ranges::equal(m, expected));
@@ -116,7 +117,7 @@ void test() {
   {
     // flat_multiset(from_range_t, R&&, const key_compare&)
     using C = test_less<int>;
-    using M = std::flat_multiset<int, C, std::vector<int>>;
+    using M = std::flat_multiset<int, C, KeyContainer<int>>;
     using R = std::ranges::subrange<const int*>;
     auto m  = M(std::from_range, R(ar, ar + 9), C(3));
     assert(std::ranges::equal(m, expected));
@@ -130,7 +131,7 @@ void test() {
   {
     // flat_multiset(from_range_t, R&&, const Allocator&)
     using A1 = test_allocator<int>;
-    using M  = std::flat_multiset<int, std::less<int>, std::vector<int, A1>>;
+    using M  = std::flat_multiset<int, std::less<int>, KeyContainer<int, A1>>;
     using R  = std::ranges::subrange<const int*>;
     auto m   = M(std::from_range, R(ar, ar + 9), A1(5));
     assert(std::ranges::equal(m, expected));
@@ -140,7 +141,7 @@ void test() {
     // flat_multiset(from_range_t, R&&, const Allocator&)
     // explicit(false)
     using A1 = test_allocator<int>;
-    using M  = std::flat_multiset<int, std::less<int>, std::deque<int, A1>>;
+    using M  = std::flat_multiset<int, std::less<int>, KeyContainer<int, A1>>;
     using R  = std::ranges::subrange<const int*>;
     M m      = {std::from_range, R(ar, ar + 9), A1(5)}; // implicit ctor
     assert(std::ranges::equal(m, expected));
@@ -150,7 +151,7 @@ void test() {
     // flat_multiset(from_range_t, R&&, const key_compare&, const Allocator&)
     using C  = test_less<int>;
     using A1 = test_allocator<int>;
-    using M  = std::flat_multiset<int, C, std::vector<int, A1>>;
+    using M  = std::flat_multiset<int, C, KeyContainer<int, A1>>;
     using R  = std::ranges::subrange<const int*>;
     auto m   = M(std::from_range, R(ar, ar + 9), C(3), A1(5));
     assert(std::ranges::equal(m, expected));
@@ -161,7 +162,7 @@ void test() {
     // flat_multiset(from_range_t, R&&, const key_compare&, const Allocator&)
     // explicit(false)
     using A1 = test_allocator<int>;
-    using M  = std::flat_multiset<int, std::less<int>, std::deque<int, A1>>;
+    using M  = std::flat_multiset<int, std::less<int>, KeyContainer<int, A1>>;
     using R  = std::ranges::subrange<const int*>;
     M m      = {std::from_range, R(ar, ar + 9), {}, A1(5)}; // implicit ctor
     assert(std::ranges::equal(m, expected));
@@ -169,8 +170,21 @@ void test() {
   }
 }
 
+constexpr bool test() {
+  test<std::vector>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque>();
+
+  return true;
+}
+
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/sorted_container.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/sorted_container.pass.cpp
index 76759be7da8e3..60fd70abc83b5 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/sorted_container.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/sorted_container.pass.cpp
@@ -30,7 +30,8 @@
 #include "test_macros.h"
 #include "../../../test_compare.h"
 
-void test() {
+template <template <class...> class KeyContainer>
+constexpr void test() {
   {
     // The constructors in this subclause shall not participate in overload
     // resolution unless uses_allocator_v<container_type, Alloc> is true.
@@ -38,8 +39,8 @@ void test() {
     using C  = test_less<int>;
     using A1 = test_allocator<int>;
     using A2 = other_allocator<int>;
-    using V1 = std::vector<int, A1>;
-    using V2 = std::vector<int, A2>;
+    using V1 = KeyContainer<int, A1>;
+    using V2 = KeyContainer<int, A2>;
     using M1 = std::flat_multiset<int, C, V1>;
     using M2 = std::flat_multiset<int, C, V2>;
     static_assert(std::is_constructible_v<M1, std::sorted_equivalent_t, const V1&, const A1&>);
@@ -52,11 +53,12 @@ void test() {
     static_assert(!std::is_constructible_v<M1, std::sorted_equivalent_t, const V1&, const C&, const A2&>);
     static_assert(!std::is_constructible_v<M2, std::sorted_equivalent_t, const V2&, const C&, const A1&>);
   }
+
   {
     // flat_multiset(sorted_equivalent_t, container_type)
-    using M             = std::flat_multiset<int>;
-    std::vector<int> ks = {1, 2, 2, 4, 10};
-    auto ks2            = ks;
+    using M              = std::flat_multiset<int, std::less<int>, KeyContainer<int>>;
+    KeyContainer<int> ks = {1, 2, 2, 4, 10};
+    auto ks2             = ks;
 
     auto m = M(std::sorted_equivalent, ks);
     assert((m == M{1, 2, 2, 4, 10}));
@@ -71,7 +73,7 @@ void test() {
   {
     // flat_multiset(sorted_equivalent_t, container_type)
     // non-default container, comparator and allocator type
-    using Ks = std::deque<int, min_allocator<int>>;
+    using Ks = KeyContainer<int, min_allocator<int>>;
     using M  = std::flat_multiset<int, std::greater<int>, Ks>;
     Ks ks    = {10, 4, 4, 2, 1};
     auto m   = M(std::sorted_equivalent, ks);
@@ -84,8 +86,8 @@ void test() {
     // flat_multiset(sorted_equivalent_t, container_type)
     // allocator copied into the containers
     using A = test_allocator<int>;
-    using M = std::flat_multiset<int, std::less<int>, std::deque<int, A>>;
-    auto ks = std::deque<int, A>({1, 2, 2, 4, 10}, A(4));
+    using M = std::flat_multiset<int, std::less<int>, KeyContainer<int, A>>;
+    auto ks = KeyContainer<int, A>({1, 2, 2, 4, 10}, A(4));
     auto m  = M(std::sorted_equivalent, std::move(ks));
     assert(ks.empty()); // it was moved-from
     assert((m == M{1, 2, 2, 4, 10}));
@@ -93,9 +95,9 @@ void test() {
   }
   {
     // flat_multiset(sorted_equivalent_t, container_type ,  key_compare)
-    using C             = test_less<int>;
-    using M             = std::flat_multiset<int, C>;
-    std::vector<int> ks = {1, 2, 2, 4, 10};
+    using C              = test_less<int>;
+    using M              = std::flat_multiset<int, C, KeyContainer<int>>;
+    KeyContainer<int> ks = {1, 2, 2, 4, 10};
 
     auto m = M(std::sorted_equivalent, ks, C(4));
     assert((m == M{1, 2, 2, 4, 10}));
@@ -108,11 +110,11 @@ void test() {
   }
   {
     // flat_multiset(sorted_equivalent_t, container_type , key_compare, const Allocator&)
-    using C                = test_less<int>;
-    using A                = test_allocator<int>;
-    using M                = std::flat_multiset<int, C, std::vector<int, A>>;
-    std::vector<int, A> ks = {1, 2, 2, 4, 10};
-    auto m                 = M(std::sorted_equivalent, ks, C(4), A(5));
+    using C                 = test_less<int>;
+    using A                 = test_allocator<int>;
+    using M                 = std::flat_multiset<int, C, KeyContainer<int, A>>;
+    KeyContainer<int, A> ks = {1, 2, 2, 4, 10};
+    auto m                  = M(std::sorted_equivalent, ks, C(4), A(5));
     assert((m == M{1, 2, 2, 4, 10}));
     assert(m.key_comp() == C(4));
     assert(M(m).extract().get_allocator() == A(5));
@@ -126,8 +128,8 @@ void test() {
   {
     // flat_multiset(sorted_equivalent_t, container_type , const Allocator&)
     using A = test_allocator<int>;
-    using M = std::flat_multiset<int, std::less<int>, std::deque<int, A>>;
-    auto ks = std::deque<int, A>({1, 2, 2, 4, 10}, A(4));
+    using M = std::flat_multiset<int, std::less<int>, KeyContainer<int, A>>;
+    auto ks = KeyContainer<int, A>({1, 2, 2, 4, 10}, A(4));
     auto m  = M(std::sorted_equivalent, ks, A(6)); // replaces the allocators
     assert(!ks.empty());                           // it was an lvalue above
     assert((m == M{1, 2, 2, 4, 10}));
@@ -140,8 +142,22 @@ void test() {
   }
 }
 
+constexpr bool test() {
+  test<std::vector>();
+
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque>();
+
+  return true;
+}
+
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/sorted_initializer_list.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/sorted_initializer_list.pass.cpp
index 955662dd233ef..ff10c97c7bd3f 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/sorted_initializer_list.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/sorted_initializer_list.pass.cpp
@@ -31,12 +31,13 @@
 #include "../../../test_compare.h"
 
 template <class T>
-std::initializer_list<T> il = {1, 2, 4, 4, 5};
+constexpr std::initializer_list<T> il = {1, 2, 4, 4, 5};
 
-void test() {
-  const auto il1 = il<int>;
-  const auto il2 = il<short>;
+constexpr auto il1 = il<int>;
+constexpr auto il2 = il<short>;
 
+template <template <class...> class KeyContainer>
+constexpr void test() {
   {
     // The constructors in this subclause shall not participate in overload
     // resolution unless uses_allocator_v<container_type, Alloc> is true.
@@ -44,8 +45,8 @@ void test() {
     using C  = test_less<int>;
     using A1 = test_allocator<int>;
     using A2 = other_allocator<int>;
-    using V1 = std::vector<int, A1>;
-    using V2 = std::vector<int, A2>;
+    using V1 = KeyContainer<int, A1>;
+    using V2 = KeyContainer<int, A2>;
     using M1 = std::flat_multiset<int, C, V1>;
     using M2 = std::flat_multiset<int, C, V2>;
     using IL = std::initializer_list<int>;
@@ -62,7 +63,7 @@ void test() {
   }
   {
     // initializer_list<value_type> needs to match exactly
-    using M = std::flat_multiset<int>;
+    using M = std::flat_multiset<int, std::less<int>, KeyContainer<int>>;
     using C = typename M::key_compare;
     static_assert(std::is_constructible_v<M, std::sorted_equivalent_t, std::initializer_list<int>>);
     static_assert(std::is_constructible_v<M, std::sorted_equivalent_t, std::initializer_list<int>, C>);
@@ -88,7 +89,7 @@ void test() {
 
   {
     // flat_multiset(sorted_equivalent_t, initializer_list<value_type>);
-    using M       = std::flat_multiset<int>;
+    using M       = std::flat_multiset<int, std::less<int>, KeyContainer<int>>;
     auto m        = M(std::sorted_equivalent, il1);
     auto expected = M{1, 2, 4, 4, 5};
     assert(m == expected);
@@ -97,9 +98,9 @@ void test() {
     M m2 = {std::sorted_equivalent, il1};
     assert(m2 == m);
   }
-  {
+  if (!TEST_IS_CONSTANT_EVALUATED) {
     // flat_multiset(sorted_equivalent_t, initializer_list<value_type>, const key_compare&);
-    using M = std::flat_multiset<int, std::function<bool(int, int)>>;
+    using M = std::flat_multiset<int, std::function<bool(int, int)>, KeyContainer<int>>;
     auto m  = M(std::sorted_equivalent, il1, std::less<int>());
     assert(m == M({1, 2, 4, 4, 5}, std::less<>()));
     assert(m.key_comp()(1, 2) == true);
@@ -111,7 +112,7 @@ void test() {
   {
     // flat_multiset(sorted_equivalent_t, initializer_list<value_type>, const key_compare&);
     // greater
-    using M = std::flat_multiset<int, std::greater<int>, std::deque<int, min_allocator<int>>>;
+    using M = std::flat_multiset<int, std::greater<int>, KeyContainer<int, min_allocator<int>>>;
     std::initializer_list<int> il4{5, 4, 4, 2, 1};
     auto m = M(std::sorted_equivalent, il4, std::greater<int>());
     assert((m == M{5, 4, 4, 2, 1}));
@@ -119,7 +120,7 @@ void test() {
   {
     // flat_multiset(sorted_equivalent_t, initializer_list<value_type>,  const Allocator&)
     using A1      = test_allocator<short>;
-    using M       = std::flat_multiset<short, std::less<int>, std::deque<short, A1>>;
+    using M       = std::flat_multiset<short, std::less<int>, KeyContainer<short, A1>>;
     auto m        = M(std::sorted_equivalent, il2, A1(5));
     auto expected = M{1, 2, 4, 4, 5};
     assert(m == expected);
@@ -134,7 +135,7 @@ void test() {
     // flat_multiset(sorted_equivalent_t, initializer_list<value_type>, const key_compare&, const Allocator&);
     using C  = test_less<int>;
     using A1 = test_allocator<short>;
-    using M  = std::flat_multiset<short, C, std::vector<short, A1>>;
+    using M  = std::flat_multiset<short, C, KeyContainer<short, A1>>;
     auto m   = M(std::sorted_equivalent, il2, C(3), A1(5));
     assert((m == M{1, 2, 4, 4, 5}));
     assert(m.key_comp() == C(3));
@@ -144,15 +145,29 @@ void test() {
     // flat_multiset(sorted_equivalent_t, initializer_list<value_type>, const key_compare&, const Allocator&);
     // explicit(false)
     using A1 = test_allocator<short>;
-    using M  = std::flat_multiset<short, std::less<int>, std::deque<short, A1>>;
+    using M  = std::flat_multiset<short, std::less<int>, KeyContainer<short, A1>>;
     M m      = {std::sorted_equivalent, il2, {}, A1(5)}; // implicit ctor
     assert((m == M{1, 2, 4, 4, 5}));
     assert(std::move(m).extract().get_allocator() == A1(5));
   }
 }
 
+constexpr bool test() {
+  test<std::vector>();
+
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque>();
+
+  return true;
+}
+
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/sorted_iter_iter.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/sorted_iter_iter.pass.cpp
index 9ebe45d71d667..a3c998114ad5b 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/sorted_iter_iter.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/sorted_iter_iter.pass.cpp
@@ -28,7 +28,8 @@
 #include "test_macros.h"
 #include "../../../test_compare.h"
 
-void test() {
+template <template <class...> class KeyContainer>
+constexpr void test() {
   {
     // The constructors in this subclause shall not participate in overload
     // resolution unless uses_allocator_v<container_type, Alloc> is true.
@@ -36,8 +37,8 @@ void test() {
     using C     = test_less<int>;
     using A1    = test_allocator<int>;
     using A2    = other_allocator<int>;
-    using V1    = std::vector<int, A1>;
-    using V2    = std::vector<int, A2>;
+    using V1    = KeyContainer<int, A1>;
+    using V2    = KeyContainer<int, A2>;
     using M1    = std::flat_multiset<int, C, V1>;
     using M2    = std::flat_multiset<int, C, V2>;
     using Iter1 = typename M1::iterator;
@@ -52,10 +53,12 @@ void test() {
     static_assert(!std::is_constructible_v<M1, std::sorted_equivalent_t, Iter1, Iter1, const C&, const A2&>);
     static_assert(!std::is_constructible_v<M2, std::sorted_equivalent_t, Iter2, Iter2, const C&, const A1&>);
   }
+
   {
     // flat_multiset(sorted_equivalent_t, InputIterator, InputIterator);
     // cpp17_input_iterator
-    using M  = std::flat_multiset<int>;
+    using M = std::flat_multiset<int, std::less<int>, KeyContainer<int>>;
+
     int ar[] = {1, 2, 2, 4, 5};
     auto m = M(std::sorted_equivalent, cpp17_input_iterator<const int*>(ar), cpp17_input_iterator<const int*>(ar + 5));
     auto expected = M{1, 2, 2, 4, 5};
@@ -69,16 +72,16 @@ void test() {
     // flat_multiset(sorted_equivalent_t, InputIterator, InputIterator);
     // contiguous iterator
     using C       = test_less<int>;
-    using M       = std::flat_multiset<int, C, std::vector<int, min_allocator<int>>>;
+    using M       = std::flat_multiset<int, C, KeyContainer<int, min_allocator<int>>>;
     int ar[]      = {1, 2, 4, 4, 5};
     auto m        = M(std::sorted_equivalent, ar, ar + 5);
     auto expected = M{1, 2, 4, 4, 5};
     assert(m == expected);
   }
-  {
+  if (!TEST_IS_CONSTANT_EVALUATED) {
     // flat_multiset(sorted_equivalent_t, InputIterator, InputIterator, const key_compare&);
     // cpp_17_input_iterator
-    using M  = std::flat_multiset<int, std::function<bool(int, int)>>;
+    using M  = std::flat_multiset<int, std::function<bool(int, int)>, KeyContainer<int>>;
     int ar[] = {1, 2, 4, 4, 5};
     auto m   = M(std::sorted_equivalent,
                cpp17_input_iterator<const int*>(ar),
@@ -97,7 +100,7 @@ void test() {
   {
     // flat_multiset(sorted_equivalent_t, InputIterator, InputIterator, const key_compare&);
     // greater
-    using M  = std::flat_multiset<int, std::greater<int>, std::deque<int, min_allocator<int>>>;
+    using M  = std::flat_multiset<int, std::greater<int>, KeyContainer<int, min_allocator<int>>>;
     int ar[] = {5, 4, 4, 2, 1};
     auto m   = M(std::sorted_equivalent,
                cpp17_input_iterator<const int*>(ar),
@@ -109,7 +112,7 @@ void test() {
     // flat_multiset(sorted_equivalent_t, InputIterator, InputIterator, const key_compare&);
     // contiguous iterator
     using C   = test_less<int>;
-    using M   = std::flat_multiset<int, C, std::vector<int, min_allocator<int>>>;
+    using M   = std::flat_multiset<int, C, KeyContainer<int, min_allocator<int>>>;
     int ar[1] = {42};
     auto m    = M(std::sorted_equivalent, ar, ar, C(5));
     assert(m.empty());
@@ -118,7 +121,7 @@ void test() {
   {
     // flat_multiset(sorted_equivalent_t, InputIterator , InputIterator, const Allocator&)
     using A1      = test_allocator<int>;
-    using M       = std::flat_multiset<int, std::less<int>, std::vector<int, A1>>;
+    using M       = std::flat_multiset<int, std::less<int>, KeyContainer<int, A1>>;
     int ar[]      = {1, 2, 4, 4, 5};
     auto m        = M(std::sorted_equivalent, ar, ar + 5, A1(5));
     auto expected = M{1, 2, 4, 4, 5};
@@ -134,7 +137,7 @@ void test() {
     // flat_multiset(sorted_equivalent_t, InputIterator, InputIterator, const key_compare&, const Allocator&);
     using C  = test_less<int>;
     using A1 = test_allocator<int>;
-    using M  = std::flat_multiset<int, C, std::deque<int, A1>>;
+    using M  = std::flat_multiset<int, C, KeyContainer<int, A1>>;
     int ar[] = {1, 2, 4, 4, 5};
     auto m   = M(std::sorted_equivalent, ar, ar + 5, C(3), A1(5));
     assert((m == M{1, 2, 4, 4, 5}));
@@ -145,7 +148,7 @@ void test() {
     // flat_multiset(sorted_equivalent_t, InputIterator, InputIterator, const key_compare&, const Allocator&);
     // explicit(false)
     using A1 = test_allocator<short>;
-    using M  = std::flat_multiset<short, std::less<int>, std::deque<short, A1>>;
+    using M  = std::flat_multiset<short, std::less<int>, KeyContainer<short, A1>>;
     int ar[] = {1, 2, 4, 4, 5};
     M m      = {std::sorted_equivalent, ar, ar + 5, {}, A1(5)}; // implicit ctor
     assert((m == M{1, 2, 4, 4, 5}));
@@ -153,8 +156,22 @@ void test() {
   }
 }
 
+constexpr bool test() {
+  test<std::vector>();
+
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque>();
+
+  return true;
+}
+
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.erasure/erase_if.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.erasure/erase_if.pass.cpp
index 21f3c918dec0d..337ad04c9cd48 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.erasure/erase_if.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.erasure/erase_if.pass.cpp
@@ -32,7 +32,7 @@ static_assert(HasStdErase<std::vector<int>>);
 static_assert(!HasStdErase<std::flat_multiset<int>>);
 
 template <class M>
-M make(std::initializer_list<int> vals) {
+constexpr M make(std::initializer_list<int> vals) {
   M ret;
   for (int v : vals)
     ret.emplace(v);
@@ -40,8 +40,8 @@ M make(std::initializer_list<int> vals) {
 }
 
 template <class M, class Pred>
-void test0(
-    std::initializer_list<int> vals, Pred p, std::initializer_list<int> expected, std::size_t expected_erased_count) {
+constexpr void
+test0(std::initializer_list<int> vals, Pred p, std::initializer_list<int> expected, std::size_t expected_erased_count) {
   M s = make<M>(vals);
   ASSERT_SAME_TYPE(typename M::size_type, decltype(std::erase_if(s, p)));
   assert(expected_erased_count == std::erase_if(s, p));
@@ -50,11 +50,11 @@ void test0(
 
 struct NotBool {
   bool b;
-  explicit operator bool() const { return b; }
+  explicit constexpr operator bool() const { return b; }
 };
 
 template <class S>
-void test_one() {
+constexpr void test_one() {
   // Test all the plausible signatures for this predicate.
   auto is1        = [](typename S::const_reference v) { return v == 1; };
   auto is2        = [](typename S::value_type v) { return v == 2; };
@@ -96,18 +96,28 @@ void test_one() {
   test0<S>({1, 1, 2, 2, 3}, nonBoolIs1, {2, 2, 3}, 2);
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::flat_multiset<int>>();
   test_one<std::flat_multiset<int, std::less<int>, std::vector<int, min_allocator<int>>>>();
   test_one<std::flat_multiset<int, std::greater<int>, std::vector<int, test_allocator<int>>>>();
-  test_one<std::flat_multiset<int, std::less<int>, std::deque<int, min_allocator<int>>>>();
-  test_one<std::flat_multiset<int, std::greater<int>, std::deque<int, test_allocator<int>>>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test_one<std::flat_multiset<int, std::less<int>, std::deque<int, min_allocator<int>>>>();
+    test_one<std::flat_multiset<int, std::greater<int>, std::deque<int, test_allocator<int>>>>();
+  }
   test_one<std::flat_multiset<long>>();
   test_one<std::flat_multiset<double>>();
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.iterators/iterator.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.iterators/iterator.pass.cpp
index 809f03df47977..878b2b2094f71 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.iterators/iterator.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.iterators/iterator.pass.cpp
@@ -30,7 +30,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, std::less<Key>, KeyContainer>;
 
@@ -68,9 +68,12 @@ void test_one() {
   assert(i == m.begin());
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
 
@@ -89,10 +92,15 @@ void test() {
     assert(!(ii1 != cii));
     assert(!(cii != ii1));
   }
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.iterators/iterator_comparison.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.iterators/iterator_comparison.pass.cpp
index cbf69d6e04904..ff4ad3f8f0279 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.iterators/iterator_comparison.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.iterators/iterator_comparison.pass.cpp
@@ -24,7 +24,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, std::less<Key>, KeyContainer>;
   using I   = M::iterator;
@@ -141,15 +141,23 @@ void test_one() {
   assert(cri2 <=> cri1 == std::strong_ordering::greater);
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.iterators/reverse_iterator.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.iterators/reverse_iterator.pass.cpp
index e25d786d9b3b4..678109b88f9fb 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.iterators/reverse_iterator.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.iterators/reverse_iterator.pass.cpp
@@ -25,46 +25,59 @@
 
 #include <iterator>
 
+#include "MinSequenceContainer.h"
 #include "test_macros.h"
+#include "min_allocator.h"
 
-void test() {
-  {
-    using M        = std::flat_multiset<int, std::less<int>, std::deque<int>>;
-    M m            = {1, 1, 2, 2, 3, 4};
-    int expected[] = {1, 1, 2, 2, 3, 4};
-    const M& cm    = m;
-    ASSERT_SAME_TYPE(decltype(m.rbegin()), M::reverse_iterator);
-    ASSERT_SAME_TYPE(decltype(m.crbegin()), M::const_reverse_iterator);
-    ASSERT_SAME_TYPE(decltype(cm.rbegin()), M::const_reverse_iterator);
-    ASSERT_SAME_TYPE(decltype(m.rend()), M::reverse_iterator);
-    ASSERT_SAME_TYPE(decltype(m.crend()), M::const_reverse_iterator);
-    ASSERT_SAME_TYPE(decltype(cm.rend()), M::const_reverse_iterator);
-    static_assert(noexcept(m.rbegin()));
-    static_assert(noexcept(cm.rbegin()));
-    static_assert(noexcept(m.crbegin()));
-    static_assert(noexcept(m.rend()));
-    static_assert(noexcept(cm.rend()));
-    static_assert(noexcept(m.crend()));
-    assert(m.size() == 6);
-    assert(std::distance(m.rbegin(), m.rend()) == 6);
-    assert(std::distance(cm.rbegin(), cm.rend()) == 6);
-    assert(std::distance(m.crbegin(), m.crend()) == 6);
-    assert(std::distance(cm.crbegin(), cm.crend()) == 6);
-    M::reverse_iterator i; // default-construct
-    ASSERT_SAME_TYPE(decltype(*i), const int&);
-    i                           = m.rbegin(); // move-assignment
-    M::const_reverse_iterator k = i;          // converting constructor
-    assert(i == k);                           // comparison
-    for (int j = 5; j >= 0; --j, ++i) {       // pre-increment
-      assert(*i == expected[j]);
-    }
-    assert(i == m.rend());
-    for (int j = 0; j <= 5; ++j) {
-      --i; // pre-decrement
-      assert(*i == expected[j]);
-    }
-    assert(i == m.rbegin());
+template <class KeyContainer>
+constexpr void test_one() {
+  using Key      = typename KeyContainer::value_type;
+  using M        = std::flat_multiset<Key, std::less<Key>, KeyContainer>;
+  M m            = {1, 1, 2, 2, 3, 4};
+  int expected[] = {1, 1, 2, 2, 3, 4};
+  const M& cm    = m;
+  ASSERT_SAME_TYPE(decltype(m.rbegin()), typename M::reverse_iterator);
+  ASSERT_SAME_TYPE(decltype(m.crbegin()), typename M::const_reverse_iterator);
+  ASSERT_SAME_TYPE(decltype(cm.rbegin()), typename M::const_reverse_iterator);
+  ASSERT_SAME_TYPE(decltype(m.rend()), typename M::reverse_iterator);
+  ASSERT_SAME_TYPE(decltype(m.crend()), typename M::const_reverse_iterator);
+  ASSERT_SAME_TYPE(decltype(cm.rend()), typename M::const_reverse_iterator);
+  static_assert(noexcept(m.rbegin()));
+  static_assert(noexcept(cm.rbegin()));
+  static_assert(noexcept(m.crbegin()));
+  static_assert(noexcept(m.rend()));
+  static_assert(noexcept(cm.rend()));
+  static_assert(noexcept(m.crend()));
+  assert(m.size() == 6);
+  assert(std::distance(m.rbegin(), m.rend()) == 6);
+  assert(std::distance(cm.rbegin(), cm.rend()) == 6);
+  assert(std::distance(m.crbegin(), m.crend()) == 6);
+  assert(std::distance(cm.crbegin(), cm.crend()) == 6);
+  typename M::reverse_iterator i; // default-construct
+  ASSERT_SAME_TYPE(decltype(*i), const int&);
+  i                                    = m.rbegin(); // move-assignment
+  typename M::const_reverse_iterator k = i;          // converting constructor
+  assert(i == k);                                    // comparison
+  for (int j = 5; j >= 0; --j, ++i) {                // pre-increment
+    assert(*i == expected[j]);
+  }
+  assert(i == m.rend());
+  for (int j = 0; j <= 5; ++j) {
+    --i; // pre-decrement
+    assert(*i == expected[j]);
   }
+  assert(i == m.rbegin());
+}
+
+constexpr bool test() {
+  test_one<std::vector<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
+  test_one<MinSequenceContainer<int>>();
+  test_one<std::vector<int, min_allocator<int>>>();
+
   {
     // N3644 testing
     using C = std::flat_multiset<int>;
@@ -80,10 +93,15 @@ void test() {
     assert(!(ii1 != cii));
     assert(!(cii != ii1));
   }
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/clear.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/clear.pass.cpp
index 4d01ece7ed6a6..088a8838ad8ae 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/clear.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/clear.pass.cpp
@@ -38,7 +38,7 @@ static_assert(NoExceptClear<std::flat_multiset<int, std::less<int>, ThrowOnMoveC
 #endif
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, std::less<Key>, KeyContainer>;
   {
@@ -58,17 +58,25 @@ void test_one() {
   }
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
   test_one<std::vector<int, min_allocator<int>>>();
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/emplace.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/emplace.pass.cpp
index 3ef13964c905e..6772e17378b70 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/emplace.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/emplace.pass.cpp
@@ -28,7 +28,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, std::less<Key>, KeyContainer>;
   using R   = typename M::iterator;
@@ -91,7 +91,7 @@ void test_one() {
 }
 
 template <class KeyContainer>
-void test_emplaceable() {
+constexpr void test_emplaceable() {
   using M = std::flat_multiset<Emplaceable, std::less<Emplaceable>, KeyContainer>;
   using R = typename M::iterator;
 
@@ -111,16 +111,24 @@ void test_emplaceable() {
   assert(*r == Emplaceable(1, 3.5));
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
 
   test_emplaceable<std::vector<Emplaceable>>();
-  test_emplaceable<std::deque<Emplaceable>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_emplaceable<std::deque<Emplaceable>>();
   test_emplaceable<MinSequenceContainer<Emplaceable>>();
   test_emplaceable<std::vector<Emplaceable, min_allocator<Emplaceable>>>();
+
+  return true;
 }
 
 void test_exception() {
@@ -130,6 +138,9 @@ void test_exception() {
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
   test_exception();
 
   return 0;
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/emplace_hint.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/emplace_hint.pass.cpp
index 41a2e9c4ce115..ec99a9fcc1d9b 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/emplace_hint.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/emplace_hint.pass.cpp
@@ -27,11 +27,11 @@
 #include "../helpers.h"
 
 struct CompareTensDigit {
-  bool operator()(auto lhs, auto rhs) const { return (lhs / 10) < (rhs / 10); }
+  constexpr bool operator()(auto lhs, auto rhs) const { return (lhs / 10) < (rhs / 10); }
 };
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, std::less<Key>, KeyContainer>;
   using R   = M::iterator;
@@ -179,7 +179,6 @@ void test_one() {
     assert(r == m.begin() + 2);
     assert(m.size() == 7);
     assert(*r == 23);
-    assert(*std::next(r) == 20);
   }
   {
     // hint incorrect and after the last duplicate
@@ -196,7 +195,7 @@ void test_one() {
 }
 
 template <class KeyContainer>
-void test_emplaceable() {
+constexpr void test_emplaceable() {
   using M = std::flat_multiset<Emplaceable, std::less<Emplaceable>, KeyContainer>;
   using R = M::iterator;
 
@@ -216,9 +215,12 @@ void test_emplaceable() {
   assert(*r == Emplaceable(1, 3.5));
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
 
@@ -226,6 +228,8 @@ void test() {
   test_emplaceable<std::vector<Emplaceable>>();
   test_emplaceable<MinSequenceContainer<Emplaceable>>();
   test_emplaceable<std::vector<Emplaceable, min_allocator<Emplaceable>>>();
+
+  return true;
 }
 
 void test_exception() {
@@ -235,6 +239,9 @@ void test_exception() {
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
   test_exception();
 
   return 0;
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_iter.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_iter.pass.cpp
index 8418efa67bb23..f2cb151d8661b 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_iter.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_iter.pass.cpp
@@ -27,7 +27,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, std::less<Key>, KeyContainer>;
   using I   = M::iterator;
@@ -94,11 +94,16 @@ void test_one() {
   assert(i8 == m.end());
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
+
+  return true;
 }
 
 void test_exception() {
@@ -108,6 +113,9 @@ void test_exception() {
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
   test_exception();
 
   return 0;
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_iter_iter.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_iter_iter.pass.cpp
index 2d54fef17b6c0..76078920af1bf 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_iter_iter.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_iter_iter.pass.cpp
@@ -26,7 +26,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, std::less<Key>, KeyContainer>;
   using I   = M::iterator;
@@ -78,11 +78,16 @@ void test_one() {
   assert(i5 == m.end());
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
+
+  return true;
 }
 
 void test_exception() {
@@ -92,6 +97,9 @@ void test_exception() {
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
   test_exception();
 
   return 0;
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_key.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_key.pass.cpp
index 8175afa5b626e..7ddd3d8657066 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_key.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_key.pass.cpp
@@ -26,7 +26,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer, class Compare = std::less<>>
-void test_one() {
+constexpr void test_one() {
   using M = std::flat_multiset<int, Compare, KeyContainer>;
 
   auto make = [](std::initializer_list<int> il) {
@@ -74,12 +74,17 @@ void test_one() {
   assert(m.empty());
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
   test_one<std::vector<int>, std::greater<>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
+
+  return true;
 }
 
 void test_exception() {
@@ -94,6 +99,9 @@ void test_exception() {
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
   test_exception();
 
   return 0;
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_key_transparent.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_key_transparent.pass.cpp
index a8765495d91d4..0613744ec5e39 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_key_transparent.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_key_transparent.pass.cpp
@@ -38,10 +38,10 @@ static_assert(!CanErase<const NonTransparentSet>);
 
 template <class Key, class It>
 struct HeterogeneousKey {
-  explicit HeterogeneousKey(Key key, It it) : key_(key), it_(it) {}
-  operator It() && { return it_; }
-  auto operator<=>(Key key) const { return key_ <=> key; }
-  friend bool operator<(const HeterogeneousKey&, const HeterogeneousKey&) {
+  constexpr explicit HeterogeneousKey(Key key, It it) : key_(key), it_(it) {}
+  constexpr operator It() && { return it_; }
+  constexpr auto operator<=>(Key key) const { return key_ <=> key; }
+  constexpr friend bool operator<(const HeterogeneousKey&, const HeterogeneousKey&) {
     assert(false);
     return false;
   }
@@ -50,7 +50,7 @@ struct HeterogeneousKey {
 };
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, std::less<Key>, KeyContainer>;
 
@@ -70,7 +70,7 @@ void test_one() {
 }
 
 template <class KeyContainer>
-void test_transparent_comparator() {
+constexpr void test_transparent_comparator() {
   using M = std::flat_multiset<std::string, TransparentComparator, KeyContainer>;
   {
     M m = {"alpha", "beta", "beta", "epsilon", "epsilon", "epsilon", "eta", "eta", "gamma"};
@@ -95,14 +95,20 @@ void test_transparent_comparator() {
   }
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
 
   test_transparent_comparator<std::vector<std::string>>();
-  test_transparent_comparator<std::deque<std::string>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_transparent_comparator<std::deque<std::string>>();
   test_transparent_comparator<MinSequenceContainer<std::string>>();
   test_transparent_comparator<std::vector<std::string, min_allocator<std::string>>>();
 
@@ -146,6 +152,8 @@ void test() {
     assert(n == 2);
     assert((m == M{"alpha", "epsilon", "eta", "gamma"}));
   }
+
+  return true;
 }
 
 void test_exception() {
@@ -159,6 +167,9 @@ void test_exception() {
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
   test_exception();
 
   return 0;
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/extract.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/extract.pass.cpp
index 8a66431396916..bb41cedf85497 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/extract.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/extract.pass.cpp
@@ -33,7 +33,7 @@ static_assert(!CanExtract<std::flat_multiset<int> const&>);
 static_assert(!CanExtract<std::flat_multiset<int> const&&>);
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using M = std::flat_multiset<int, std::less<int>, KeyContainer>;
   {
     M m = M({1, 1, 3});
@@ -55,9 +55,12 @@ void test_one() {
   }
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
 
@@ -70,6 +73,8 @@ void test() {
     check_invariant(m);
     LIBCPP_ASSERT(m.empty());
   }
+
+  return true;
 }
 
 void test_exception() {
@@ -96,6 +101,9 @@ void test_exception() {
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
   test_exception();
 
   return 0;
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_cv.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_cv.pass.cpp
index eeb1bdd26ca16..5128a40ada694 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_cv.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_cv.pass.cpp
@@ -23,7 +23,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, std::less<Key>, KeyContainer>;
   using R   = typename M::iterator;
@@ -61,11 +61,16 @@ void test_one() {
   assert(*r == 1);
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
+
+  return true;
 }
 
 void test_exception() {
@@ -79,6 +84,9 @@ void test_exception() {
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
   test_exception();
 
   return 0;
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_initializer_list.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_initializer_list.pass.cpp
index 9c56d3bfb750b..f0b1eaf377c5d 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_initializer_list.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_initializer_list.pass.cpp
@@ -23,7 +23,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, std::less<Key>, KeyContainer>;
 
@@ -65,11 +65,16 @@ void test_one() {
   }
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
+
+  return true;
 }
 
 void test_exception() {
@@ -84,6 +89,9 @@ void test_exception() {
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
   test_exception();
 
   return 0;
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_iter_cv.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_iter_cv.pass.cpp
index 61f00f5138118..55a77d576dacc 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_iter_cv.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_iter_cv.pass.cpp
@@ -23,7 +23,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, std::less<Key>, KeyContainer>;
   using R   = typename M::iterator;
@@ -61,11 +61,16 @@ void test_one() {
   assert(*r == 1);
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
+
+  return true;
 }
 
 void test_exception() {
@@ -80,6 +85,9 @@ void test_exception() {
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
   test_exception();
 
   return 0;
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_iter_iter.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_iter_iter.pass.cpp
index 93815686787c4..9b10bf3fbb1a4 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_iter_iter.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_iter_iter.pass.cpp
@@ -37,7 +37,7 @@ static_assert(!CanInsert<Set, int, int>);
 static_assert(!CanInsert<Set, cpp20_input_iterator<int*>, cpp20_input_iterator<int*>>);
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using M = std::flat_multiset<int, std::less<int>, KeyContainer>;
 
   int ar1[] = {
@@ -75,9 +75,12 @@ void test_one() {
   assert(m == expected2);
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
   {
@@ -86,6 +89,8 @@ void test() {
     m.insert(v.begin(), v.end());
     assert(std::ranges::equal(m, std::vector<int>{1, 2, 3, 4}));
   }
+
+  return true;
 }
 
 void test_exception() {
@@ -95,6 +100,9 @@ void test_exception() {
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
   test_exception();
 
   return 0;
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_iter_rv.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_iter_rv.pass.cpp
index 9976c04c9973a..8bbc6c80e4ef7 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_iter_rv.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_iter_rv.pass.cpp
@@ -22,7 +22,7 @@
 #include "test_macros.h"
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, std::less<Key>, KeyContainer>;
   using V   = Key;
@@ -59,15 +59,22 @@ void test_one() {
   assert(*r == V(1));
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
   test_one<std::vector<MoveOnly>>();
-  test_one<std::deque<int>>();
-  test_one<std::deque<MoveOnly>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test_one<std::deque<int>>();
+    test_one<std::deque<MoveOnly>>();
+  }
   test_one<MinSequenceContainer<int>>();
   test_one<MinSequenceContainer<MoveOnly>>();
   test_one<std::vector<int, min_allocator<int>>>();
   test_one<std::vector<MoveOnly, min_allocator<MoveOnly>>>();
+
+  return true;
 }
 
 void test_exception() {
@@ -82,6 +89,9 @@ void test_exception() {
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
   test_exception();
 
   return 0;
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_range.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_range.pass.cpp
index 566be3921bf77..a9d8f7e330fed 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_range.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_range.pass.cpp
@@ -39,7 +39,7 @@ static_assert(!CanInsertRange<Set, std::ranges::subrange<std::pair<int, int>*>>)
 static_assert(!CanInsertRange<Set, std::ranges::subrange<std::pair<short, short>*>>);
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
 
   {
@@ -72,9 +72,12 @@ void test_one() {
   }
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
   {
@@ -85,6 +88,8 @@ void test() {
     MoveOnly expected[] = {1, 1, 3, 4, 5};
     assert(std::ranges::equal(m, expected));
   }
+
+  return true;
 }
 
 void test_exception() {
@@ -94,6 +99,9 @@ void test_exception() {
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
   test_exception();
 
   return 0;
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_rv.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_rv.pass.cpp
index 9328c42fb0cda..67f3036a8dae7 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_rv.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_rv.pass.cpp
@@ -25,7 +25,7 @@
 #include "../helpers.h"
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, TransparentComparator, KeyContainer>;
   using R   = typename M::iterator;
@@ -63,15 +63,22 @@ void test_one() {
   assert(*r == V(1));
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
   test_one<std::vector<MoveOnly>>();
-  test_one<std::deque<int>>();
-  test_one<std::deque<MoveOnly>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test_one<std::deque<int>>();
+    test_one<std::deque<MoveOnly>>();
+  }
   test_one<MinSequenceContainer<int>>();
   test_one<MinSequenceContainer<MoveOnly>>();
   test_one<std::vector<int, min_allocator<int>>>();
   test_one<std::vector<MoveOnly, min_allocator<MoveOnly>>>();
+
+  return true;
 }
 
 void test_exception() {
@@ -86,6 +93,9 @@ void test_exception() {
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
   test_exception();
 
   return 0;
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_sorted_initializer_list.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_sorted_initializer_list.pass.cpp
index 11af199c3d1ee..81b7e4e196b30 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_sorted_initializer_list.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_sorted_initializer_list.pass.cpp
@@ -23,7 +23,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, std::less<Key>, KeyContainer>;
   {
@@ -42,11 +42,16 @@ void test_one() {
   }
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
+
+  return true;
 }
 
 void test_exception() {
@@ -61,6 +66,9 @@ void test_exception() {
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
   test_exception();
 
   return 0;
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_sorted_iter_iter.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_sorted_iter_iter.pass.cpp
index 07b62d04e0ebc..bfb230718fb6f 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_sorted_iter_iter.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_sorted_iter_iter.pass.cpp
@@ -36,7 +36,7 @@ static_assert(!CanInsert<Set, std::sorted_equivalent_t, int, int>);
 static_assert(!CanInsert<Set, std::sorted_equivalent_t, cpp20_input_iterator<int*>, cpp20_input_iterator<int*>>);
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, std::less<Key>, KeyContainer>;
 
@@ -60,11 +60,16 @@ void test_one() {
   assert(m == expected2);
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
+
+  return true;
 }
 
 void test_exception() {
@@ -76,6 +81,9 @@ void test_exception() {
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
   test_exception();
 
   return 0;
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/replace.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/replace.pass.cpp
index 5fe61389d72a1..3c74cf6ebe995 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/replace.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/replace.pass.cpp
@@ -31,7 +31,7 @@ static_assert(CanReplace<Set, std::vector<int>>);
 static_assert(!CanReplace<Set, const std::vector<int>&>);
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, std::less<Key>, KeyContainer>;
   {
@@ -53,11 +53,16 @@ void test_one() {
   }
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
+
+  return true;
 }
 
 void test_exception() {
@@ -82,6 +87,9 @@ void test_exception() {
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
   test_exception();
 
   return 0;
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/swap_free.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/swap_free.pass.cpp
index 2e3ed02c3c00e..241f2cf9e0a73 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/swap_free.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/swap_free.pass.cpp
@@ -38,7 +38,7 @@ static_assert(NoExceptAdlSwap<std::flat_multiset<int, std::less<int>, ThrowOnMov
 #endif
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, std::less<Key>, KeyContainer>;
 
@@ -84,15 +84,23 @@ void test_one() {
   }
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/swap_member.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/swap_member.pass.cpp
index 1d0d9152d1c1f..7ad96ed340955 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/swap_member.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/swap_member.pass.cpp
@@ -37,7 +37,7 @@ static_assert(NoExceptMemberSwap<std::flat_multiset<int, std::less<int>, ThrowOn
 #endif
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, std::less<Key>, KeyContainer>;
   {
@@ -82,15 +82,23 @@ void test_one() {
   }
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.observers/comp.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.observers/comp.pass.cpp
index 4ca64516e242f..74c92f3a3f843 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.observers/comp.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.observers/comp.pass.cpp
@@ -21,7 +21,7 @@
 
 #include "test_macros.h"
 
-void test() {
+constexpr bool test() {
   {
     using M    = std::flat_multiset<int>;
     using Comp = std::less<int>; // the default
@@ -36,7 +36,7 @@ void test() {
     assert(vc(1, 2));
     assert(!vc(2, 1));
   }
-  {
+  if (!TEST_IS_CONSTANT_EVALUATED) {
     using Comp = std::function<bool(int, int)>;
     using M    = std::flat_multiset<int, Comp>;
     Comp comp  = std::greater<int>();
@@ -67,10 +67,15 @@ void test() {
     assert(vc(1, 2));
     assert(!vc(2, 1));
   }
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/contains.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/contains.pass.cpp
index 00fda6c2edd88..a178dfd3d0cb5 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/contains.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/contains.pass.cpp
@@ -23,7 +23,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   {
     using M = std::flat_multiset<Key, std::less<>, KeyContainer>;
@@ -66,15 +66,23 @@ void test_one() {
   }
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/contains_transparent.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/contains_transparent.pass.cpp
index abee2b1bb12f9..3222762122f88 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/contains_transparent.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/contains_transparent.pass.cpp
@@ -35,7 +35,7 @@ static_assert(!CanContains<NonTransparentSet>);
 static_assert(!CanContains<const NonTransparentSet>);
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, TransparentComparator, KeyContainer>;
 
@@ -60,9 +60,12 @@ void test_one() {
   }
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<std::string>>();
-  test_one<std::deque<std::string>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<std::string>>();
   test_one<MinSequenceContainer<std::string>>();
   test_one<std::vector<std::string, min_allocator<std::string>>>();
 
@@ -82,10 +85,15 @@ void test() {
     assert(m.contains("beta"));
     assert(!m.contains("charlie"));
   }
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/count.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/count.pass.cpp
index 1752dab0e0e3a..8b034dfa1423c 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/count.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/count.pass.cpp
@@ -23,7 +23,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using S   = typename KeyContainer::size_type;
 
@@ -66,15 +66,23 @@ void test_one() {
   }
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/count_transparent.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/count_transparent.pass.cpp
index a9160aebb7517..a1a0d6b1f0310 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/count_transparent.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/count_transparent.pass.cpp
@@ -35,7 +35,7 @@ static_assert(!CanCount<NonTransparentSet>);
 static_assert(!CanCount<const NonTransparentSet>);
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, TransparentComparator, KeyContainer>;
   {
@@ -59,9 +59,12 @@ void test_one() {
   }
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<std::string>>();
-  test_one<std::deque<std::string>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<std::string>>();
   test_one<MinSequenceContainer<std::string>>();
   test_one<std::vector<std::string, min_allocator<std::string>>>();
 
@@ -81,10 +84,15 @@ void test() {
     auto n  = m.count("beta");
     assert(n == 2);
   }
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/equal_range.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/equal_range.pass.cpp
index 54ae27e9ba19c..b105d1914113a 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/equal_range.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/equal_range.pass.cpp
@@ -24,7 +24,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   {
     using M  = std::flat_multiset<Key, std::less<>, KeyContainer>;
@@ -74,15 +74,23 @@ void test_one() {
   }
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/equal_range_transparent.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/equal_range_transparent.pass.cpp
index ae16ec1127f31..65bff7a095dc6 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/equal_range_transparent.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/equal_range_transparent.pass.cpp
@@ -36,7 +36,7 @@ static_assert(!CanEqualRange<NonTransparentSet>);
 static_assert(!CanEqualRange<const NonTransparentSet>);
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, TransparentComparator, KeyContainer>;
 
@@ -90,9 +90,12 @@ void test_one() {
   }
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<std::string>>();
-  test_one<std::deque<std::string>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<std::string>>();
   test_one<MinSequenceContainer<std::string>>();
   test_one<std::vector<std::string, min_allocator<std::string>>>();
 
@@ -113,10 +116,15 @@ void test() {
     assert(first == m.begin() + 1);
     assert(last == m.begin() + 3);
   }
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/find.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/find.pass.cpp
index 49386a6f77fae..bc9a439eecbb9 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/find.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/find.pass.cpp
@@ -25,7 +25,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, std::less<>, KeyContainer>;
   {
@@ -50,15 +50,23 @@ void test_one() {
   }
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/find_transparent.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/find_transparent.pass.cpp
index 9d0b75c7b52bc..4c9c403464634 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/find_transparent.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/find_transparent.pass.cpp
@@ -36,7 +36,7 @@ static_assert(!CanFind<NonTransparentSet>);
 static_assert(!CanFind<const NonTransparentSet>);
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, TransparentComparator, KeyContainer>;
 
@@ -77,9 +77,12 @@ void test_one() {
   }
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<std::string>>();
-  test_one<std::deque<std::string>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<std::string>>();
   test_one<MinSequenceContainer<std::string>>();
   test_one<std::vector<std::string, min_allocator<std::string>>>();
 
@@ -101,10 +104,15 @@ void test() {
     auto it2 = m.find("charlie");
     assert(it2 == m.end());
   }
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/lower_bound.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/lower_bound.pass.cpp
index ba41b822fda74..07f053316ad32 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/lower_bound.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/lower_bound.pass.cpp
@@ -24,7 +24,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   {
     using M = std::flat_multiset<Key, std::less<>, KeyContainer>;
@@ -66,15 +66,23 @@ void test_one() {
   }
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/lower_bound_transparent.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/lower_bound_transparent.pass.cpp
index c03fb27a7c27e..e674c85ab30e6 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/lower_bound_transparent.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/lower_bound_transparent.pass.cpp
@@ -36,7 +36,7 @@ static_assert(!CanLowerBound<NonTransparentSet>);
 static_assert(!CanLowerBound<const NonTransparentSet>);
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, TransparentComparator, KeyContainer>;
 
@@ -83,9 +83,12 @@ void test_one() {
   }
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<std::string>>();
-  test_one<std::deque<std::string>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<std::string>>();
   test_one<MinSequenceContainer<std::string>>();
   test_one<std::vector<std::string, min_allocator<std::string>>>();
 
@@ -107,10 +110,15 @@ void test() {
     auto it2 = m.lower_bound("charlie");
     assert(it2 == m.begin() + 3);
   }
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/upper_bound.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/upper_bound.pass.cpp
index 7828f0500c8b9..d4d19926571d7 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/upper_bound.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/upper_bound.pass.cpp
@@ -24,7 +24,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   {
     using M = std::flat_multiset<Key, std::less<>, KeyContainer>;
@@ -67,15 +67,23 @@ void test_one() {
   }
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/upper_bound_transparent.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/upper_bound_transparent.pass.cpp
index de517fd7e520a..75140a780cceb 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/upper_bound_transparent.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/upper_bound_transparent.pass.cpp
@@ -36,7 +36,7 @@ static_assert(!CanUpperBound<NonTransparentSet>);
 static_assert(!CanUpperBound<const NonTransparentSet>);
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, TransparentComparator, KeyContainer>;
 
@@ -83,9 +83,12 @@ void test_one() {
   }
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<std::string>>();
-  test_one<std::deque<std::string>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<std::string>>();
   test_one<MinSequenceContainer<std::string>>();
   test_one<std::vector<std::string, min_allocator<std::string>>>();
 
@@ -105,10 +108,15 @@ void test() {
     auto it = m.upper_bound("beta");
     assert(it == m.begin() + 3);
   }
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/helpers.h b/libcxx/test/std/containers/container.adaptors/flat.multiset/helpers.h
index e7ed8a091d3be..82f917756e92c 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/helpers.h
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/helpers.h
@@ -20,7 +20,7 @@
 #include "test_macros.h"
 
 template <class... Args>
-void check_invariant(const std::flat_multiset<Args...>& m) {
+constexpr void check_invariant(const std::flat_multiset<Args...>& m) {
   assert(std::is_sorted(m.begin(), m.end(), m.key_comp()));
 }
 
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/op_compare.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/op_compare.pass.cpp
index 94f0f2b34abcc..606cdfc3ba7d2 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/op_compare.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/op_compare.pass.cpp
@@ -31,7 +31,7 @@
 #include "test_container_comparisons.h"
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
 
   {
@@ -64,9 +64,12 @@ void test_one() {
   }
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
 
@@ -81,7 +84,7 @@ void test() {
   {
     // Comparisons use value_type's native operators, not the comparator
     struct StrongComp {
-      bool operator()(double a, double b) const { return std::strong_order(a, b) < 0; }
+      constexpr bool operator()(double a, double b) const { return std::strong_order(a, b) < 0; }
     };
     using C = std::flat_multiset<double, StrongComp>;
     C s1    = {1};
@@ -96,10 +99,15 @@ void test() {
     assert(s1 != s2);
     assert((s1 <=> s2) == std::partial_ordering::unordered);
   }
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/input.output/file.streams/fstreams/filebuf.virtuals/setbuf.pass.cpp b/libcxx/test/std/input.output/file.streams/fstreams/filebuf.virtuals/setbuf.pass.cpp
index 9d14abcedd423..72af0a2db1180 100644
--- a/libcxx/test/std/input.output/file.streams/fstreams/filebuf.virtuals/setbuf.pass.cpp
+++ b/libcxx/test/std/input.output/file.streams/fstreams/filebuf.virtuals/setbuf.pass.cpp
@@ -6,9 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// TODO(mordante) Investigate
-// UNSUPPORTED: apple-clang
-
 // <fstream>
 
 // basic_streambuf<charT, traits>* setbuf(char_type* s, streamsize n) override;
diff --git a/libcxx/test/std/input.output/iostream.format/input.streams/istream.unformatted/sync.pass.cpp b/libcxx/test/std/input.output/iostream.format/input.streams/istream.unformatted/sync.pass.cpp
index 3b685950d36a6..79d20ce68d11b 100644
--- a/libcxx/test/std/input.output/iostream.format/input.streams/istream.unformatted/sync.pass.cpp
+++ b/libcxx/test/std/input.output/iostream.format/input.streams/istream.unformatted/sync.pass.cpp
@@ -6,9 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// TODO(mordante) Investigate
-// UNSUPPORTED: apple-clang
-
 // <istream>
 
 // int sync();
diff --git a/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.elem/arrow.pass.cpp b/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.elem/arrow.pass.cpp
index 665a1a89223bc..a238b753d1f15 100644
--- a/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.elem/arrow.pass.cpp
+++ b/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.elem/arrow.pass.cpp
@@ -17,10 +17,10 @@
 // LWG 198 was superseded by LWG 2360
 //    http://www.open-std.org/jtc1/sc22/wg21/docs/lwg-defects.html#2360
 
-
+#include <cassert>
 #include <iterator>
 #include <list>
-#include <cassert>
+#include <type_traits>
 
 #include "test_macros.h"
 
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/flat_map.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/flat_map.version.compile.pass.cpp
index 9c06eee27e0c8..26c8e1bc7d66f 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/flat_map.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/flat_map.version.compile.pass.cpp
@@ -20,30 +20,50 @@
 
 #if TEST_STD_VER < 14
 
+#  ifdef __cpp_lib_constexpr_flat_map
+#    error "__cpp_lib_constexpr_flat_map should not be defined before c++26"
+#  endif
+
 #  ifdef __cpp_lib_flat_map
 #    error "__cpp_lib_flat_map should not be defined before c++23"
 #  endif
 
 #elif TEST_STD_VER == 14
 
+#  ifdef __cpp_lib_constexpr_flat_map
+#    error "__cpp_lib_constexpr_flat_map should not be defined before c++26"
+#  endif
+
 #  ifdef __cpp_lib_flat_map
 #    error "__cpp_lib_flat_map should not be defined before c++23"
 #  endif
 
 #elif TEST_STD_VER == 17
 
+#  ifdef __cpp_lib_constexpr_flat_map
+#    error "__cpp_lib_constexpr_flat_map should not be defined before c++26"
+#  endif
+
 #  ifdef __cpp_lib_flat_map
 #    error "__cpp_lib_flat_map should not be defined before c++23"
 #  endif
 
 #elif TEST_STD_VER == 20
 
+#  ifdef __cpp_lib_constexpr_flat_map
+#    error "__cpp_lib_constexpr_flat_map should not be defined before c++26"
+#  endif
+
 #  ifdef __cpp_lib_flat_map
 #    error "__cpp_lib_flat_map should not be defined before c++23"
 #  endif
 
 #elif TEST_STD_VER == 23
 
+#  ifdef __cpp_lib_constexpr_flat_map
+#    error "__cpp_lib_constexpr_flat_map should not be defined before c++26"
+#  endif
+
 #  ifndef __cpp_lib_flat_map
 #    error "__cpp_lib_flat_map should be defined in c++23"
 #  endif
@@ -53,6 +73,13 @@
 
 #elif TEST_STD_VER > 23
 
+#  ifndef __cpp_lib_constexpr_flat_map
+#    error "__cpp_lib_constexpr_flat_map should be defined in c++26"
+#  endif
+#  if __cpp_lib_constexpr_flat_map != 202502L
+#    error "__cpp_lib_constexpr_flat_map should have the value 202502L in c++26"
+#  endif
+
 #  ifndef __cpp_lib_flat_map
 #    error "__cpp_lib_flat_map should be defined in c++26"
 #  endif
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/flat_set.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/flat_set.version.compile.pass.cpp
index 5985bdc2d7d4f..b29da9fdbe649 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/flat_set.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/flat_set.version.compile.pass.cpp
@@ -20,30 +20,50 @@
 
 #if TEST_STD_VER < 14
 
+#  ifdef __cpp_lib_constexpr_flat_set
+#    error "__cpp_lib_constexpr_flat_set should not be defined before c++26"
+#  endif
+
 #  ifdef __cpp_lib_flat_set
 #    error "__cpp_lib_flat_set should not be defined before c++23"
 #  endif
 
 #elif TEST_STD_VER == 14
 
+#  ifdef __cpp_lib_constexpr_flat_set
+#    error "__cpp_lib_constexpr_flat_set should not be defined before c++26"
+#  endif
+
 #  ifdef __cpp_lib_flat_set
 #    error "__cpp_lib_flat_set should not be defined before c++23"
 #  endif
 
 #elif TEST_STD_VER == 17
 
+#  ifdef __cpp_lib_constexpr_flat_set
+#    error "__cpp_lib_constexpr_flat_set should not be defined before c++26"
+#  endif
+
 #  ifdef __cpp_lib_flat_set
 #    error "__cpp_lib_flat_set should not be defined before c++23"
 #  endif
 
 #elif TEST_STD_VER == 20
 
+#  ifdef __cpp_lib_constexpr_flat_set
+#    error "__cpp_lib_constexpr_flat_set should not be defined before c++26"
+#  endif
+
 #  ifdef __cpp_lib_flat_set
 #    error "__cpp_lib_flat_set should not be defined before c++23"
 #  endif
 
 #elif TEST_STD_VER == 23
 
+#  ifdef __cpp_lib_constexpr_flat_set
+#    error "__cpp_lib_constexpr_flat_set should not be defined before c++26"
+#  endif
+
 #  ifndef __cpp_lib_flat_set
 #    error "__cpp_lib_flat_set should be defined in c++23"
 #  endif
@@ -53,6 +73,13 @@
 
 #elif TEST_STD_VER > 23
 
+#  ifndef __cpp_lib_constexpr_flat_set
+#    error "__cpp_lib_constexpr_flat_set should be defined in c++26"
+#  endif
+#  if __cpp_lib_constexpr_flat_set != 202502L
+#    error "__cpp_lib_constexpr_flat_set should have the value 202502L in c++26"
+#  endif
+
 #  ifndef __cpp_lib_flat_set
 #    error "__cpp_lib_flat_set should be defined in c++26"
 #  endif
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/type_traits.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/type_traits.version.compile.pass.cpp
index 0074f3bf4cc57..cb5c008f16bb3 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/type_traits.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/type_traits.version.compile.pass.cpp
@@ -918,7 +918,7 @@
 #    endif
 #  endif
 
-#  if !defined(_LIBCPP_VERSION)
+#  if __has_builtin(__builtin_is_within_lifetime)
 #    ifndef __cpp_lib_is_within_lifetime
 #      error "__cpp_lib_is_within_lifetime should be defined in c++26"
 #    endif
@@ -927,7 +927,7 @@
 #    endif
 #  else
 #    ifdef __cpp_lib_is_within_lifetime
-#      error "__cpp_lib_is_within_lifetime should not be defined because it is unimplemented in libc++!"
+#      error "__cpp_lib_is_within_lifetime should not be defined when the requirement '__has_builtin(__builtin_is_within_lifetime)' is not met!"
 #    endif
 #  endif
 
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
index 05af1fb0cf14b..8189c5c4e5985 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
@@ -204,6 +204,14 @@
 #    error "__cpp_lib_constexpr_dynamic_alloc should not be defined before c++20"
 #  endif
 
+#  ifdef __cpp_lib_constexpr_flat_map
+#    error "__cpp_lib_constexpr_flat_map should not be defined before c++26"
+#  endif
+
+#  ifdef __cpp_lib_constexpr_flat_set
+#    error "__cpp_lib_constexpr_flat_set should not be defined before c++26"
+#  endif
+
 #  ifdef __cpp_lib_constexpr_forward_list
 #    error "__cpp_lib_constexpr_forward_list should not be defined before c++26"
 #  endif
@@ -1116,6 +1124,14 @@
 #    error "__cpp_lib_constexpr_dynamic_alloc should not be defined before c++20"
 #  endif
 
+#  ifdef __cpp_lib_constexpr_flat_map
+#    error "__cpp_lib_constexpr_flat_map should not be defined before c++26"
+#  endif
+
+#  ifdef __cpp_lib_constexpr_flat_set
+#    error "__cpp_lib_constexpr_flat_set should not be defined before c++26"
+#  endif
+
 #  ifdef __cpp_lib_constexpr_forward_list
 #    error "__cpp_lib_constexpr_forward_list should not be defined before c++26"
 #  endif
@@ -2130,6 +2146,14 @@
 #    error "__cpp_lib_constexpr_dynamic_alloc should not be defined before c++20"
 #  endif
 
+#  ifdef __cpp_lib_constexpr_flat_map
+#    error "__cpp_lib_constexpr_flat_map should not be defined before c++26"
+#  endif
+
+#  ifdef __cpp_lib_constexpr_flat_set
+#    error "__cpp_lib_constexpr_flat_set should not be defined before c++26"
+#  endif
+
 #  ifdef __cpp_lib_constexpr_forward_list
 #    error "__cpp_lib_constexpr_forward_list should not be defined before c++26"
 #  endif
@@ -3384,6 +3408,14 @@
 #    error "__cpp_lib_constexpr_dynamic_alloc should have the value 201907L in c++20"
 #  endif
 
+#  ifdef __cpp_lib_constexpr_flat_map
+#    error "__cpp_lib_constexpr_flat_map should not be defined before c++26"
+#  endif
+
+#  ifdef __cpp_lib_constexpr_flat_set
+#    error "__cpp_lib_constexpr_flat_set should not be defined before c++26"
+#  endif
+
 #  ifdef __cpp_lib_constexpr_forward_list
 #    error "__cpp_lib_constexpr_forward_list should not be defined before c++26"
 #  endif
@@ -4860,6 +4892,14 @@
 #    error "__cpp_lib_constexpr_dynamic_alloc should have the value 201907L in c++23"
 #  endif
 
+#  ifdef __cpp_lib_constexpr_flat_map
+#    error "__cpp_lib_constexpr_flat_map should not be defined before c++26"
+#  endif
+
+#  ifdef __cpp_lib_constexpr_flat_set
+#    error "__cpp_lib_constexpr_flat_set should not be defined before c++26"
+#  endif
+
 #  ifdef __cpp_lib_constexpr_forward_list
 #    error "__cpp_lib_constexpr_forward_list should not be defined before c++26"
 #  endif
@@ -6549,6 +6589,20 @@
 #    error "__cpp_lib_constexpr_dynamic_alloc should have the value 201907L in c++26"
 #  endif
 
+#  ifndef __cpp_lib_constexpr_flat_map
+#    error "__cpp_lib_constexpr_flat_map should be defined in c++26"
+#  endif
+#  if __cpp_lib_constexpr_flat_map != 202502L
+#    error "__cpp_lib_constexpr_flat_map should have the value 202502L in c++26"
+#  endif
+
+#  ifndef __cpp_lib_constexpr_flat_set
+#    error "__cpp_lib_constexpr_flat_set should be defined in c++26"
+#  endif
+#  if __cpp_lib_constexpr_flat_set != 202502L
+#    error "__cpp_lib_constexpr_flat_set should have the value 202502L in c++26"
+#  endif
+
 #  ifndef __cpp_lib_constexpr_forward_list
 #    error "__cpp_lib_constexpr_forward_list should be defined in c++26"
 #  endif
@@ -7256,7 +7310,7 @@
 #    endif
 #  endif
 
-#  if !defined(_LIBCPP_VERSION)
+#  if __has_builtin(__builtin_is_within_lifetime)
 #    ifndef __cpp_lib_is_within_lifetime
 #      error "__cpp_lib_is_within_lifetime should be defined in c++26"
 #    endif
@@ -7265,7 +7319,7 @@
 #    endif
 #  else
 #    ifdef __cpp_lib_is_within_lifetime
-#      error "__cpp_lib_is_within_lifetime should not be defined because it is unimplemented in libc++!"
+#      error "__cpp_lib_is_within_lifetime should not be defined when the requirement '__has_builtin(__builtin_is_within_lifetime)' is not met!"
 #    endif
 #  endif
 
diff --git a/libcxx/test/std/localization/locale.categories/category.collate/locale.collate.byname/compare.pass.cpp b/libcxx/test/std/localization/locale.categories/category.collate/locale.collate.byname/compare.pass.cpp
index 4905ed40f4a24..8ae6bc2d3ba66 100644
--- a/libcxx/test/std/localization/locale.categories/category.collate/locale.collate.byname/compare.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.collate/locale.collate.byname/compare.pass.cpp
@@ -6,9 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// TODO(mordante) Investigate
-// UNSUPPORTED: apple-clang
-
 // Bionic has minimal locale support, investigate this later.
 // XFAIL: LIBCXX-ANDROID-FIXME
 
@@ -56,14 +53,7 @@ int main(int, char**)
             ASSERT_COMPARE(std::string, "AAA", "BBB", -1);
             ASSERT_COMPARE(std::string, "bbb", "aaa", 1);
             ASSERT_COMPARE(std::string, "ccc", "ccc", 0);
-
-#if defined(__APPLE__)
-            // Apple's default collation is case-sensitive
-            ASSERT_COMPARE(std::string, "aaaaaaA", "BaaaaaA", 1);
-#else
-            // Glibc, Windows, and FreeBSD's default collation is case-insensitive
             ASSERT_COMPARE(std::string, "aaaaaaA", "BaaaaaA", -1);
-#endif
         }
 #ifndef TEST_HAS_NO_WIDE_CHARACTERS
         {
@@ -73,13 +63,7 @@ int main(int, char**)
             ASSERT_COMPARE(std::wstring, L"AAA", L"BBB", -1);
             ASSERT_COMPARE(std::wstring, L"bbb", L"aaa", 1);
             ASSERT_COMPARE(std::wstring, L"ccc", L"ccc", 0);
-#if defined(__APPLE__)
-            // Apple's default collation is case-sensitive
-            ASSERT_COMPARE(std::wstring, L"aaaaaaA", L"BaaaaaA", 1);
-#else
-            // Glibc, Windows, and FreeBSD's default collation is case-insensitive
             ASSERT_COMPARE(std::wstring, L"aaaaaaA", L"BaaaaaA", -1);
-#endif
         }
 #endif
     }
diff --git a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_fr_FR.pass.cpp b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_fr_FR.pass.cpp
index ea6b07934510a..c9ed59f3cb9aa 100644
--- a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_fr_FR.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_fr_FR.pass.cpp
@@ -6,11 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// TODO(mordante) Investigate
-// UNSUPPORTED: apple-clang
-
-// XFAIL: darwin
-
 // NetBSD does not support LC_MONETARY at the moment
 // XFAIL: netbsd
 
diff --git a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_ru_RU.pass.cpp b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_ru_RU.pass.cpp
index f98758d086de1..371cf0e90c8d3 100644
--- a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_ru_RU.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_ru_RU.pass.cpp
@@ -6,9 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// TODO(mordante) Investigate
-// UNSUPPORTED: apple-clang
-
 // NetBSD does not support LC_MONETARY at the moment
 // XFAIL: netbsd
 
diff --git a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_zh_CN.pass.cpp b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_zh_CN.pass.cpp
index 6980b7ae77db0..c86df7e6b53bf 100644
--- a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_zh_CN.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_zh_CN.pass.cpp
@@ -6,9 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// TODO(mordante) Investigate
-// UNSUPPORTED: apple-clang
-
 // NetBSD does not support LC_MONETARY at the moment
 // XFAIL: netbsd
 
@@ -158,7 +155,7 @@ int main(int, char**)
             std::noshowbase(ios);
         }
         {   // negative one, showbase
-#ifdef _AIX
+#if defined(_AIX) || defined(__APPLE__)
           std::string v = "-" + currency_symbol + "0.01";
 #else
           std::string v = currency_symbol + "-0.01";
@@ -172,7 +169,7 @@ int main(int, char**)
           assert(ex == -1);
         }
         {   // negative one, showbase
-#ifdef _AIX
+#if defined(_AIX) || defined(__APPLE__)
           std::string v = "-" + currency_symbol + "0.01";
 #else
           std::string v = currency_symbol + "-0.01";
@@ -212,7 +209,7 @@ int main(int, char**)
             std::noshowbase(ios);
         }
         {   // negative, showbase
-#ifdef _AIX
+#if defined(_AIX) || defined(__APPLE__)
           std::string v = "-" + currency_symbol + "1,234,567.89";
 #else
           std::string v = currency_symbol + "-1,234,567.89";
@@ -333,7 +330,7 @@ int main(int, char**)
             std::noshowbase(ios);
         }
         {   // negative one, showbase
-#if defined(TEST_HAS_GLIBC) || defined(_AIX)
+#if defined(TEST_HAS_GLIBC) || defined(_AIX) || defined(__APPLE__)
           std::string v = "-" + currency_name + "0.01";
 #else
             std::string v = currency_name + "-0.01";
@@ -348,7 +345,7 @@ int main(int, char**)
             assert(ex == -1);
         }
         {   // negative one, showbase
-#if defined(TEST_HAS_GLIBC) || defined(_AIX)
+#if defined(TEST_HAS_GLIBC) || defined(_AIX) || defined(__APPLE__)
           std::string v = "-" + currency_name + "0.01";
 #else
             std::string v = currency_name + "-0.01";
@@ -389,7 +386,7 @@ int main(int, char**)
             std::noshowbase(ios);
         }
         {   // negative, showbase
-#if defined(TEST_HAS_GLIBC) || defined(_AIX)
+#if defined(TEST_HAS_GLIBC) || defined(_AIX) || defined(__APPLE__)
           std::string v = "-" + currency_name + "1,234,567.89";
 #else
             std::string v = currency_name + "-1,234,567.89";
@@ -518,7 +515,7 @@ int main(int, char**)
             std::noshowbase(ios);
         }
         {   // negative one, showbase
-#  ifdef _AIX
+#  if defined(_AIX) || defined(__APPLE__)
           std::wstring v = L"-" + w_currency_symbol + L"0.01";
 #  else
           std::wstring v = w_currency_symbol + L"-0.01";
@@ -532,7 +529,7 @@ int main(int, char**)
           assert(ex == -1);
         }
         {   // negative one, showbase
-#  ifdef _AIX
+#  if defined(_AIX) || defined(__APPLE__)
           std::wstring v = L"-" + w_currency_symbol + L"0.01";
 #  else
           std::wstring v = w_currency_symbol + L"-0.01";
@@ -572,7 +569,7 @@ int main(int, char**)
             std::noshowbase(ios);
         }
         {   // negative, showbase
-#  ifdef _AIX
+#  if defined(_AIX) || defined(__APPLE__)
           std::wstring v = L"-" + w_currency_symbol + L"1,234,567.89";
 #  else
           std::wstring v = w_currency_symbol + L"-1,234,567.89";
@@ -693,7 +690,7 @@ int main(int, char**)
             std::noshowbase(ios);
         }
         {   // negative one, showbase
-#  if defined(TEST_HAS_GLIBC) || defined(_AIX)
+#  if defined(TEST_HAS_GLIBC) || defined(_AIX) || defined(__APPLE__)
           std::wstring v = L"-" + w_currency_name + L"0.01";
 #  else
           std::wstring v = w_currency_name + L"-0.01";
@@ -707,7 +704,7 @@ int main(int, char**)
           assert(ex == -1);
         }
         {   // negative one, showbase
-#  if defined(TEST_HAS_GLIBC) || defined(_AIX)
+#  if defined(TEST_HAS_GLIBC) || defined(_AIX) || defined(__APPLE__)
           std::wstring v = L"-" + w_currency_name + L"0.01";
 #  else
           std::wstring v = w_currency_name + L"-0.01";
@@ -747,7 +744,7 @@ int main(int, char**)
             std::noshowbase(ios);
         }
         {   // negative, showbase
-#  if defined(TEST_HAS_GLIBC) || defined(_AIX)
+#  if defined(TEST_HAS_GLIBC) || defined(_AIX) || defined(__APPLE__)
           std::wstring v = L"-" + w_currency_name + L"1,234,567.89";
 #  else
           std::wstring v = w_currency_name + L"-1,234,567.89";
diff --git a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_fr_FR.pass.cpp b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_fr_FR.pass.cpp
index 14745996b9fd1..f9d7998b07ff4 100644
--- a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_fr_FR.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_fr_FR.pass.cpp
@@ -6,11 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// TODO(mordante) Investigate
-// UNSUPPORTED: apple-clang
-
-// XFAIL: darwin
-
 // NetBSD does not support LC_MONETARY at the moment
 // XFAIL: netbsd
 
diff --git a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_ru_RU.pass.cpp b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_ru_RU.pass.cpp
index 0455e5949c44a..be1e397488468 100644
--- a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_ru_RU.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_ru_RU.pass.cpp
@@ -6,9 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// TODO(mordante) Investigate
-// UNSUPPORTED: apple-clang
-
 // NetBSD does not support LC_MONETARY at the moment
 // XFAIL: netbsd
 
diff --git a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_zh_CN.pass.cpp b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_zh_CN.pass.cpp
index 68640fabb73b0..25046a8417083 100644
--- a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_zh_CN.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_zh_CN.pass.cpp
@@ -6,9 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// TODO(mordante) Investigate
-// UNSUPPORTED: apple-clang
-
 // NetBSD does not support LC_MONETARY at the moment
 // XFAIL: netbsd
 
@@ -122,7 +119,7 @@ int main(int, char**)
         char str[100];
         cpp17_output_iterator<char*> iter = f.put(cpp17_output_iterator<char*>(str), false, ios, '*', v);
         std::string ex(str, base(iter));
-#ifdef _AIX
+#if defined(_AIX) || defined(__APPLE__)
         assert(ex == "-" + currency_symbol + "0.01");
 #else
         assert(ex == currency_symbol + "-0.01");
@@ -142,7 +139,7 @@ int main(int, char**)
         char str[100];
         cpp17_output_iterator<char*> iter = f.put(cpp17_output_iterator<char*>(str), false, ios, '*', v);
         std::string ex(str, base(iter));
-#ifdef _AIX
+#if defined(_AIX) || defined(__APPLE__)
         assert(ex == "-" + currency_symbol + "1,234,567.89");
 #else
         assert(ex == currency_symbol + "-1,234,567.89");
@@ -156,7 +153,7 @@ int main(int, char**)
         char str[100];
         cpp17_output_iterator<char*> iter = f.put(cpp17_output_iterator<char*>(str), false, ios, ' ', v);
         std::string ex(str, base(iter));
-#ifdef _AIX
+#if defined(_AIX) || defined(__APPLE__)
         assert(ex == "-" + currency_symbol + "1,234,567.89" + currency_symbol_padding);
 #else
         assert(ex == currency_symbol + "-1,234,567.89" + currency_symbol_padding);
@@ -171,7 +168,7 @@ int main(int, char**)
         char str[100];
         cpp17_output_iterator<char*> iter = f.put(cpp17_output_iterator<char*>(str), false, ios, ' ', v);
         std::string ex(str, base(iter));
-#ifdef _AIX
+#if defined(_AIX) || defined(__APPLE__)
         assert(ex == "-" + currency_symbol + currency_symbol_padding + "1,234,567.89");
 #else
         assert(ex == currency_symbol + "-" + currency_symbol_padding + "1,234,567.89");
@@ -186,7 +183,7 @@ int main(int, char**)
         char str[100];
         cpp17_output_iterator<char*> iter = f.put(cpp17_output_iterator<char*>(str), false, ios, ' ', v);
         std::string ex(str, base(iter));
-#ifdef _AIX
+#if defined(_AIX) || defined(__APPLE__)
         assert(ex == currency_symbol_padding + "-" + currency_symbol + "1,234,567.89");
 #else
         assert(ex == currency_symbol_padding + currency_symbol + "-1,234,567.89");
@@ -239,7 +236,7 @@ int main(int, char**)
         char str[100];
         cpp17_output_iterator<char*> iter = f.put(cpp17_output_iterator<char*>(str), true, ios, '*', v);
         std::string ex(str, base(iter));
-#if defined(TEST_HAS_GLIBC) || defined(_AIX)
+#if defined(TEST_HAS_GLIBC) || defined(_AIX) || defined(__APPLE__)
         assert(ex == "-" + currency_name + "0.01");
 #else
         assert(ex == currency_name + "-0.01");
@@ -259,7 +256,7 @@ int main(int, char**)
         char str[100];
         cpp17_output_iterator<char*> iter = f.put(cpp17_output_iterator<char*>(str), true, ios, '*', v);
         std::string ex(str, base(iter));
-#if defined(TEST_HAS_GLIBC) || defined(_AIX)
+#if defined(TEST_HAS_GLIBC) || defined(_AIX) || defined(__APPLE__)
         assert(ex == "-" + currency_name + "1,234,567.89");
 #else
         assert(ex == currency_name + "-1,234,567.89");
@@ -273,7 +270,7 @@ int main(int, char**)
         char str[100];
         cpp17_output_iterator<char*> iter = f.put(cpp17_output_iterator<char*>(str), true, ios, ' ', v);
         std::string ex(str, base(iter));
-#if defined(TEST_HAS_GLIBC) || defined(_AIX)
+#if defined(TEST_HAS_GLIBC) || defined(_AIX) || defined(__APPLE__)
         assert(ex == "-" + currency_name + "1,234,567.89" + currency_name_padding);
 #else
         assert(ex == currency_name + "-1,234,567.89" + currency_name_padding);
@@ -288,7 +285,7 @@ int main(int, char**)
         char str[100];
         cpp17_output_iterator<char*> iter = f.put(cpp17_output_iterator<char*>(str), true, ios, ' ', v);
         std::string ex(str, base(iter));
-#if defined(TEST_HAS_GLIBC) || defined(_AIX)
+#if defined(TEST_HAS_GLIBC) || defined(_AIX) || defined(__APPLE__)
         assert(ex == "-" + currency_name + currency_name_padding + "1,234,567.89");
 #else
         assert(ex == currency_name + "-" + currency_name_padding + "1,234,567.89");
@@ -303,7 +300,7 @@ int main(int, char**)
         char str[100];
         cpp17_output_iterator<char*> iter = f.put(cpp17_output_iterator<char*>(str), true, ios, ' ', v);
         std::string ex(str, base(iter));
-#if defined(TEST_HAS_GLIBC) || defined(_AIX)
+#if defined(TEST_HAS_GLIBC) || defined(_AIX) || defined(__APPLE__)
         assert(ex == currency_name_padding + "-" + currency_name + "1,234,567.89");
 #else
         assert(ex == currency_name_padding + currency_name + "-1,234,567.89");
@@ -366,7 +363,7 @@ int main(int, char**)
         wchar_t str[100];
         cpp17_output_iterator<wchar_t*> iter = f.put(cpp17_output_iterator<wchar_t*>(str), false, ios, '*', v);
         std::wstring ex(str, base(iter));
-#  ifdef _AIX
+#  if defined(_AIX) || defined(__APPLE__)
         assert(ex == L"-" + currency_symbol + L"0.01");
 #  else
         assert(ex == currency_symbol + L"-0.01");
@@ -386,7 +383,7 @@ int main(int, char**)
         wchar_t str[100];
         cpp17_output_iterator<wchar_t*> iter = f.put(cpp17_output_iterator<wchar_t*>(str), false, ios, '*', v);
         std::wstring ex(str, base(iter));
-#  ifdef _AIX
+#  if defined(_AIX) || defined(__APPLE__)
         assert(ex == L"-" + currency_symbol + L"1,234,567.89");
 #  else
         assert(ex == currency_symbol + L"-1,234,567.89");
@@ -400,7 +397,7 @@ int main(int, char**)
         wchar_t str[100];
         cpp17_output_iterator<wchar_t*> iter = f.put(cpp17_output_iterator<wchar_t*>(str), false, ios, ' ', v);
         std::wstring ex(str, base(iter));
-#  ifdef _AIX
+#  if defined(_AIX) || defined(__APPLE__)
         assert(ex == L"-" + currency_symbol + L"1,234,567.89      ");
 #  else
         assert(ex == currency_symbol + L"-1,234,567.89      ");
@@ -415,7 +412,7 @@ int main(int, char**)
         wchar_t str[100];
         cpp17_output_iterator<wchar_t*> iter = f.put(cpp17_output_iterator<wchar_t*>(str), false, ios, ' ', v);
         std::wstring ex(str, base(iter));
-#  ifdef _AIX
+#  if defined(_AIX) || defined(__APPLE__)
         assert(ex == L"-" + currency_symbol + L"      1,234,567.89");
 #  else
         assert(ex == currency_symbol + L"-      1,234,567.89");
@@ -430,7 +427,7 @@ int main(int, char**)
         wchar_t str[100];
         cpp17_output_iterator<wchar_t*> iter = f.put(cpp17_output_iterator<wchar_t*>(str), false, ios, ' ', v);
         std::wstring ex(str, base(iter));
-#  ifdef _AIX
+#  if defined(_AIX) || defined(__APPLE__)
         assert(ex == L"      -" + currency_symbol + L"1,234,567.89");
 #  else
         assert(ex == L"      " + currency_symbol + L"-1,234,567.89");
@@ -483,7 +480,7 @@ int main(int, char**)
         wchar_t str[100];
         cpp17_output_iterator<wchar_t*> iter = f.put(cpp17_output_iterator<wchar_t*>(str), true, ios, '*', v);
         std::wstring ex(str, base(iter));
-#  if defined(TEST_HAS_GLIBC) || defined(_AIX)
+#  if defined(TEST_HAS_GLIBC) || defined(_AIX) || defined(__APPLE__)
         assert(ex == L"-" + currency_name + L"0.01");
 #else
         assert(ex == currency_name + L"-0.01");
@@ -503,7 +500,7 @@ int main(int, char**)
         wchar_t str[100];
         cpp17_output_iterator<wchar_t*> iter = f.put(cpp17_output_iterator<wchar_t*>(str), true, ios, '*', v);
         std::wstring ex(str, base(iter));
-#  if defined(TEST_HAS_GLIBC) || defined(_AIX)
+#  if defined(TEST_HAS_GLIBC) || defined(_AIX) || defined(__APPLE__)
         assert(ex == L"-" + currency_name + L"1,234,567.89");
 #else
         assert(ex == currency_name + L"-1,234,567.89");
@@ -517,7 +514,7 @@ int main(int, char**)
         wchar_t str[100];
         cpp17_output_iterator<wchar_t*> iter = f.put(cpp17_output_iterator<wchar_t*>(str), true, ios, ' ', v);
         std::wstring ex(str, base(iter));
-#  if defined(TEST_HAS_GLIBC) || defined(_AIX)
+#  if defined(TEST_HAS_GLIBC) || defined(_AIX) || defined(__APPLE__)
         assert(ex == L"-" + currency_name + L"1,234,567.89" + currency_name_padding);
 #else
         assert(ex == currency_name + L"-1,234,567.89" + currency_name_padding);
@@ -532,7 +529,7 @@ int main(int, char**)
         wchar_t str[100];
         cpp17_output_iterator<wchar_t*> iter = f.put(cpp17_output_iterator<wchar_t*>(str), true, ios, ' ', v);
         std::wstring ex(str, base(iter));
-#  if defined(TEST_HAS_GLIBC) || defined(_AIX)
+#  if defined(TEST_HAS_GLIBC) || defined(_AIX) || defined(__APPLE__)
         assert(ex == L"-" + currency_name + currency_name_padding + L"1,234,567.89");
 #else
         assert(ex == currency_name + L"-" + currency_name_padding + L"1,234,567.89");
@@ -547,7 +544,7 @@ int main(int, char**)
         wchar_t str[100];
         cpp17_output_iterator<wchar_t*> iter = f.put(cpp17_output_iterator<wchar_t*>(str), true, ios, ' ', v);
         std::wstring ex(str, base(iter));
-#  if defined(TEST_HAS_GLIBC) || defined(_AIX)
+#  if defined(TEST_HAS_GLIBC) || defined(_AIX) || defined(__APPLE__)
         assert(ex == currency_name_padding + L"-" + currency_name + L"1,234,567.89");
 #else
         assert(ex == currency_name_padding + currency_name + L"-1,234,567.89");
diff --git a/libcxx/test/std/localization/locale.categories/category.monetary/locale.moneypunct.byname/curr_symbol.pass.cpp b/libcxx/test/std/localization/locale.categories/category.monetary/locale.moneypunct.byname/curr_symbol.pass.cpp
index 9c1253d47acd2..e7f0f29e87742 100644
--- a/libcxx/test/std/localization/locale.categories/category.monetary/locale.moneypunct.byname/curr_symbol.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.monetary/locale.moneypunct.byname/curr_symbol.pass.cpp
@@ -6,9 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// TODO(mordante) Investigate
-// UNSUPPORTED: apple-clang
-
 // NetBSD does not support LC_MONETARY at the moment
 // XFAIL: netbsd
 
@@ -117,11 +114,7 @@ int main(int, char**)
 
     {
         Fnf f(LOCALE_fr_FR_UTF_8, 1);
-#ifdef __APPLE__
-        assert(f.curr_symbol() == " Eu");
-#else
         assert(f.curr_symbol() == " \u20ac");
-#endif
     }
     {
         Fnt f(LOCALE_fr_FR_UTF_8, 1);
@@ -130,11 +123,7 @@ int main(int, char**)
 #ifndef TEST_HAS_NO_WIDE_CHARACTERS
     {
         Fwf f(LOCALE_fr_FR_UTF_8, 1);
-#ifdef __APPLE__
-        assert(f.curr_symbol() == L" Eu");
-#else
         assert(f.curr_symbol() == L" \u20ac");
-#endif
     }
     {
         Fwt f(LOCALE_fr_FR_UTF_8, 1);
@@ -164,7 +153,7 @@ int main(int, char**)
 
     {
         Fnf f(LOCALE_zh_CN_UTF_8, 1);
-#ifdef _WIN32
+#if defined(_WIN32) || defined(__APPLE__)
         assert(f.curr_symbol() == "\xC2\xA5"); // \u00A5
 #else
         assert(f.curr_symbol() == "\xEF\xBF\xA5"); // \uFFE5
@@ -177,7 +166,7 @@ int main(int, char**)
 #ifndef TEST_HAS_NO_WIDE_CHARACTERS
     {
         Fwf f(LOCALE_zh_CN_UTF_8, 1);
-#ifdef _WIN32
+#if defined(_WIN32) || defined(__APPLE__)
         assert(f.curr_symbol() == L"\u00A5");
 #else
         assert(f.curr_symbol() == L"\uFFE5");
diff --git a/libcxx/test/std/localization/locale.categories/category.monetary/locale.moneypunct.byname/grouping.pass.cpp b/libcxx/test/std/localization/locale.categories/category.monetary/locale.moneypunct.byname/grouping.pass.cpp
index 630b2739c88a8..90dc6c4d7a2ab 100644
--- a/libcxx/test/std/localization/locale.categories/category.monetary/locale.moneypunct.byname/grouping.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.monetary/locale.moneypunct.byname/grouping.pass.cpp
@@ -6,11 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// TODO(mordante) Investigate
-// UNSUPPORTED: apple-clang
-
-// XFAIL: darwin
-//
 // NetBSD does not support LC_MONETARY at the moment
 // XFAIL: netbsd
 
diff --git a/libcxx/test/std/localization/locale.categories/category.monetary/locale.moneypunct.byname/neg_format.pass.cpp b/libcxx/test/std/localization/locale.categories/category.monetary/locale.moneypunct.byname/neg_format.pass.cpp
index a3e3d853524b5..e9528147dfe62 100644
--- a/libcxx/test/std/localization/locale.categories/category.monetary/locale.moneypunct.byname/neg_format.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.monetary/locale.moneypunct.byname/neg_format.pass.cpp
@@ -6,9 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// TODO(mordante) Investigate
-// UNSUPPORTED: apple-clang
-
 // NetBSD does not support LC_MONETARY at the moment
 // XFAIL: netbsd
 
@@ -82,14 +79,6 @@ void assert_sign_symbol_none_value(std::money_base::pattern p)
     assert(p.field[3] == std::money_base::value);
 }
 
-void assert_value_none_symbol_sign(std::money_base::pattern p)
-{
-    assert(p.field[0] == std::money_base::value);
-    assert(p.field[1] == std::money_base::none);
-    assert(p.field[2] == std::money_base::symbol);
-    assert(p.field[3] == std::money_base::sign);
-}
-
 void assert_sign_value_none_symbol(std::money_base::pattern p)
 {
     assert(p.field[0] == std::money_base::sign);
@@ -149,39 +138,23 @@ int main(int, char**)
     {
         Fnf f(LOCALE_fr_FR_UTF_8, 1);
         std::money_base::pattern p = f.neg_format();
-#ifdef __APPLE__
-        assert_value_none_symbol_sign(p);
-#else
         assert_sign_value_none_symbol(p);
-#endif
     }
     {
         Fnt f(LOCALE_fr_FR_UTF_8, 1);
         std::money_base::pattern p = f.neg_format();
-#ifdef __APPLE__
-        assert_value_none_symbol_sign(p);
-#else
         assert_sign_value_none_symbol(p);
-#endif
     }
 #ifndef TEST_HAS_NO_WIDE_CHARACTERS
     {
         Fwf f(LOCALE_fr_FR_UTF_8, 1);
         std::money_base::pattern p = f.neg_format();
-#ifdef __APPLE__
-        assert_value_none_symbol_sign(p);
-#else
         assert_sign_value_none_symbol(p);
-#endif
     }
     {
         Fwt f(LOCALE_fr_FR_UTF_8, 1);
         std::money_base::pattern p = f.neg_format();
-#ifdef __APPLE__
-        assert_value_none_symbol_sign(p);
-#else
         assert_sign_value_none_symbol(p);
-#endif
     }
 #endif // TEST_HAS_NO_WIDE_CHARACTERS
 
@@ -211,7 +184,7 @@ int main(int, char**)
     {
         Fnf f(LOCALE_zh_CN_UTF_8, 1);
         std::money_base::pattern p = f.neg_format();
-#ifdef _AIX
+#if defined(_AIX) || defined(__APPLE__)
         assert_sign_symbol_none_value(p);
 #else
         assert_symbol_sign_none_value(p);
@@ -220,7 +193,7 @@ int main(int, char**)
     {
         Fnt f(LOCALE_zh_CN_UTF_8, 1);
         std::money_base::pattern p = f.neg_format();
-#if defined(_WIN32) || defined(__APPLE__)
+#if defined(_WIN32)
         assert_symbol_sign_none_value(p);
 #else
         assert_sign_symbol_none_value(p);
@@ -230,7 +203,7 @@ int main(int, char**)
     {
         Fwf f(LOCALE_zh_CN_UTF_8, 1);
         std::money_base::pattern p = f.neg_format();
-#ifdef _AIX
+#if defined(_AIX) || defined(__APPLE__)
         assert_sign_symbol_none_value(p);
 #else
         assert_symbol_sign_none_value(p);
@@ -239,7 +212,7 @@ int main(int, char**)
     {
         Fwt f(LOCALE_zh_CN_UTF_8, 1);
         std::money_base::pattern p = f.neg_format();
-#if defined(_WIN32) || defined(__APPLE__)
+#if defined(_WIN32)
         assert_symbol_sign_none_value(p);
 #else
         assert_sign_symbol_none_value(p);
diff --git a/libcxx/test/std/localization/locale.categories/category.monetary/locale.moneypunct.byname/pos_format.pass.cpp b/libcxx/test/std/localization/locale.categories/category.monetary/locale.moneypunct.byname/pos_format.pass.cpp
index 671620a0c2f92..11832a7d89278 100644
--- a/libcxx/test/std/localization/locale.categories/category.monetary/locale.moneypunct.byname/pos_format.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.monetary/locale.moneypunct.byname/pos_format.pass.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
+
 // NetBSD does not support LC_MONETARY at the moment
 // XFAIL: netbsd
 
@@ -79,14 +79,6 @@ void assert_sign_symbol_none_value(std::money_base::pattern p)
     assert(p.field[3] == std::money_base::value);
 }
 
-void assert_value_none_symbol_sign(std::money_base::pattern p)
-{
-    assert(p.field[0] == std::money_base::value);
-    assert(p.field[1] == std::money_base::none);
-    assert(p.field[2] == std::money_base::symbol);
-    assert(p.field[3] == std::money_base::sign);
-}
-
 void assert_sign_value_none_symbol(std::money_base::pattern p)
 {
     assert(p.field[0] == std::money_base::sign);
diff --git a/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_double.pass.cpp b/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_double.pass.cpp
index 612d3738a373f..a388c0b15a840 100644
--- a/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_double.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_double.pass.cpp
@@ -6,9 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// TODO(mordante) Investigate
-// UNSUPPORTED: apple-clang
-
 // The fix for LWG2381 (https://github.com/llvm/llvm-project/pull/77948) changed behavior of
 // FP parsing. This requires 3e15c97fa3812993bdc319827a5c6d867b765ae8 in the dylib.
 // XFAIL: using-built-library-before-llvm-19
diff --git a/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_float.pass.cpp b/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_float.pass.cpp
index 58bc9e5abef87..596d81cbc8c91 100644
--- a/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_float.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_float.pass.cpp
@@ -6,9 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// TODO(mordante) Investigate
-// UNSUPPORTED: apple-clang
-
 // The fix for LWG2381 (https://github.com/llvm/llvm-project/pull/77948) changed behavior of
 // FP parsing. This requires 3e15c97fa3812993bdc319827a5c6d867b765ae8 in the dylib.
 // XFAIL: using-built-library-before-llvm-19
diff --git a/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_long_double.pass.cpp b/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_long_double.pass.cpp
index bf8bb651d6bce..8a9fd41501626 100644
--- a/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_long_double.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_long_double.pass.cpp
@@ -6,9 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// TODO(mordante) Investigate
-// UNSUPPORTED: apple-clang
-
 // The fix for LWG2381 (https://github.com/llvm/llvm-project/pull/77948) changed behavior of
 // FP parsing. This requires 3e15c97fa3812993bdc319827a5c6d867b765ae8 in the dylib.
 // XFAIL: using-built-library-before-llvm-19
diff --git a/libcxx/test/std/localization/locale.categories/facet.numpunct/locale.numpunct.byname/grouping.pass.cpp b/libcxx/test/std/localization/locale.categories/facet.numpunct/locale.numpunct.byname/grouping.pass.cpp
index a87c5e0ace28a..11ec75469c704 100644
--- a/libcxx/test/std/localization/locale.categories/facet.numpunct/locale.numpunct.byname/grouping.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/facet.numpunct/locale.numpunct.byname/grouping.pass.cpp
@@ -5,10 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-
-// TODO(mordante) Investigate
-// UNSUPPORTED: apple-clang
-
+//
 // NetBSD does not support LC_NUMERIC at the moment
 // XFAIL: netbsd
 
@@ -63,7 +60,7 @@ int main(int, char**)
     }
     {
         std::locale l(LOCALE_fr_FR_UTF_8);
-#if defined(TEST_HAS_GLIBC) || defined(_WIN32) || defined(_AIX)
+#if defined(TEST_HAS_GLIBC) || defined(_WIN32) || defined(_AIX) || defined(__APPLE__)
         const char* const group = "\3";
 #else
         const char* const group = "\x7f";
diff --git a/libcxx/test/std/localization/locale.categories/facet.numpunct/locale.numpunct.byname/thousands_sep.pass.cpp b/libcxx/test/std/localization/locale.categories/facet.numpunct/locale.numpunct.byname/thousands_sep.pass.cpp
index ef39e8aa7b685..53f2c8554f3d7 100644
--- a/libcxx/test/std/localization/locale.categories/facet.numpunct/locale.numpunct.byname/thousands_sep.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/facet.numpunct/locale.numpunct.byname/thousands_sep.pass.cpp
@@ -6,9 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// TODO(mordante) Investigate
-// UNSUPPORTED: apple-clang
-
 // NetBSD does not support LC_NUMERIC at the moment
 // XFAIL: netbsd
 
@@ -69,7 +66,7 @@ int main(int, char**)
         // The below tests work around GLIBC's use of U202F as LC_NUMERIC thousands_sep.
         std::locale l(LOCALE_fr_FR_UTF_8);
         {
-#if defined(_CS_GNU_LIBC_VERSION) || defined(_WIN32) || defined(_AIX)
+#if defined(_CS_GNU_LIBC_VERSION) || defined(_WIN32) || defined(_AIX) || defined(__APPLE__)
             const char sep = ' ';
 #else
             const char sep = ',';
diff --git a/libcxx/test/std/strings/basic.string/string.capacity/deallocate_size.pass.cpp b/libcxx/test/std/strings/basic.string/string.capacity/deallocate_size.pass.cpp
index 00f9e2b846783..ecdc39701641d 100644
--- a/libcxx/test/std/strings/basic.string/string.capacity/deallocate_size.pass.cpp
+++ b/libcxx/test/std/strings/basic.string/string.capacity/deallocate_size.pass.cpp
@@ -12,12 +12,14 @@
 
 #include <string>
 #include <cassert>
+#include <cstddef>
 #include <cstdint>
 #include <type_traits>
 
 #include "test_macros.h"
 
-static int allocated_;
+static std::uint64_t allocated_;
+static std::uint64_t deallocated_;
 
 template <class T, class Sz>
 struct test_alloc {
@@ -41,12 +43,12 @@ struct test_alloc {
 
   pointer allocate(size_type n, const void* = nullptr) {
     allocated_ += n;
-    return std::allocator<value_type>().allocate(n);
+    return std::allocator<value_type>().allocate(static_cast<std::size_t>(n));
   }
 
   void deallocate(pointer p, size_type s) {
-    allocated_ -= s;
-    std::allocator<value_type>().deallocate(p, s);
+    deallocated_ += s;
+    std::allocator<value_type>().deallocate(p, static_cast<std::size_t>(s));
   }
 
   template <class U>
@@ -64,14 +66,13 @@ struct test_alloc {
 
 template <class Sz>
 void test() {
-  for (int i = 1; i < 1000; ++i) {
-    using Str = std::basic_string<char, std::char_traits<char>, test_alloc<char, Sz> >;
+  for (unsigned int i = 1; i < 1000; ++i) {
     {
-      Str s(i, 't');
-      assert(allocated_ == 0 || allocated_ >= i);
+      std::basic_string<char, std::char_traits<char>, test_alloc<char, Sz> > s(i, 't');
+      (void)s;
     }
+    assert(allocated_ == deallocated_);
   }
-  assert(allocated_ == 0);
 }
 
 int main(int, char**) {
diff --git a/libcxx/test/std/strings/basic.string/string.capacity/over_max_size.pass.cpp b/libcxx/test/std/strings/basic.string/string.capacity/over_max_size.pass.cpp
index 5eb3240699a81..8e5919539d94e 100644
--- a/libcxx/test/std/strings/basic.string/string.capacity/over_max_size.pass.cpp
+++ b/libcxx/test/std/strings/basic.string/string.capacity/over_max_size.pass.cpp
@@ -8,11 +8,13 @@
 
 // UNSUPPORTED: no-exceptions
 
-// After changing the alignment of the allocated pointer from 16 to 8, the exception
-// thrown is no longer `bad_alloc` but instead length_error on systems using new
-// headers but a dylib that doesn't contain 04ce0ba.
+// This test fails when using a built library that does not contain
+// 15860446a8c3, which changed the return value of max_size(). Without
+// that change, the built library believes the max size to be one greater
+// than it really is, and we fail to throw `length_error` from `string::resize()`,
+// which is explicitly instantiated in the built library.
 //
-// XFAIL: using-built-library-before-llvm-19
+// XFAIL: using-built-library-before-llvm-21
 
 // <string>
 
diff --git a/libcxx/test/std/time/time.duration/time.duration.nonmember/ostream.pass.cpp b/libcxx/test/std/time/time.duration/time.duration.nonmember/ostream.pass.cpp
index 4e84db9a84d78..97ac04275b0b6 100644
--- a/libcxx/test/std/time/time.duration/time.duration.nonmember/ostream.pass.cpp
+++ b/libcxx/test/std/time/time.duration/time.duration.nonmember/ostream.pass.cpp
@@ -6,9 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// TODO(mordante) Investigate
-// UNSUPPORTED: apple-clang
-
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 // UNSUPPORTED: no-localization
 // UNSUPPORTED: GCC-ALWAYS_INLINE-FIXME
@@ -83,17 +80,10 @@ static void test_values() {
   assert(stream_c_locale<CharT>(1'000.123456s) == SV("1000.1235s"));
 
   if constexpr (std::same_as<CharT, char>) {
-#if defined(__APPLE__)
-    assert(stream_fr_FR_locale<CharT>(-1'000'000s) == SV("-1000000s"));
-    assert(stream_fr_FR_locale<CharT>(1'000'000s) == SV("1000000s"));
-    assert(stream_fr_FR_locale<CharT>(-1'000.123456s) == SV("-1000,1235s"));
-    assert(stream_fr_FR_locale<CharT>(1'000.123456s) == SV("1000,1235s"));
-#else
     assert(stream_fr_FR_locale<CharT>(-1'000'000s) == SV("-1 000 000s"));
     assert(stream_fr_FR_locale<CharT>(1'000'000s) == SV("1 000 000s"));
     assert(stream_fr_FR_locale<CharT>(-1'000.123456s) == SV("-1 000,1235s"));
     assert(stream_fr_FR_locale<CharT>(1'000.123456s) == SV("1 000,1235s"));
-#endif
   } else {
 #ifndef TEST_HAS_NO_WIDE_CHARACTERS
     assert(stream_fr_FR_locale<CharT>(-1'000'000s) == L"-1" FR_THOU_SEP "000" FR_THOU_SEP "000s");
diff --git a/libcxx/test/std/time/time.syn/formatter.duration.pass.cpp b/libcxx/test/std/time/time.syn/formatter.duration.pass.cpp
index 973bce8f81d41..f1f7debed2464 100644
--- a/libcxx/test/std/time/time.syn/formatter.duration.pass.cpp
+++ b/libcxx/test/std/time/time.syn/formatter.duration.pass.cpp
@@ -6,9 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// TODO(mordante) Investigate
-// UNSUPPORTED: apple-clang
-
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 // UNSUPPORTED: no-localization
 // UNSUPPORTED: GCC-ALWAYS_INLINE-FIXME
@@ -408,19 +405,11 @@ static void test_valid_positive_integral_values() {
            "%OM='00'\t"
            "%S='00'\t"
            "%OS='00'\t"
-#  if defined(__APPLE__)
-           "%p='AM'\t"
-#  else
            "%p='午前'\t"
-#  endif
            "%R='00:00'\t"
            "%T='00:00:00'\t"
 #  if defined(__APPLE__) || defined(__FreeBSD__)
-#    if defined(__APPLE__)
-           "%r='12:00:00 AM'\t"
-#    else
            "%r='12:00:00 午前'\t"
-#    endif
            "%X='00時00分00秒'\t"
            "%EX='00時00分00秒'\t"
 #  elif defined(_WIN32)
@@ -448,19 +437,11 @@ static void test_valid_positive_integral_values() {
            "%OM='59'\t"
            "%S='59'\t"
            "%OS='59'\t"
-#  if defined(__APPLE__)
-           "%p='AM'\t"
-#  else
            "%p='午前'\t"
-#  endif
            "%R='11:59'\t"
            "%T='11:59:59'\t"
 #  if defined(__APPLE__) || defined(__FreeBSD__)
-#    if defined(__APPLE__)
-           "%r='11:59:59 AM'\t"
-#    else
            "%r='11:59:59 午前'\t"
-#    endif
            "%X='11時59分59秒'\t"
            "%EX='11時59分59秒'\t"
 #  elif defined(_WIN32)
@@ -488,19 +469,11 @@ static void test_valid_positive_integral_values() {
            "%OM='00'\t"
            "%S='00'\t"
            "%OS='00'\t"
-#  if defined(__APPLE__)
-           "%p='PM'\t"
-#  else
            "%p='午後'\t"
-#  endif
            "%R='12:00'\t"
            "%T='12:00:00'\t"
 #  if defined(__APPLE__) || defined(__FreeBSD__)
-#    if defined(__APPLE__)
-           "%r='12:00:00 PM'\t"
-#    else
            "%r='12:00:00 午後'\t"
-#    endif
            "%X='12時00分00秒'\t"
            "%EX='12時00分00秒'\t"
 #  else
@@ -528,19 +501,11 @@ static void test_valid_positive_integral_values() {
            "%OM='59'\t"
            "%S='59'\t"
            "%OS='59'\t"
-#  if defined(__APPLE__)
-           "%p='PM'\t"
-#  else
            "%p='午後'\t"
-#  endif
            "%R='23:59'\t"
            "%T='23:59:59'\t"
 #  if defined(__APPLE__) || defined(__FreeBSD__)
-#    if defined(__APPLE__)
-           "%r='11:59:59 PM'\t"
-#    else
            "%r='11:59:59 午後'\t"
-#    endif
            "%X='23時59分59秒'\t"
            "%EX='23時59分59秒'\t"
 #  else
@@ -568,19 +533,11 @@ static void test_valid_positive_integral_values() {
            "%OM='00'\t"
            "%S='00'\t"
            "%OS='00'\t"
-#  if defined(__APPLE__)
-           "%p='AM'\t"
-#  else
            "%p='午前'\t"
-#  endif
            "%R='00:00'\t"
            "%T='00:00:00'\t"
 #  if defined(__APPLE__) || defined(__FreeBSD__)
-#    if defined(__APPLE__)
-           "%r='12:00:00 AM'\t"
-#    else
            "%r='12:00:00 午前'\t"
-#    endif
            "%X='00時00分00秒'\t"
            "%EX='00時00分00秒'\t"
 #  elif defined(_WIN32)
@@ -835,19 +792,11 @@ static void test_valid_negative_integral_values() {
            "%OM='59'\t"
            "%S='59'\t"
            "%OS='59'\t"
-#  if defined(__APPLE__)
-           "%p='PM'\t"
-#  else
            "%p='午後'\t"
-#  endif
            "%R='23:59'\t"
            "%T='23:59:59'\t"
 #  if defined(__APPLE__) || defined(__FreeBSD__)
-#    if defined(__APPLE__)
-           "%r='11:59:59 PM'\t"
-#    else
            "%r='11:59:59 午後'\t"
-#    endif
            "%X='23時59分59秒'\t"
            "%EX='23時59分59秒'\t"
 #  elif defined(_WIN32)
diff --git a/libcxx/test/std/time/time.syn/formatter.file_time.pass.cpp b/libcxx/test/std/time/time.syn/formatter.file_time.pass.cpp
index 28a972b19dcef..e258c4161eda4 100644
--- a/libcxx/test/std/time/time.syn/formatter.file_time.pass.cpp
+++ b/libcxx/test/std/time/time.syn/formatter.file_time.pass.cpp
@@ -6,9 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// TODO(mordante) Investigate
-// UNSUPPORTED: apple-clang
-
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 // UNSUPPORTED: no-localization
 // UNSUPPORTED: GCC-ALWAYS_INLINE-FIXME
@@ -695,19 +692,11 @@ static void test_valid_values_time() {
            "%OM='00'\t"
            "%S='00'\t"
            "%OS='00'\t"
-#  if defined(__APPLE__)
-           "%p='AM'\t"
-#  else
            "%p='午前'\t"
-#  endif
            "%R='00:00'\t"
            "%T='00:00:00'\t"
 #  if defined(__APPLE__) || defined(__FreeBSD__)
-#    if defined(__APPLE__)
-           "%r='12:00:00 AM'\t"
-#    else
            "%r='12:00:00 午前'\t"
-#    endif
            "%X='00時00分00秒'\t"
            "%EX='00時00分00秒'\t"
 #  elif defined(_WIN32)
@@ -732,19 +721,11 @@ static void test_valid_values_time() {
            "%OM='31'\t"
            "%S='30.123'\t"
            "%OS='30.123'\t"
-#  if defined(__APPLE__)
-           "%p='PM'\t"
-#  else
            "%p='午後'\t"
-#  endif
            "%R='23:31'\t"
            "%T='23:31:30.123'\t"
 #  if defined(__APPLE__) || defined(__FreeBSD__)
-#    if defined(__APPLE__)
-           "%r='11:31:30 PM'\t"
-#    else
            "%r='11:31:30 午後'\t"
-#    endif
            "%X='23時31分30秒'\t"
            "%EX='23時31分30秒'\t"
 #  elif defined(_WIN32)
diff --git a/libcxx/test/std/time/time.syn/formatter.hh_mm_ss.pass.cpp b/libcxx/test/std/time/time.syn/formatter.hh_mm_ss.pass.cpp
index 82d9b4c7540a7..bbd9c074bef24 100644
--- a/libcxx/test/std/time/time.syn/formatter.hh_mm_ss.pass.cpp
+++ b/libcxx/test/std/time/time.syn/formatter.hh_mm_ss.pass.cpp
@@ -6,9 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// TODO(mordante) Investigate
-// UNSUPPORTED: apple-clang
-
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 // UNSUPPORTED: no-localization
 // UNSUPPORTED: GCC-ALWAYS_INLINE-FIXME
@@ -302,19 +299,11 @@ static void test_valid_values() {
            "%OM='00'\t"
            "%S='00'\t"
            "%OS='00'\t"
-#  if defined(__APPLE__)
-           "%p='AM'\t"
-#  else
            "%p='午前'\t"
-#  endif
            "%R='00:00'\t"
            "%T='00:00:00'\t"
 #  if defined(__APPLE__) || defined(__FreeBSD__)
-#    if defined(__APPLE__)
-           "%r='12:00:00 AM'\t"
-#    else
            "%r='12:00:00 午前'\t"
-#    endif
            "%X='00時00分00秒'\t"
            "%EX='00時00分00秒'\t"
 #  elif defined(_WIN32)
@@ -339,19 +328,11 @@ static void test_valid_values() {
            "%OM='31'\t"
            "%S='30.123'\t"
            "%OS='30.123'\t"
-#  if defined(__APPLE__)
-           "%p='PM'\t"
-#  else
            "%p='午後'\t"
-#  endif
            "%R='23:31'\t"
            "%T='23:31:30.123'\t"
 #  if defined(__APPLE__) || defined(__FreeBSD__)
-#    if defined(__APPLE__)
-           "%r='11:31:30 PM'\t"
-#    else
            "%r='11:31:30 午後'\t"
-#    endif
            "%X='23時31分30秒'\t"
            "%EX='23時31分30秒'\t"
 #  elif defined(_WIN32)
@@ -376,19 +357,11 @@ static void test_valid_values() {
            "%OM='02'\t"
            "%S='01.123456789012'\t"
            "%OS='01.123456789012'\t"
-#  if defined(__APPLE__)
-           "%p='AM'\t"
-#  else
            "%p='午前'\t"
-#  endif
            "%R='03:02'\t"
            "%T='03:02:01.123456789012'\t"
 #  if defined(__APPLE__) || defined(__FreeBSD__)
-#    if defined(__APPLE__)
-           "%r='03:02:01 AM'\t"
-#    else
            "%r='03:02:01 午前'\t"
-#    endif
            "%X='03時02分01秒'\t"
            "%EX='03時02分01秒'\t"
 #  elif defined(_WIN32)
@@ -413,19 +386,11 @@ static void test_valid_values() {
            "%OM='01'\t"
            "%S='01'\t"
            "%OS='01'\t"
-#  if defined(__APPLE__)
-           "%p='AM'\t"
-#  else
            "%p='午前'\t"
-#  endif
            "%R='01:01'\t"
            "%T='01:01:01'\t"
 #  if defined(__APPLE__) || defined(__FreeBSD__)
-#    if defined(__APPLE__)
-           "%r='01:01:01 AM'\t"
-#    else
            "%r='01:01:01 午前'\t"
-#    endif
            "%X='01時01分01秒'\t"
            "%EX='01時01分01秒'\t"
 #  elif defined(_WIN32)
diff --git a/libcxx/test/std/time/time.syn/formatter.local_time.pass.cpp b/libcxx/test/std/time/time.syn/formatter.local_time.pass.cpp
index bd23337ccb318..ce3af8ec199ae 100644
--- a/libcxx/test/std/time/time.syn/formatter.local_time.pass.cpp
+++ b/libcxx/test/std/time/time.syn/formatter.local_time.pass.cpp
@@ -6,9 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// TODO(mordante) Investigate
-// UNSUPPORTED: apple-clang
-
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 // UNSUPPORTED: no-localization
 // UNSUPPORTED: GCC-ALWAYS_INLINE-FIXME
@@ -694,19 +691,11 @@ static void test_valid_values_time() {
            "%OM='00'\t"
            "%S='00'\t"
            "%OS='00'\t"
-#  if defined(__APPLE__)
-           "%p='AM'\t"
-#  else
            "%p='午前'\t"
-#  endif
            "%R='00:00'\t"
            "%T='00:00:00'\t"
 #  if defined(__APPLE__) || defined(__FreeBSD__)
-#    if defined(__APPLE__)
-           "%r='12:00:00 AM'\t"
-#    else
            "%r='12:00:00 午前'\t"
-#    endif
            "%X='00時00分00秒'\t"
            "%EX='00時00分00秒'\t"
 #  elif defined(_WIN32)
@@ -731,19 +720,11 @@ static void test_valid_values_time() {
            "%OM='31'\t"
            "%S='30.123'\t"
            "%OS='30.123'\t"
-#  if defined(__APPLE__)
-           "%p='PM'\t"
-#  else
            "%p='午後'\t"
-#  endif
            "%R='23:31'\t"
            "%T='23:31:30.123'\t"
 #  if defined(__APPLE__) || defined(__FreeBSD__)
-#    if defined(__APPLE__)
-           "%r='11:31:30 PM'\t"
-#    else
            "%r='11:31:30 午後'\t"
-#    endif
            "%X='23時31分30秒'\t"
            "%EX='23時31分30秒'\t"
 #  elif defined(_WIN32)
diff --git a/libcxx/test/std/time/time.syn/formatter.sys_time.pass.cpp b/libcxx/test/std/time/time.syn/formatter.sys_time.pass.cpp
index 9c9c8e0de1e93..9238f3daf1f81 100644
--- a/libcxx/test/std/time/time.syn/formatter.sys_time.pass.cpp
+++ b/libcxx/test/std/time/time.syn/formatter.sys_time.pass.cpp
@@ -6,9 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// TODO(mordante) Investigate
-// UNSUPPORTED: apple-clang
-
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 // UNSUPPORTED: no-localization
 // UNSUPPORTED: GCC-ALWAYS_INLINE-FIXME
@@ -691,19 +688,11 @@ static void test_valid_values_time() {
            "%OM='00'\t"
            "%S='00'\t"
            "%OS='00'\t"
-#  if defined(__APPLE__)
-           "%p='AM'\t"
-#  else
            "%p='午前'\t"
-#  endif
            "%R='00:00'\t"
            "%T='00:00:00'\t"
 #  if defined(__APPLE__) || defined(__FreeBSD__)
-#    if defined(__APPLE__)
-           "%r='12:00:00 AM'\t"
-#    else
            "%r='12:00:00 午前'\t"
-#    endif
            "%X='00時00分00秒'\t"
            "%EX='00時00分00秒'\t"
 #  elif defined(_WIN32)
@@ -728,19 +717,11 @@ static void test_valid_values_time() {
            "%OM='31'\t"
            "%S='30.123'\t"
            "%OS='30.123'\t"
-#  if defined(__APPLE__)
-           "%p='PM'\t"
-#  else
            "%p='午後'\t"
-#  endif
            "%R='23:31'\t"
            "%T='23:31:30.123'\t"
 #  if defined(__APPLE__) || defined(__FreeBSD__)
-#    if defined(__APPLE__)
-           "%r='11:31:30 PM'\t"
-#    else
            "%r='11:31:30 午後'\t"
-#    endif
            "%X='23時31分30秒'\t"
            "%EX='23時31分30秒'\t"
 #  elif defined(_WIN32)
diff --git a/libcxx/test/std/utilities/format/format.arguments/format.arg/visit.pass.cpp b/libcxx/test/std/utilities/format/format.arguments/format.arg/visit.pass.cpp
index 20e0a5ed66bd0..68fe8b6de41d6 100644
--- a/libcxx/test/std/utilities/format/format.arguments/format.arg/visit.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.arguments/format.arg/visit.pass.cpp
@@ -8,8 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23
 // UNSUPPORTED: GCC-ALWAYS_INLINE-FIXME
-// The tested functionality needs deducing this.
-// XFAIL: apple-clang
 
 // <format>
 
diff --git a/libcxx/test/std/utilities/format/format.arguments/format.arg/visit.return_type.pass.cpp b/libcxx/test/std/utilities/format/format.arguments/format.arg/visit.return_type.pass.cpp
index 8a79dd4d50f20..4ae63e896caed 100644
--- a/libcxx/test/std/utilities/format/format.arguments/format.arg/visit.return_type.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.arguments/format.arg/visit.return_type.pass.cpp
@@ -8,8 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23
 // UNSUPPORTED: GCC-ALWAYS_INLINE-FIXME
-// The tested functionality needs deducing this.
-// XFAIL: apple-clang
 
 // <format>
 
diff --git a/libcxx/test/std/utilities/format/format.arguments/format.arg/visit_format_arg.deprecated.verify.cpp b/libcxx/test/std/utilities/format/format.arguments/format.arg/visit_format_arg.deprecated.verify.cpp
index 146ceba58872e..77df72d3c4c6c 100644
--- a/libcxx/test/std/utilities/format/format.arguments/format.arg/visit_format_arg.deprecated.verify.cpp
+++ b/libcxx/test/std/utilities/format/format.arguments/format.arg/visit_format_arg.deprecated.verify.cpp
@@ -8,7 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23
 // UNSUPPORTED: GCC-ALWAYS_INLINE-FIXME
-// XFAIL: apple-clang
 
 // <format>
 
diff --git a/libcxx/test/std/utilities/format/format.arguments/format.arg/visit_format_arg.pass.cpp b/libcxx/test/std/utilities/format/format.arguments/format.arg/visit_format_arg.pass.cpp
index d99675a71f321..9b7c8a7f4f8b4 100644
--- a/libcxx/test/std/utilities/format/format.arguments/format.arg/visit_format_arg.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.arguments/format.arg/visit_format_arg.pass.cpp
@@ -9,6 +9,8 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 // UNSUPPORTED: GCC-ALWAYS_INLINE-FIXME
 
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+
 // <format>
 
 // template<class Visitor, class Context>
@@ -25,10 +27,6 @@
 #include "make_string.h"
 #include "min_allocator.h"
 
-#if TEST_STD_VER >= 26 && defined(TEST_HAS_EXPLICIT_THIS_PARAMETER)
-TEST_CLANG_DIAGNOSTIC_IGNORED("-Wdeprecated-declarations")
-#endif
-
 template <class Context, class To, class From>
 void test(From value) {
   auto store = std::make_format_args<Context>(value);
diff --git a/libcxx/test/std/utilities/format/format.arguments/format.args/get.pass.cpp b/libcxx/test/std/utilities/format/format.arguments/format.args/get.pass.cpp
index c7dd82d726b3a..cbddc4f437a53 100644
--- a/libcxx/test/std/utilities/format/format.arguments/format.args/get.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.arguments/format.args/get.pass.cpp
@@ -32,7 +32,7 @@ void test(From value) {
     else
       assert(false);
   };
-#if TEST_STD_VER >= 26 && defined(TEST_HAS_EXPLICIT_THIS_PARAMETER)
+#if TEST_STD_VER >= 26
   format_args.get(0).visit(visitor);
 #else
   std::visit_format_arg(visitor, format_args.get(0));
@@ -47,7 +47,7 @@ void test_handle(T value) {
   std::basic_format_args<Context> format_args{store};
 
   auto visitor = [](auto a) { assert((std::is_same_v<decltype(a), typename std::basic_format_arg<Context>::handle>)); };
-#if TEST_STD_VER >= 26 && defined(TEST_HAS_EXPLICIT_THIS_PARAMETER)
+#if TEST_STD_VER >= 26
   format_args.get(0).visit(visitor);
 #else
   std::visit_format_arg(visitor, format_args.get(0));
@@ -73,7 +73,7 @@ void test_string_view(From value) {
     else
       assert(false);
   };
-#if TEST_STD_VER >= 26 && defined(TEST_HAS_EXPLICIT_THIS_PARAMETER)
+#if TEST_STD_VER >= 26
   format_args.get(0).visit(visitor);
 #else
   std::visit_format_arg(visitor, format_args.get(0));
diff --git a/libcxx/test/std/utilities/meta/meta.const.eval/is_within_lifetime.compile.pass.cpp b/libcxx/test/std/utilities/meta/meta.const.eval/is_within_lifetime.compile.pass.cpp
new file mode 100644
index 0000000000000..40c2273f1f862
--- /dev/null
+++ b/libcxx/test/std/utilities/meta/meta.const.eval/is_within_lifetime.compile.pass.cpp
@@ -0,0 +1,148 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23
+// UNSUPPORTED: gcc-15, apple-clang-17
+
+// <type_traits>
+
+// template <class T>
+//   consteval bool is_within_lifetime(const T*) noexcept; // C++26
+
+#include <cassert>
+#include <type_traits>
+#include <utility>
+
+#include "test_macros.h"
+
+ASSERT_SAME_TYPE(decltype(std::is_within_lifetime(std::declval<int*>())), bool);
+ASSERT_SAME_TYPE(decltype(std::is_within_lifetime(std::declval<const int*>())), bool);
+ASSERT_SAME_TYPE(decltype(std::is_within_lifetime(std::declval<void*>())), bool);
+ASSERT_SAME_TYPE(decltype(std::is_within_lifetime(std::declval<const void*>())), bool);
+
+ASSERT_NOEXCEPT(std::is_within_lifetime(std::declval<int*>()));
+ASSERT_NOEXCEPT(std::is_within_lifetime(std::declval<const int*>()));
+ASSERT_NOEXCEPT(std::is_within_lifetime(std::declval<void*>()));
+ASSERT_NOEXCEPT(std::is_within_lifetime(std::declval<const void*>()));
+
+template <class T>
+concept is_within_lifetime_exists = requires(T t) { std::is_within_lifetime(t); };
+
+struct S {};
+
+static_assert(is_within_lifetime_exists<int*>);
+static_assert(is_within_lifetime_exists<const int*>);
+static_assert(is_within_lifetime_exists<void*>);
+static_assert(is_within_lifetime_exists<const void*>);
+static_assert(!is_within_lifetime_exists<int>);               // Not a pointer
+static_assert(!is_within_lifetime_exists<decltype(nullptr)>); // Not a pointer
+static_assert(!is_within_lifetime_exists<void() const>);      // Not a pointer
+static_assert(!is_within_lifetime_exists<int S::*>);          // Doesn't accept pointer-to-data-member
+static_assert(!is_within_lifetime_exists<void (S::*)()>);     // Doesn't accept pointer-to-member-function
+static_assert(!is_within_lifetime_exists<void (*)()>);        // Doesn't match `const T*`
+
+consteval bool f() {
+  // Test that it works with global variables whose lifetime is in a
+  // different constant expression
+  {
+    static constexpr int i = 0;
+    static_assert(std::is_within_lifetime(&i));
+    // (Even when cast to a different type)
+    static_assert(std::is_within_lifetime(const_cast<int*>(&i)));
+    static_assert(std::is_within_lifetime(static_cast<const void*>(&i)));
+    static_assert(std::is_within_lifetime(static_cast<void*>(const_cast<int*>(&i))));
+    static_assert(std::is_within_lifetime<const int>(&i));
+    static_assert(std::is_within_lifetime<int>(const_cast<int*>(&i)));
+    static_assert(std::is_within_lifetime<const void>(static_cast<const void*>(&i)));
+    static_assert(std::is_within_lifetime<void>(static_cast<void*>(const_cast<int*>(&i))));
+  }
+
+  {
+    static constexpr union {
+      int member1;
+      int member2;
+    } u{.member2 = 1};
+    static_assert(!std::is_within_lifetime(&u.member1) && std::is_within_lifetime(&u.member2));
+  }
+
+  // Test that it works for varibles inside the same constant expression
+  {
+    int i = 0;
+    assert(std::is_within_lifetime(&i));
+    // (Even when cast to a different type)
+    assert(std::is_within_lifetime(const_cast<int*>(&i)));
+    assert(std::is_within_lifetime(static_cast<const void*>(&i)));
+    assert(std::is_within_lifetime(static_cast<void*>(const_cast<int*>(&i))));
+    assert(std::is_within_lifetime<const int>(&i));
+    assert(std::is_within_lifetime<int>(const_cast<int*>(&i)));
+    assert(std::is_within_lifetime<const void>(static_cast<const void*>(&i)));
+    assert(std::is_within_lifetime<void>(static_cast<void*>(const_cast<int*>(&i))));
+  }
+  // Anonymous union
+  {
+    union {
+      int member1;
+      int member2;
+    };
+    assert(!std::is_within_lifetime(&member1) && !std::is_within_lifetime(&member2));
+    member1 = 1;
+    assert(std::is_within_lifetime(&member1) && !std::is_within_lifetime(&member2));
+    member2 = 1;
+    assert(!std::is_within_lifetime(&member1) && std::is_within_lifetime(&member2));
+  }
+  // Variant members
+  {
+    struct X {
+      union {
+        int member1;
+        int member2;
+      };
+    } x;
+    assert(!std::is_within_lifetime(&x.member1) && !std::is_within_lifetime(&x.member2));
+    x.member1 = 1;
+    assert(std::is_within_lifetime(&x.member1) && !std::is_within_lifetime(&x.member2));
+    x.member2 = 1;
+    assert(!std::is_within_lifetime(&x.member1) && std::is_within_lifetime(&x.member2));
+  }
+  // Unions
+  {
+    union X {
+      int member1;
+      int member2;
+    } x;
+    assert(!std::is_within_lifetime(&x.member1) && !std::is_within_lifetime(&x.member2));
+    x.member1 = 1;
+    assert(std::is_within_lifetime(&x.member1) && !std::is_within_lifetime(&x.member2));
+    x.member2 = 1;
+    assert(!std::is_within_lifetime(&x.member1) && std::is_within_lifetime(&x.member2));
+  }
+  {
+    S s; // uninitialised
+    assert(std::is_within_lifetime(&s));
+  }
+
+  return true;
+}
+static_assert(f());
+
+// Check that it is a consteval (and consteval-propagating) function
+// (i.e., taking the address of below will fail because it will be an immediate function)
+template <typename T>
+constexpr void does_escalate(T p) {
+  std::is_within_lifetime(p);
+}
+template <typename T, void (*)(T) = &does_escalate<T>>
+constexpr bool check_escalated(int) {
+  return false;
+}
+template <typename T>
+constexpr bool check_escalated(long) {
+  return true;
+}
+static_assert(check_escalated<int*>(0), "");
+static_assert(check_escalated<void*>(0), "");
diff --git a/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.cnstr/PR20855_tuple_ref_binding_diagnostics.pass.cpp b/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.cnstr/PR20855_tuple_ref_binding_diagnostics.pass.cpp
index d78de0eec8e53..0f6a6734264c3 100644
--- a/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.cnstr/PR20855_tuple_ref_binding_diagnostics.pass.cpp
+++ b/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.cnstr/PR20855_tuple_ref_binding_diagnostics.pass.cpp
@@ -16,17 +16,6 @@
 #include <tuple>
 #include <string>
 #include <cassert>
-#include "test_macros.h"
-
-#if TEST_HAS_BUILTIN(__reference_constructs_from_temporary)
-#  define ASSERT_REFERENCE_BINDS_TEMPORARY(...) static_assert(__reference_constructs_from_temporary(__VA_ARGS__), "")
-#  define ASSERT_NOT_REFERENCE_BINDS_TEMPORARY(...)                                                                    \
-    static_assert(!__reference_constructs_from_temporary(__VA_ARGS__), "")
-#else
-// TODO(LLVM 22): Remove this as all support compilers should have __reference_constructs_from_temporary implemented.
-#  define ASSERT_REFERENCE_BINDS_TEMPORARY(...) static_assert(__reference_binds_to_temporary(__VA_ARGS__), "")
-#  define ASSERT_NOT_REFERENCE_BINDS_TEMPORARY(...) static_assert(!__reference_binds_to_temporary(__VA_ARGS__), "")
-#endif
 
 template <class Tp>
 struct ConvertsTo {
@@ -42,17 +31,6 @@ struct ConvertsTo {
 struct Base {};
 struct Derived : Base {};
 
-
-static_assert(std::is_same<decltype("abc"), decltype(("abc"))>::value, "");
-ASSERT_REFERENCE_BINDS_TEMPORARY(std::string const&, decltype("abc"));
-ASSERT_REFERENCE_BINDS_TEMPORARY(std::string const&, decltype(("abc")));
-ASSERT_REFERENCE_BINDS_TEMPORARY(std::string const&, const char*&&);
-
-ASSERT_NOT_REFERENCE_BINDS_TEMPORARY(int&, const ConvertsTo<int&>&);
-ASSERT_NOT_REFERENCE_BINDS_TEMPORARY(const int&, ConvertsTo<int&>&);
-ASSERT_NOT_REFERENCE_BINDS_TEMPORARY(Base&, Derived&);
-
-
 static_assert(std::is_constructible<int&, std::reference_wrapper<int>>::value, "");
 static_assert(std::is_constructible<int const&, std::reference_wrapper<int>>::value, "");
 
diff --git a/libcxx/test/std/utilities/variant/variant.visit.member/robust_against_adl.pass.cpp b/libcxx/test/std/utilities/variant/variant.visit.member/robust_against_adl.pass.cpp
index 7be7c7ff9122b..38cf34a9c699c 100644
--- a/libcxx/test/std/utilities/variant/variant.visit.member/robust_against_adl.pass.cpp
+++ b/libcxx/test/std/utilities/variant/variant.visit.member/robust_against_adl.pass.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23
-// XFAIL: apple-clang
 
 // <variant>
 
diff --git a/libcxx/test/std/utilities/variant/variant.visit.member/visit.pass.cpp b/libcxx/test/std/utilities/variant/variant.visit.member/visit.pass.cpp
index f68112d30fc35..aeb1297c136ae 100644
--- a/libcxx/test/std/utilities/variant/variant.visit.member/visit.pass.cpp
+++ b/libcxx/test/std/utilities/variant/variant.visit.member/visit.pass.cpp
@@ -7,8 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23
-// The tested functionality needs deducing this.
-// XFAIL: apple-clang
 
 // <variant>
 
diff --git a/libcxx/test/std/utilities/variant/variant.visit.member/visit_return_type.pass.cpp b/libcxx/test/std/utilities/variant/variant.visit.member/visit_return_type.pass.cpp
index 90320ae518c34..7ca05908ab340 100644
--- a/libcxx/test/std/utilities/variant/variant.visit.member/visit_return_type.pass.cpp
+++ b/libcxx/test/std/utilities/variant/variant.visit.member/visit_return_type.pass.cpp
@@ -7,8 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23
-// The tested functionality needs deducing this.
-// XFAIL: apple-clang
 
 // <variant>
 
diff --git a/libcxx/test/support/locale_helpers.h b/libcxx/test/support/locale_helpers.h
index 946c2fed0f3a5..3cec7397e3d7e 100644
--- a/libcxx/test/support/locale_helpers.h
+++ b/libcxx/test/support/locale_helpers.h
@@ -73,6 +73,12 @@ MultiStringType currency_symbol_ru_RU() {
     return MKSTR("\u20BD"); // U+20BD RUBLE SIGN
 #elif defined(_WIN32) || defined(__FreeBSD__) || defined(_AIX)
   return MKSTR("\u20BD"); // U+20BD RUBLE SIGN
+#elif defined(__APPLE__)
+  if (__builtin_available(macOS 15.4, *)) {
+    return MKSTR("\u20BD"); // U+20BD RUBLE SIGN
+  } else {
+    return MKSTR("\u0440\u0443\u0431.");
+  }
 #else
   return MKSTR("\u0440\u0443\u0431.");
 #endif
@@ -81,6 +87,12 @@ MultiStringType currency_symbol_ru_RU() {
 MultiStringType currency_symbol_zh_CN() {
 #if defined(_WIN32)
   return MKSTR("\u00A5"); // U+00A5 YEN SIGN
+#elif defined(__APPLE__)
+  if (__builtin_available(macOS 15.4, *)) {
+    return MKSTR("\u00A5"); // U+00A5 YEN SIGN
+  } else {
+    return MKSTR("\uFFE5"); // U+FFE5 FULLWIDTH YEN SIGN
+  }
 #else
   return MKSTR("\uFFE5"); // U+FFE5 FULLWIDTH YEN SIGN
 #endif
diff --git a/libcxx/test/support/test_basic_format_arg.h b/libcxx/test/support/test_basic_format_arg.h
index f51f6e97cbed0..99cd558c3c5bf 100644
--- a/libcxx/test/support/test_basic_format_arg.h
+++ b/libcxx/test/support/test_basic_format_arg.h
@@ -21,7 +21,7 @@ bool test_basic_format_arg(std::basic_format_arg<Context> arg, T expected) {
     else
       return false;
   };
-#if TEST_STD_VER >= 26 && defined(TEST_HAS_EXPLICIT_THIS_PARAMETER)
+#if TEST_STD_VER >= 26
   return arg.visit(std::move(visitor));
 #else
   return std::visit_format_arg(std::move(visitor), arg);
diff --git a/libcxx/test/support/test_macros.h b/libcxx/test/support/test_macros.h
index c4e1600572456..8d88d6fad7d0b 100644
--- a/libcxx/test/support/test_macros.h
+++ b/libcxx/test/support/test_macros.h
@@ -531,13 +531,6 @@ inline Tp const& DoNotOptimize(Tp const& value) {
 #  define TEST_IF_AIX(arg_true, arg_false) arg_false
 #endif
 
-// Clang-18 has support for deducing this, but it does not set the FTM.
-#ifdef _LIBCPP_USE_FROZEN_CXX03_HEADERS
-// This is a C++20 featue, so we don't care whether the compiler could support it
-#elif defined(_LIBCPP_VERSION) && _LIBCPP_HAS_EXPLICIT_THIS_PARAMETER
-#  define TEST_HAS_EXPLICIT_THIS_PARAMETER
-#endif
-
 // Placement `operator new`/`operator new[]` are not yet constexpr in C++26
 // when using MS ABI, because they are from <vcruntime_new.h>.
 #if defined(__cpp_lib_constexpr_new) && __cpp_lib_constexpr_new >= 202406L
diff --git a/libcxx/utils/generate_feature_test_macro_components.py b/libcxx/utils/generate_feature_test_macro_components.py
index f6f252751b3e3..22209f53d50d7 100644
--- a/libcxx/utils/generate_feature_test_macro_components.py
+++ b/libcxx/utils/generate_feature_test_macro_components.py
@@ -368,6 +368,16 @@ def add_version_header(tc):
             "values": {"c++20": 201907},
             "headers": ["memory"],
         },
+        {
+            "name": "__cpp_lib_constexpr_flat_map",
+            "values": {"c++26": 202502},
+            "headers": ["flat_map"],
+        },
+        {
+            "name": "__cpp_lib_constexpr_flat_set",
+            "values": {"c++26": 202502},
+            "headers": ["flat_set"],
+        },
         {
             "name": "__cpp_lib_constexpr_forward_list",
             "values": {"c++26": 202502},
@@ -863,7 +873,8 @@ def add_version_header(tc):
                 "c++26": 202306  # P2641R4 Checking if a union alternative is active
             },
             "headers": ["type_traits"],
-            "unimplemented": True,
+            "test_suite_guard": "__has_builtin(__builtin_is_within_lifetime)",
+            "libcxx_guard": "__has_builtin(__builtin_is_within_lifetime)",
         },
         {
             "name": "__cpp_lib_jthread",
diff --git a/libcxx/utils/libcxx/test/features.py b/libcxx/utils/libcxx/test/features.py
index 7d6e78de343c5..5da1d9afee911 100644
--- a/libcxx/utils/libcxx/test/features.py
+++ b/libcxx/utils/libcxx/test/features.py
@@ -733,17 +733,45 @@ def check_gdb(cfg):
 # Helpers to define correspondances between LLVM versions and vendor system versions.
 # Those are used for backdeployment features below, do not use directly in tests.
 DEFAULT_FEATURES += [
+    Feature(
+        name="_target-has-llvm-22",
+        when=lambda cfg: BooleanExpression.evaluate(
+            "TBD",
+            cfg.available_features,
+        ),
+    ),
+    Feature(
+        name="_target-has-llvm-21",
+        when=lambda cfg: BooleanExpression.evaluate(
+            "TBD",
+            cfg.available_features,
+        ),
+    ),
+    Feature(
+        name="_target-has-llvm-20",
+        when=lambda cfg: BooleanExpression.evaluate(
+            "_target-has-llvm-21 || target={{.+}}-apple-macosx{{26.[0-9](.\d+)?}}",
+            cfg.available_features,
+        ),
+    ),
+    Feature(
+        name="_target-has-llvm-19",
+        when=lambda cfg: BooleanExpression.evaluate(
+            "_target-has-llvm-20 || target={{.+}}-apple-macosx{{15.[4-9](.\d+)?}}",
+            cfg.available_features,
+        ),
+    ),
     Feature(
         name="_target-has-llvm-18",
         when=lambda cfg: BooleanExpression.evaluate(
-            "target={{.+}}-apple-macosx{{15(.[0-9]+)?(.[0-9]+)?}}",
+            "_target-has-llvm-19 || target={{.+}}-apple-macosx{{15.[0-3](.\d+)?}}",
             cfg.available_features,
         ),
     ),
     Feature(
         name="_target-has-llvm-17",
         when=lambda cfg: BooleanExpression.evaluate(
-            "_target-has-llvm-18 || target={{.+}}-apple-macosx{{14.[4-9](.[0-9]+)?}} || target={{.+}}-apple-macosx{{1[5-9]([.].+)?}}",
+            "_target-has-llvm-18 || target={{.+}}-apple-macosx{{14.[4-9](.\d+)?}}",
             cfg.available_features,
         ),
     ),
@@ -821,7 +849,7 @@ def check_gdb(cfg):
 # a libc++ flavor that enables availability markup. Similarly, a test could fail when
 # run against the system library of an older version of FreeBSD, even though FreeBSD
 # doesn't provide availability markup at the time of writing this.
-for version in ("12", "13", "14", "15", "16", "17", "18", "19", "20"):
+for version in ("12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22"):
     DEFAULT_FEATURES.append(
         Feature(
             name="using-built-library-before-llvm-{}".format(version),
diff --git a/libcxxabi/src/demangle/Utility.h b/libcxxabi/src/demangle/Utility.h
index 8829f3fa13a93..76243f5d3280c 100644
--- a/libcxxabi/src/demangle/Utility.h
+++ b/libcxxabi/src/demangle/Utility.h
@@ -81,7 +81,7 @@ class OutputBuffer {
   OutputBuffer(const OutputBuffer &) = delete;
   OutputBuffer &operator=(const OutputBuffer &) = delete;
 
-  virtual ~OutputBuffer() {}
+  virtual ~OutputBuffer() = default;
 
   operator std::string_view() const {
     return std::string_view(Buffer, CurrentPosition);
diff --git a/libcxxabi/test/test_demangle.pass.cpp b/libcxxabi/test/test_demangle.pass.cpp
index 858347bedce15..6790d7074a8b7 100644
--- a/libcxxabi/test/test_demangle.pass.cpp
+++ b/libcxxabi/test/test_demangle.pass.cpp
@@ -13,6 +13,10 @@
 // dd8b266ef.
 // UNSUPPORTED: using-built-library-before-llvm-20
 
+// This test exercises support for BitInt demangling introduced in
+// 20f56d140909a01c74e9981835373eaab6021af9.
+// UNSUPPORTED: using-built-library-before-llvm-21
+
 // XFAIL: win32-broken-printf-a-precision
 
 #include "support/timer.h"
diff --git a/libcxxabi/test/uncaught_exception.pass.cpp b/libcxxabi/test/uncaught_exception.pass.cpp
index 8e8468c43240d..cace88a309d0b 100644
--- a/libcxxabi/test/uncaught_exception.pass.cpp
+++ b/libcxxabi/test/uncaught_exception.pass.cpp
@@ -6,9 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// TODO(mordante) Investigate
-// UNSUPPORTED: apple-clang
-
 // UNSUPPORTED: no-exceptions
 
 // This tests that libc++abi still provides __cxa_uncaught_exception() for
diff --git a/libunwind/src/DwarfParser.hpp b/libunwind/src/DwarfParser.hpp
index dbd7d65c354aa..2b04ae2831f9a 100644
--- a/libunwind/src/DwarfParser.hpp
+++ b/libunwind/src/DwarfParser.hpp
@@ -842,12 +842,10 @@ bool CFI_Parser<A>::parseFDEInstructions(A &addressSpace,
             results->savedRegisters[UNW_AARCH64_RA_SIGN_STATE].value ^ 0x3;
         results->setRegisterValue(UNW_AARCH64_RA_SIGN_STATE, value,
                                   initialState);
-        // When calculating the value of the PC, it is assumed that the CFI
-        // instruction is placed before the signing instruction, however it is
-        // placed after. Because of this, we need to take into account the CFI
-        // instruction is one instruction call later than expected, and reduce
-        // the PC value by 4 bytes to compensate.
-        results->ptrAuthDiversifier = fdeInfo.pcStart + codeOffset - 0x4;
+        // When using Feat_PAuthLR, the PC value needs to be captured so that
+        // during unwinding, the correct PC value is used for re-authentication.
+        // It is assumed that the CFI is placed before the signing instruction.
+        results->ptrAuthDiversifier = fdeInfo.pcStart + codeOffset;
         _LIBUNWIND_TRACE_DWARF(
             "DW_CFA_AARCH64_negate_ra_state_with_pc(pc=0x%" PRIx64 ")\n",
             static_cast<uint64_t>(results->ptrAuthDiversifier));
diff --git a/lld/COFF/DriverUtils.cpp b/lld/COFF/DriverUtils.cpp
index 96ae2f0ddef6f..10a3934d53284 100644
--- a/lld/COFF/DriverUtils.cpp
+++ b/lld/COFF/DriverUtils.cpp
@@ -440,7 +440,7 @@ std::string LinkerDriver::createDefaultXml() {
      << "<assembly xmlns=\"urn:schemas-microsoft-com:asm.v1\"\n"
      << "          manifestVersion=\"1.0\">\n";
   if (ctx.config.manifestUAC) {
-    os << "  <trustInfo>\n"
+    os << "  <trustInfo xmlns=\"urn:schemas-microsoft-com:asm.v3\">\n"
        << "    <security>\n"
        << "      <requestedPrivileges>\n"
        << "         <requestedExecutionLevel level=" << ctx.config.manifestLevel
diff --git a/lld/ELF/Arch/ARM.cpp b/lld/ELF/Arch/ARM.cpp
index 91a673f13d68e..6c4290ff1e448 100644
--- a/lld/ELF/Arch/ARM.cpp
+++ b/lld/ELF/Arch/ARM.cpp
@@ -472,7 +472,7 @@ bool ARM::inBranchRange(RelType type, uint64_t src, uint64_t dst) const {
     // Bit 0 == 1 denotes Thumb state, it is not part of the range.
     dst &= ~0x1;
 
-  int64_t offset = dst - src;
+  int64_t offset = llvm::SignExtend64<32>(dst - src);
   switch (type) {
   case R_ARM_PC24:
   case R_ARM_PLT32:
diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
index e52d3a0e11113..8647752be31fe 100644
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -156,23 +156,23 @@ static std::tuple<ELFKind, uint16_t, uint8_t> parseEmulation(Ctx &ctx,
 
   std::pair<ELFKind, uint16_t> ret =
       StringSwitch<std::pair<ELFKind, uint16_t>>(s)
-          .Cases("aarch64elf", "aarch64linux", {ELF64LEKind, EM_AARCH64})
-          .Cases("aarch64elfb", "aarch64linuxb", {ELF64BEKind, EM_AARCH64})
-          .Cases("armelf", "armelf_linux_eabi", {ELF32LEKind, EM_ARM})
-          .Cases("armelfb", "armelfb_linux_eabi", {ELF32BEKind, EM_ARM})
+          .Cases({"aarch64elf", "aarch64linux"}, {ELF64LEKind, EM_AARCH64})
+          .Cases({"aarch64elfb", "aarch64linuxb"}, {ELF64BEKind, EM_AARCH64})
+          .Cases({"armelf", "armelf_linux_eabi"}, {ELF32LEKind, EM_ARM})
+          .Cases({"armelfb", "armelfb_linux_eabi"}, {ELF32BEKind, EM_ARM})
           .Case("elf32_x86_64", {ELF32LEKind, EM_X86_64})
-          .Cases("elf32btsmip", "elf32btsmipn32", {ELF32BEKind, EM_MIPS})
-          .Cases("elf32ltsmip", "elf32ltsmipn32", {ELF32LEKind, EM_MIPS})
+          .Cases({"elf32btsmip", "elf32btsmipn32"}, {ELF32BEKind, EM_MIPS})
+          .Cases({"elf32ltsmip", "elf32ltsmipn32"}, {ELF32LEKind, EM_MIPS})
           .Case("elf32lriscv", {ELF32LEKind, EM_RISCV})
-          .Cases("elf32ppc", "elf32ppclinux", {ELF32BEKind, EM_PPC})
-          .Cases("elf32lppc", "elf32lppclinux", {ELF32LEKind, EM_PPC})
+          .Cases({"elf32ppc", "elf32ppclinux"}, {ELF32BEKind, EM_PPC})
+          .Cases({"elf32lppc", "elf32lppclinux"}, {ELF32LEKind, EM_PPC})
           .Case("elf32loongarch", {ELF32LEKind, EM_LOONGARCH})
           .Case("elf64btsmip", {ELF64BEKind, EM_MIPS})
           .Case("elf64ltsmip", {ELF64LEKind, EM_MIPS})
           .Case("elf64lriscv", {ELF64LEKind, EM_RISCV})
           .Case("elf64ppc", {ELF64BEKind, EM_PPC64})
           .Case("elf64lppc", {ELF64LEKind, EM_PPC64})
-          .Cases("elf_amd64", "elf_x86_64", {ELF64LEKind, EM_X86_64})
+          .Cases({"elf_amd64", "elf_x86_64"}, {ELF64LEKind, EM_X86_64})
           .Case("elf_i386", {ELF32LEKind, EM_386})
           .Case("elf_iamcu", {ELF32LEKind, EM_IAMCU})
           .Case("elf64_sparc", {ELF64BEKind, EM_SPARCV9})
diff --git a/lld/ELF/ScriptParser.cpp b/lld/ELF/ScriptParser.cpp
index 4b9c941eb9d69..b61dc647401a3 100644
--- a/lld/ELF/ScriptParser.cpp
+++ b/lld/ELF/ScriptParser.cpp
@@ -450,7 +450,7 @@ static std::pair<ELFKind, uint16_t> parseBfdName(StringRef s) {
       .Case("elf64-powerpc", {ELF64BEKind, EM_PPC64})
       .Case("elf64-powerpcle", {ELF64LEKind, EM_PPC64})
       .Case("elf64-x86-64", {ELF64LEKind, EM_X86_64})
-      .Cases("elf32-tradbigmips", "elf32-bigmips", {ELF32BEKind, EM_MIPS})
+      .Cases({"elf32-tradbigmips", "elf32-bigmips"}, {ELF32BEKind, EM_MIPS})
       .Case("elf32-ntradbigmips", {ELF32BEKind, EM_MIPS})
       .Case("elf32-tradlittlemips", {ELF32LEKind, EM_MIPS})
       .Case("elf32-ntradlittlemips", {ELF32LEKind, EM_MIPS})
@@ -463,7 +463,8 @@ static std::pair<ELFKind, uint16_t> parseBfdName(StringRef s) {
       .Case("elf32-loongarch", {ELF32LEKind, EM_LOONGARCH})
       .Case("elf64-loongarch", {ELF64LEKind, EM_LOONGARCH})
       .Case("elf64-s390", {ELF64BEKind, EM_S390})
-      .Cases("elf32-hexagon", "elf32-littlehexagon", {ELF32LEKind, EM_HEXAGON})
+      .Cases({"elf32-hexagon", "elf32-littlehexagon"},
+             {ELF32LEKind, EM_HEXAGON})
       .Default({ELFNoneKind, EM_NONE});
 }
 
@@ -745,7 +746,7 @@ StringMatcher ScriptParser::readFilePatterns() {
 SortSectionPolicy ScriptParser::peekSortKind() {
   return StringSwitch<SortSectionPolicy>(peek())
       .Case("REVERSE", SortSectionPolicy::Reverse)
-      .Cases("SORT", "SORT_BY_NAME", SortSectionPolicy::Name)
+      .Cases({"SORT", "SORT_BY_NAME"}, SortSectionPolicy::Name)
       .Case("SORT_BY_ALIGNMENT", SortSectionPolicy::Alignment)
       .Case("SORT_BY_INIT_PRIORITY", SortSectionPolicy::Priority)
       .Case("SORT_NONE", SortSectionPolicy::None)
diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp
index bbf4b29a9fda5..a4150ebfa1653 100644
--- a/lld/ELF/SyntheticSections.cpp
+++ b/lld/ELF/SyntheticSections.cpp
@@ -2749,14 +2749,13 @@ RelroPaddingSection::RelroPaddingSection(Ctx &ctx)
     : SyntheticSection(ctx, ".relro_padding", SHT_NOBITS, SHF_ALLOC | SHF_WRITE,
                        1) {}
 
-RandomizePaddingSection::RandomizePaddingSection(Ctx &ctx, uint64_t size,
-                                                 OutputSection *parent)
-    : SyntheticSection(ctx, ".randomize_padding", SHT_PROGBITS, SHF_ALLOC, 1),
+PaddingSection::PaddingSection(Ctx &ctx, uint64_t size, OutputSection *parent)
+    : SyntheticSection(ctx, ".padding", SHT_PROGBITS, SHF_ALLOC, 1),
       size(size) {
   this->parent = parent;
 }
 
-void RandomizePaddingSection::writeTo(uint8_t *buf) {
+void PaddingSection::writeTo(uint8_t *buf) {
   std::array<uint8_t, 4> filler = getParent()->getFiller(ctx);
   uint8_t *end = buf + size;
   for (; buf + 4 <= end; buf += 4)
diff --git a/lld/ELF/SyntheticSections.h b/lld/ELF/SyntheticSections.h
index ac3ec63f0a7a5..38e68110e4bc0 100644
--- a/lld/ELF/SyntheticSections.h
+++ b/lld/ELF/SyntheticSections.h
@@ -779,11 +779,11 @@ class RelroPaddingSection final : public SyntheticSection {
   void writeTo(uint8_t *buf) override {}
 };
 
-class RandomizePaddingSection final : public SyntheticSection {
+class PaddingSection final : public SyntheticSection {
   uint64_t size;
 
 public:
-  RandomizePaddingSection(Ctx &ctx, uint64_t size, OutputSection *parent);
+  PaddingSection(Ctx &ctx, uint64_t size, OutputSection *parent);
   size_t getSize() const override { return size; }
   void writeTo(uint8_t *buf) override;
 };
diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp
index 4fa80397cbfa7..083b4fb1dbd22 100644
--- a/lld/ELF/Writer.cpp
+++ b/lld/ELF/Writer.cpp
@@ -1495,15 +1495,14 @@ static void randomizeSectionPadding(Ctx &ctx) {
       if (auto *isd = dyn_cast<InputSectionDescription>(bc)) {
         SmallVector<InputSection *, 0> tmp;
         if (os->ptLoad != curPtLoad) {
-          tmp.push_back(make<RandomizePaddingSection>(
-              ctx, g() % ctx.arg.maxPageSize, os));
+          tmp.push_back(
+              make<PaddingSection>(ctx, g() % ctx.arg.maxPageSize, os));
           curPtLoad = os->ptLoad;
         }
         for (InputSection *isec : isd->sections) {
           // Probability of inserting padding is 1 in 16.
           if (g() % 16 == 0)
-            tmp.push_back(
-                make<RandomizePaddingSection>(ctx, isec->addralign, os));
+            tmp.push_back(make<PaddingSection>(ctx, isec->addralign, os));
           tmp.push_back(isec);
         }
         isd->sections = std::move(tmp);
diff --git a/lld/MachO/Arch/X86_64.cpp b/lld/MachO/Arch/X86_64.cpp
index a7c4b452f990b..111c4d9846d28 100644
--- a/lld/MachO/Arch/X86_64.cpp
+++ b/lld/MachO/Arch/X86_64.cpp
@@ -104,7 +104,7 @@ int64_t X86_64::getEmbeddedAddend(MemoryBufferRef mb, uint64_t offset,
 void X86_64::relocateOne(uint8_t *loc, const Reloc &r, uint64_t value,
                          uint64_t relocVA) const {
   if (r.pcrel) {
-    uint64_t pc = relocVA + (1 << r.length) + pcrelOffset(r.type);
+    uint64_t pc = relocVA + (1ull << r.length) + pcrelOffset(r.type);
     value -= pc;
   }
 
diff --git a/lld/MachO/BPSectionOrderer.cpp b/lld/MachO/BPSectionOrderer.cpp
index d50abc22fc6c1..328c33e6cfb65 100644
--- a/lld/MachO/BPSectionOrderer.cpp
+++ b/lld/MachO/BPSectionOrderer.cpp
@@ -118,6 +118,10 @@ DenseMap<const InputSection *, int> lld::macho::runBalancedPartitioning(
         auto *isec = subsec.isec;
         if (!isec || isec->data.empty() || !isec->data.data())
           continue;
+        // CString section order is handled by
+        // {Deduplicated}CStringSection::finalizeContents()
+        if (isa<CStringInputSection>(isec) || isec->isFinal)
+          continue;
         // ConcatInputSections are entirely live or dead, so the offset is
         // irrelevant.
         if (isa<ConcatInputSection>(isec) && !isec->isLive(0))
diff --git a/lld/MachO/Driver.cpp b/lld/MachO/Driver.cpp
index 9b67db9fa55cf..32b20993af67c 100644
--- a/lld/MachO/Driver.cpp
+++ b/lld/MachO/Driver.cpp
@@ -841,18 +841,18 @@ static PlatformVersion parsePlatformVersion(const Arg *arg) {
   // TODO(compnerd) see if we can generate this case list via XMACROS
   platformVersion.platform =
       StringSwitch<PlatformType>(lowerDash(platformStr))
-          .Cases("macos", "1", PLATFORM_MACOS)
-          .Cases("ios", "2", PLATFORM_IOS)
-          .Cases("tvos", "3", PLATFORM_TVOS)
-          .Cases("watchos", "4", PLATFORM_WATCHOS)
-          .Cases("bridgeos", "5", PLATFORM_BRIDGEOS)
-          .Cases("mac-catalyst", "6", PLATFORM_MACCATALYST)
-          .Cases("ios-simulator", "7", PLATFORM_IOSSIMULATOR)
-          .Cases("tvos-simulator", "8", PLATFORM_TVOSSIMULATOR)
-          .Cases("watchos-simulator", "9", PLATFORM_WATCHOSSIMULATOR)
-          .Cases("driverkit", "10", PLATFORM_DRIVERKIT)
-          .Cases("xros", "11", PLATFORM_XROS)
-          .Cases("xros-simulator", "12", PLATFORM_XROS_SIMULATOR)
+          .Cases({"macos", "1"}, PLATFORM_MACOS)
+          .Cases({"ios", "2"}, PLATFORM_IOS)
+          .Cases({"tvos", "3"}, PLATFORM_TVOS)
+          .Cases({"watchos", "4"}, PLATFORM_WATCHOS)
+          .Cases({"bridgeos", "5"}, PLATFORM_BRIDGEOS)
+          .Cases({"mac-catalyst", "6"}, PLATFORM_MACCATALYST)
+          .Cases({"ios-simulator", "7"}, PLATFORM_IOSSIMULATOR)
+          .Cases({"tvos-simulator", "8"}, PLATFORM_TVOSSIMULATOR)
+          .Cases({"watchos-simulator", "9"}, PLATFORM_WATCHOSSIMULATOR)
+          .Cases({"driverkit", "10"}, PLATFORM_DRIVERKIT)
+          .Cases({"xros", "11"}, PLATFORM_XROS)
+          .Cases({"xros-simulator", "12"}, PLATFORM_XROS_SIMULATOR)
           .Default(PLATFORM_UNKNOWN);
   if (platformVersion.platform == PLATFORM_UNKNOWN)
     error(Twine("malformed platform: ") + platformStr);
@@ -948,7 +948,7 @@ getUndefinedSymbolTreatment(const ArgList &args) {
   StringRef treatmentStr = args.getLastArgValue(OPT_undefined);
   auto treatment =
       StringSwitch<UndefinedSymbolTreatment>(treatmentStr)
-          .Cases("error", "", UndefinedSymbolTreatment::error)
+          .Cases({"error", ""}, UndefinedSymbolTreatment::error)
           .Case("warning", UndefinedSymbolTreatment::warning)
           .Case("suppress", UndefinedSymbolTreatment::suppress)
           .Case("dynamic_lookup", UndefinedSymbolTreatment::dynamic_lookup)
@@ -972,7 +972,7 @@ getUndefinedSymbolTreatment(const ArgList &args) {
 static ICFLevel getICFLevel(const ArgList &args) {
   StringRef icfLevelStr = args.getLastArgValue(OPT_icf_eq);
   auto icfLevel = StringSwitch<ICFLevel>(icfLevelStr)
-                      .Cases("none", "", ICFLevel::none)
+                      .Cases({"none", ""}, ICFLevel::none)
                       .Case("safe", ICFLevel::safe)
                       .Case("safe_thunks", ICFLevel::safe_thunks)
                       .Case("all", ICFLevel::all)
diff --git a/lld/MachO/InputSection.cpp b/lld/MachO/InputSection.cpp
index b173e14cc86a8..2b2d28ef63e2d 100644
--- a/lld/MachO/InputSection.cpp
+++ b/lld/MachO/InputSection.cpp
@@ -348,6 +348,9 @@ WordLiteralInputSection::WordLiteralInputSection(const Section &section,
 }
 
 uint64_t WordLiteralInputSection::getOffset(uint64_t off) const {
+  if (off >= data.size())
+    fatal(toString(this) + ": offset is outside the section");
+
   auto *osec = cast<WordLiteralSection>(parent);
   const uintptr_t buf = reinterpret_cast<uintptr_t>(data.data());
   switch (sectionType(getFlags())) {
diff --git a/lld/MachO/Sections.cpp b/lld/MachO/Sections.cpp
index a27d902c0a227..47169c7e14ed0 100644
--- a/lld/MachO/Sections.cpp
+++ b/lld/MachO/Sections.cpp
@@ -27,7 +27,7 @@ bool isCodeSection(StringRef name, StringRef segName, uint32_t flags) {
 
   if (segName == segment_names::text)
     return StringSwitch<bool>(name)
-        .Cases(section_names::textCoalNt, section_names::staticInit, true)
+        .Cases({section_names::textCoalNt, section_names::staticInit}, true)
         .Default(false);
 
   return false;
diff --git a/lld/docs/ld.lld.1 b/lld/docs/ld.lld.1
index bb1a53ad1112a..cfdde0a6c2299 100644
--- a/lld/docs/ld.lld.1
+++ b/lld/docs/ld.lld.1
@@ -719,6 +719,19 @@ a given pattern are handled as if they were given as arguments of
 Creates a separate output section for every orphan input section.
 .It Fl -unresolved-symbols Ns = Ns Ar value
 Determine how to handle unresolved symbols.
+.Ar value
+may be:
+.Pp
+.Bl -tag -width 2n -compact
+.It Cm report-all
+Report unresolved symbols (default).
+.It Cm ignore-in-object-files
+Only report unresolved symbols contained in shared libraries.
+.It Cm ignore-in-shared-libs
+Only report unresolved symbols contained in object files.
+.It Cm ignore-all
+Do not report unresolved symbols.
+.El
 .It Fl -use-android-relr-tags
 Use SHT_ANDROID_RELR / DT_ANDROID_RELR* tags instead of SHT_RELR / DT_RELR*.
 .It Fl v , Fl V
diff --git a/lld/test/COFF/Inputs/manifest-uac.test b/lld/test/COFF/Inputs/manifest-uac.test
new file mode 100644
index 0000000000000..5269339489355
--- /dev/null
+++ b/lld/test/COFF/Inputs/manifest-uac.test
@@ -0,0 +1,11 @@
+<?xml version='1.0' encoding='UTF-8' standalone='yes'?>
+<assembly xmlns='urn:schemas-microsoft-com:asm.v1' manifestVersion='1.0'>
+  <trustInfo xmlns="urn:schemas-microsoft-com:asm.v3">
+    <security>
+      <requestedPrivileges>
+        <requestedExecutionLevel level='asInvoker'
+                                 uiAccess='false'/>
+      </requestedPrivileges>
+    </security>
+  </trustInfo>
+</assembly>
diff --git a/lld/test/COFF/manifest-uac.test b/lld/test/COFF/manifest-uac.test
new file mode 100644
index 0000000000000..d3a17c7282716
--- /dev/null
+++ b/lld/test/COFF/manifest-uac.test
@@ -0,0 +1,33 @@
+# REQUIRES: libxml2
+
+# RUN: yaml2obj %p/Inputs/ret42.yaml -o %t.obj
+# RUN: lld-link /out:%t.exe /entry:main \
+# RUN:   /manifest:embed \
+# RUN:   /manifestinput:%p/Inputs/manifest-uac.test %t.obj
+# RUN: llvm-readobj --coff-resources %t.exe | FileCheck %s
+
+CHECK:      Data (
+CHECK-NEXT:   0000: 3C3F786D 6C207665 7273696F 6E3D2231  |<?xml version="1|
+CHECK-NEXT:   0010: 2E302220 656E636F 64696E67 3D225554  |.0" encoding="UT|
+CHECK-NEXT:   0020: 462D3822 3F3E0A3C 61737365 6D626C79  |F-8"?>.<assembly|
+CHECK-NEXT:   0030: 20786D6C 6E733D22 75726E3A 73636865  | xmlns="urn:sche|
+CHECK-NEXT:   0040: 6D61732D 6D696372 6F736F66 742D636F  |mas-microsoft-co|
+CHECK-NEXT:   0050: 6D3A6173 6D2E7631 22206D61 6E696665  |m:asm.v1" manife|
+CHECK-NEXT:   0060: 73745665 7273696F 6E3D2231 2E30223E  |stVersion="1.0">|
+CHECK-NEXT:   0070: 0A20203C 74727573 74496E66 6F20786D  |.  <trustInfo xm|
+CHECK-NEXT:   0080: 6C6E733D 2275726E 3A736368 656D6173  |lns="urn:schemas|
+CHECK-NEXT:   0090: 2D6D6963 726F736F 66742D63 6F6D3A61  |-microsoft-com:a|
+CHECK-NEXT:   00A0: 736D2E76 33223E0A 20202020 3C736563  |sm.v3">.    <sec|
+CHECK-NEXT:   00B0: 75726974 793E0A20 20202020 203C7265  |urity>.      <re|
+CHECK-NEXT:   00C0: 71756573 74656450 72697669 6C656765  |questedPrivilege|
+CHECK-NEXT:   00D0: 733E0A20 20202020 2020203C 72657175  |s>.        <requ|
+CHECK-NEXT:   00E0: 65737465 64457865 63757469 6F6E4C65  |estedExecutionLe|
+CHECK-NEXT:   00F0: 76656C20 6C657665 6C3D2261 73496E76  |vel level="asInv|
+CHECK-NEXT:   0100: 6F6B6572 22207569 41636365 73733D22  |oker" uiAccess="|
+CHECK-NEXT:   0110: 66616C73 65222F3E 0A202020 2020203C  |false"/>.      <|
+CHECK-NEXT:   0120: 2F726571 75657374 65645072 6976696C  |/requestedPrivil|
+CHECK-NEXT:   0130: 65676573 3E0A2020 20203C2F 73656375  |eges>.    </secu|
+CHECK-NEXT:   0140: 72697479 3E0A2020 3C2F7472 75737449  |rity>.  </trustI|
+CHECK-NEXT:   0150: 6E666F3E 0A3C2F61 7373656D 626C793E  |nfo>.</assembly>|
+CHECK-NEXT:   0160: 0A                                   |.|
+CHECK-NEXT: )
diff --git a/lld/test/COFF/manifest.test b/lld/test/COFF/manifest.test
index 4910600bd3a17..09de96e9bccfa 100644
--- a/lld/test/COFF/manifest.test
+++ b/lld/test/COFF/manifest.test
@@ -10,7 +10,7 @@
 MANIFEST: <?xml version="1.0" standalone="yes"?>
 MANIFEST: <assembly xmlns="urn:schemas-microsoft-com:asm.v1"
 MANIFEST:           manifestVersion="1.0">
-MANIFEST:   <trustInfo>
+MANIFEST:   <trustInfo xmlns="urn:schemas-microsoft-com:asm.v3">
 MANIFEST:     <security>
 MANIFEST:       <requestedPrivileges>
 MANIFEST:          <requestedExecutionLevel level='asInvoker' uiAccess='false'/>
@@ -26,7 +26,7 @@ MANIFEST: </assembly>
 UAC: <?xml version="1.0" standalone="yes"?>
 UAC: <assembly xmlns="urn:schemas-microsoft-com:asm.v1"
 UAC:           manifestVersion="1.0">
-UAC:   <trustInfo>
+UAC:   <trustInfo xmlns="urn:schemas-microsoft-com:asm.v3">
 UAC:     <security>
 UAC:       <requestedPrivileges>
 UAC:          <requestedExecutionLevel level='requireAdministrator' uiAccess='true'/>
@@ -43,7 +43,7 @@ UAC: </assembly>
 DEPENDENCY: <?xml version="1.0" standalone="yes"?>
 DEPENDENCY: <assembly xmlns="urn:schemas-microsoft-com:asm.v1"
 DEPENDENCY:           manifestVersion="1.0">
-DEPENDENCY:   <trustInfo>
+DEPENDENCY:   <trustInfo xmlns="urn:schemas-microsoft-com:asm.v3">
 DEPENDENCY:     <security>
 DEPENDENCY:       <requestedPrivileges>
 DEPENDENCY:          <requestedExecutionLevel level='asInvoker' uiAccess='false'/>
@@ -90,7 +90,7 @@ NOUACNODEP: </assembly>
 SEVERALDEPS: <?xml version="1.0" standalone="yes"?>
 SEVERALDEPS: <assembly xmlns="urn:schemas-microsoft-com:asm.v1"
 SEVERALDEPS:           manifestVersion="1.0">
-SEVERALDEPS:   <trustInfo>
+SEVERALDEPS:   <trustInfo xmlns="urn:schemas-microsoft-com:asm.v3">
 SEVERALDEPS:     <security>
 SEVERALDEPS:       <requestedPrivileges>
 SEVERALDEPS:          <requestedExecutionLevel level='asInvoker' uiAccess='false'/>
@@ -139,31 +139,34 @@ EMBED:   0040: 6D61732D 6D696372 6F736F66 742D636F  |mas-microsoft-co|
 EMBED:   0050: 6D3A6173 6D2E7631 220A2020 20202020  |m:asm.v1".      |
 EMBED:   0060: 20202020 6D616E69 66657374 56657273  |    manifestVers|
 EMBED:   0070: 696F6E3D 22312E30 223E0A20 203C7472  |ion="1.0">.  <tr|
-EMBED:   0080: 75737449 6E666F3E 0A202020 203C7365  |ustInfo>.    <se|
-EMBED:   0090: 63757269 74793E0A 20202020 20203C72  |curity>.      <r|
-EMBED:   00A0: 65717565 73746564 50726976 696C6567  |equestedPrivileg|
-EMBED:   00B0: 65733E0A 20202020 20202020 203C7265  |es>.         <re|
-EMBED:   00C0: 71756573 74656445 78656375 74696F6E  |questedExecution|
-EMBED:   00D0: 4C657665 6C206C65 76656C3D 27617349  |Level level='asI|
-EMBED:   00E0: 6E766F6B 65722720 75694163 63657373  |nvoker' uiAccess|
-EMBED:   00F0: 3D276661 6C736527 2F3E0A20 20202020  |='false'/>.     |
-EMBED:   0100: 203C2F72 65717565 73746564 50726976  | </requestedPriv|
-EMBED:   0110: 696C6567 65733E0A 20202020 3C2F7365  |ileges>.    </se|
-EMBED:   0120: 63757269 74793E0A 20203C2F 74727573  |curity>.  </trus|
-EMBED:   0130: 74496E66 6F3E0A20 203C6465 70656E64  |tInfo>.  <depend|
-EMBED:   0140: 656E6379 3E0A2020 20203C64 6570656E  |ency>.    <depen|
-EMBED:   0150: 64656E74 41737365 6D626C79 3E0A2020  |dentAssembly>.  |
-EMBED:   0160: 20202020 3C617373 656D626C 79496465  |    <assemblyIde|
-EMBED:   0170: 6E746974 7920666F 6F3D2762 61722720  |ntity foo='bar' |
-EMBED:   0180: 2F3E0A20 2020203C 2F646570 656E6465  |/>.    </depende|
-EMBED:   0190: 6E744173 73656D62 6C793E0A 20203C2F  |ntAssembly>.  </|
-EMBED:   01A0: 64657065 6E64656E 63793E0A 20203C64  |dependency>.  <d|
-EMBED:   01B0: 6570656E 64656E63 793E0A20 2020203C  |ependency>.    <|
-EMBED:   01C0: 64657065 6E64656E 74417373 656D626C  |dependentAssembl|
-EMBED:   01D0: 793E0A20 20202020 203C6173 73656D62  |y>.      <assemb|
-EMBED:   01E0: 6C794964 656E7469 74792062 617A3D27  |lyIdentity baz='|
-EMBED:   01F0: 71757578 27202F3E 0A202020 203C2F64  |quux' />.    </d|
-EMBED:   0200: 6570656E 64656E74 41737365 6D626C79  |ependentAssembly|
-EMBED:   0210: 3E0A2020 3C2F6465 70656E64 656E6379  |>.  </dependency|
-EMBED:   0220: 3E0A3C2F 61737365 6D626C79 3E0A      |>.</assembly>.|
+EMBED:   0080: 75737449 6E666F20 786D6C6E 733D2275  |ustInfo xmlns="u|
+EMBED:   0090: 726E3A73 6368656D 61732D6D 6963726F  |rn:schemas-micro|
+EMBED:   00A0: 736F6674 2D636F6D 3A61736D 2E763322  |soft-com:asm.v3"|
+EMBED:   00B0: 3E0A2020 20203C73 65637572 6974793E  |>.    <security>|
+EMBED:   00C0: 0A202020 2020203C 72657175 65737465  |.      <requeste|
+EMBED:   00D0: 64507269 76696C65 6765733E 0A202020  |dPrivileges>.   |
+EMBED:   00E0: 20202020 20203C72 65717565 73746564  |      <requested|
+EMBED:   00F0: 45786563 7574696F 6E4C6576 656C206C  |ExecutionLevel l|
+EMBED:   0100: 6576656C 3D276173 496E766F 6B657227  |evel='asInvoker'|
+EMBED:   0110: 20756941 63636573 733D2766 616C7365  | uiAccess='false|
+EMBED:   0120: 272F3E0A 20202020 20203C2F 72657175  |'/>.      </requ|
+EMBED:   0130: 65737465 64507269 76696C65 6765733E  |estedPrivileges>|
+EMBED:   0140: 0A202020 203C2F73 65637572 6974793E  |.    </security>|
+EMBED:   0150: 0A20203C 2F747275 7374496E 666F3E0A  |.  </trustInfo>.|
+EMBED:   0160: 20203C64 6570656E 64656E63 793E0A20  |  <dependency>. |
+EMBED:   0170: 2020203C 64657065 6E64656E 74417373  |   <dependentAss|
+EMBED:   0180: 656D626C 793E0A20 20202020 203C6173  |embly>.      <as|
+EMBED:   0190: 73656D62 6C794964 656E7469 74792066  |semblyIdentity f|
+EMBED:   01A0: 6F6F3D27 62617227 202F3E0A 20202020  |oo='bar' />.    |
+EMBED:   01B0: 3C2F6465 70656E64 656E7441 7373656D  |</dependentAssem|
+EMBED:   01C0: 626C793E 0A20203C 2F646570 656E6465  |bly>.  </depende|
+EMBED:   01D0: 6E63793E 0A20203C 64657065 6E64656E  |ncy>.  <dependen|
+EMBED:   01E0: 63793E0A 20202020 3C646570 656E6465  |cy>.    <depende|
+EMBED:   01F0: 6E744173 73656D62 6C793E0A 20202020  |ntAssembly>.    |
+EMBED:   0200: 20203C61 7373656D 626C7949 64656E74  |  <assemblyIdent|
+EMBED:   0210: 69747920 62617A3D 27717575 7827202F  |ity baz='quux' /|
+EMBED:   0220: 3E0A2020 20203C2F 64657065 6E64656E  |>.    </dependen|
+EMBED:   0230: 74417373 656D626C 793E0A20 203C2F64  |tAssembly>.  </d|
+EMBED:   0240: 6570656E 64656E63 793E0A3C 2F617373  |ependency>.</ass|
+EMBED:   0250: 656D626C 793E0A                      |embly>.|
 EMBED: )
diff --git a/lld/test/COFF/manifestinput.test b/lld/test/COFF/manifestinput.test
index 04af80a13312d..cbf27b1ea96b5 100644
--- a/lld/test/COFF/manifestinput.test
+++ b/lld/test/COFF/manifestinput.test
@@ -5,22 +5,21 @@
 # RUN:   /manifest:embed \
 # RUN:   /manifestuac:"level='requireAdministrator'" \
 # RUN:   /manifestinput:%p/Inputs/manifestinput.test %t.obj
-# RUN: llvm-readobj --coff-resources --file-headers %t.exe | FileCheck %s \
-# RUN:   -check-prefix TEST_EMBED
+# RUN: llvm-readobj --coff-resources --file-headers %t.exe | FileCheck %s
 
-TEST_EMBED:          ResourceTableRVA: 0x2000
-TEST_EMBED-NEXT:     ResourceTableSize: 0x2A0
-TEST_EMBED-DAG:      Resources [
-TEST_EMBED-NEXT:       Total Number of Resources: 1
-TEST_EMBED-DAG:        Number of String Entries: 0
-TEST_EMBED-NEXT:       Number of ID Entries: 1
-TEST_EMBED-NEXT:       Type: MANIFEST (ID 24) [
-TEST_EMBED-NEXT:         Table Offset: 0x18
-TEST_EMBED-NEXT:         Number of String Entries: 0
-TEST_EMBED-NEXT:         Number of ID Entries: 1
-TEST_EMBED-NEXT:         Name: (ID 1) [
-TEST_EMBED-NEXT:           Table Offset: 0x30
-TEST_EMBED-NEXT:           Number of String Entries: 0
-TEST_EMBED-NEXT:           Number of ID Entries: 1
-TEST_EMBED-NEXT:           Language: (ID 1033) [
-TEST_EMBED-NEXT:             Entry Offset: 0x48
+CHECK:          ResourceTableRVA: 0x2000
+CHECK-NEXT:     ResourceTableSize: 0x2C8
+CHECK-DAG:      Resources [
+CHECK-NEXT:       Total Number of Resources: 1
+CHECK-DAG:        Number of String Entries: 0
+CHECK-NEXT:       Number of ID Entries: 1
+CHECK-NEXT:       Type: MANIFEST (ID 24) [
+CHECK-NEXT:         Table Offset: 0x18
+CHECK-NEXT:         Number of String Entries: 0
+CHECK-NEXT:         Number of ID Entries: 1
+CHECK-NEXT:         Name: (ID 1) [
+CHECK-NEXT:           Table Offset: 0x30
+CHECK-NEXT:           Number of String Entries: 0
+CHECK-NEXT:           Number of ID Entries: 1
+CHECK-NEXT:           Language: (ID 1033) [
+CHECK-NEXT:             Entry Offset: 0x48
diff --git a/lld/test/ELF/aarch64-build-attributes.s b/lld/test/ELF/aarch64-build-attributes.s
index f2d542150897e..3d333bf6ccf2f 100644
--- a/lld/test/ELF/aarch64-build-attributes.s
+++ b/lld/test/ELF/aarch64-build-attributes.s
@@ -1,11 +1,11 @@
 // REQUIRES: aarch64
 // RUN: rm -rf %t && split-file %s %t && cd %t
 
-// RUN: llvm-mc -triple=aarch64 -filetype=obj %s -o %t1.o
-// RUN: llvm-mc -triple=aarch64 -filetype=obj pauth-bti-gcs.s -o %t2.o
-// RUN: llvm-mc -triple=aarch64 -filetype=obj pauth-bti-pac.s -o %t3.o
-// RUN: ld.lld -r %t1.o %t2.o %t3.o -o %t.merged.o
-// RUN: llvm-readelf -n %t.merged.o | FileCheck %s --check-prefix=NOTE
+// RUN: llvm-mc -triple=aarch64 -filetype=obj %s -o 1.o
+// RUN: llvm-mc -triple=aarch64 -filetype=obj pauth-bti-gcs.s -o 2.o
+// RUN: llvm-mc -triple=aarch64 -filetype=obj pauth-bti-pac.s -o 3.o
+// RUN: ld.lld -r 1.o 2.o 3.o -o merged.o
+// RUN: llvm-readelf -n merged.o | FileCheck %s --check-prefix=NOTE
 
 /// This test merges three object files with AArch64 build attributes.
 /// All contain identical PAuth ABI info (platform/version), which must be preserved.
diff --git a/lld/test/ELF/arm-wraparound-veneer.s b/lld/test/ELF/arm-wraparound-veneer.s
new file mode 100644
index 0000000000000..74dd6f29d8170
--- /dev/null
+++ b/lld/test/ELF/arm-wraparound-veneer.s
@@ -0,0 +1,102 @@
+// REQUIRES: arm
+// RUN: rm -rf %t && split-file %s %t && cd %t
+// RUN: llvm-mc -filetype=obj -triple=armv7-none-eabi code.s -o code.o
+// RUN: ld.lld -T unsigned1.ld code.o -o unsigned1.elf
+// RUN: llvm-objdump --triple=armv7 --no-show-raw-insn -d unsigned1.elf | FileCheck %s --check-prefix=UNSIGNED1
+// RUN: ld.lld -T unsigned2.ld code.o -o unsigned2.elf
+// RUN: llvm-objdump --triple=armv7 --no-show-raw-insn -d unsigned2.elf | FileCheck %s --check-prefix=UNSIGNED2
+// RUN: ld.lld -T signed1.ld code.o -o signed1.elf
+// RUN: llvm-objdump --triple=armv7 --no-show-raw-insn -d signed1.elf | FileCheck %s --check-prefix=SIGNED1
+// RUN: ld.lld -T signed2.ld code.o -o signed2.elf
+// RUN: llvm-objdump --triple=armv7 --no-show-raw-insn -d signed2.elf | FileCheck %s --check-prefix=SIGNED2
+
+/// The aim of this test is to ensure that a BL instruction near one end of the
+/// address space can reach a function at the extreme other end, directly,
+/// using a branch offset that makes the address wrap round. We check this at
+/// both the unsigned wraparound point (one address near 0 and the other near
+/// 0xFFFFFFFF) and the signed wraparound point (addresses either side of
+/// 0x80000000), crossing the boundary in both directions. In all four cases we
+/// expect a direct branch with no veneer.
+
+// UNSIGNED1: Disassembly of section .text.lowaddr:
+// UNSIGNED1: <func>:
+// UNSIGNED1:    10000:       bx      lr
+//
+// UNSIGNED1: Disassembly of section .text.highaddr:
+// UNSIGNED1: <_start>:
+// UNSIGNED1: ffff0000:       bl      0x10000
+// UNSIGNED1-NEXT:            bx      lr
+
+// UNSIGNED2: Disassembly of section .text.lowaddr:
+// UNSIGNED2: <_start>:
+// UNSIGNED2:    10000:       bl      0xffff0000
+// UNSIGNED2-NEXT:            bx      lr
+//
+// UNSIGNED2: Disassembly of section .text.highaddr:
+// UNSIGNED2: <func>:
+// UNSIGNED2: ffff0000:       bx      lr
+
+// SIGNED1:   Disassembly of section .text.posaddr:
+// SIGNED1:   <_start>:
+// SIGNED1:   7fff0000:       bl      0x80010000
+// SIGNED1-NEXT:              bx      lr
+//
+// SIGNED1:   Disassembly of section .text.negaddr:
+// SIGNED1:   <func>:
+// SIGNED1:   80010000:       bx      lr
+
+// SIGNED2:   Disassembly of section .text.posaddr:
+// SIGNED2:   <func>:
+// SIGNED2:   7fff0000:       bx      lr
+//
+// SIGNED2:   Disassembly of section .text.negaddr:
+// SIGNED2:   <_start>:
+// SIGNED2:   80010000:       bl      0x7fff0000
+// SIGNED2-NEXT:              bx      lr
+
+//--- code.s
+
+  .section .text.callee, "ax", %progbits
+  .global func
+  .type func, %function
+func:
+  bx lr
+
+  .section .text.caller, "ax", %progbits
+  .global _start
+  .type _start, %function
+_start:
+  bl func
+  bx lr
+
+//--- unsigned1.ld
+
+ENTRY(_start)
+SECTIONS {
+  .text.lowaddr  0x00010000 : AT(0x00010000) { *(.text.callee) }
+  .text.highaddr 0xffff0000 : AT(0xffff0000) { *(.text.caller) }
+}
+
+//--- unsigned2.ld
+
+ENTRY(_start)
+SECTIONS {
+  .text.lowaddr  0x00010000 : AT(0x00010000) { *(.text.caller) }
+  .text.highaddr 0xffff0000 : AT(0xffff0000) { *(.text.callee) }
+}
+
+//--- signed1.ld
+
+ENTRY(_start)
+SECTIONS {
+  .text.posaddr  0x7fff0000 : AT(0x7fff0000) { *(.text.caller) }
+  .text.negaddr  0x80010000 : AT(0x80010000) { *(.text.callee) }
+}
+
+//--- signed2.ld
+
+ENTRY(_start)
+SECTIONS {
+  .text.posaddr  0x7fff0000 : AT(0x7fff0000) { *(.text.callee) }
+  .text.negaddr  0x80010000 : AT(0x80010000) { *(.text.caller) }
+}
diff --git a/lld/test/MachO/bp-section-orderer.s b/lld/test/MachO/bp-section-orderer.s
index 90924e5797b64..d7de90d6cd7b3 100644
--- a/lld/test/MachO/bp-section-orderer.s
+++ b/lld/test/MachO/bp-section-orderer.s
@@ -106,6 +106,11 @@ r3:
 r4:
   .quad s2
 
+# cstrings are ignored by runBalancedPartitioning()
+.cstring
+cstr:
+  .asciz "this is cstr"
+
 .bss
 bss0:
   .zero 10
diff --git a/lld/test/MachO/invalid/bad-offsets.s b/lld/test/MachO/invalid/bad-offsets.s
new file mode 100644
index 0000000000000..e1244ee501960
--- /dev/null
+++ b/lld/test/MachO/invalid/bad-offsets.s
@@ -0,0 +1,45 @@
+## Test that we properly detect and report out-of-bounds offsets in literal sections.
+## We're intentionally testing fatal errors (for malformed input files), and
+## fatal errors aren't supported for testing when main is run twice.
+# XFAIL: main-run-twice
+
+# REQUIRES: x86
+# RUN: rm -rf %t; split-file %s %t
+
+## Test WordLiteralInputSection bounds checking
+# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/word-literal.s -o %t/word-literal.o
+# RUN: not %lld -dylib %t/word-literal.o -o /dev/null 2>&1 | FileCheck %s --check-prefix=WORD
+
+## Test CStringInputSection bounds checking
+# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/cstring.s -o %t/cstring.o
+# RUN: not %lld -dylib %t/cstring.o -o /dev/null 2>&1 | FileCheck %s --check-prefix=CSTRING
+
+# WORD: error: {{.*}}word-literal.o:(__literal4): offset is outside the section
+# CSTRING: error: {{.*}}cstring.o:(__cstring): offset is outside the section
+
+#--- word-literal.s
+.section __TEXT,__literal4,4byte_literals
+L_literal:
+  .long 0x01020304
+
+.text
+.globl _main
+_main:
+  # We use a subtractor expression to force a section relocation. Symbol relocations
+  # don't trigger the error.
+  .long L_literal - _main + 4
+
+.subsections_via_symbols
+
+#--- cstring.s
+## Create a cstring section with a reference that points past the end
+.cstring
+L_str:
+  .asciz "foo"
+
+.text
+.globl _main
+_main:
+  .long L_str - _main + 4
+
+.subsections_via_symbols
\ No newline at end of file
diff --git a/lld/test/wasm/lto/relocation-model.ll b/lld/test/wasm/lto/relocation-model.ll
index 8fe198d0c64e6..a042615b8fe1c 100644
--- a/lld/test/wasm/lto/relocation-model.ll
+++ b/lld/test/wasm/lto/relocation-model.ll
@@ -8,6 +8,11 @@
 ; RUN: wasm-ld %t.o -o %t_static.wasm -save-temps -r -mllvm -relocation-model=static
 ; RUN: llvm-readobj -r %t_static.wasm.lto.o | FileCheck %s --check-prefix=STATIC
 
+;; Linking with --unresolved-symbols=import-dynamic should also generate PIC
+;; code for external references.
+; RUN: wasm-ld %t.o -o %t_import.wasm -save-temps --experimental-pic --unresolved-symbols=import-dynamic
+; RUN: llvm-readobj -r %t_import.wasm.lto.o | FileCheck %s --check-prefix=PIC
+
 ; PIC: R_WASM_GLOBAL_INDEX_LEB foo
 ; STATIC: R_WASM_MEMORY_ADDR_LEB foo
 
diff --git a/lld/wasm/LTO.cpp b/lld/wasm/LTO.cpp
index ae85f4693214b..668cdf21ea3ed 100644
--- a/lld/wasm/LTO.cpp
+++ b/lld/wasm/LTO.cpp
@@ -63,6 +63,12 @@ static lto::Config createConfig() {
     c.RelocModel = std::nullopt;
   else if (ctx.isPic)
     c.RelocModel = Reloc::PIC_;
+  else if (ctx.arg.unresolvedSymbols == UnresolvedPolicy::ImportDynamic)
+    // With ImportDynamic we also need to use the PIC relocation model so that
+    // external symbols are references via the GOT.
+    // TODO(sbc): This should probably be Reloc::DynamicNoPIC, but the backend
+    // doesn't currently support that.
+    c.RelocModel = Reloc::PIC_;
   else
     c.RelocModel = Reloc::Static;
 
diff --git a/lldb/bindings/lua/lua-typemaps.swig b/lldb/bindings/lua/lua-typemaps.swig
index 56756936a532c..f2a7401419368 100644
--- a/lldb/bindings/lua/lua-typemaps.swig
+++ b/lldb/bindings/lua/lua-typemaps.swig
@@ -121,9 +121,27 @@ LLDB_NUMBER_TYPEMAP(enum SWIGTYPE);
   $1 = (char *)malloc($2);
 }
 
+// Disable default type checking for this method to avoid SWIG dispatch issues.
+// 
+// Problem: SBThread::GetStopDescription has two overloads:
+//   1. GetStopDescription(char* dst_or_null, size_t dst_len) 
+//   2. GetStopDescription(lldb::SBStream& stream)
+//
+// SWIG generates a dispatch function to select the correct overload based on argument types.
+// see https://www.swig.org/Doc4.0/SWIGDocumentation.html#Typemaps_overloading.
+// However, this dispatcher doesn't consider typemaps that transform function signatures.
+//
+// In lua, our typemap converts GetStopDescription(char*, size_t) to GetStopDescription(int).
+// The dispatcher still checks against the original (char*, size_t) signature instead of 
+// the transformed (int) signature, causing type matching to fail.
+// This only affects SBThread::GetStopDescription since the type check also matches 
+// the argument name, which is unique to this function.
+%typemap(typecheck, precedence=SWIG_TYPECHECK_POINTER) (char *dst_or_null, size_t dst_len) ""
+
 %typemap(argout) (char *dst_or_null, size_t dst_len) {
   lua_pop(L, 1); // Blow away the previous result
-  lua_pushlstring(L, (const char *)$1, $result);
+  llvm::StringRef ref($1);
+  lua_pushlstring(L, (const char *)$1, ref.size());
   free($1);
   // SWIG_arg was already incremented
 }
diff --git a/lldb/bindings/python/python-typemaps.swig b/lldb/bindings/python/python-typemaps.swig
index 715914fe745f8..4d3a95768f2f3 100644
--- a/lldb/bindings/python/python-typemaps.swig
+++ b/lldb/bindings/python/python-typemaps.swig
@@ -224,6 +224,24 @@ AND call SWIG_fail at the same time, because it will result in a double free.
   }
   $1 = (char *)malloc($2);
 }
+
+// Disable default type checking for this method to avoid SWIG dispatch issues.
+// 
+// Problem: SBThread::GetStopDescription has two overloads:
+//   1. GetStopDescription(char* dst_or_null, size_t dst_len) 
+//   2. GetStopDescription(lldb::SBStream& stream)
+//
+// SWIG generates a dispatch function to select the correct overload based on argument types.
+// see https://www.swig.org/Doc4.0/SWIGDocumentation.html#Typemaps_overloading.
+// However, this dispatcher doesn't consider typemaps that transform function signatures.
+//
+// In Python, our typemap converts GetStopDescription(char*, size_t) to GetStopDescription(int).
+// The dispatcher still checks against the original (char*, size_t) signature instead of 
+// the transformed (int) signature, causing type matching to fail.
+// This only affects SBThread::GetStopDescription since the type check also matches 
+// the argument name, which is unique to this function.
+%typemap(typecheck, precedence=SWIG_TYPECHECK_POINTER) (char *dst_or_null, size_t dst_len) ""
+
 %typemap(argout) (char *dst_or_null, size_t dst_len) {
   Py_XDECREF($result); /* Blow away any previous result */
   llvm::StringRef ref($1);
diff --git a/lldb/bindings/python/python-wrapper.swig b/lldb/bindings/python/python-wrapper.swig
index 64b7dc8381073..e7acba5b95d89 100644
--- a/lldb/bindings/python/python-wrapper.swig
+++ b/lldb/bindings/python/python-wrapper.swig
@@ -312,7 +312,7 @@ PyObject *lldb_private::python::SWIGBridge::LLDBSwigPython_GetChildAtIndex(PyObj
   return result.release();
 }
 
-int lldb_private::python::SWIGBridge::LLDBSwigPython_GetIndexOfChildWithName(
+uint32_t lldb_private::python::SWIGBridge::LLDBSwigPython_GetIndexOfChildWithName(
     PyObject * implementor, const char *child_name) {
   PyErr_Cleaner py_err_cleaner(true);
 
diff --git a/lldb/docs/resources/build.rst b/lldb/docs/resources/build.rst
index 0db8c92ad49d6..2eb167709dbda 100644
--- a/lldb/docs/resources/build.rst
+++ b/lldb/docs/resources/build.rst
@@ -95,37 +95,31 @@ commands below.
 Windows
 *******
 
-* Visual Studio 2019.
-* The latest Windows SDK.
-* The Active Template Library (ATL).
-* `GnuWin32 <http://gnuwin32.sourceforge.net/>`_ for CoreUtils and Make.
-* `Python 3 <https://www.python.org/downloads/windows/>`_.  Make sure to (1) get
-  the x64 variant if that's what you're targeting and (2) install the debug
-  library if you want to build a debug lldb. The standalone installer is the
-  easiest way to get the debug library.
-* `Python Tools for Visual Studio
-  <https://github.com/Microsoft/PTVS/>`_. If you plan to debug test failures
-  or even write new tests at all, PTVS is an indispensable debugging
-  extension to VS that enables full editing and debugging support for Python
-  (including mixed native/managed debugging).
-* `SWIG for Windows <http://www.swig.org/download.html>`_
-
-The steps outlined here describes how to set up your system and install the
-required dependencies such that they can be found when needed during the build
-process. They only need to be performed once.
-
-#. Install Visual Studio with the "Desktop Development with C++" workload and
-   the "Python Development" workload.
-#. Install GnuWin32, making sure ``<GnuWin32 install dir>\bin`` is added to
-   your PATH environment variable. Verify that utilities like ``dirname`` and
-   ``make`` are available from your terminal.
-#. Install SWIG for Windows, making sure ``<SWIG install dir>`` is added to
-   your PATH environment variable. Verify that ``swig`` is available from your
-   terminal.
-#. Install Python 3 from the standalone installer and include the debug libraries
-   in the install, making sure the Python install path is added to your PATH
-   environment variable.
-#. Register the Debug Interface Access DLLs with the Registry from a privileged
+The steps outlined here describe how to set up your system and install the
+required dependencies for building and testing LLDB on Windows. They only need
+to be performed once.
+
+Build Requirements
+^^^^^^^^^^^^^^^^^^
+
+Please follow the steps below if you only want to **build** lldb.
+
+1. Install `Visual Studio <https://visualstudio.microsoft.com>` with the
+   "Desktop Development with C++" workload. Make sure that the latest Windows
+   SDK and the Active Template Library (ATL) are installed.
+2. Install `Git Bash <https://git-scm.com/install/windows>`_ and add
+   ``<Git install dir>\usr\bin`` to your ``PATH``. Verify that utilities like
+   ``dirname`` are available from your terminal.
+3. Install `make <https://sourceforge.net/projects/ezwinports/files/>`_ and
+   verify that it's in your ``PATH``.
+4. Install `Python 3 <https://www.python.org/downloads/windows/>`_ from the
+   GUI installer. If you will be building LLDB in Debug mode, **include the
+   debug libraries** during the install. Make sure ``python`` is added to your
+   ``PATH``.
+5. Install `SWIG for Windows <http://www.swig.org/download.html>`_. Make sure
+   ``swig`` is added to your ``PATH`` and that ``swig -swiglib`` points to the
+   correct directory.
+6. Register the Debug Interface Access DLLs with the Registry from a privileged
    terminal.
 
 ::
@@ -139,6 +133,16 @@ Prompt for VS <https://docs.microsoft.com/en-us/visualstudio/ide/reference/comma
 corresponding to the version you wish to use or run ``vcvarsall.bat`` or
 ``VsDevCmd.bat``.
 
+Test Requirements
+^^^^^^^^^^^^^^^^^
+
+Please follow the steps above and below if you want to **test** `lldb`.
+
+* Install `Python Tools for Visual Studio <https://github.com/Microsoft/PTVS/>`_,
+  an indispensable debugging extension to Visual Studio which enables full
+  editing and debugging support for Python (including mixed native/managed
+  debugging).
+
 macOS
 *****
 
diff --git a/lldb/docs/use/tutorials/implementing-standalone-scripts.md b/lldb/docs/use/tutorials/implementing-standalone-scripts.md
index 285d2d3dea9ea..b1a3441ffe2ee 100644
--- a/lldb/docs/use/tutorials/implementing-standalone-scripts.md
+++ b/lldb/docs/use/tutorials/implementing-standalone-scripts.md
@@ -147,3 +147,20 @@ SBFunction: id = 0x0000002e, name = main, type = main
 a.out[0x714]: mov    w0, #0x0                  ; =0
 a.out[0x718]: ret
 ```
+
+### Troubleshooting
+
+You can use all the usual Python tools to debug scripts, and on top of that
+you can enable LLDB's log channels. To do this in the script shown above, add
+this line right after `debugger` has been assigned:
+
+```python
+debugger.EnableLog("lldb", ["all"])
+```
+
+`lldb` `all` enables a lot of different channels, so you will probably want
+to enable only a few channels once you know what you are interested in.
+
+This API call is the equivalent of `log enable lldb all` when using LLDB
+interactively. All channels available to `log enable` can be enabled using
+`EnableLog` too.
\ No newline at end of file
diff --git a/lldb/examples/synthetic/gnu_libstdcpp.py b/lldb/examples/synthetic/gnu_libstdcpp.py
index f42a009c21f48..8a41ddff9b679 100644
--- a/lldb/examples/synthetic/gnu_libstdcpp.py
+++ b/lldb/examples/synthetic/gnu_libstdcpp.py
@@ -63,11 +63,8 @@ def __init__(self, valobj, dict):
         self.count = None
 
     def extract_type(self):
-        type = self.valobj.GetType()
-        # The last template argument is the allocator type.
-        template_arg_num = type.GetNumberOfTemplateArguments() - 1
-        allocator_type = type.GetTemplateArgumentType(template_arg_num)
-        data_type = allocator_type.GetTemplateArgumentType(0)
+        head_type = self.head.GetType().GetCanonicalType()
+        data_type = head_type.GetTemplateArgumentType(1)
         return data_type
 
     def update(self):
diff --git a/lldb/include/lldb/API/SBThread.h b/lldb/include/lldb/API/SBThread.h
index e9fe5858d125e..2411dfd376519 100644
--- a/lldb/include/lldb/API/SBThread.h
+++ b/lldb/include/lldb/API/SBThread.h
@@ -81,6 +81,14 @@ class LLDB_API SBThread {
   SBThreadCollection
   GetStopReasonExtendedBacktraces(InstrumentationRuntimeType type);
 
+  /// Gets a human-readable description of why the thread stopped.
+  ///
+  /// \param stream Output stream to receive the stop description text
+  /// \return
+  ///   true if obtained and written to the stream,
+  //    false if there was an error retrieving the description.
+  bool GetStopDescription(lldb::SBStream &stream) const;
+
   size_t GetStopDescription(char *dst_or_null, size_t dst_len);
 
   SBValue GetStopReturnValue();
diff --git a/lldb/include/lldb/Core/SourceManager.h b/lldb/include/lldb/Core/SourceManager.h
index 1244291596b73..83dc74768733d 100644
--- a/lldb/include/lldb/Core/SourceManager.h
+++ b/lldb/include/lldb/Core/SourceManager.h
@@ -109,6 +109,8 @@ class SourceManager {
   private:
     void CommonInitializer(lldb::SupportFileSP support_file_sp,
                            lldb::TargetSP target_sp);
+    void CommonInitializerImpl(lldb::SupportFileSP support_file_sp,
+                               lldb::TargetSP target_sp);
   };
 
   typedef std::shared_ptr<File> FileSP;
diff --git a/lldb/include/lldb/Interpreter/ScriptInterpreter.h b/lldb/include/lldb/Interpreter/ScriptInterpreter.h
index 6c0054a1ec1d1..edb80dc66aca7 100644
--- a/lldb/include/lldb/Interpreter/ScriptInterpreter.h
+++ b/lldb/include/lldb/Interpreter/ScriptInterpreter.h
@@ -352,7 +352,7 @@ class ScriptInterpreter : public PluginInterface {
     return lldb::ValueObjectSP();
   }
 
-  virtual llvm::Expected<int>
+  virtual llvm::Expected<uint32_t>
   GetIndexOfChildWithName(const StructuredData::ObjectSP &implementor,
                           const char *child_name) {
     return llvm::createStringError("Type has no child named '%s'", child_name);
diff --git a/lldb/include/lldb/Symbol/CompilerType.h b/lldb/include/lldb/Symbol/CompilerType.h
index df8489a7fe582..869c5076ee0a7 100644
--- a/lldb/include/lldb/Symbol/CompilerType.h
+++ b/lldb/include/lldb/Symbol/CompilerType.h
@@ -144,7 +144,7 @@ class CompilerType {
 
   bool IsDefined() const;
 
-  bool IsFloatingPointType(uint32_t &count, bool &is_complex) const;
+  bool IsFloatingPointType(bool &is_complex) const;
 
   bool IsFunctionType() const;
 
@@ -400,7 +400,7 @@ class CompilerType {
   /// Return the size of the type in bits.
   llvm::Expected<uint64_t> GetBitSize(ExecutionContextScope *exe_scope) const;
 
-  lldb::Encoding GetEncoding(uint64_t &count) const;
+  lldb::Encoding GetEncoding() const;
 
   lldb::Format GetFormat() const;
 
diff --git a/lldb/include/lldb/Symbol/Type.h b/lldb/include/lldb/Symbol/Type.h
index e657357b942f1..02b43e300a83e 100644
--- a/lldb/include/lldb/Symbol/Type.h
+++ b/lldb/include/lldb/Symbol/Type.h
@@ -507,7 +507,7 @@ class Type : public std::enable_shared_from_this<Type>, public UserID {
 
   lldb::Format GetFormat();
 
-  lldb::Encoding GetEncoding(uint64_t &count);
+  lldb::Encoding GetEncoding();
 
   SymbolContextScope *GetSymbolContextScope() { return m_context; }
   const SymbolContextScope *GetSymbolContextScope() const { return m_context; }
diff --git a/lldb/include/lldb/Symbol/TypeSystem.h b/lldb/include/lldb/Symbol/TypeSystem.h
index 0ec3a28898329..25b208a65349b 100644
--- a/lldb/include/lldb/Symbol/TypeSystem.h
+++ b/lldb/include/lldb/Symbol/TypeSystem.h
@@ -163,7 +163,7 @@ class TypeSystem : public PluginInterface,
   virtual bool IsDefined(lldb::opaque_compiler_type_t type) = 0;
 
   virtual bool IsFloatingPointType(lldb::opaque_compiler_type_t type,
-                                   uint32_t &count, bool &is_complex) = 0;
+                                   bool &is_complex) = 0;
 
   virtual bool IsFunctionType(lldb::opaque_compiler_type_t type) = 0;
 
@@ -317,8 +317,7 @@ class TypeSystem : public PluginInterface,
   GetBitSize(lldb::opaque_compiler_type_t type,
              ExecutionContextScope *exe_scope) = 0;
 
-  virtual lldb::Encoding GetEncoding(lldb::opaque_compiler_type_t type,
-                                     uint64_t &count) = 0;
+  virtual lldb::Encoding GetEncoding(lldb::opaque_compiler_type_t type) = 0;
 
   virtual lldb::Format GetFormat(lldb::opaque_compiler_type_t type) = 0;
 
diff --git a/lldb/include/lldb/Target/Process.h b/lldb/include/lldb/Target/Process.h
index 8f5892e16cedf..c1f9785e76f90 100644
--- a/lldb/include/lldb/Target/Process.h
+++ b/lldb/include/lldb/Target/Process.h
@@ -127,10 +127,7 @@ class ProcessAttachInfo : public ProcessInstanceInfo {
 public:
   ProcessAttachInfo() = default;
 
-  ProcessAttachInfo(const ProcessLaunchInfo &launch_info)
-      : m_resume_count(0), m_wait_for_launch(false), m_ignore_existing(true),
-        m_continue_once_attached(false), m_detach_on_error(true),
-        m_async(false) {
+  ProcessAttachInfo(const ProcessLaunchInfo &launch_info) {
     ProcessInfo::operator=(launch_info);
     SetProcessPluginName(launch_info.GetProcessPluginName());
     SetResumeCount(launch_info.GetResumeCount());
diff --git a/lldb/include/lldb/Target/Target.h b/lldb/include/lldb/Target/Target.h
index c375df248154f..40f9c9bea1c12 100644
--- a/lldb/include/lldb/Target/Target.h
+++ b/lldb/include/lldb/Target/Target.h
@@ -1346,6 +1346,13 @@ class Target : public std::enable_shared_from_this<Target>,
                                const lldb_private::RegisterFlags &flags,
                                uint32_t byte_size);
 
+  /// Sends a breakpoint notification event.
+  void NotifyBreakpointChanged(Breakpoint &bp,
+                               lldb::BreakpointEventType event_kind);
+  /// Sends a breakpoint notification event.
+  void NotifyBreakpointChanged(Breakpoint &bp,
+                               const lldb::EventDataSP &breakpoint_data_sp);
+
   llvm::Expected<lldb::DisassemblerSP>
   ReadInstructions(const Address &start_addr, uint32_t count,
                    const char *flavor_string = nullptr);
diff --git a/lldb/packages/Python/lldbsuite/test/make/Makefile.rules b/lldb/packages/Python/lldbsuite/test/make/Makefile.rules
index e72ffd1f030ec..28cae54776ac8 100644
--- a/lldb/packages/Python/lldbsuite/test/make/Makefile.rules
+++ b/lldb/packages/Python/lldbsuite/test/make/Makefile.rules
@@ -322,6 +322,17 @@ ifeq (,$(filter $(OS), Windows_NT Android Darwin))
 		LDFLAGS += -pthread
 	endif
 endif
+
+# macOS forbids injecting the ASAN runtime into system processes when
+# SIP is enabled. That includes the just-built libLTO that the
+# just-built clang injects into the system linker.  Since we don't
+# test the compiler here, just use the system (non-asanified) LTO
+# library to make ASAN tests work for most users, including the bots.
+ifeq "$(OS)" "Darwin"
+ifneq "$(ASAN_OPTIONS)" ""
+LD_FLAGS += -Wl,-lto_library -Wl,$(shell dirname $(shell xcrun -find clang))/../lib/libLTO.dylib
+endif
+endif
 OBJECTS =
 EXE ?= a.out
 
@@ -386,7 +397,9 @@ ifeq (,$(filter 1, $(USE_LIBSTDCPP) $(USE_LIBCPP) $(USE_SYSTEM_STDLIB)))
     ifneq "$(LIBCPP_INCLUDE_TARGET_DIR)" ""
       CXXFLAGS += -cxx-isystem $(LIBCPP_INCLUDE_TARGET_DIR)
     endif
-    LDFLAGS += -L$(LIBCPP_LIBRARY_DIR) -Wl,-rpath,$(LIBCPP_LIBRARY_DIR) -lc++
+
+	# If `-nostdlib++` is not passed, clang will link to the system's stdlib.
+    LDFLAGS += -nostdlib++ -L$(LIBCPP_LIBRARY_DIR) -Wl,-rpath,$(LIBCPP_LIBRARY_DIR) -lc++
   else
     USE_SYSTEM_STDLIB := 1
   endif
@@ -407,7 +420,8 @@ ifeq (1,$(USE_LIBCPP))
 		ifneq "$(LIBCPP_INCLUDE_TARGET_DIR)" ""
 				CXXFLAGS += -cxx-isystem $(LIBCPP_INCLUDE_TARGET_DIR)
 		endif
-		LDFLAGS += -L$(LIBCPP_LIBRARY_DIR) -Wl,-rpath,$(LIBCPP_LIBRARY_DIR) -lc++
+		# If `-nostdlib++` is not passed, clang will link to the system's stdlib.
+		LDFLAGS += -nostdlib++ -L$(LIBCPP_LIBRARY_DIR) -Wl,-rpath,$(LIBCPP_LIBRARY_DIR) -lc++
 	else
 		ifeq "$(OS)" "Android"
 				# Nothing to do, this is already handled in
diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py
index 29935bb8046ff..c6c4a3e2a4e1e 100644
--- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py
+++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py
@@ -223,6 +223,16 @@ def verify_stop_exception_info(self, expected_description):
                     return True
         return False
 
+    def verify_stop_on_entry(self) -> None:
+        """Waits for the process to be stopped and then verifies at least one
+        thread has the stop reason 'entry'."""
+        self.dap_server.wait_for_stopped()
+        self.assertIn(
+            "entry",
+            (t["reason"] for t in self.dap_server.thread_stop_reasons.values()),
+            "Expected at least one thread to report stop reason 'entry' in {self.dap_server.thread_stop_reasons}",
+        )
+
     def verify_commands(self, flavor: str, output: str, commands: list[str]):
         self.assertTrue(output and len(output) > 0, "expect console output")
         lines = output.splitlines()
diff --git a/lldb/source/API/SBThread.cpp b/lldb/source/API/SBThread.cpp
index 4e4aa48bc9a2e..f58a1b52afa91 100644
--- a/lldb/source/API/SBThread.cpp
+++ b/lldb/source/API/SBThread.cpp
@@ -239,11 +239,34 @@ SBThread::GetStopReasonExtendedBacktraces(InstrumentationRuntimeType type) {
   return threads;
 }
 
-size_t SBThread::GetStopDescription(char *dst, size_t dst_len) {
-  LLDB_INSTRUMENT_VA(this, dst, dst_len);
+bool SBThread::GetStopDescription(lldb::SBStream &stream) const {
+  LLDB_INSTRUMENT_VA(this, stream);
+
+  if (!m_opaque_sp)
+    return false;
+
+  llvm::Expected<StoppedExecutionContext> exe_ctx =
+      GetStoppedExecutionContext(m_opaque_sp);
+  if (!exe_ctx) {
+    LLDB_LOG_ERROR(GetLog(LLDBLog::API), exe_ctx.takeError(), "{0}");
+    return false;
+  }
+
+  if (!exe_ctx->HasThreadScope())
+    return false;
+
+  Stream &strm = stream.ref();
+  const std::string stop_desc = exe_ctx->GetThreadPtr()->GetStopDescription();
+  strm.PutCString(stop_desc);
+
+  return true;
+}
+
+size_t SBThread::GetStopDescription(char *dst_or_null, size_t dst_len) {
+  LLDB_INSTRUMENT_VA(this, dst_or_null, dst_len);
 
-  if (dst)
-    *dst = 0;
+  if (dst_or_null)
+    *dst_or_null = 0;
 
   llvm::Expected<StoppedExecutionContext> exe_ctx =
       GetStoppedExecutionContext(m_opaque_sp);
@@ -259,8 +282,8 @@ size_t SBThread::GetStopDescription(char *dst, size_t dst_len) {
   if (thread_stop_desc.empty())
     return 0;
 
-  if (dst)
-    return ::snprintf(dst, dst_len, "%s", thread_stop_desc.c_str()) + 1;
+  if (dst_or_null)
+    return ::snprintf(dst_or_null, dst_len, "%s", thread_stop_desc.c_str()) + 1;
 
   // NULL dst passed in, return the length needed to contain the
   // description.
diff --git a/lldb/source/Breakpoint/Breakpoint.cpp b/lldb/source/Breakpoint/Breakpoint.cpp
index b23d1143d60c4..201d8d20c4901 100644
--- a/lldb/source/Breakpoint/Breakpoint.cpp
+++ b/lldb/source/Breakpoint/Breakpoint.cpp
@@ -1098,14 +1098,9 @@ bool Breakpoint::EvaluatePrecondition(StoppointCallbackContext &context) {
 }
 
 void Breakpoint::SendBreakpointChangedEvent(
-    lldb::BreakpointEventType eventKind) {
-  if (!IsInternal() && GetTarget().EventTypeHasListeners(
-                           Target::eBroadcastBitBreakpointChanged)) {
-    std::shared_ptr<BreakpointEventData> data =
-        std::make_shared<BreakpointEventData>(eventKind, shared_from_this());
-
-    GetTarget().BroadcastEvent(Target::eBroadcastBitBreakpointChanged, data);
-  }
+    lldb::BreakpointEventType event_kind) {
+  if (!IsInternal())
+    GetTarget().NotifyBreakpointChanged(*this, event_kind);
 }
 
 void Breakpoint::SendBreakpointChangedEvent(
@@ -1113,10 +1108,8 @@ void Breakpoint::SendBreakpointChangedEvent(
   if (!breakpoint_data_sp)
     return;
 
-  if (!IsInternal() &&
-      GetTarget().EventTypeHasListeners(Target::eBroadcastBitBreakpointChanged))
-    GetTarget().BroadcastEvent(Target::eBroadcastBitBreakpointChanged,
-                               breakpoint_data_sp);
+  if (!IsInternal())
+    GetTarget().NotifyBreakpointChanged(*this, breakpoint_data_sp);
 }
 
 const char *Breakpoint::BreakpointEventTypeAsCString(BreakpointEventType type) {
diff --git a/lldb/source/Breakpoint/BreakpointList.cpp b/lldb/source/Breakpoint/BreakpointList.cpp
index 779490ae0316a..e3dd62bfa329d 100644
--- a/lldb/source/Breakpoint/BreakpointList.cpp
+++ b/lldb/source/Breakpoint/BreakpointList.cpp
@@ -16,13 +16,7 @@ using namespace lldb;
 using namespace lldb_private;
 
 static void NotifyChange(const BreakpointSP &bp, BreakpointEventType event) {
-  Target &target = bp->GetTarget();
-  if (target.EventTypeHasListeners(Target::eBroadcastBitBreakpointChanged)) {
-    auto event_data_sp =
-        std::make_shared<Breakpoint::BreakpointEventData>(event, bp);
-    target.BroadcastEvent(Target::eBroadcastBitBreakpointChanged,
-                          event_data_sp);
-  }
+  bp->GetTarget().NotifyBreakpointChanged(*bp, event);
 }
 
 BreakpointList::BreakpointList(bool is_internal)
diff --git a/lldb/source/Breakpoint/BreakpointLocation.cpp b/lldb/source/Breakpoint/BreakpointLocation.cpp
index 22c98acda8c59..f25209c15e007 100644
--- a/lldb/source/Breakpoint/BreakpointLocation.cpp
+++ b/lldb/source/Breakpoint/BreakpointLocation.cpp
@@ -749,13 +749,11 @@ void BreakpointLocation::Dump(Stream *s) const {
 
 void BreakpointLocation::SendBreakpointLocationChangedEvent(
     lldb::BreakpointEventType eventKind) {
-  if (!m_owner.IsInternal() && m_owner.GetTarget().EventTypeHasListeners(
-                                   Target::eBroadcastBitBreakpointChanged)) {
+  if (!m_owner.IsInternal()) {
     auto data_sp = std::make_shared<Breakpoint::BreakpointEventData>(
         eventKind, m_owner.shared_from_this());
     data_sp->GetBreakpointLocationCollection().Add(shared_from_this());
-    m_owner.GetTarget().BroadcastEvent(Target::eBroadcastBitBreakpointChanged,
-                                       data_sp);
+    m_owner.GetTarget().NotifyBreakpointChanged(m_owner, data_sp);
   }
 }
 
diff --git a/lldb/source/Core/SourceManager.cpp b/lldb/source/Core/SourceManager.cpp
index f786866a18137..097173ffe678e 100644
--- a/lldb/source/Core/SourceManager.cpp
+++ b/lldb/source/Core/SourceManager.cpp
@@ -34,6 +34,7 @@
 
 #include "llvm/ADT/Twine.h"
 
+#include <future>
 #include <memory>
 #include <optional>
 #include <utility>
@@ -54,8 +55,7 @@ using namespace lldb_private;
 static inline bool is_newline_char(char ch) { return ch == '\n' || ch == '\r'; }
 
 static void resolve_tilde(FileSpec &file_spec) {
-  if (!FileSystem::Instance().Exists(file_spec) &&
-      file_spec.GetDirectory() &&
+  if (!FileSystem::Instance().Exists(file_spec) && file_spec.GetDirectory() &&
       file_spec.GetDirectory().GetCString()[0] == '~') {
     FileSystem::Instance().Resolve(file_spec);
   }
@@ -477,6 +477,28 @@ SourceManager::File::File(SupportFileSP support_file_sp, TargetSP target_sp)
 
 void SourceManager::File::CommonInitializer(SupportFileSP support_file_sp,
                                             TargetSP target_sp) {
+  // It might take a while to read a source file, for example because it's
+  // coming from a virtual file system that's fetching the data on demand. When
+  // reading the data exceeds a certain threshold, show a progress event to let
+  // the user know what's going on.
+  static constexpr auto g_progress_delay = std::chrono::milliseconds(500);
+
+  std::future<void> future = std::async(std::launch::async, [=]() {
+    CommonInitializerImpl(support_file_sp, target_sp);
+  });
+
+  std::optional<Progress> progress;
+  if (future.wait_for(g_progress_delay) == std::future_status::timeout) {
+    Debugger *debugger = target_sp ? &target_sp->GetDebugger() : nullptr;
+    progress.emplace("Loading source file",
+                     support_file_sp->GetSpecOnly().GetFilename().GetString(),
+                     1, debugger);
+  }
+  future.wait();
+}
+
+void SourceManager::File::CommonInitializerImpl(SupportFileSP support_file_sp,
+                                                TargetSP target_sp) {
   // Set the file and update the modification time.
   SetSupportFile(support_file_sp);
 
diff --git a/lldb/source/Host/posix/ConnectionFileDescriptorPosix.cpp b/lldb/source/Host/posix/ConnectionFileDescriptorPosix.cpp
index e8bf04e308447..b5831f013ba62 100644
--- a/lldb/source/Host/posix/ConnectionFileDescriptorPosix.cpp
+++ b/lldb/source/Host/posix/ConnectionFileDescriptorPosix.cpp
@@ -149,11 +149,11 @@ ConnectionFileDescriptor::Connect(llvm::StringRef path,
         llvm::StringSwitch<ConnectionStatus (ConnectionFileDescriptor::*)(
             llvm::StringRef, socket_id_callback_type, Status *)>(scheme)
             .Case("listen", &ConnectionFileDescriptor::AcceptTCP)
-            .Cases("accept", "unix-accept",
+            .Cases({"accept", "unix-accept"},
                    &ConnectionFileDescriptor::AcceptNamedSocket)
             .Case("unix-abstract-accept",
                   &ConnectionFileDescriptor::AcceptAbstractSocket)
-            .Cases("connect", "tcp-connect",
+            .Cases({"connect", "tcp-connect"},
                    &ConnectionFileDescriptor::ConnectTCP)
             .Case("udp", &ConnectionFileDescriptor::ConnectUDP)
             .Case("unix-connect", &ConnectionFileDescriptor::ConnectNamedSocket)
diff --git a/lldb/source/Plugins/ABI/AArch64/ABIAArch64.cpp b/lldb/source/Plugins/ABI/AArch64/ABIAArch64.cpp
index e40d2c5fc121a..8bfb4327a5f73 100644
--- a/lldb/source/Plugins/ABI/AArch64/ABIAArch64.cpp
+++ b/lldb/source/Plugins/ABI/AArch64/ABIAArch64.cpp
@@ -86,9 +86,9 @@ std::string ABIAArch64::GetMCName(std::string reg) {
 uint32_t ABIAArch64::GetGenericNum(llvm::StringRef name) {
   return llvm::StringSwitch<uint32_t>(name)
       .Case("pc", LLDB_REGNUM_GENERIC_PC)
-      .Cases("lr", "x30", LLDB_REGNUM_GENERIC_RA)
-      .Cases("sp", "x31", LLDB_REGNUM_GENERIC_SP)
-      .Cases("fp", "x29", LLDB_REGNUM_GENERIC_FP)
+      .Cases({"lr", "x30"}, LLDB_REGNUM_GENERIC_RA)
+      .Cases({"sp", "x31"}, LLDB_REGNUM_GENERIC_SP)
+      .Cases({"fp", "x29"}, LLDB_REGNUM_GENERIC_FP)
       .Case("cpsr", LLDB_REGNUM_GENERIC_FLAGS)
       .Case("x0", LLDB_REGNUM_GENERIC_ARG1)
       .Case("x1", LLDB_REGNUM_GENERIC_ARG2)
diff --git a/lldb/source/Plugins/ABI/ARC/ABISysV_arc.cpp b/lldb/source/Plugins/ABI/ARC/ABISysV_arc.cpp
index f9c249d7fec1c..e41a28bd21c36 100644
--- a/lldb/source/Plugins/ABI/ARC/ABISysV_arc.cpp
+++ b/lldb/source/Plugins/ABI/ARC/ABISysV_arc.cpp
@@ -480,11 +480,10 @@ ABISysV_arc::GetReturnValueObjectSimple(Thread &thread,
   }
   // Floating point return type.
   else if (type_flags & eTypeIsFloat) {
-    uint32_t float_count = 0;
     bool is_complex = false;
 
-    if (compiler_type.IsFloatingPointType(float_count, is_complex) &&
-        1 == float_count && !is_complex) {
+    if (compiler_type.IsFloatingPointType(is_complex) &&
+        !compiler_type.IsVectorType() && !is_complex) {
       const size_t byte_size =
           llvm::expectedToOptional(compiler_type.GetByteSize(&thread))
               .value_or(0);
diff --git a/lldb/source/Plugins/ABI/ARM/ABIMacOSX_arm.cpp b/lldb/source/Plugins/ABI/ARM/ABIMacOSX_arm.cpp
index 5b5f6facc924c..8e690218843fa 100644
--- a/lldb/source/Plugins/ABI/ARM/ABIMacOSX_arm.cpp
+++ b/lldb/source/Plugins/ABI/ARM/ABIMacOSX_arm.cpp
@@ -1695,7 +1695,6 @@ Status ABIMacOSX_arm::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
   Thread *thread = frame_sp->GetThread().get();
 
   bool is_signed;
-  uint32_t count;
   bool is_complex;
 
   RegisterContext *reg_ctx = thread->GetRegisterContext().get();
@@ -1767,7 +1766,7 @@ Status ABIMacOSX_arm::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
           "We don't support returning longer than 64 bit "
           "integer values at present.");
     }
-  } else if (compiler_type.IsFloatingPointType(count, is_complex)) {
+  } else if (compiler_type.IsFloatingPointType(is_complex)) {
     if (is_complex)
       error = Status::FromErrorString(
           "We don't support returning complex values at present");
diff --git a/lldb/source/Plugins/ABI/ARM/ABISysV_arm.cpp b/lldb/source/Plugins/ABI/ARM/ABISysV_arm.cpp
index bb0c4ba3f1b57..7258f5cc9acb5 100644
--- a/lldb/source/Plugins/ABI/ARM/ABISysV_arm.cpp
+++ b/lldb/source/Plugins/ABI/ARM/ABISysV_arm.cpp
@@ -1550,7 +1550,6 @@ ValueObjectSP ABISysV_arm::GetReturnValueObjectImpl(
 
   bool is_signed;
   bool is_complex;
-  uint32_t float_count;
   bool is_vfp_candidate = false;
   uint8_t vfp_count = 0;
   uint8_t vfp_byte_size = 0;
@@ -1634,8 +1633,9 @@ ValueObjectSP ABISysV_arm::GetReturnValueObjectImpl(
       if (!GetReturnValuePassedInMemory(thread, reg_ctx, *byte_size, value))
         return return_valobj_sp;
     }
-  } else if (compiler_type.IsFloatingPointType(float_count, is_complex)) {
-    if (float_count == 1 && !is_complex) {
+  } else if (compiler_type.IsFloatingPointType(is_complex)) {
+    // Vector types are handled above.
+    if (!is_complex) {
       switch (*bit_width) {
       default:
         return return_valobj_sp;
@@ -1681,7 +1681,7 @@ ValueObjectSP ABISysV_arm::GetReturnValueObjectImpl(
         break;
       }
       }
-    } else if (is_complex && float_count == 2) {
+    } else if (is_complex) {
       if (IsArmHardFloat(thread)) {
         is_vfp_candidate = true;
         vfp_byte_size = *byte_size / 2;
@@ -1709,8 +1709,9 @@ ValueObjectSP ABISysV_arm::GetReturnValueObjectImpl(
             vfp_count = (*base_byte_size == 8 ? homogeneous_count
                                               : homogeneous_count * 2);
           }
-        } else if (base_type.IsFloatingPointType(float_count, is_complex)) {
-          if (float_count == 1 && !is_complex) {
+        } else if (base_type.IsFloatingPointType(is_complex)) {
+          // Vector types are handled above.
+          if (!is_complex) {
             is_vfp_candidate = true;
             if (base_byte_size)
               vfp_byte_size = *base_byte_size;
@@ -1727,10 +1728,10 @@ ValueObjectSP ABISysV_arm::GetReturnValueObjectImpl(
             base_type = compiler_type.GetFieldAtIndex(index, name, nullptr,
                                                       nullptr, nullptr);
 
-            if (base_type.IsFloatingPointType(float_count, is_complex)) {
+            if (base_type.IsFloatingPointType(is_complex)) {
               std::optional<uint64_t> base_byte_size =
                   llvm::expectedToOptional(base_type.GetByteSize(&thread));
-              if (float_count == 2 && is_complex) {
+              if (is_complex) {
                 if (index != 0 && base_byte_size &&
                     vfp_byte_size != *base_byte_size)
                   break;
@@ -1841,7 +1842,6 @@ Status ABISysV_arm::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
   Thread *thread = frame_sp->GetThread().get();
 
   bool is_signed;
-  uint32_t count;
   bool is_complex;
 
   RegisterContext *reg_ctx = thread->GetRegisterContext().get();
@@ -1884,7 +1884,7 @@ Status ABISysV_arm::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
           "We don't support returning longer than 64 bit "
           "integer values at present.");
     }
-  } else if (compiler_type.IsFloatingPointType(count, is_complex)) {
+  } else if (compiler_type.IsFloatingPointType(is_complex)) {
     if (is_complex)
       error = Status::FromErrorString(
           "We don't support returning complex values at present");
diff --git a/lldb/source/Plugins/ABI/LoongArch/ABISysV_loongarch.cpp b/lldb/source/Plugins/ABI/LoongArch/ABISysV_loongarch.cpp
index 7bf99ce7bddee..91b965d3b5715 100644
--- a/lldb/source/Plugins/ABI/LoongArch/ABISysV_loongarch.cpp
+++ b/lldb/source/Plugins/ABI/LoongArch/ABISysV_loongarch.cpp
@@ -510,11 +510,10 @@ ValueObjectSP ABISysV_loongarch::GetReturnValueObjectSimple(
                                           value, ConstString(""));
   }
   if (type_flags & eTypeIsFloat) {
-    uint32_t float_count = 0;
     bool is_complex = false;
 
-    if (compiler_type.IsFloatingPointType(float_count, is_complex) &&
-        float_count == 1 && !is_complex) {
+    if (compiler_type.IsFloatingPointType(is_complex) &&
+        !(type_flags & eTypeIsVector) && !is_complex) {
       return_valobj_sp =
           GetValObjFromFPRegs(thread, reg_ctx, machine, type_flags, byte_size);
       return return_valobj_sp;
@@ -623,17 +622,17 @@ void ABISysV_loongarch::Terminate() {
 static uint32_t GetGenericNum(llvm::StringRef name) {
   return llvm::StringSwitch<uint32_t>(name)
       .Case("pc", LLDB_REGNUM_GENERIC_PC)
-      .Cases("ra", "r1", LLDB_REGNUM_GENERIC_RA)
-      .Cases("sp", "r3", LLDB_REGNUM_GENERIC_SP)
-      .Cases("fp", "r22", LLDB_REGNUM_GENERIC_FP)
-      .Cases("a0", "r4", LLDB_REGNUM_GENERIC_ARG1)
-      .Cases("a1", "r5", LLDB_REGNUM_GENERIC_ARG2)
-      .Cases("a2", "r6", LLDB_REGNUM_GENERIC_ARG3)
-      .Cases("a3", "r7", LLDB_REGNUM_GENERIC_ARG4)
-      .Cases("a4", "r8", LLDB_REGNUM_GENERIC_ARG5)
-      .Cases("a5", "r9", LLDB_REGNUM_GENERIC_ARG6)
-      .Cases("a6", "r10", LLDB_REGNUM_GENERIC_ARG7)
-      .Cases("a7", "r11", LLDB_REGNUM_GENERIC_ARG8)
+      .Cases({"ra", "r1"}, LLDB_REGNUM_GENERIC_RA)
+      .Cases({"sp", "r3"}, LLDB_REGNUM_GENERIC_SP)
+      .Cases({"fp", "r22"}, LLDB_REGNUM_GENERIC_FP)
+      .Cases({"a0", "r4"}, LLDB_REGNUM_GENERIC_ARG1)
+      .Cases({"a1", "r5"}, LLDB_REGNUM_GENERIC_ARG2)
+      .Cases({"a2", "r6"}, LLDB_REGNUM_GENERIC_ARG3)
+      .Cases({"a3", "r7"}, LLDB_REGNUM_GENERIC_ARG4)
+      .Cases({"a4", "r8"}, LLDB_REGNUM_GENERIC_ARG5)
+      .Cases({"a5", "r9"}, LLDB_REGNUM_GENERIC_ARG6)
+      .Cases({"a6", "r10"}, LLDB_REGNUM_GENERIC_ARG7)
+      .Cases({"a7", "r11"}, LLDB_REGNUM_GENERIC_ARG8)
       .Default(LLDB_INVALID_REGNUM);
 }
 
diff --git a/lldb/source/Plugins/ABI/Mips/ABISysV_mips.cpp b/lldb/source/Plugins/ABI/Mips/ABISysV_mips.cpp
index dd91a05534e37..e03604467ceec 100644
--- a/lldb/source/Plugins/ABI/Mips/ABISysV_mips.cpp
+++ b/lldb/source/Plugins/ABI/Mips/ABISysV_mips.cpp
@@ -708,7 +708,6 @@ Status ABISysV_mips::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
   Thread *thread = frame_sp->GetThread().get();
 
   bool is_signed;
-  uint32_t count;
   bool is_complex;
 
   RegisterContext *reg_ctx = thread->GetRegisterContext().get();
@@ -750,7 +749,7 @@ Status ABISysV_mips::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
           "We don't support returning longer than 64 bit "
           "integer values at present.");
     }
-  } else if (compiler_type.IsFloatingPointType(count, is_complex)) {
+  } else if (compiler_type.IsFloatingPointType(is_complex)) {
     if (is_complex)
       error = Status::FromErrorString(
           "We don't support returning complex values at present");
@@ -797,7 +796,6 @@ ValueObjectSP ABISysV_mips::GetReturnValueObjectImpl(
 
   bool is_signed = false;
   bool is_complex = false;
-  uint32_t count = 0;
 
   // In MIPS register "r2" (v0) holds the integer function return values
   const RegisterInfo *r2_reg_info = reg_ctx->GetRegisterInfoByName("r2", 0);
@@ -860,10 +858,10 @@ ValueObjectSP ABISysV_mips::GetReturnValueObjectImpl(
     return_valobj_sp = ValueObjectMemory::Create(
         &thread, "", Address(mem_address, nullptr), return_compiler_type);
     return return_valobj_sp;
-  } else if (return_compiler_type.IsFloatingPointType(count, is_complex)) {
+  } else if (return_compiler_type.IsFloatingPointType(is_complex)) {
     if (IsSoftFloat(fp_flag)) {
       uint64_t raw_value = reg_ctx->ReadRegisterAsUnsigned(r2_reg_info, 0);
-      if (count != 1 && is_complex)
+      if (is_complex)
         return return_valobj_sp;
       switch (*bit_width) {
       default:
@@ -896,7 +894,7 @@ ValueObjectSP ABISysV_mips::GetReturnValueObjectImpl(
       f0_value.GetData(f0_data);
       lldb::offset_t offset = 0;
 
-      if (count == 1 && !is_complex) {
+      if (!return_compiler_type.IsVectorType() && !is_complex) {
         switch (*bit_width) {
         default:
           return return_valobj_sp;
diff --git a/lldb/source/Plugins/ABI/Mips/ABISysV_mips64.cpp b/lldb/source/Plugins/ABI/Mips/ABISysV_mips64.cpp
index baefbfc363d99..0dd9db0948220 100644
--- a/lldb/source/Plugins/ABI/Mips/ABISysV_mips64.cpp
+++ b/lldb/source/Plugins/ABI/Mips/ABISysV_mips64.cpp
@@ -923,7 +923,6 @@ ValueObjectSP ABISysV_mips64::GetReturnValueObjectImpl(
       bool sucess = false;
       std::string name;
       bool is_complex;
-      uint32_t count;
       const uint32_t num_children = return_compiler_type.GetNumFields();
 
       // A structure consisting of one or two FP values (and nothing else) will
@@ -937,7 +936,7 @@ ValueObjectSP ABISysV_mips64::GetReturnValueObjectImpl(
               return_compiler_type.GetFieldAtIndex(idx, name, &field_bit_offset,
                                                    nullptr, nullptr);
 
-          if (field_compiler_type.IsFloatingPointType(count, is_complex))
+          if (field_compiler_type.IsFloatingPointType(is_complex))
             use_fp_regs = true;
           else
             found_non_fp_field = true;
@@ -1044,7 +1043,7 @@ ValueObjectSP ABISysV_mips64::GetReturnValueObjectImpl(
 
         if (field_compiler_type.IsIntegerOrEnumerationType(is_signed) ||
             field_compiler_type.IsPointerType() ||
-            field_compiler_type.IsFloatingPointType(count, is_complex)) {
+            field_compiler_type.IsFloatingPointType(is_complex)) {
           padding = field_byte_offset - integer_bytes;
 
           if (integer_bytes < 8) {
diff --git a/lldb/source/Plugins/ABI/PowerPC/ABISysV_ppc.cpp b/lldb/source/Plugins/ABI/PowerPC/ABISysV_ppc.cpp
index e4bdc44c59c10..0d25faef1c659 100644
--- a/lldb/source/Plugins/ABI/PowerPC/ABISysV_ppc.cpp
+++ b/lldb/source/Plugins/ABI/PowerPC/ABISysV_ppc.cpp
@@ -426,7 +426,6 @@ Status ABISysV_ppc::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
   Thread *thread = frame_sp->GetThread().get();
 
   bool is_signed;
-  uint32_t count;
   bool is_complex;
 
   RegisterContext *reg_ctx = thread->GetRegisterContext().get();
@@ -454,7 +453,7 @@ Status ABISysV_ppc::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
           "We don't support returning longer than 64 bit "
           "integer values at present.");
     }
-  } else if (compiler_type.IsFloatingPointType(count, is_complex)) {
+  } else if (compiler_type.IsFloatingPointType(is_complex)) {
     if (is_complex)
       error = Status::FromErrorString(
           "We don't support returning complex values at present");
@@ -695,7 +694,6 @@ ValueObjectSP ABISysV_ppc::GetReturnValueObjectImpl(
         uint64_t field_bit_offset = 0;
         bool is_signed;
         bool is_complex;
-        uint32_t count;
 
         CompilerType field_compiler_type = return_compiler_type.GetFieldAtIndex(
             idx, name, &field_bit_offset, nullptr, nullptr);
@@ -741,7 +739,7 @@ ValueObjectSP ABISysV_ppc::GetReturnValueObjectImpl(
             // return a nullptr return value object.
             return return_valobj_sp;
           }
-        } else if (field_compiler_type.IsFloatingPointType(count, is_complex)) {
+        } else if (field_compiler_type.IsFloatingPointType(is_complex)) {
           // Structs with long doubles are always passed in memory.
           if (*field_bit_width == 128) {
             is_memory = true;
diff --git a/lldb/source/Plugins/ABI/PowerPC/ABISysV_ppc64.cpp b/lldb/source/Plugins/ABI/PowerPC/ABISysV_ppc64.cpp
index f5327a1f403c0..63357618774d4 100644
--- a/lldb/source/Plugins/ABI/PowerPC/ABISysV_ppc64.cpp
+++ b/lldb/source/Plugins/ABI/PowerPC/ABISysV_ppc64.cpp
@@ -309,7 +309,6 @@ Status ABISysV_ppc64::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
   Thread *thread = frame_sp->GetThread().get();
 
   bool is_signed;
-  uint32_t count;
   bool is_complex;
 
   RegisterContext *reg_ctx = thread->GetRegisterContext().get();
@@ -339,7 +338,7 @@ Status ABISysV_ppc64::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
           "We don't support returning longer than 64 bit "
           "integer values at present.");
     }
-  } else if (compiler_type.IsFloatingPointType(count, is_complex)) {
+  } else if (compiler_type.IsFloatingPointType(is_complex)) {
     if (is_complex)
       error = Status::FromErrorString(
           "We don't support returning complex values at present");
diff --git a/lldb/source/Plugins/ABI/RISCV/ABISysV_riscv.cpp b/lldb/source/Plugins/ABI/RISCV/ABISysV_riscv.cpp
index 822c93dbbec3d..ff37b48d86ca8 100644
--- a/lldb/source/Plugins/ABI/RISCV/ABISysV_riscv.cpp
+++ b/lldb/source/Plugins/ABI/RISCV/ABISysV_riscv.cpp
@@ -643,11 +643,10 @@ ABISysV_riscv::GetReturnValueObjectSimple(Thread &thread,
   }
   // Floating point return type.
   else if (type_flags & eTypeIsFloat) {
-    uint32_t float_count = 0;
     bool is_complex = false;
 
-    if (compiler_type.IsFloatingPointType(float_count, is_complex) &&
-        float_count == 1 && !is_complex) {
+    if (compiler_type.IsFloatingPointType(is_complex) &&
+        !(type_flags & eTypeIsVector) && !is_complex) {
       const uint32_t arch_fp_flags =
           arch.GetFlags() & ArchSpec::eRISCV_float_abi_mask;
       return_valobj_sp = GetValObjFromFPRegs(
@@ -816,9 +815,9 @@ void ABISysV_riscv::Terminate() {
 static uint32_t GetGenericNum(llvm::StringRef name) {
   return llvm::StringSwitch<uint32_t>(name)
       .Case("pc", LLDB_REGNUM_GENERIC_PC)
-      .Cases("ra", "x1", LLDB_REGNUM_GENERIC_RA)
-      .Cases("sp", "x2", LLDB_REGNUM_GENERIC_SP)
-      .Cases("fp", "s0", LLDB_REGNUM_GENERIC_FP)
+      .Cases({"ra", "x1"}, LLDB_REGNUM_GENERIC_RA)
+      .Cases({"sp", "x2"}, LLDB_REGNUM_GENERIC_SP)
+      .Cases({"fp", "s0"}, LLDB_REGNUM_GENERIC_FP)
       .Case("a0", LLDB_REGNUM_GENERIC_ARG1)
       .Case("a1", LLDB_REGNUM_GENERIC_ARG2)
       .Case("a2", LLDB_REGNUM_GENERIC_ARG3)
diff --git a/lldb/source/Plugins/ABI/SystemZ/ABISysV_s390x.cpp b/lldb/source/Plugins/ABI/SystemZ/ABISysV_s390x.cpp
index 5e52b6e4db499..301c3b309ffd5 100644
--- a/lldb/source/Plugins/ABI/SystemZ/ABISysV_s390x.cpp
+++ b/lldb/source/Plugins/ABI/SystemZ/ABISysV_s390x.cpp
@@ -393,7 +393,6 @@ Status ABISysV_s390x::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
   Thread *thread = frame_sp->GetThread().get();
 
   bool is_signed;
-  uint32_t count;
   bool is_complex;
 
   RegisterContext *reg_ctx = thread->GetRegisterContext().get();
@@ -423,7 +422,7 @@ Status ABISysV_s390x::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
           "We don't support returning longer than 64 bit "
           "integer values at present.");
     }
-  } else if (compiler_type.IsFloatingPointType(count, is_complex)) {
+  } else if (compiler_type.IsFloatingPointType(is_complex)) {
     if (is_complex)
       error = Status::FromErrorString(
           "We don't support returning complex values at present");
diff --git a/lldb/source/Plugins/ABI/X86/ABIMacOSX_i386.cpp b/lldb/source/Plugins/ABI/X86/ABIMacOSX_i386.cpp
index eaeed6c04590c..ee79abe55ead0 100644
--- a/lldb/source/Plugins/ABI/X86/ABIMacOSX_i386.cpp
+++ b/lldb/source/Plugins/ABI/X86/ABIMacOSX_i386.cpp
@@ -198,7 +198,6 @@ Status ABIMacOSX_i386::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
   Thread *thread = frame_sp->GetThread().get();
 
   bool is_signed;
-  uint32_t count;
   bool is_complex;
 
   RegisterContext *reg_ctx = thread->GetRegisterContext().get();
@@ -240,7 +239,7 @@ Status ABIMacOSX_i386::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
           "We don't support returning longer than 64 bit "
           "integer values at present.");
     }
-  } else if (compiler_type.IsFloatingPointType(count, is_complex)) {
+  } else if (compiler_type.IsFloatingPointType(is_complex)) {
     if (is_complex)
       error = Status::FromErrorString(
           "We don't support returning complex values at present");
diff --git a/lldb/source/Plugins/ABI/X86/ABISysV_x86_64.cpp b/lldb/source/Plugins/ABI/X86/ABISysV_x86_64.cpp
index effb3de8215d6..29fd9f0eceb93 100644
--- a/lldb/source/Plugins/ABI/X86/ABISysV_x86_64.cpp
+++ b/lldb/source/Plugins/ABI/X86/ABISysV_x86_64.cpp
@@ -307,7 +307,6 @@ Status ABISysV_x86_64::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
   Thread *thread = frame_sp->GetThread().get();
 
   bool is_signed;
-  uint32_t count;
   bool is_complex;
 
   RegisterContext *reg_ctx = thread->GetRegisterContext().get();
@@ -337,7 +336,7 @@ Status ABISysV_x86_64::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
           "We don't support returning longer than 64 bit "
           "integer values at present.");
     }
-  } else if (compiler_type.IsFloatingPointType(count, is_complex)) {
+  } else if (compiler_type.IsFloatingPointType(is_complex)) {
     if (is_complex)
       error = Status::FromErrorString(
           "We don't support returning complex values at present");
@@ -587,7 +586,6 @@ static bool FlattenAggregateType(
   for (uint32_t idx = 0; idx < num_children; ++idx) {
     std::string name;
     bool is_signed;
-    uint32_t count;
     bool is_complex;
 
     uint64_t field_bit_offset = 0;
@@ -606,7 +604,7 @@ static bool FlattenAggregateType(
     const uint32_t field_type_flags = field_compiler_type.GetTypeInfo();
     if (field_compiler_type.IsIntegerOrEnumerationType(is_signed) ||
         field_compiler_type.IsPointerType() ||
-        field_compiler_type.IsFloatingPointType(count, is_complex)) {
+        field_compiler_type.IsFloatingPointType(is_complex)) {
       aggregate_field_offsets.push_back(field_byte_offset);
       aggregate_compiler_types.push_back(field_compiler_type);
     } else if (field_type_flags & eTypeHasChildren) {
@@ -696,7 +694,6 @@ ValueObjectSP ABISysV_x86_64::GetReturnValueObjectImpl(
       is_memory = false;
       for (uint32_t idx = 0; idx < num_children; idx++) {
         bool is_signed;
-        uint32_t count;
         bool is_complex;
 
         CompilerType field_compiler_type = aggregate_compiler_types[idx];
@@ -736,7 +733,7 @@ ValueObjectSP ABISysV_x86_64::GetReturnValueObjectImpl(
             // return a nullptr return value object.
             return return_valobj_sp;
           }
-        } else if (field_compiler_type.IsFloatingPointType(count, is_complex)) {
+        } else if (field_compiler_type.IsFloatingPointType(is_complex)) {
           // Structs with long doubles are always passed in memory.
           if (field_bit_width == 128) {
             is_memory = true;
diff --git a/lldb/source/Plugins/ABI/X86/ABIWindows_x86_64.cpp b/lldb/source/Plugins/ABI/X86/ABIWindows_x86_64.cpp
index 339012cffb688..6520af2f643ee 100644
--- a/lldb/source/Plugins/ABI/X86/ABIWindows_x86_64.cpp
+++ b/lldb/source/Plugins/ABI/X86/ABIWindows_x86_64.cpp
@@ -312,7 +312,6 @@ Status ABIWindows_x86_64::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
   Thread *thread = frame_sp->GetThread().get();
 
   bool is_signed;
-  uint32_t count;
   bool is_complex;
 
   RegisterContext *reg_ctx = thread->GetRegisterContext().get();
@@ -342,7 +341,7 @@ Status ABIWindows_x86_64::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
           "We don't support returning longer than 64 bit "
           "integer values at present.");
     }
-  } else if (compiler_type.IsFloatingPointType(count, is_complex)) {
+  } else if (compiler_type.IsFloatingPointType(is_complex)) {
     if (is_complex)
       error = Status::FromErrorString(
           "We don't support returning complex values at present");
@@ -558,7 +557,6 @@ static bool FlattenAggregateType(
   for (uint32_t idx = 0; idx < num_children; ++idx) {
     std::string name;
     bool is_signed;
-    uint32_t count;
     bool is_complex;
 
     uint64_t field_bit_offset = 0;
@@ -582,7 +580,7 @@ static bool FlattenAggregateType(
     const uint32_t field_type_flags = field_compiler_type.GetTypeInfo();
     if (field_compiler_type.IsIntegerOrEnumerationType(is_signed) ||
         field_compiler_type.IsPointerType() ||
-        field_compiler_type.IsFloatingPointType(count, is_complex)) {
+        field_compiler_type.IsFloatingPointType(is_complex)) {
       aggregate_field_offsets.push_back(field_byte_offset);
       aggregate_compiler_types.push_back(field_compiler_type);
     } else if (field_type_flags & eTypeHasChildren) {
@@ -672,7 +670,6 @@ ValueObjectSP ABIWindows_x86_64::GetReturnValueObjectImpl(
     for (uint32_t idx = 0; idx < num_children; idx++) {
       bool is_signed;
       bool is_complex;
-      uint32_t count;
 
       CompilerType field_compiler_type = aggregate_compiler_types[idx];
       uint32_t field_byte_width =
@@ -691,7 +688,7 @@ ValueObjectSP ABIWindows_x86_64::GetReturnValueObjectImpl(
       uint32_t copy_from_offset = 0;
       if (field_compiler_type.IsIntegerOrEnumerationType(is_signed) ||
           field_compiler_type.IsPointerType() ||
-          field_compiler_type.IsFloatingPointType(count, is_complex)) {
+          field_compiler_type.IsFloatingPointType(is_complex)) {
         copy_from_extractor = &rax_data;
         copy_from_offset = used_bytes;
         used_bytes += field_byte_width;
diff --git a/lldb/source/Plugins/Language/ObjCPlusPlus/ObjCPlusPlusLanguage.cpp b/lldb/source/Plugins/Language/ObjCPlusPlus/ObjCPlusPlusLanguage.cpp
index 0489f4d6ada32..faa0dd0d87321 100644
--- a/lldb/source/Plugins/Language/ObjCPlusPlus/ObjCPlusPlusLanguage.cpp
+++ b/lldb/source/Plugins/Language/ObjCPlusPlus/ObjCPlusPlusLanguage.cpp
@@ -47,7 +47,7 @@ Language *ObjCPlusPlusLanguage::CreateInstance(lldb::LanguageType language) {
 std::optional<bool>
 ObjCPlusPlusLanguage::GetBooleanFromString(llvm::StringRef str) const {
   return llvm::StringSwitch<std::optional<bool>>(str)
-      .Cases("true", "YES", {true})
-      .Cases("false", "NO", {false})
+      .Cases({"true", "YES"}, {true})
+      .Cases({"false", "NO"}, {false})
       .Default({});
 }
diff --git a/lldb/source/Plugins/ObjectFile/Breakpad/BreakpadRecords.cpp b/lldb/source/Plugins/ObjectFile/Breakpad/BreakpadRecords.cpp
index d40f87b1a7b42..945b70fcb96ec 100644
--- a/lldb/source/Plugins/ObjectFile/Breakpad/BreakpadRecords.cpp
+++ b/lldb/source/Plugins/ObjectFile/Breakpad/BreakpadRecords.cpp
@@ -70,7 +70,7 @@ llvm::Triple::ArchType stringTo<llvm::Triple::ArchType>(llvm::StringRef Str) {
   using llvm::Triple;
   return llvm::StringSwitch<Triple::ArchType>(Str)
       .Case("arm", Triple::arm)
-      .Cases("arm64", "arm64e", Triple::aarch64)
+      .Cases({"arm64", "arm64e"}, Triple::aarch64)
       .Case("mips", Triple::mips)
       .Case("msp430", Triple::msp430)
       .Case("ppc", Triple::ppc)
@@ -79,7 +79,7 @@ llvm::Triple::ArchType stringTo<llvm::Triple::ArchType>(llvm::StringRef Str) {
       .Case("sparc", Triple::sparc)
       .Case("sparcv9", Triple::sparcv9)
       .Case("x86", Triple::x86)
-      .Cases("x86_64", "x86_64h", Triple::x86_64)
+      .Cases({"x86_64", "x86_64h"}, Triple::x86_64)
       .Default(Triple::UnknownArch);
 }
 
diff --git a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp
index 097c91b623e8f..49841e7307443 100644
--- a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp
+++ b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp
@@ -1678,7 +1678,7 @@ static SectionType GetSectionTypeFromName(llvm::StringRef Name) {
       .Case(".ARM.exidx", eSectionTypeARMexidx)
       .Case(".ARM.extab", eSectionTypeARMextab)
       .Case(".ctf", eSectionTypeDebug)
-      .Cases(".data", ".tdata", eSectionTypeData)
+      .Cases({".data", ".tdata"}, eSectionTypeData)
       .Case(".eh_frame", eSectionTypeEHFrame)
       .Case(".gnu_debugaltlink", eSectionTypeDWARFGNUDebugAltLink)
       .Case(".gosymtab", eSectionTypeGoSymtab)
diff --git a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
index 9cdb8467bfc60..c8e520d687f67 100644
--- a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
+++ b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
@@ -1674,6 +1674,10 @@ void ObjectFileMachO::ProcessSegmentCommand(
   uint32_t segment_sect_idx;
   const lldb::user_id_t first_segment_sectID = context.NextSectionIdx + 1;
 
+  // 64 bit mach-o files have sections with 32 bit file offsets. If any section
+  // data end will exceed UINT32_MAX, then we need to do some bookkeeping to
+  // ensure we can access this data correctly.
+  uint64_t section_offset_adjust = 0;
   const uint32_t num_u32s = load_cmd.cmd == LC_SEGMENT ? 7 : 8;
   for (segment_sect_idx = 0; segment_sect_idx < load_cmd.nsects;
        ++segment_sect_idx) {
@@ -1697,6 +1701,16 @@ void ObjectFileMachO::ProcessSegmentCommand(
     // isn't stored in the abstracted Sections.
     m_mach_sections.push_back(sect64);
 
+    // Make sure we can load sections in mach-o files where some sections cross
+    // a 4GB boundary. llvm::MachO::section_64 have only 32 bit file offsets
+    // for the file offset of the section contents, so we need to track and
+    // sections that overflow and adjust the offsets accordingly.
+    const uint64_t section_file_offset =
+        (uint64_t)sect64.offset + section_offset_adjust;
+    const uint64_t end_section_offset = (uint64_t)sect64.offset + sect64.size;
+    if (end_section_offset >= UINT32_MAX)
+      section_offset_adjust += end_section_offset & 0xFFFFFFFF00000000ull;
+
     if (add_section) {
       ConstString section_name(
           sect64.sectname, strnlen(sect64.sectname, sizeof(sect64.sectname)));
@@ -1736,13 +1750,13 @@ void ObjectFileMachO::ProcessSegmentCommand(
           }
 
           // Grow the section size as needed.
-          if (sect64.offset) {
+          if (section_file_offset) {
             const lldb::addr_t segment_min_file_offset =
                 segment->GetFileOffset();
             const lldb::addr_t segment_max_file_offset =
                 segment_min_file_offset + segment->GetFileSize();
 
-            const lldb::addr_t section_min_file_offset = sect64.offset;
+            const lldb::addr_t section_min_file_offset = section_file_offset;
             const lldb::addr_t section_max_file_offset =
                 section_min_file_offset + sect64.size;
             const lldb::addr_t new_file_offset =
@@ -1769,10 +1783,10 @@ void ObjectFileMachO::ProcessSegmentCommand(
               // other sections.
               sect64.addr, // File VM address == addresses as they are
               // found in the object file
-              sect64.size,   // VM size in bytes of this section
-              sect64.offset, // Offset to the data for this section in
+              sect64.size,         // VM size in bytes of this section
+              section_file_offset, // Offset to the data for this section in
               // the file
-              sect64.offset ? sect64.size : 0, // Size in bytes of
+              section_file_offset ? sect64.size : 0, // Size in bytes of
               // this section as
               // found in the file
               sect64.align,
@@ -1792,14 +1806,14 @@ void ObjectFileMachO::ProcessSegmentCommand(
       SectionSP section_sp(new Section(
           segment_sp, module_sp, this, ++context.NextSectionIdx, section_name,
           sect_type, sect64.addr - segment_sp->GetFileAddress(), sect64.size,
-          sect64.offset, sect64.offset == 0 ? 0 : sect64.size, sect64.align,
-          sect64.flags));
+          section_file_offset, section_file_offset == 0 ? 0 : sect64.size,
+          sect64.align, sect64.flags));
       // Set the section to be encrypted to match the segment
 
       bool section_is_encrypted = false;
       if (!segment_is_encrypted && load_cmd.filesize != 0)
         section_is_encrypted = context.EncryptedRanges.FindEntryThatContains(
-                                   sect64.offset) != nullptr;
+                                   section_file_offset) != nullptr;
 
       section_sp->SetIsEncrypted(segment_is_encrypted || section_is_encrypted);
       section_sp->SetPermissions(segment_permissions);
diff --git a/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp b/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp
index 4984445dcbab9..244489ae06d65 100644
--- a/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp
+++ b/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp
@@ -985,7 +985,7 @@ SectionType ObjectFilePECOFF::GetSectionType(llvm::StringRef sect_name,
           .Case(".stabstr", eSectionTypeDataCString)
           .Case(".reloc", eSectionTypeOther)
           // .eh_frame can be truncated to 8 chars.
-          .Cases(".eh_frame", ".eh_fram", eSectionTypeEHFrame)
+          .Cases({".eh_frame", ".eh_fram"}, eSectionTypeEHFrame)
           .Case(".gosymtab", eSectionTypeGoSymtab)
           .Case(".lldbsummaries", lldb::eSectionTypeLLDBTypeSummaries)
           .Case(".lldbformatters", lldb::eSectionTypeLLDBFormatters)
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h b/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h
index 7b39d29ba2b20..27f5d2ee471c0 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h
@@ -158,8 +158,9 @@ class SWIGBridge {
   static PyObject *LLDBSwigPython_GetChildAtIndex(PyObject *implementor,
                                                   uint32_t idx);
 
-  static int LLDBSwigPython_GetIndexOfChildWithName(PyObject *implementor,
-                                                    const char *child_name);
+  static uint32_t
+  LLDBSwigPython_GetIndexOfChildWithName(PyObject *implementor,
+                                         const char *child_name);
 
   static lldb::ValueObjectSP
   LLDBSWIGPython_GetValueObjectSPFromSBValue(void *data);
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
index 73c5c72932ff1..d257a08a2c62c 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
@@ -1939,7 +1939,7 @@ lldb::ValueObjectSP ScriptInterpreterPythonImpl::GetChildAtIndex(
   return ret_val;
 }
 
-llvm::Expected<int> ScriptInterpreterPythonImpl::GetIndexOfChildWithName(
+llvm::Expected<uint32_t> ScriptInterpreterPythonImpl::GetIndexOfChildWithName(
     const StructuredData::ObjectSP &implementor_sp, const char *child_name) {
   if (!implementor_sp)
     return llvm::createStringError("Type has no child named '%s'", child_name);
@@ -1951,7 +1951,7 @@ llvm::Expected<int> ScriptInterpreterPythonImpl::GetIndexOfChildWithName(
   if (!implementor)
     return llvm::createStringError("Type has no child named '%s'", child_name);
 
-  int ret_val = INT32_MAX;
+  uint32_t ret_val = UINT32_MAX;
 
   {
     Locker py_lock(this,
@@ -1960,7 +1960,7 @@ llvm::Expected<int> ScriptInterpreterPythonImpl::GetIndexOfChildWithName(
                                                                  child_name);
   }
 
-  if (ret_val == INT32_MAX)
+  if (ret_val == UINT32_MAX)
     return llvm::createStringError("Type has no child named '%s'", child_name);
   return ret_val;
 }
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h
index dedac280788f4..00ae59c1c4241 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h
@@ -122,7 +122,7 @@ class ScriptInterpreterPythonImpl : public ScriptInterpreterPython {
   GetChildAtIndex(const StructuredData::ObjectSP &implementor,
                   uint32_t idx) override;
 
-  llvm::Expected<int>
+  llvm::Expected<uint32_t>
   GetIndexOfChildWithName(const StructuredData::ObjectSP &implementor,
                           const char *child_name) override;
 
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
index 82e9d867c3ac0..63b2dc4ab82b0 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
@@ -450,6 +450,10 @@ ParsedDWARFTypeAttributes::ParsedDWARFTypeAttributes(const DWARFDIE &die) {
       byte_size = form_value.Unsigned();
       break;
 
+    case DW_AT_bit_size:
+      data_bit_size = form_value.Unsigned();
+      break;
+
     case DW_AT_alignment:
       alignment = form_value.Unsigned();
       break;
@@ -810,13 +814,18 @@ DWARFASTParserClang::ParseTypeModifier(const SymbolContext &sc,
     // there...
     [[fallthrough]];
 
-  case DW_TAG_base_type:
+  case DW_TAG_base_type: {
     resolve_state = Type::ResolveState::Full;
+    // If a builtin type's size isn't a multiple of a byte, DWARF producers may
+    // add a precise bit-size to the type. Use the most precise bit-size
+    // possible.
+    const uint64_t bit_size = attrs.data_bit_size
+                                  ? *attrs.data_bit_size
+                                  : attrs.byte_size.value_or(0) * 8;
     clang_type = m_ast.GetBuiltinTypeForDWARFEncodingAndBitSize(
-        attrs.name.GetStringRef(), attrs.encoding,
-        attrs.byte_size.value_or(0) * 8);
+        attrs.name.GetStringRef(), attrs.encoding, bit_size);
     break;
-
+  }
   case DW_TAG_pointer_type:
     encoding_data_type = Type::eEncodingIsPointerUID;
     break;
@@ -1901,6 +1910,17 @@ DWARFASTParserClang::ParseStructureLikeDIE(const SymbolContext &sc,
         m_ast.CreateClassTemplateSpecializationDecl(
             containing_decl_ctx, GetOwningClangModule(die), class_template_decl,
             tag_decl_kind, template_param_infos);
+    if (!class_specialization_decl) {
+      if (log) {
+        dwarf->GetObjectFile()->GetModule()->LogMessage(
+            log,
+            "SymbolFileDWARF({0:p}) - Failed to create specialization for "
+            "clang::ClassTemplateDecl({1}, {2:p}).",
+            this, llvm::StringRef(attrs.name), class_template_decl);
+      }
+      return TypeSP();
+    }
+
     clang_type =
         m_ast.CreateClassTemplateSpecializationType(class_specialization_decl);
 
@@ -2032,11 +2052,10 @@ static std::optional<clang::APValue> MakeAPValue(const clang::ASTContext &ast,
   if (is_integral)
     return clang::APValue(apint);
 
-  uint32_t count;
   bool is_complex;
   // FIXME: we currently support a limited set of floating point types.
   // E.g., 16-bit floats are not supported.
-  if (!clang_type.IsFloatingPointType(count, is_complex))
+  if (!clang_type.IsFloatingPointType(is_complex))
     return std::nullopt;
 
   return clang::APValue(llvm::APFloat(
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h
index da58f4c146226..f5f707129d67d 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h
@@ -574,6 +574,7 @@ struct ParsedDWARFTypeAttributes {
   lldb_private::plugin::dwarf::DWARFFormValue type;
   lldb::LanguageType class_language = lldb::eLanguageTypeUnknown;
   std::optional<uint64_t> byte_size;
+  std::optional<uint64_t> data_bit_size;
   std::optional<uint64_t> alignment;
   size_t calling_convention = llvm::dwarf::DW_CC_normal;
   uint32_t bit_stride = 0;
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.cpp b/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.cpp
index d90108f687f84..36dee1470e0a2 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.cpp
@@ -22,7 +22,6 @@
 #include "lldb/Utility/Stream.h"
 #include "lldb/Utility/Timer.h"
 #include "lldb/lldb-private-enumerations.h"
-#include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/ThreadPool.h"
 #include <atomic>
 #include <optional>
@@ -33,10 +32,10 @@ using namespace lldb_private::plugin::dwarf;
 using namespace llvm::dwarf;
 
 void ManualDWARFIndex::Index() {
-  if (m_indexed)
-    return;
-  m_indexed = true;
+  std::call_once(m_indexed_flag, [this]() { IndexImpl(); });
+}
 
+void ManualDWARFIndex::IndexImpl() {
   ElapsedTime elapsed(m_index_time);
   LLDB_SCOPED_TIMERF("%p", static_cast<void *>(m_dwarf));
   if (LoadFromCache()) {
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.h b/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.h
index 0b5b2f3e84309..41e0e620a4896 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.h
@@ -66,8 +66,14 @@ class ManualDWARFIndex : public DWARFIndex {
   void Dump(Stream &s) override;
 
 private:
+  /// Reads the DWARF debug info to build the index once.
+  ///
+  /// Should be called before attempting to retrieve symbols.
   void Index();
 
+  /// Call `ManualDWARFIndex::Index()` instead.
+  void IndexImpl();
+
   /// Decode a serialized version of this object from data.
   ///
   /// \param data
@@ -170,7 +176,7 @@ class ManualDWARFIndex : public DWARFIndex {
   llvm::DenseSet<uint64_t> m_type_sigs_to_avoid;
 
   IndexSet<NameToDIE> m_set;
-  bool m_indexed = false;
+  std::once_flag m_indexed_flag;
 };
 } // namespace dwarf
 } // namespace lldb_private::plugin
diff --git a/lldb/source/Plugins/SymbolFile/NativePDB/SymbolFileNativePDB.cpp b/lldb/source/Plugins/SymbolFile/NativePDB/SymbolFileNativePDB.cpp
index e76b7a3cf274a..aaec1600dacff 100644
--- a/lldb/source/Plugins/SymbolFile/NativePDB/SymbolFileNativePDB.cpp
+++ b/lldb/source/Plugins/SymbolFile/NativePDB/SymbolFileNativePDB.cpp
@@ -1130,7 +1130,35 @@ void SymbolFileNativePDB::AddSymbols(Symtab &symtab) {
   if (!section_list)
     return;
 
-  for (auto pid : m_index->publics().getPublicsTable()) {
+  PublicSym32 last_sym;
+  size_t last_sym_idx = 0;
+  lldb::SectionSP section_sp;
+
+  // To estimate the size of a symbol, we use the difference to the next symbol.
+  // If there's no next symbol or the section/segment changed, the symbol will
+  // take the remaining space. The estimate can be too high in case there's
+  // padding between symbols. This similar to the algorithm used by the DIA
+  // SDK.
+  auto finish_last_symbol = [&](const PublicSym32 *next) {
+    if (!section_sp)
+      return;
+    Symbol *last = symtab.SymbolAtIndex(last_sym_idx);
+    if (!last)
+      return;
+
+    if (next && last_sym.Segment == next->Segment) {
+      assert(last_sym.Offset <= next->Offset);
+      last->SetByteSize(next->Offset - last_sym.Offset);
+    } else {
+      // the last symbol was the last in its section
+      assert(section_sp->GetByteSize() >= last_sym.Offset);
+      assert(!next || next->Segment > last_sym.Segment);
+      last->SetByteSize(section_sp->GetByteSize() - last_sym.Offset);
+    }
+  };
+
+  // The address map is sorted by the address of a symbol.
+  for (auto pid : m_index->publics().getAddressMap()) {
     PdbGlobalSymId global{pid, true};
     CVSymbol sym = m_index->ReadSymbolRecord(global);
     auto kind = sym.kind();
@@ -1138,8 +1166,11 @@ void SymbolFileNativePDB::AddSymbols(Symtab &symtab) {
       continue;
     PublicSym32 pub =
         llvm::cantFail(SymbolDeserializer::deserializeAs<PublicSym32>(sym));
+    finish_last_symbol(&pub);
+
+    if (!section_sp || last_sym.Segment != pub.Segment)
+      section_sp = section_list->FindSectionByID(pub.Segment);
 
-    auto section_sp = section_list->FindSectionByID(pub.Segment);
     if (!section_sp)
       continue;
 
@@ -1148,20 +1179,24 @@ void SymbolFileNativePDB::AddSymbols(Symtab &symtab) {
         (pub.Flags & PublicSymFlags::Code) != PublicSymFlags::None)
       type = eSymbolTypeCode;
 
-    symtab.AddSymbol(Symbol(/*symID=*/pid,
-                            /*name=*/pub.Name,
-                            /*type=*/type,
-                            /*external=*/true,
-                            /*is_debug=*/true,
-                            /*is_trampoline=*/false,
-                            /*is_artificial=*/false,
-                            /*section_sp=*/section_sp,
-                            /*value=*/pub.Offset,
-                            /*size=*/0,
-                            /*size_is_valid=*/false,
-                            /*contains_linker_annotations=*/false,
-                            /*flags=*/0));
-  }
+    last_sym_idx =
+        symtab.AddSymbol(Symbol(/*symID=*/pid,
+                                /*name=*/pub.Name,
+                                /*type=*/type,
+                                /*external=*/true,
+                                /*is_debug=*/true,
+                                /*is_trampoline=*/false,
+                                /*is_artificial=*/false,
+                                /*section_sp=*/section_sp,
+                                /*value=*/pub.Offset,
+                                /*size=*/0,
+                                /*size_is_valid=*/false,
+                                /*contains_linker_annotations=*/false,
+                                /*flags=*/0));
+    last_sym = pub;
+  }
+
+  finish_last_symbol(nullptr);
 }
 
 size_t SymbolFileNativePDB::ParseFunctions(CompileUnit &comp_unit) {
diff --git a/lldb/source/Plugins/SymbolFile/PDB/SymbolFilePDB.cpp b/lldb/source/Plugins/SymbolFile/PDB/SymbolFilePDB.cpp
index 3b936c06b1072..0ccb1804bb13a 100644
--- a/lldb/source/Plugins/SymbolFile/PDB/SymbolFilePDB.cpp
+++ b/lldb/source/Plugins/SymbolFile/PDB/SymbolFilePDB.cpp
@@ -83,8 +83,8 @@ constexpr OptionEnumValueElement g_pdb_reader_enums[] = {
     {
         ePDBReaderDefault,
         "default",
-        "Use DIA PDB reader unless LLDB_USE_NATIVE_PDB_READER environment "
-        "variable is set",
+        "Use native PDB reader unless LLDB_USE_NATIVE_PDB_READER environment "
+        "is set to 0",
     },
     {
         ePDBReaderDIA,
@@ -109,16 +109,10 @@ enum {
 static const bool g_should_use_native_reader_by_default = [] {
   llvm::StringRef env_value = ::getenv("LLDB_USE_NATIVE_PDB_READER");
 
-#if !LLVM_ENABLE_DIA_SDK || !defined(_WIN32)
-  // if the environment value is unset, the native reader is requested
-  if (env_value.empty())
-    return true;
-#endif
-
-  return env_value.equals_insensitive("on") ||
-         env_value.equals_insensitive("yes") ||
-         env_value.equals_insensitive("1") ||
-         env_value.equals_insensitive("true");
+  return !env_value.equals_insensitive("off") &&
+         !env_value.equals_insensitive("no") &&
+         !env_value.equals_insensitive("0") &&
+         !env_value.equals_insensitive("false");
 }();
 
 class PluginProperties : public Properties {
diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
index 82dfe7e540717..51cb883748514 100644
--- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
+++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
@@ -1000,6 +1000,8 @@ CompilerType TypeSystemClang::GetBuiltinTypeForDWARFEncodingAndBitSize(
 
   case DW_ATE_signed:
     if (!type_name.empty()) {
+      if (type_name.starts_with("_BitInt"))
+        return GetType(ast.getBitIntType(/*Unsigned=*/false, bit_size));
       if (type_name == "wchar_t" &&
           QualTypeMatchesBitSize(bit_size, ast, ast.WCharTy) &&
           (getTargetInfo() &&
@@ -1056,6 +1058,8 @@ CompilerType TypeSystemClang::GetBuiltinTypeForDWARFEncodingAndBitSize(
 
   case DW_ATE_unsigned:
     if (!type_name.empty()) {
+      if (type_name.starts_with("unsigned _BitInt"))
+        return GetType(ast.getBitIntType(/*Unsigned=*/true, bit_size));
       if (type_name == "wchar_t") {
         if (QualTypeMatchesBitSize(bit_size, ast, ast.WCharTy)) {
           if (!(getTargetInfo() &&
@@ -1693,6 +1697,11 @@ TypeSystemClang::CreateClassTemplateSpecializationDecl(
   class_template_specialization_decl->setInstantiationOf(class_template_decl);
   class_template_specialization_decl->setTemplateArgs(
       TemplateArgumentList::CreateCopy(ast, args));
+  void *insert_pos = nullptr;
+  if (class_template_decl->findSpecialization(args, insert_pos))
+    return nullptr;
+  class_template_decl->AddSpecialization(class_template_specialization_decl,
+                                         insert_pos);
   class_template_specialization_decl->setDeclName(
       class_template_decl->getDeclName());
 
@@ -3483,7 +3492,7 @@ bool TypeSystemClang::IsReferenceType(lldb::opaque_compiler_type_t type,
 }
 
 bool TypeSystemClang::IsFloatingPointType(lldb::opaque_compiler_type_t type,
-                                          uint32_t &count, bool &is_complex) {
+                                          bool &is_complex) {
   if (type) {
     clang::QualType qual_type(GetCanonicalQualType(type));
 
@@ -3492,30 +3501,26 @@ bool TypeSystemClang::IsFloatingPointType(lldb::opaque_compiler_type_t type,
       clang::BuiltinType::Kind kind = BT->getKind();
       if (kind >= clang::BuiltinType::Float &&
           kind <= clang::BuiltinType::LongDouble) {
-        count = 1;
         is_complex = false;
         return true;
       }
     } else if (const clang::ComplexType *CT =
                    llvm::dyn_cast<clang::ComplexType>(
                        qual_type->getCanonicalTypeInternal())) {
-      if (IsFloatingPointType(CT->getElementType().getAsOpaquePtr(), count,
+      if (IsFloatingPointType(CT->getElementType().getAsOpaquePtr(),
                               is_complex)) {
-        count = 2;
         is_complex = true;
         return true;
       }
     } else if (const clang::VectorType *VT = llvm::dyn_cast<clang::VectorType>(
                    qual_type->getCanonicalTypeInternal())) {
-      if (IsFloatingPointType(VT->getElementType().getAsOpaquePtr(), count,
+      if (IsFloatingPointType(VT->getElementType().getAsOpaquePtr(),
                               is_complex)) {
-        count = VT->getNumElements();
         is_complex = false;
         return true;
       }
     }
   }
-  count = 0;
   is_complex = false;
   return false;
 }
@@ -3888,6 +3893,13 @@ TypeSystemClang::GetTypeInfo(lldb::opaque_compiler_type_t type,
                            ->getModifiedType()
                            .getAsOpaquePtr(),
                        pointee_or_element_clang_type);
+  case clang::Type::BitInt: {
+    uint32_t type_flags = eTypeIsScalar | eTypeIsInteger | eTypeHasValue;
+    if (qual_type->isSignedIntegerType())
+      type_flags |= eTypeIsSigned;
+
+    return type_flags;
+  }
   case clang::Type::Builtin: {
     const clang::BuiltinType *builtin_type =
         llvm::cast<clang::BuiltinType>(qual_type->getCanonicalTypeInternal());
@@ -3960,9 +3972,9 @@ TypeSystemClang::GetTypeInfo(lldb::opaque_compiler_type_t type,
     if (complex_type) {
       clang::QualType complex_element_type(complex_type->getElementType());
       if (complex_element_type->isIntegerType())
-        complex_type_flags |= eTypeIsFloat;
-      else if (complex_element_type->isFloatingType())
         complex_type_flags |= eTypeIsInteger;
+      else if (complex_element_type->isFloatingType())
+        complex_type_flags |= eTypeIsFloat;
     }
     return complex_type_flags;
   } break;
@@ -4057,12 +4069,17 @@ TypeSystemClang::GetTypeInfo(lldb::opaque_compiler_type_t type,
     uint32_t vector_type_flags = eTypeHasChildren | eTypeIsVector;
     const clang::VectorType *vector_type = llvm::dyn_cast<clang::VectorType>(
         qual_type->getCanonicalTypeInternal());
-    if (vector_type) {
-      if (vector_type->isIntegerType())
-        vector_type_flags |= eTypeIsFloat;
-      else if (vector_type->isFloatingType())
-        vector_type_flags |= eTypeIsInteger;
-    }
+    if (!vector_type)
+      return 0;
+
+    QualType element_type = vector_type->getElementType();
+    if (element_type.isNull())
+      return 0;
+
+    if (element_type->isIntegerType())
+      vector_type_flags |= eTypeIsInteger;
+    else if (element_type->isFloatingType())
+      vector_type_flags |= eTypeIsFloat;
     return vector_type_flags;
   }
   default:
@@ -4859,12 +4876,10 @@ TypeSystemClang::GetTypeBitAlign(lldb::opaque_compiler_type_t type,
   return {};
 }
 
-lldb::Encoding TypeSystemClang::GetEncoding(lldb::opaque_compiler_type_t type,
-                                            uint64_t &count) {
+lldb::Encoding TypeSystemClang::GetEncoding(lldb::opaque_compiler_type_t type) {
   if (!type)
     return lldb::eEncodingInvalid;
 
-  count = 1;
   clang::QualType qual_type = RemoveWrappingTypes(GetCanonicalQualType(type));
 
   switch (qual_type->getTypeClass()) {
@@ -4898,7 +4913,6 @@ lldb::Encoding TypeSystemClang::GetEncoding(lldb::opaque_compiler_type_t type,
   case clang::Type::DependentVector:
   case clang::Type::ExtVector:
   case clang::Type::Vector:
-    // TODO: Set this to more than one???
     break;
 
   case clang::Type::BitInt:
@@ -5099,11 +5113,10 @@ lldb::Encoding TypeSystemClang::GetEncoding(lldb::opaque_compiler_type_t type,
       const clang::ComplexType *complex_type =
           qual_type->getAsComplexIntegerType();
       if (complex_type)
-        encoding = GetType(complex_type->getElementType()).GetEncoding(count);
+        encoding = GetType(complex_type->getElementType()).GetEncoding();
       else
         encoding = lldb::eEncodingSint;
     }
-    count = 2;
     return encoding;
   }
 
@@ -5160,7 +5173,7 @@ lldb::Encoding TypeSystemClang::GetEncoding(lldb::opaque_compiler_type_t type,
   case clang::Type::SubstBuiltinTemplatePack:
     break;
   }
-  count = 0;
+
   return lldb::eEncodingInvalid;
 }
 
diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h
index 9e0a54209345d..375891b3cfd2f 100644
--- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h
+++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h
@@ -651,7 +651,7 @@ class TypeSystemClang : public TypeSystem {
 
   bool IsDefined(lldb::opaque_compiler_type_t type) override;
 
-  bool IsFloatingPointType(lldb::opaque_compiler_type_t type, uint32_t &count,
+  bool IsFloatingPointType(lldb::opaque_compiler_type_t type,
                            bool &is_complex) override;
 
   unsigned GetPtrAuthKey(lldb::opaque_compiler_type_t type) override;
@@ -837,8 +837,7 @@ class TypeSystemClang : public TypeSystem {
   GetBitSize(lldb::opaque_compiler_type_t type,
              ExecutionContextScope *exe_scope) override;
 
-  lldb::Encoding GetEncoding(lldb::opaque_compiler_type_t type,
-                             uint64_t &count) override;
+  lldb::Encoding GetEncoding(lldb::opaque_compiler_type_t type) override;
 
   lldb::Format GetFormat(lldb::opaque_compiler_type_t type) override;
 
diff --git a/lldb/source/Symbol/CompilerType.cpp b/lldb/source/Symbol/CompilerType.cpp
index 62c0ddf51c012..c999ab256fc98 100644
--- a/lldb/source/Symbol/CompilerType.cpp
+++ b/lldb/source/Symbol/CompilerType.cpp
@@ -240,13 +240,11 @@ bool CompilerType::ShouldTreatScalarValueAsAddress() const {
   return false;
 }
 
-bool CompilerType::IsFloatingPointType(uint32_t &count,
-                                       bool &is_complex) const {
+bool CompilerType::IsFloatingPointType(bool &is_complex) const {
   if (IsValid()) {
     if (auto type_system_sp = GetTypeSystem())
-      return type_system_sp->IsFloatingPointType(m_type, count, is_complex);
+      return type_system_sp->IsFloatingPointType(m_type, is_complex);
   }
-  count = 0;
   is_complex = false;
   return false;
 }
@@ -331,9 +329,8 @@ bool CompilerType::IsInteger() const {
 }
 
 bool CompilerType::IsFloat() const {
-  uint32_t count = 0;
   bool is_complex = false;
-  return IsFloatingPointType(count, is_complex);
+  return IsFloatingPointType(is_complex);
 }
 
 bool CompilerType::IsEnumerationType() const {
@@ -793,10 +790,10 @@ CompilerType::GetTypeBitAlign(ExecutionContextScope *exe_scope) const {
   return {};
 }
 
-lldb::Encoding CompilerType::GetEncoding(uint64_t &count) const {
+lldb::Encoding CompilerType::GetEncoding() const {
   if (IsValid())
     if (auto type_system_sp = GetTypeSystem())
-      return type_system_sp->GetEncoding(m_type, count);
+      return type_system_sp->GetEncoding(m_type);
   return lldb::eEncodingInvalid;
 }
 
@@ -1093,10 +1090,10 @@ bool CompilerType::GetValueAsScalar(const lldb_private::DataExtractor &data,
   if (IsAggregateType()) {
     return false; // Aggregate types don't have scalar values
   } else {
-    uint64_t count = 0;
-    lldb::Encoding encoding = GetEncoding(count);
+    // FIXME: check that type is scalar instead of checking encoding?
+    lldb::Encoding encoding = GetEncoding();
 
-    if (encoding == lldb::eEncodingInvalid || count != 1)
+    if (encoding == lldb::eEncodingInvalid || (GetTypeInfo() & eTypeIsComplex))
       return false;
 
     auto byte_size_or_err = GetByteSize(exe_scope);
diff --git a/lldb/source/Symbol/ObjectFile.cpp b/lldb/source/Symbol/ObjectFile.cpp
index 9a79b3c627623..6f5348c153030 100644
--- a/lldb/source/Symbol/ObjectFile.cpp
+++ b/lldb/source/Symbol/ObjectFile.cpp
@@ -647,14 +647,14 @@ ObjectFile::GetDWARFSectionTypeFromName(llvm::StringRef name) {
       .Case("frame", eSectionTypeDWARFDebugFrame)
       .Case("info", eSectionTypeDWARFDebugInfo)
       .Case("info.dwo", eSectionTypeDWARFDebugInfoDwo)
-      .Cases("line", "line.dwo", eSectionTypeDWARFDebugLine)
-      .Cases("line_str", "line_str.dwo", eSectionTypeDWARFDebugLineStr)
+      .Cases({"line", "line.dwo"}, eSectionTypeDWARFDebugLine)
+      .Cases({"line_str", "line_str.dwo"}, eSectionTypeDWARFDebugLineStr)
       .Case("loc", eSectionTypeDWARFDebugLoc)
       .Case("loc.dwo", eSectionTypeDWARFDebugLocDwo)
       .Case("loclists", eSectionTypeDWARFDebugLocLists)
       .Case("loclists.dwo", eSectionTypeDWARFDebugLocListsDwo)
       .Case("macinfo", eSectionTypeDWARFDebugMacInfo)
-      .Cases("macro", "macro.dwo", eSectionTypeDWARFDebugMacro)
+      .Cases({"macro", "macro.dwo"}, eSectionTypeDWARFDebugMacro)
       .Case("names", eSectionTypeDWARFDebugNames)
       .Case("pubnames", eSectionTypeDWARFDebugPubNames)
       .Case("pubtypes", eSectionTypeDWARFDebugPubTypes)
@@ -663,7 +663,7 @@ ObjectFile::GetDWARFSectionTypeFromName(llvm::StringRef name) {
       .Case("rnglists.dwo", eSectionTypeDWARFDebugRngListsDwo)
       .Case("str", eSectionTypeDWARFDebugStr)
       .Case("str.dwo", eSectionTypeDWARFDebugStrDwo)
-      .Cases("str_offsets", "str_offs", eSectionTypeDWARFDebugStrOffsets)
+      .Cases({"str_offsets", "str_offs"}, eSectionTypeDWARFDebugStrOffsets)
       .Case("str_offsets.dwo", eSectionTypeDWARFDebugStrOffsetsDwo)
       .Case("tu_index", eSectionTypeDWARFDebugTuIndex)
       .Case("types", eSectionTypeDWARFDebugTypes)
diff --git a/lldb/source/Symbol/Type.cpp b/lldb/source/Symbol/Type.cpp
index 952b2bdee1886..0c3246d238701 100644
--- a/lldb/source/Symbol/Type.cpp
+++ b/lldb/source/Symbol/Type.cpp
@@ -531,9 +531,9 @@ lldb::TypeSP Type::GetTypedefType() {
 
 lldb::Format Type::GetFormat() { return GetForwardCompilerType().GetFormat(); }
 
-lldb::Encoding Type::GetEncoding(uint64_t &count) {
+lldb::Encoding Type::GetEncoding() {
   // Make sure we resolve our type if it already hasn't been.
-  return GetForwardCompilerType().GetEncoding(count);
+  return GetForwardCompilerType().GetEncoding();
 }
 
 bool Type::ReadFromMemory(ExecutionContext *exe_ctx, lldb::addr_t addr,
diff --git a/lldb/source/Target/Target.cpp b/lldb/source/Target/Target.cpp
index d070c3d953d4a..1e43094421f0a 100644
--- a/lldb/source/Target/Target.cpp
+++ b/lldb/source/Target/Target.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "lldb/Target/Target.h"
+#include "lldb/Breakpoint/Breakpoint.h"
 #include "lldb/Breakpoint/BreakpointIDList.h"
 #include "lldb/Breakpoint/BreakpointPrecondition.h"
 #include "lldb/Breakpoint/BreakpointResolver.h"
@@ -5271,3 +5272,19 @@ void Target::ClearSectionLoadList() { GetSectionLoadList().Clear(); }
 void Target::DumpSectionLoadList(Stream &s) {
   GetSectionLoadList().Dump(s, this);
 }
+
+void Target::NotifyBreakpointChanged(Breakpoint &bp,
+                                     lldb::BreakpointEventType eventKind) {
+  if (EventTypeHasListeners(Target::eBroadcastBitBreakpointChanged)) {
+    std::shared_ptr<Breakpoint::BreakpointEventData> data_sp =
+        std::make_shared<Breakpoint::BreakpointEventData>(
+            eventKind, bp.shared_from_this());
+    BroadcastEvent(Target::eBroadcastBitBreakpointChanged, data_sp);
+  }
+}
+
+void Target::NotifyBreakpointChanged(
+    Breakpoint &bp, const lldb::EventDataSP &breakpoint_data_sp) {
+  if (EventTypeHasListeners(Target::eBroadcastBitBreakpointChanged))
+    BroadcastEvent(Target::eBroadcastBitBreakpointChanged, breakpoint_data_sp);
+}
diff --git a/lldb/source/Utility/Args.cpp b/lldb/source/Utility/Args.cpp
index 8ba40bae4d67e..7eff9cf3ed591 100644
--- a/lldb/source/Utility/Args.cpp
+++ b/lldb/source/Utility/Args.cpp
@@ -445,7 +445,7 @@ uint32_t Args::StringToGenericRegister(llvm::StringRef s) {
                         .Case("pc", LLDB_REGNUM_GENERIC_PC)
                         .Case("sp", LLDB_REGNUM_GENERIC_SP)
                         .Case("fp", LLDB_REGNUM_GENERIC_FP)
-                        .Cases("ra", "lr", LLDB_REGNUM_GENERIC_RA)
+                        .Cases({"ra", "lr"}, LLDB_REGNUM_GENERIC_RA)
                         .Case("flags", LLDB_REGNUM_GENERIC_FLAGS)
                         .Case("arg1", LLDB_REGNUM_GENERIC_ARG1)
                         .Case("arg2", LLDB_REGNUM_GENERIC_ARG2)
diff --git a/lldb/source/ValueObject/ValueObject.cpp b/lldb/source/ValueObject/ValueObject.cpp
index 38b9f77e6ddda..aeea32f19ee2c 100644
--- a/lldb/source/ValueObject/ValueObject.cpp
+++ b/lldb/source/ValueObject/ValueObject.cpp
@@ -790,8 +790,7 @@ bool ValueObject::SetData(DataExtractor &data, Status &error) {
     return false;
   }
 
-  uint64_t count = 0;
-  const Encoding encoding = GetCompilerType().GetEncoding(count);
+  const Encoding encoding = GetCompilerType().GetEncoding();
 
   const size_t byte_size = llvm::expectedToOptional(GetByteSize()).value_or(0);
 
@@ -1669,8 +1668,7 @@ bool ValueObject::SetValueFromCString(const char *value_str, Status &error) {
     return false;
   }
 
-  uint64_t count = 0;
-  const Encoding encoding = GetCompilerType().GetEncoding(count);
+  const Encoding encoding = GetCompilerType().GetEncoding();
 
   const size_t byte_size = llvm::expectedToOptional(GetByteSize()).value_or(0);
 
diff --git a/lldb/test/API/commands/register/register/aarch64_dynamic_regset/TestArm64DynamicRegsets.py b/lldb/test/API/commands/register/register/aarch64_dynamic_regset/TestArm64DynamicRegsets.py
index eb121ecbfdbaf..a985ebbced719 100644
--- a/lldb/test/API/commands/register/register/aarch64_dynamic_regset/TestArm64DynamicRegsets.py
+++ b/lldb/test/API/commands/register/register/aarch64_dynamic_regset/TestArm64DynamicRegsets.py
@@ -97,6 +97,9 @@ def setup_register_config_test(self, run_args=None):
     @skipIf(oslist=no_match(["linux"]))
     def test_aarch64_dynamic_regset_config(self):
         """Test AArch64 Dynamic Register sets configuration."""
+        if not self.isAArch64SVE():
+            self.skipTest("SVE must be present")
+
         register_sets = self.setup_register_config_test()
 
         for registerSet in register_sets:
@@ -259,6 +262,8 @@ def write_to_enable_za_test(self, has_zt0, write_za_first):
     def test_aarch64_dynamic_regset_config_sme_write_za_to_enable(self):
         """Test that ZA and ZT0 (if present) shows as 0s when disabled and
         can be enabled by writing to ZA."""
+        if not self.isAArch64SVE():
+            self.skipTest("SVE must be present.")
         if not self.isAArch64SME():
             self.skipTest("SME must be present.")
 
@@ -270,6 +275,8 @@ def test_aarch64_dynamic_regset_config_sme_write_za_to_enable(self):
     def test_aarch64_dynamic_regset_config_sme_write_zt0_to_enable(self):
         """Test that ZA and ZT0 (if present) shows as 0s when disabled and
         can be enabled by writing to ZT0."""
+        if not self.isAArch64SVE():
+            self.skipTest("SVE must be present.")
         if not self.isAArch64SME():
             self.skipTest("SME must be present.")
         if not self.isAArch64SME2():
diff --git a/lldb/test/API/driver/stdio_closed/TestDriverWithClosedSTDIO.py b/lldb/test/API/driver/stdio_closed/TestDriverWithClosedSTDIO.py
index 13437d05557bf..a73322c78d81e 100644
--- a/lldb/test/API/driver/stdio_closed/TestDriverWithClosedSTDIO.py
+++ b/lldb/test/API/driver/stdio_closed/TestDriverWithClosedSTDIO.py
@@ -24,7 +24,7 @@ class TestDriverWithClosedSTDIO(TestBase):
 
     # Windows doesn't have the fcntl module, so we can't run this
     # test there.
-    @skipIf(oslist=["windows"])
+    @skipIf(hostoslist=["windows"])
     def test_run_lldb_and_wait(self):
         """This test forks, closes the stdio channels and exec's lldb.
         Then it waits for it to exit and asserts it did that successfully"""
diff --git a/lldb/test/API/functionalities/breakpoint/same_cu_name/Makefile b/lldb/test/API/functionalities/breakpoint/same_cu_name/Makefile
index b19e7818601eb..b508da24c6828 100644
--- a/lldb/test/API/functionalities/breakpoint/same_cu_name/Makefile
+++ b/lldb/test/API/functionalities/breakpoint/same_cu_name/Makefile
@@ -4,16 +4,16 @@ LD_EXTRAS := ns1.o ns2.o ns3.o ns4.o
 a.out: main.o ns1.o ns2.o ns3.o ns4.o
 
 ns1.o: common.cpp
-	$(CC) -gdwarf -c -DNAMESPACE=ns1 -o $@ $<
+	$(CXX) -gdwarf -c -DNAMESPACE=ns1 -o $@ $<
 
 ns2.o: common.cpp
-	$(CC) -gdwarf -c -DNAMESPACE=ns2 -o $@ $<
+	$(CXX) -gdwarf -c -DNAMESPACE=ns2 -o $@ $<
 
 ns3.o: common.cpp
-	$(CC) -gdwarf -c -DNAMESPACE=ns3 -o $@ $<
+	$(CXX) -gdwarf -c -DNAMESPACE=ns3 -o $@ $<
 
 ns4.o: common.cpp
-	$(CC) -gdwarf -c -DNAMESPACE=ns4 -o $@ $<
+	$(CXX) -gdwarf -c -DNAMESPACE=ns4 -o $@ $<
 
 
 include Makefile.rules
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/optional/TestDataFormatterLibcxxOptionalSimulator.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/optional/TestDataFormatterLibcxxOptionalSimulator.py
index 3fefe87dcad97..7463f8897901f 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/optional/TestDataFormatterLibcxxOptionalSimulator.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/optional/TestDataFormatterLibcxxOptionalSimulator.py
@@ -53,6 +53,8 @@ def _run_test(self, defines):
     # causing this test to fail. This was reverted in newer version of clang
     # with commit 52a9ba7ca.
     @skipIf(compiler="clang", compiler_version=["=", "17"])
+    @skipIf(compiler="clang", compiler_version=["=", "18"])
+    @skipIf(compiler="clang", compiler_version=["=", "19"])
     @functools.wraps(LibcxxOptionalDataFormatterSimulatorTestCase._run_test)
     def test_method(self, defines=defines):
         LibcxxOptionalDataFormatterSimulatorTestCase._run_test(self, defines)
diff --git a/lldb/test/API/functionalities/multiple-slides/TestMultipleSlides.py b/lldb/test/API/functionalities/multiple-slides/TestMultipleSlides.py
index 3d6b27fe68a1b..5fd2b767a6237 100644
--- a/lldb/test/API/functionalities/multiple-slides/TestMultipleSlides.py
+++ b/lldb/test/API/functionalities/multiple-slides/TestMultipleSlides.py
@@ -29,10 +29,13 @@ def test_mulitple_slides(self):
             first_sym.GetEndAddress().GetOffset()
             - first_sym.GetStartAddress().GetOffset()
         )
+        int_size = target.FindFirstType("int").GetByteSize()
+        self.assertGreaterEqual(first_size, 2048 * int_size)
         second_size = (
             second_sym.GetEndAddress().GetOffset()
             - second_sym.GetStartAddress().GetOffset()
         )
+        self.assertGreaterEqual(second_size, 2048 * int_size)
 
         # View the first element of `first` and `second` while
         # they have no load address set.
diff --git a/lldb/test/API/lang/cpp/libcxx-internals-recognizer/TestLibcxxInternalsRecognizer.py b/lldb/test/API/lang/cpp/libcxx-internals-recognizer/TestLibcxxInternalsRecognizer.py
index d8a729b322fe4..2f942da604ff2 100644
--- a/lldb/test/API/lang/cpp/libcxx-internals-recognizer/TestLibcxxInternalsRecognizer.py
+++ b/lldb/test/API/lang/cpp/libcxx-internals-recognizer/TestLibcxxInternalsRecognizer.py
@@ -9,7 +9,7 @@ class LibCxxInternalsRecognizerTestCase(TestBase):
     NO_DEBUG_INFO_TESTCASE = True
 
     @add_test_categories(["libc++"])
-    @skipIf(compiler="clang", compiler_version=["<", "19.0"])
+    @skipIf(compiler="clang", compiler_version=["<=", "19.0"])
     def test_frame_recognizer(self):
         """Test that implementation details of libc++ are hidden"""
         self.build()
diff --git a/lldb/test/API/lang/objc/real-definition/TestRealDefinition.py b/lldb/test/API/lang/objc/real-definition/TestRealDefinition.py
index 6cbb9ddec264d..9fb2bea93e9c2 100644
--- a/lldb/test/API/lang/objc/real-definition/TestRealDefinition.py
+++ b/lldb/test/API/lang/objc/real-definition/TestRealDefinition.py
@@ -27,13 +27,11 @@ def test_frame_var_after_stop_at_interface(self):
         # Run at stop at main
         lldbutil.check_breakpoint(self, bpno=1, expected_hit_count=1)
 
-        self.runCmd("settings set target.prefer-dynamic-value no-dynamic-values")
-
         # This should display correctly.
         self.expect(
             "frame variable foo->_bar->_hidden_ivar",
             VARIABLES_DISPLAYED_CORRECTLY,
-            substrs=["(NSString *)", "foo->_bar->_hidden_ivar = 0x"],
+            substrs=["foo->_bar->_hidden_ivar = 0x"],
         )
 
     def test_frame_var_after_stop_at_implementation(self):
@@ -54,11 +52,9 @@ def test_frame_var_after_stop_at_implementation(self):
         # Run at stop at main
         lldbutil.check_breakpoint(self, bpno=1, expected_hit_count=1)
 
-        self.runCmd("settings set target.prefer-dynamic-value no-dynamic-values")
-
         # This should display correctly.
         self.expect(
             "frame variable foo->_bar->_hidden_ivar",
             VARIABLES_DISPLAYED_CORRECTLY,
-            substrs=["(NSString *)", "foo->_bar->_hidden_ivar = 0x"],
+            substrs=["foo->_bar->_hidden_ivar = 0x"],
         )
diff --git a/lldb/test/API/lua_api/TestThreadAPI.lua b/lldb/test/API/lua_api/TestThreadAPI.lua
new file mode 100644
index 0000000000000..5a38d0ba9192f
--- /dev/null
+++ b/lldb/test/API/lua_api/TestThreadAPI.lua
@@ -0,0 +1,25 @@
+_T = require('lua_lldb_test').create_test('TestThreadAPI')
+
+function _T:TestGetStopDescription()
+    local target = self:create_target()
+    local breakpoint = target:BreakpointCreateByName("main", "a.out")
+    assertTrue(breakpoint:IsValid() and breakpoint:GetNumLocations() == 1)
+
+    local process = target:LaunchSimple({ 'arg1', 'arg2' }, nil, nil)
+    local thread = get_stopped_thread(process, lldb.eStopReasonBreakpoint)
+    assertNotNil(thread)
+    assertTrue(thread:IsValid())
+
+    assertEqual("breakpoint", thread:GetStopDescription(string.len("breakpoint") + 1))
+    assertEqual("break", thread:GetStopDescription(string.len("break") + 1))
+    assertEqual("b", thread:GetStopDescription(string.len("b") + 1))
+    assertEqual("breakpoint 1.1", thread:GetStopDescription(string.len("breakpoint 1.1") + 100))
+
+    -- Test stream variation
+    local stream = lldb.SBStream()
+    assertTrue(thread:GetStopDescription(stream))
+    assertNotNil(stream)
+    assertEqual("breakpoint 1.1", stream:GetData())
+end
+
+os.exit(_T:run())
diff --git a/lldb/test/API/python_api/default-constructor/sb_thread.py b/lldb/test/API/python_api/default-constructor/sb_thread.py
index 34eb3db852c38..4252fa0321fff 100644
--- a/lldb/test/API/python_api/default-constructor/sb_thread.py
+++ b/lldb/test/API/python_api/default-constructor/sb_thread.py
@@ -10,6 +10,7 @@ def fuzz_obj(obj):
     obj.GetStopReasonDataCount()
     obj.GetStopReasonDataAtIndex(100)
     obj.GetStopDescription(256)
+    obj.GetStopDescription(lldb.SBStream())
     obj.GetThreadID()
     obj.GetIndexID()
     obj.GetName()
diff --git a/lldb/test/API/python_api/thread/TestThreadAPI.py b/lldb/test/API/python_api/thread/TestThreadAPI.py
index 5583434a742a9..acad7583eec19 100644
--- a/lldb/test/API/python_api/thread/TestThreadAPI.py
+++ b/lldb/test/API/python_api/thread/TestThreadAPI.py
@@ -138,6 +138,11 @@ def get_stop_description(self):
             "breakpoint 1.1", thread.GetStopDescription(len("breakpoint 1.1") + 100)
         )
 
+        # Test the stream variation
+        stream = lldb.SBStream()
+        self.assertTrue(thread.GetStopDescription(stream))
+        self.assertEqual("breakpoint 1.1", stream.GetData())
+
     def step_out_of_malloc_into_function_b(self, exe_name):
         """Test Python SBThread.StepOut() API to step out of a malloc call where the call site is at function b()."""
         exe = self.getBuildArtifact(exe_name)
diff --git a/lldb/test/API/tools/lldb-dap/coreFile/TestDAP_coreFile.py b/lldb/test/API/tools/lldb-dap/coreFile/TestDAP_coreFile.py
index 1143cd93a70b3..d56a8a45ebf1e 100644
--- a/lldb/test/API/tools/lldb-dap/coreFile/TestDAP_coreFile.py
+++ b/lldb/test/API/tools/lldb-dap/coreFile/TestDAP_coreFile.py
@@ -61,6 +61,21 @@ def test_core_file(self):
         self.dap_server.request_next(threadId=32259)
         self.assertEqual(self.get_stackFrames(), expected_frames)
 
+    def test_wrong_core_file(self):
+        exe_file = self.getSourcePath("linux-x86_64.out")
+        wrong_core_file = self.getSourcePath("main.c")
+
+        self.create_debug_adapter()
+        resp = self.attach(
+            program=exe_file, coreFile=wrong_core_file, expectFailure=True
+        )
+        self.assertIsNotNone(resp)
+        self.assertFalse(resp["success"], "Expected failure in response {resp!r}")
+        error_msg = resp["body"]["error"]["format"]
+
+        # attach may fail for mutilple reasons.
+        self.assertEqual(error_msg, "Failed to create the process")
+
     @skipIfLLVMTargetMissing("X86")
     def test_core_file_source_mapping_array(self):
         """Test that sourceMap property is correctly applied when loading a core"""
diff --git a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py
index 8db2316e73fc8..ca881f1d817c5 100644
--- a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py
+++ b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py
@@ -642,6 +642,7 @@ def test_stdio_redirection(self):
     @skipIfAsan
     @skipIfWindows
     @skipIf(oslist=["linux"], archs=no_match(["x86_64"]))
+    @skipIfBuildType(["debug"])
     def test_stdio_redirection_and_console(self):
         """
         Test stdio redirection and console.
diff --git a/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart.py b/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart.py
index 83faf276852f8..e8e07e1e86fc4 100644
--- a/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart.py
+++ b/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart.py
@@ -51,20 +51,8 @@ def test_stopOnEntry(self):
         self.build_and_launch(program, stopOnEntry=True)
         [bp_main] = self.set_function_breakpoints(["main"])
 
-        self.dap_server.request_configurationDone()
-        self.dap_server.wait_for_stopped()
-        # Once the "configuration done" event is sent, we should get a stopped
-        # event immediately because of stopOnEntry.
-        self.assertTrue(
-            len(self.dap_server.thread_stop_reasons) > 0,
-            "expected stopped event during launch",
-        )
-        for _, body in self.dap_server.thread_stop_reasons.items():
-            if "reason" in body:
-                reason = body["reason"]
-                self.assertNotEqual(
-                    reason, "breakpoint", 'verify stop isn\'t "main" breakpoint'
-                )
+        self.continue_to_next_stop()
+        self.verify_stop_on_entry()
 
         # Then, if we continue, we should hit the breakpoint at main.
         self.continue_to_breakpoints([bp_main])
@@ -73,17 +61,7 @@ def test_stopOnEntry(self):
         # main.
         resp = self.dap_server.request_restart()
         self.assertTrue(resp["success"])
-        stopped_events = self.dap_server.wait_for_stopped()
-        for stopped_event in stopped_events:
-            if "body" in stopped_event:
-                body = stopped_event["body"]
-                if "reason" in body:
-                    reason = body["reason"]
-                    self.assertNotEqual(
-                        reason,
-                        "breakpoint",
-                        'verify stop after restart isn\'t "main" breakpoint',
-                    )
+        self.verify_stop_on_entry()
 
     @skipIfWindows
     def test_arguments(self):
diff --git a/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart_console.py b/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart_console.py
index e1ad1425a993d..7d4949907df0d 100644
--- a/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart_console.py
+++ b/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart_console.py
@@ -11,27 +11,6 @@
 
 @skipIfBuildType(["debug"])
 class TestDAP_restart_console(lldbdap_testcase.DAPTestCaseBase):
-    def verify_stopped_on_entry(self, stopped_events: List[Dict[str, Any]]):
-        seen_stopped_event = 0
-        for stopped_event in stopped_events:
-            body = stopped_event.get("body")
-            if body is None:
-                continue
-
-            reason = body.get("reason")
-            if reason is None:
-                continue
-
-            self.assertNotEqual(
-                reason,
-                "breakpoint",
-                'verify stop after restart isn\'t "main" breakpoint',
-            )
-            if reason == "entry":
-                seen_stopped_event += 1
-
-        self.assertEqual(seen_stopped_event, 1, "expect only one stopped entry event.")
-
     @skipIfAsan
     @skipIfWindows
     @skipIf(oslist=["linux"], archs=["arm$"])  # Always times out on buildbot
@@ -92,11 +71,8 @@ def test_stopOnEntry(self):
         self.build_and_launch(program, console="integratedTerminal", stopOnEntry=True)
         [bp_main] = self.set_function_breakpoints(["main"])
 
-        self.dap_server.request_continue()  # sends configuration done
-        stopped_events = self.dap_server.wait_for_stopped()
-        # We should be stopped at the entry point.
-        self.assertGreaterEqual(len(stopped_events), 0, "expect stopped events")
-        self.verify_stopped_on_entry(stopped_events)
+        self.dap_server.request_configurationDone()
+        self.verify_stop_on_entry()
 
         # Then, if we continue, we should hit the breakpoint at main.
         self.dap_server.request_continue()
@@ -105,8 +81,7 @@ def test_stopOnEntry(self):
         # Restart and check that we still get a stopped event before reaching
         # main.
         self.dap_server.request_restart()
-        stopped_events = self.dap_server.wait_for_stopped()
-        self.verify_stopped_on_entry(stopped_events)
+        self.verify_stop_on_entry()
 
         # continue to main
         self.dap_server.request_continue()
diff --git a/lldb/test/Shell/Breakpoint/jit-loader_jitlink_elf.test b/lldb/test/Shell/Breakpoint/jit-loader_jitlink_elf.test
index 52c86fa5530bf..9a972f1f1ece7 100644
--- a/lldb/test/Shell/Breakpoint/jit-loader_jitlink_elf.test
+++ b/lldb/test/Shell/Breakpoint/jit-loader_jitlink_elf.test
@@ -3,8 +3,8 @@
 
 # JITLink is the Orc-specific JIT linker implementation.
 #
-# RUN: %clang -g -S -emit-llvm -fPIC --target=x86_64-unknown-unknown-elf \
-# RUN:        -o %t.ll %p/Inputs/jitbp.cpp
+# RUN: %clangxx -g -S -emit-llvm -fPIC --target=x86_64-unknown-unknown-elf \
+# RUN:          -o %t.ll %p/Inputs/jitbp.cpp
 # RUN: %lldb -b -o 'settings set plugin.jit-loader.gdb.enable on' -o 'b jitbp' \
 # RUN:          -o 'run --jit-linker=jitlink %t.ll' lli | FileCheck %s
 
diff --git a/lldb/test/Shell/Breakpoint/jit-loader_rtdyld_elf.test b/lldb/test/Shell/Breakpoint/jit-loader_rtdyld_elf.test
index b34a5673936f5..ae9402a519494 100644
--- a/lldb/test/Shell/Breakpoint/jit-loader_rtdyld_elf.test
+++ b/lldb/test/Shell/Breakpoint/jit-loader_rtdyld_elf.test
@@ -3,8 +3,8 @@
 
 # RuntimeDyld can be used to link and load emitted code for both, MCJIT and Orc.
 #
-# RUN: %clang -g -S -emit-llvm --target=x86_64-unknown-unknown-elf \
-# RUN:        -o %t.ll %p/Inputs/jitbp.cpp
+# RUN: %clangxx -g -S -emit-llvm --target=x86_64-unknown-unknown-elf \
+# RUN:          -o %t.ll %p/Inputs/jitbp.cpp
 #
 # RUN: %lldb -b -o 'settings set plugin.jit-loader.gdb.enable on' -o 'b jitbp' \
 # RUN:          -o 'run --jit-kind=mcjit %t.ll' lli | FileCheck %s
diff --git a/lldb/test/Shell/Commands/command-image-dump-ast-colored.test b/lldb/test/Shell/Commands/command-image-dump-ast-colored.test
index 355ef6bb1d199..7fd70d234fbd4 100644
--- a/lldb/test/Shell/Commands/command-image-dump-ast-colored.test
+++ b/lldb/test/Shell/Commands/command-image-dump-ast-colored.test
@@ -1,7 +1,7 @@
 # Test AST dumping with and without color.
 
 # RUN: split-file %s %t
-# RUN: %clang_host -g -gdwarf %t/main.cpp -o %t.out
+# RUN: %clangxx_host -g -gdwarf %t/main.cpp -o %t.out
 # RUN: %lldb -x -b -s %t/commands.input %t.out -o exit 2>&1 \
 # RUN:       | FileCheck %s
 
diff --git a/lldb/test/Shell/Commands/command-image-dump-ast.test b/lldb/test/Shell/Commands/command-image-dump-ast.test
index 3204022418cb8..86fe1836a2c6c 100644
--- a/lldb/test/Shell/Commands/command-image-dump-ast.test
+++ b/lldb/test/Shell/Commands/command-image-dump-ast.test
@@ -5,7 +5,7 @@
 # UNSUPPORTED: system-windows
 
 # RUN: split-file %s %t
-# RUN: %clang_host -g -gdwarf %t/main.cpp -o %t.out
+# RUN: %clangxx_host -g -gdwarf %t/main.cpp -o %t.out
 # RUN: %lldb -x -b -s %t/commands.input %t.out -o exit 2>&1 \
 # RUN:       | FileCheck %s
 
diff --git a/lldb/test/Shell/Commands/list-header.test b/lldb/test/Shell/Commands/list-header.test
index 53c4b786f1810..27eaa1a4f29c2 100644
--- a/lldb/test/Shell/Commands/list-header.test
+++ b/lldb/test/Shell/Commands/list-header.test
@@ -3,11 +3,11 @@
 # XFAIL: target-windows
 
 ## Test that `list header.h:<line>` works correctly when header is available.
-## 
+##
 # RUN: split-file %s %t
 
-# RUN: %clang_host -g %t/main_with_inlined.cc %t/foo.cc -o %t/main_with_inlined.out
-# RUN: %clang_host -g %t/main_no_inlined.cc %t/foo.cc -o %t/main_no_inlined.out
+# RUN: %clangxx_host -g %t/main_with_inlined.cc %t/foo.cc -o %t/main_with_inlined.out
+# RUN: %clangxx_host -g %t/main_no_inlined.cc %t/foo.cc -o %t/main_no_inlined.out
 
 # RUN: %lldb %t/main_with_inlined.out -o "list foo.h:2" -o "exit" 2>&1 \
 # RUN:   | FileCheck %s --check-prefix=CHECK-INLINED
@@ -19,7 +19,7 @@
 
 # CHECK-INLINED: 2      extern int* ptr;
 # CHECK-INLINED: 3   	void f(int x);
-# CHECK-INLINED: 4   	
+# CHECK-INLINED: 4
 # CHECK-INLINED: 5   	inline void g(int x) {
 # CHECK-INLINED: 6   	  *ptr = x; // should crash here
 # CHECK-INLINED: 7   	}
diff --git a/lldb/test/Shell/Error/cleanup.cpp b/lldb/test/Shell/Error/cleanup.cpp
index 6abc62dc4af99..1e83478a83337 100644
--- a/lldb/test/Shell/Error/cleanup.cpp
+++ b/lldb/test/Shell/Error/cleanup.cpp
@@ -1,5 +1,5 @@
 // Test CommandObject is cleaned up even after commands fail due to not taking any argument.
-// RUN: %clang_host -g %s -o %t
+// RUN: %clangxx_host -g %s -o %t
 // RUN: %lldb -f %t -o "settings set interpreter.stop-command-source-on-error false" -s \
 // RUN:   %S/Inputs/cleanup.lldbinit
 int main() { return 0; }
diff --git a/lldb/test/Shell/Expr/TestExprLanguageNote.test b/lldb/test/Shell/Expr/TestExprLanguageNote.test
index e8e4e1399e451..e7da30816319e 100644
--- a/lldb/test/Shell/Expr/TestExprLanguageNote.test
+++ b/lldb/test/Shell/Expr/TestExprLanguageNote.test
@@ -1,5 +1,5 @@
 # RUN: split-file %s %t
-# RUN: %clang_host -g %t/main.cpp -o %t.out
+# RUN: %clangxx_host -g %t/main.cpp -o %t.out
 #
 # RUN: %lldb -x -b -o "settings set interpreter.stop-command-source-on-error false" \
 # RUN:       -s %t/no-target.input 2>&1 | FileCheck %s --check-prefix=CHECK-NO-TARGET
diff --git a/lldb/test/Shell/Expr/TestLambdaExprImport.test b/lldb/test/Shell/Expr/TestLambdaExprImport.test
index c57ce06453fe2..b49a38036e566 100644
--- a/lldb/test/Shell/Expr/TestLambdaExprImport.test
+++ b/lldb/test/Shell/Expr/TestLambdaExprImport.test
@@ -3,7 +3,7 @@
 # uses always).
 
 # RUN: split-file %s %t
-# RUN: %clang_host -g -gdwarf %t/main.cpp -o %t.out
+# RUN: %clangxx_host -g -gdwarf %t/main.cpp -o %t.out
 # RUN: %lldb -o "settings set interpreter.stop-command-source-on-error false" \
 # RUN:       -x -b -s %t/commands.input %t.out 2>&1 \
 # RUN:       | FileCheck %s
diff --git a/lldb/test/Shell/ObjectFile/ELF/elf-memory.test b/lldb/test/Shell/ObjectFile/ELF/elf-memory.test
index 75a68edd2d349..170dc7682aab0 100644
--- a/lldb/test/Shell/ObjectFile/ELF/elf-memory.test
+++ b/lldb/test/Shell/ObjectFile/ELF/elf-memory.test
@@ -11,7 +11,7 @@
 //   - verify that "image dump objfile" will dump the dynamic section of the
 //     memory elf file and find the .dynamic string table.
 
-// RUN: %clang_host %p/Inputs/memory-elf.cpp -g -O0 -o %t
+// RUN: %clangxx_host %p/Inputs/memory-elf.cpp -g -O0 -o %t
 
 // RUN: %lldb %t -b \
 // RUN:   -o "b main" \
diff --git a/lldb/test/Shell/ObjectFile/MachO/Inputs/section-overflow-binary b/lldb/test/Shell/ObjectFile/MachO/Inputs/section-overflow-binary
new file mode 100644
index 0000000000000..19dc2f4ac9ffe
Binary files /dev/null and b/lldb/test/Shell/ObjectFile/MachO/Inputs/section-overflow-binary differ
diff --git a/lldb/test/Shell/ObjectFile/MachO/section-overflow-binary.test b/lldb/test/Shell/ObjectFile/MachO/section-overflow-binary.test
new file mode 100644
index 0000000000000..76c335f65a76a
--- /dev/null
+++ b/lldb/test/Shell/ObjectFile/MachO/section-overflow-binary.test
@@ -0,0 +1,13 @@
+RUN: %lldb -b %p/Inputs/section-overflow-binary \
+RUN:   -o 'script dwarf = lldb.target.module[0].sections[0]' \
+RUN:   -o 'script section = dwarf.GetSubSectionAtIndex(0)' \
+RUN:   -o "script print(f'{section.GetName()} file_offset=0x{section.GetFileOffset():016x}')" \
+RUN:   -o 'script section = dwarf.GetSubSectionAtIndex(1)' \
+RUN:   -o "script print(f'{section.GetName()} file_offset=0x{section.GetFileOffset():016x}')" \
+RUN:   -o 'script section = dwarf.GetSubSectionAtIndex(2)' \
+RUN:   -o "script print(f'{section.GetName()} file_offset=0x{section.GetFileOffset():016x}')" \
+RUN:   | FileCheck %s
+
+CHECK: __debug_abbrev file_offset=0x00000000fffffff0
+CHECK: __debug_info file_offset=0x0000000100000010
+CHECK: __debug_line file_offset=0x0000000300000010
diff --git a/lldb/test/Shell/Recognizer/verbose_trap-in-stl-callback-user-leaf.test b/lldb/test/Shell/Recognizer/verbose_trap-in-stl-callback-user-leaf.test
index 5a84c163453cc..32b4095d9addd 100644
--- a/lldb/test/Shell/Recognizer/verbose_trap-in-stl-callback-user-leaf.test
+++ b/lldb/test/Shell/Recognizer/verbose_trap-in-stl-callback-user-leaf.test
@@ -12,7 +12,7 @@
 
 # UNSUPPORTED: system-windows
 #
-# RUN: %clang_host -g -O0 %S/Inputs/verbose_trap-in-stl-callback-user-leaf.cpp -o %t.out
+# RUN: %clangxx_host -g -O0 %S/Inputs/verbose_trap-in-stl-callback-user-leaf.cpp -o %t.out
 # RUN: %lldb -b -s %s %t.out | FileCheck %s --check-prefixes=CHECK
 
 run
diff --git a/lldb/test/Shell/Recognizer/verbose_trap-in-stl-callback.test b/lldb/test/Shell/Recognizer/verbose_trap-in-stl-callback.test
index b15bcb3a384f9..c8c433c0a819a 100644
--- a/lldb/test/Shell/Recognizer/verbose_trap-in-stl-callback.test
+++ b/lldb/test/Shell/Recognizer/verbose_trap-in-stl-callback.test
@@ -11,7 +11,7 @@
 
 # UNSUPPORTED: system-windows
 #
-# RUN: %clang_host -g -O0 %S/Inputs/verbose_trap-in-stl-callback.cpp -o %t.out
+# RUN: %clangxx_host -g -O0 %S/Inputs/verbose_trap-in-stl-callback.cpp -o %t.out
 # RUN: %lldb -b -s %s %t.out | FileCheck %s --check-prefixes=CHECK
 
 run
diff --git a/lldb/test/Shell/Recognizer/verbose_trap-in-stl-max-depth.test b/lldb/test/Shell/Recognizer/verbose_trap-in-stl-max-depth.test
index 2ea6594643c9c..d0789ac7dc67a 100644
--- a/lldb/test/Shell/Recognizer/verbose_trap-in-stl-max-depth.test
+++ b/lldb/test/Shell/Recognizer/verbose_trap-in-stl-max-depth.test
@@ -4,7 +4,7 @@
 
 # UNSUPPORTED: system-windows
 #
-# RUN: %clang_host -g -O0 %S/Inputs/verbose_trap-in-stl-max-depth.cpp -o %t.out
+# RUN: %clangxx_host -g -O0 %S/Inputs/verbose_trap-in-stl-max-depth.cpp -o %t.out
 # RUN: %lldb -b -s %s %t.out | FileCheck %s --check-prefixes=CHECK
 
 run
diff --git a/lldb/test/Shell/Recognizer/verbose_trap-in-stl-nested.test b/lldb/test/Shell/Recognizer/verbose_trap-in-stl-nested.test
index 81a492d1ed579..68a4ea612c0d1 100644
--- a/lldb/test/Shell/Recognizer/verbose_trap-in-stl-nested.test
+++ b/lldb/test/Shell/Recognizer/verbose_trap-in-stl-nested.test
@@ -3,7 +3,7 @@
 
 # UNSUPPORTED: system-windows
 #
-# RUN: %clang_host -g -O0 %S/Inputs/verbose_trap-in-stl-nested.cpp -o %t.out
+# RUN: %clangxx_host -g -O0 %S/Inputs/verbose_trap-in-stl-nested.cpp -o %t.out
 # RUN: %lldb -b -s %s %t.out | FileCheck %s --check-prefixes=CHECK
 
 run
diff --git a/lldb/test/Shell/Recognizer/verbose_trap-in-stl.test b/lldb/test/Shell/Recognizer/verbose_trap-in-stl.test
index dd08290174e3a..bd4851146b40d 100644
--- a/lldb/test/Shell/Recognizer/verbose_trap-in-stl.test
+++ b/lldb/test/Shell/Recognizer/verbose_trap-in-stl.test
@@ -3,7 +3,7 @@
 
 # UNSUPPORTED: system-windows
 #
-# RUN: %clang_host -g -O0 %S/Inputs/verbose_trap-in-stl.cpp -o %t.out
+# RUN: %clangxx_host -g -O0 %S/Inputs/verbose_trap-in-stl.cpp -o %t.out
 # RUN: %lldb -b -s %s %t.out | FileCheck %s --check-prefixes=CHECK
 
 run
diff --git a/lldb/test/Shell/Recognizer/verbose_trap.test b/lldb/test/Shell/Recognizer/verbose_trap.test
index dafab7bdea688..ab0df082cc032 100644
--- a/lldb/test/Shell/Recognizer/verbose_trap.test
+++ b/lldb/test/Shell/Recognizer/verbose_trap.test
@@ -1,15 +1,15 @@
 # UNSUPPORTED: system-windows
 #
-# RUN: %clang_host -g -O0 %S/Inputs/verbose_trap.cpp -o %t.out -DVERBOSE_TRAP_TEST_CATEGORY=\"Foo\" -DVERBOSE_TRAP_TEST_MESSAGE=\"Bar\"
+# RUN: %clangxx_host -g -O0 %S/Inputs/verbose_trap.cpp -o %t.out -DVERBOSE_TRAP_TEST_CATEGORY=\"Foo\" -DVERBOSE_TRAP_TEST_MESSAGE=\"Bar\"
 # RUN: %lldb -b -s %s %t.out | FileCheck %s --check-prefixes=CHECK,CHECK-BOTH
 #
-# RUN: %clang_host -g -O0 %S/Inputs/verbose_trap.cpp -o %t.out -DVERBOSE_TRAP_TEST_CATEGORY=\"\" -DVERBOSE_TRAP_TEST_MESSAGE=\"Bar\"
+# RUN: %clangxx_host -g -O0 %S/Inputs/verbose_trap.cpp -o %t.out -DVERBOSE_TRAP_TEST_CATEGORY=\"\" -DVERBOSE_TRAP_TEST_MESSAGE=\"Bar\"
 # RUN: %lldb -b -s %s %t.out | FileCheck %s --check-prefixes=CHECK,CHECK-MESSAGE_ONLY
 #
-# RUN: %clang_host -g -O0 %S/Inputs/verbose_trap.cpp -o %t.out -DVERBOSE_TRAP_TEST_CATEGORY=\"Foo\" -DVERBOSE_TRAP_TEST_MESSAGE=\"\"
+# RUN: %clangxx_host -g -O0 %S/Inputs/verbose_trap.cpp -o %t.out -DVERBOSE_TRAP_TEST_CATEGORY=\"Foo\" -DVERBOSE_TRAP_TEST_MESSAGE=\"\"
 # RUN: %lldb -b -s %s %t.out | FileCheck %s --check-prefixes=CHECK,CHECK-CATEGORY_ONLY
 #
-# RUN: %clang_host -g -O0 %S/Inputs/verbose_trap.cpp -o %t.out -DVERBOSE_TRAP_TEST_CATEGORY=\"\" -DVERBOSE_TRAP_TEST_MESSAGE=\"\"
+# RUN: %clangxx_host -g -O0 %S/Inputs/verbose_trap.cpp -o %t.out -DVERBOSE_TRAP_TEST_CATEGORY=\"\" -DVERBOSE_TRAP_TEST_MESSAGE=\"\"
 # RUN: %lldb -b -s %s %t.out | FileCheck %s --check-prefixes=CHECK,CHECK-NONE
 
 run
diff --git a/lldb/test/Shell/Register/Inputs/x86-multithread-read.cpp b/lldb/test/Shell/Register/Inputs/x86-multithread-read.cpp
index c5f571fc1d2c4..0d2869c0c577c 100644
--- a/lldb/test/Shell/Register/Inputs/x86-multithread-read.cpp
+++ b/lldb/test/Shell/Register/Inputs/x86-multithread-read.cpp
@@ -1,4 +1,5 @@
 #include <cstdint>
+#include <functional>
 #include <mutex>
 #include <thread>
 
diff --git a/lldb/test/Shell/Register/Inputs/x86-multithread-write.cpp b/lldb/test/Shell/Register/Inputs/x86-multithread-write.cpp
index 320f9e938e5bf..1f4e91acc4c03 100644
--- a/lldb/test/Shell/Register/Inputs/x86-multithread-write.cpp
+++ b/lldb/test/Shell/Register/Inputs/x86-multithread-write.cpp
@@ -1,6 +1,7 @@
 #include <cinttypes>
 #include <cstdint>
 #include <cstdio>
+#include <functional>
 #include <mutex>
 #include <thread>
 
diff --git a/lldb/test/Shell/Settings/TestChildCountTruncation.test b/lldb/test/Shell/Settings/TestChildCountTruncation.test
index da6436cb5ca20..b66d0df983069 100644
--- a/lldb/test/Shell/Settings/TestChildCountTruncation.test
+++ b/lldb/test/Shell/Settings/TestChildCountTruncation.test
@@ -2,7 +2,7 @@
 # when target.max-children-count wasn't explicitly set.
 
 # RUN: split-file %s %t
-# RUN: %clang_host -g %t/main.cpp -o %t.out
+# RUN: %clangxx_host -g %t/main.cpp -o %t.out
 # RUN: %lldb -x -b -s %t/dwim-commands.input %t.out -o exit 2>&1 \
 # RUN:       | FileCheck %s --check-prefix=DWIM
 #
diff --git a/lldb/test/Shell/Settings/TestChildDepthTruncation.test b/lldb/test/Shell/Settings/TestChildDepthTruncation.test
index 12f5661600ae7..7e4fbbef9e458 100644
--- a/lldb/test/Shell/Settings/TestChildDepthTruncation.test
+++ b/lldb/test/Shell/Settings/TestChildDepthTruncation.test
@@ -2,7 +2,7 @@
 # when target.max-children-depth wasn't explicitly set.
 
 # RUN: split-file %s %t
-# RUN: %clang_host -g %t/main.cpp -o %t.out
+# RUN: %clangxx_host -g %t/main.cpp -o %t.out
 # RUN: %lldb -x -b -s %t/dwim-commands.input %t.out -o exit 2>&1 \
 # RUN:       | FileCheck %s --check-prefix=DWIM
 #
diff --git a/lldb/test/Shell/Settings/TestCxxFrameFormat.test b/lldb/test/Shell/Settings/TestCxxFrameFormat.test
index d70db582e9750..3ee92d53492fb 100644
--- a/lldb/test/Shell/Settings/TestCxxFrameFormat.test
+++ b/lldb/test/Shell/Settings/TestCxxFrameFormat.test
@@ -3,7 +3,7 @@
 # Test the plugin.cplusplus.display.function-name-format setting.
 
 # RUN: split-file %s %t
-# RUN: %clang_host -g -gdwarf %t/main.cpp -o %t.out
+# RUN: %clangxx_host -g -gdwarf %t/main.cpp -o %t.out
 # RUN: %lldb -x -b -s %t/commands.input %t.out -o exit 2>&1 \
 # RUN:       | FileCheck %s
 
diff --git a/lldb/test/Shell/Settings/TestCxxFrameFormatEmpty.test b/lldb/test/Shell/Settings/TestCxxFrameFormatEmpty.test
index 0a6d2723ded34..a0550b733d781 100644
--- a/lldb/test/Shell/Settings/TestCxxFrameFormatEmpty.test
+++ b/lldb/test/Shell/Settings/TestCxxFrameFormatEmpty.test
@@ -5,7 +5,7 @@
 # ${function.name-with-args}.
 
 # RUN: split-file %s %t
-# RUN: %clang_host -g -gdwarf %t/main.cpp -o %t.out
+# RUN: %clangxx_host -g -gdwarf %t/main.cpp -o %t.out
 # RUN: %lldb -x -b -s %t/commands.input %t.out -o exit 2>&1 \
 # RUN:       | FileCheck %s
 
diff --git a/lldb/test/Shell/Settings/TestCxxFrameFormatMixedLanguages.test b/lldb/test/Shell/Settings/TestCxxFrameFormatMixedLanguages.test
index bafd36f5ae177..679d6e4d5abe4 100644
--- a/lldb/test/Shell/Settings/TestCxxFrameFormatMixedLanguages.test
+++ b/lldb/test/Shell/Settings/TestCxxFrameFormatMixedLanguages.test
@@ -4,9 +4,9 @@
 # when interoperating multiple languages.
 
 # RUN: split-file %s %t
-# RUN: %clangxx_host -x c -c -g %t/lib.c -o %t.clib.o
+# RUN: %clang_host -x c -c -g %t/lib.c -o %t.clib.o
 # RUN: %clangxx_host -c -g %t/lib.cpp -o %t.cxxlib.o
-# RUN: %clangxx_host %t/main.m %t.cxxlib.o %t.clib.o -o %t.out
+# RUN: %clang_host %t/main.m %t.cxxlib.o %t.clib.o -o %t.out
 # RUN: %lldb -x -b -s %t/commands.input %t.out -o exit 2>&1 | FileCheck %s
 
 #--- lib.c
@@ -47,7 +47,7 @@ break set -n method
 run
 bt
 
-# CHECK: custom-frame 'this affects C++ only' 
-# CHECK: custom-frame 'this affects C++ only' 
-# CHECK: custom-frame 'func' 
-# CHECK: custom-frame 'main' 
+# CHECK: custom-frame 'this affects C++ only'
+# CHECK: custom-frame 'this affects C++ only'
+# CHECK: custom-frame 'func'
+# CHECK: custom-frame 'main'
diff --git a/lldb/test/Shell/Settings/TestCxxFrameFormatPartialFailure.test b/lldb/test/Shell/Settings/TestCxxFrameFormatPartialFailure.test
index e914ff7a010dd..f279f07afcda2 100644
--- a/lldb/test/Shell/Settings/TestCxxFrameFormatPartialFailure.test
+++ b/lldb/test/Shell/Settings/TestCxxFrameFormatPartialFailure.test
@@ -5,7 +5,7 @@
 # were successful.
 
 # RUN: split-file %s %t
-# RUN: %clang_host -g -gdwarf %t/main.cpp -o %t.out
+# RUN: %clangxx_host -g -gdwarf %t/main.cpp -o %t.out
 # RUN: %lldb -x -b -s %t/commands.input %t.out -o exit 2>&1 \
 # RUN:       | FileCheck %s
 
diff --git a/lldb/test/Shell/Settings/TestFrameFormatFunctionBasename.test b/lldb/test/Shell/Settings/TestFrameFormatFunctionBasename.test
index c0008e50927b1..56ec09e2f951d 100644
--- a/lldb/test/Shell/Settings/TestFrameFormatFunctionBasename.test
+++ b/lldb/test/Shell/Settings/TestFrameFormatFunctionBasename.test
@@ -3,11 +3,11 @@
 # Test the ${function.basename} frame-format variable.
 
 # RUN: split-file %s %t
-# RUN: %clang_host -g -gdwarf %t/main.cpp -o %t.out
+# RUN: %clangxx_host -g -gdwarf %t/main.cpp -o %t.out
 # RUN: %lldb -x -b -s %t/commands.input %t.out -o exit 2>&1 \
 # RUN:       | FileCheck %s
 #
-# RUN: %clang_host -O0 %t/main.cpp -o %t-nodebug.out
+# RUN: %clangxx_host -O0 %t/main.cpp -o %t-nodebug.out
 # RUN: %lldb -x -b -s %t/commands.input %t-nodebug.out -o exit 2>&1 \
 # RUN:       | FileCheck %s
 
diff --git a/lldb/test/Shell/Settings/TestFrameFormatFunctionFormattedArguments.test b/lldb/test/Shell/Settings/TestFrameFormatFunctionFormattedArguments.test
index 04f51701a2a2d..f20fc8ca77aeb 100644
--- a/lldb/test/Shell/Settings/TestFrameFormatFunctionFormattedArguments.test
+++ b/lldb/test/Shell/Settings/TestFrameFormatFunctionFormattedArguments.test
@@ -3,11 +3,11 @@
 # Test the ${function.formatted-arguments} frame-format variable.
 
 # RUN: split-file %s %t
-# RUN: %clang_host -g -gdwarf %t/main.cpp -o %t.out
+# RUN: %clangxx_host -g -gdwarf %t/main.cpp -o %t.out
 # RUN: %lldb -x -b -s %t/commands.input %t.out -o exit 2>&1 \
 # RUN:       | FileCheck %s
 #
-# RUN: %clang_host -O0 %t/main.cpp -o %t-nodebug.out
+# RUN: %clangxx_host -O0 %t/main.cpp -o %t-nodebug.out
 # RUN: %lldb -x -b -s %t/commands.input %t-nodebug.out -o exit 2>&1 \
 # RUN:       | FileCheck %s --check-prefix=CHECK-NODEBUG
 
diff --git a/lldb/test/Shell/Settings/TestFrameFormatFunctionQualifiers.test b/lldb/test/Shell/Settings/TestFrameFormatFunctionQualifiers.test
index b1dfe834c1deb..d05e60b0e8d10 100644
--- a/lldb/test/Shell/Settings/TestFrameFormatFunctionQualifiers.test
+++ b/lldb/test/Shell/Settings/TestFrameFormatFunctionQualifiers.test
@@ -3,11 +3,11 @@
 # Test the ${function.qualifiers} frame-format variable.
 
 # RUN: split-file %s %t
-# RUN: %clang_host -g -gdwarf %t/main.cpp -o %t.out
+# RUN: %clangxx_host -g -gdwarf %t/main.cpp -o %t.out
 # RUN: %lldb -x -b -s %t/commands.input %t.out -o exit 2>&1 \
 # RUN:       | FileCheck %s
 #
-# RUN: %clang_host -O0 %t/main.cpp -o %t-nodebug.out
+# RUN: %clangxx_host -O0 %t/main.cpp -o %t-nodebug.out
 # RUN: %lldb -x -b -s %t/commands.input %t-nodebug.out -o exit 2>&1 \
 # RUN:       | FileCheck %s
 
diff --git a/lldb/test/Shell/Settings/TestFrameFormatFunctionReturn.test b/lldb/test/Shell/Settings/TestFrameFormatFunctionReturn.test
index f913162a1aa66..bb78258aba753 100644
--- a/lldb/test/Shell/Settings/TestFrameFormatFunctionReturn.test
+++ b/lldb/test/Shell/Settings/TestFrameFormatFunctionReturn.test
@@ -4,11 +4,11 @@
 # frame-format variables.
 
 # RUN: split-file %s %t
-# RUN: %clang_host -g -gdwarf %t/main.cpp -o %t.out
+# RUN: %clangxx_host -g -gdwarf %t/main.cpp -o %t.out
 # RUN: %lldb -x -b -s %t/commands.input %t.out -o exit 2>&1 \
 # RUN:       | FileCheck %s
 #
-# RUN: %clang_host -O0 %t/main.cpp -o %t-nodebug.out
+# RUN: %clangxx_host -O0 %t/main.cpp -o %t-nodebug.out
 # RUN: %lldb -x -b -s %t/commands.input %t-nodebug.out -o exit 2>&1 \
 # RUN:       | FileCheck %s
 
diff --git a/lldb/test/Shell/Settings/TestFrameFormatFunctionScope.test b/lldb/test/Shell/Settings/TestFrameFormatFunctionScope.test
index a28c16f95a9e2..f4a17661c3602 100644
--- a/lldb/test/Shell/Settings/TestFrameFormatFunctionScope.test
+++ b/lldb/test/Shell/Settings/TestFrameFormatFunctionScope.test
@@ -3,11 +3,11 @@
 # Test the ${function.scope} frame-format variable.
 
 # RUN: split-file %s %t
-# RUN: %clang_host -g -gdwarf %t/main.cpp -o %t.out
+# RUN: %clangxx_host -g -gdwarf %t/main.cpp -o %t.out
 # RUN: %lldb -x -b -s %t/commands.input %t.out -o exit 2>&1 \
 # RUN:       | FileCheck %s
 #
-# RUN: %clang_host -O0 %t/main.cpp -o %t-nodebug.out
+# RUN: %clangxx_host -O0 %t/main.cpp -o %t-nodebug.out
 # RUN: %lldb -x -b -s %t/commands.input %t-nodebug.out -o exit 2>&1 \
 # RUN:       | FileCheck %s
 
diff --git a/lldb/test/Shell/Settings/TestFrameFormatFunctionSuffix.test b/lldb/test/Shell/Settings/TestFrameFormatFunctionSuffix.test
index 4609a0412a0ab..5883c722f3336 100644
--- a/lldb/test/Shell/Settings/TestFrameFormatFunctionSuffix.test
+++ b/lldb/test/Shell/Settings/TestFrameFormatFunctionSuffix.test
@@ -3,7 +3,7 @@
 # Test the ${function.suffix} frame-format variable.
 
 # RUN: split-file %s %t
-# RUN: %clang_host -g -gdwarf %t/main.cpp -o %t.out
+# RUN: %clangxx_host -g -gdwarf %t/main.cpp -o %t.out
 # RUN: %lldb -x -b -s %t/commands.input %t.out -o exit 2>&1 \
 # RUN:       | FileCheck %s
 
diff --git a/lldb/test/Shell/Settings/TestFrameFormatFunctionTemplateArguments.test b/lldb/test/Shell/Settings/TestFrameFormatFunctionTemplateArguments.test
index ac8a32820c888..a09a9610f48db 100644
--- a/lldb/test/Shell/Settings/TestFrameFormatFunctionTemplateArguments.test
+++ b/lldb/test/Shell/Settings/TestFrameFormatFunctionTemplateArguments.test
@@ -3,11 +3,11 @@
 # Test the ${function.template-arguments} frame-format variable.
 
 # RUN: split-file %s %t
-# RUN: %clang_host -g -gdwarf %t/main.cpp -o %t.out
+# RUN: %clangxx_host -g -gdwarf %t/main.cpp -o %t.out
 # RUN: %lldb -x -b -s %t/commands.input %t.out -o exit 2>&1 \
 # RUN:       | FileCheck %s
 #
-# RUN: %clang_host -O0 %t/main.cpp -o %t-nodebug.out
+# RUN: %clangxx_host -O0 %t/main.cpp -o %t-nodebug.out
 # RUN: %lldb -x -b -s %t/commands.input %t-nodebug.out -o exit 2>&1 \
 # RUN:       | FileCheck %s
 
diff --git a/lldb/test/Shell/Settings/TestFrameFunctionInlined.test b/lldb/test/Shell/Settings/TestFrameFunctionInlined.test
index 5db34b4160850..1bb7ab486bcf5 100644
--- a/lldb/test/Shell/Settings/TestFrameFunctionInlined.test
+++ b/lldb/test/Shell/Settings/TestFrameFunctionInlined.test
@@ -6,7 +6,7 @@
 # REQUIRES: (system-windows && lld) || !system-windows
 
 # RUN: split-file %s %t
-# RUN: %clang_host -g -gdwarf %t/main.cpp -o %t.out %if system-windows %{-fuse-ld=lld%}
+# RUN: %clangxx_host -g -gdwarf %t/main.cpp -o %t.out %if system-windows %{-fuse-ld=lld%}
 # RUN: %lldb -x -b -s %t/commands.input %t.out -o exit 2>&1 \
 # RUN:       | FileCheck %s
 
diff --git a/lldb/test/Shell/SymbolFile/DWARF/split-dwarf-expression-eval-bug.cpp b/lldb/test/Shell/SymbolFile/DWARF/split-dwarf-expression-eval-bug.cpp
index 4a8004ddd287f..b02eea6bbc4f8 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/split-dwarf-expression-eval-bug.cpp
+++ b/lldb/test/Shell/SymbolFile/DWARF/split-dwarf-expression-eval-bug.cpp
@@ -7,10 +7,10 @@
 
 // UNSUPPORTED: system-darwin, system-windows
 
-// RUN: %clang_host -c -gsplit-dwarf -g %s -o %t1.o -DONE
-// RUN: %clang_host -c -gsplit-dwarf -g %s -o %t2.o -DTWO
-// RUN: %clang_host -c -gsplit-dwarf -g %s -o %t3.o -DTHREE
-// RUN: %clang_host %t1.o %t2.o %t3.o -o %t
+// RUN: %clangxx_host -c -gsplit-dwarf -g %s -o %t1.o -DONE
+// RUN: %clangxx_host -c -gsplit-dwarf -g %s -o %t2.o -DTWO
+// RUN: %clangxx_host -c -gsplit-dwarf -g %s -o %t3.o -DTHREE
+// RUN: %clangxx_host %t1.o %t2.o %t3.o -o %t
 // RUN: %lldb %t -o "br set -n foo" -o run -o "expression bool_in_first_cu" -o exit \
 // RUN:   | FileCheck %s
 
diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/apple-index-is-used.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/apple-index-is-used.cpp
index 5bcb2cbcbbe29..8ef2e56ba3d4d 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/x86/apple-index-is-used.cpp
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/apple-index-is-used.cpp
@@ -1,5 +1,5 @@
 // Test that we use the apple indexes.
-// RUN: %clang %s -g -c -o %t --target=x86_64-apple-macosx -gdwarf-4
+// RUN: %clangxx %s -g -c -o %t --target=x86_64-apple-macosx -gdwarf-4
 // RUN: lldb-test symbols %t | FileCheck %s
 
 // CHECK: .apple_names index present
diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/debug-names-compressed.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/debug-names-compressed.cpp
index 4dcbb47152203..53c3d3daa40c5 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/x86/debug-names-compressed.cpp
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/debug-names-compressed.cpp
@@ -3,7 +3,7 @@
 
 // REQUIRES: lld, zlib
 
-// RUN: %clang -c -o %t.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames %s
+// RUN: %clangxx -c -o %t.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames %s
 // RUN: ld.lld %t.o -o %t --compress-debug-sections=zlib
 // RUN: llvm-readobj --sections %t | FileCheck %s --check-prefix NAMES
 // RUN: lldb-test symbols --find=variable --name=foo %t | FileCheck %s
diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/debug-types-debug-names.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/debug-types-debug-names.cpp
index 2b7a928c89a8f..acc34dd41688b 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/x86/debug-types-debug-names.cpp
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/debug-types-debug-names.cpp
@@ -6,7 +6,7 @@
 
 // REQUIRES: lld
 
-// RUN: %clang %s -target x86_64-pc-linux -gdwarf-5 -fdebug-types-section \
+// RUN: %clangxx %s -target x86_64-pc-linux -gdwarf-5 -fdebug-types-section \
 // RUN:   -gpubnames -fno-limit-debug-info -c -o %t.o
 // RUN: ld.lld %t.o -o %t
 // RUN: %lldb %t -o "type lookup stype" -b | FileCheck %s --check-prefix=BASE
diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/debug-types-dwo-cross-reference.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/debug-types-dwo-cross-reference.cpp
index 0e29cb3e7f16e..bc863fb64a9cc 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/x86/debug-types-dwo-cross-reference.cpp
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/debug-types-dwo-cross-reference.cpp
@@ -3,9 +3,9 @@
 
 // REQUIRES: lld
 
-// RUN: %clang %s -target x86_64-pc-linux -fno-standalone-debug -g \
+// RUN: %clangxx %s -target x86_64-pc-linux -fno-standalone-debug -g \
 // RUN:   -fdebug-types-section -gsplit-dwarf -c -o %t1.o -DONE
-// RUN: %clang %s -target x86_64-pc-linux -fno-standalone-debug -g \
+// RUN: %clangxx %s -target x86_64-pc-linux -fno-standalone-debug -g \
 // RUN:   -fdebug-types-section -gsplit-dwarf -c -o %t2.o -DTWO
 // RUN: llvm-dwarfdump %t1.dwo -debug-types -debug-info | FileCheck --check-prefix=ONEUNIT %s
 // RUN: llvm-dwarfdump %t2.dwo -debug-types -debug-info | FileCheck --check-prefix=ONEUNIT %s
diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/dwarf5-index-is-used.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/dwarf5-index-is-used.cpp
index d6ac23716f6ce..2fdb1d8d7ca7d 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/x86/dwarf5-index-is-used.cpp
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/dwarf5-index-is-used.cpp
@@ -2,7 +2,7 @@
 
 // REQUIRES: lld
 
-// RUN: %clang %s -c -o %t.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames
+// RUN: %clangxx %s -c -o %t.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames
 // RUN: ld.lld %t.o -o %t
 // RUN: lldb-test symbols %t | FileCheck %s
 
diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/dwarf5-partial-index.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/dwarf5-partial-index.cpp
index ab84415f61b27..a739dfde48aaf 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/x86/dwarf5-partial-index.cpp
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/dwarf5-partial-index.cpp
@@ -3,9 +3,9 @@
 
 // REQUIRES: lld
 
-// RUN: %clang %s -c -o %t-1.o --target=x86_64-pc-linux -DONE -gdwarf-5 -gpubnames
+// RUN: %clangxx %s -c -o %t-1.o --target=x86_64-pc-linux -DONE -gdwarf-5 -gpubnames
 // RUN: llvm-readobj --sections %t-1.o | FileCheck %s --check-prefix NAMES
-// RUN: %clang %s -c -o %t-2.o --target=x86_64-pc-linux -DTWO -gdwarf-5 -gno-pubnames
+// RUN: %clangxx %s -c -o %t-2.o --target=x86_64-pc-linux -DTWO -gdwarf-5 -gno-pubnames
 // RUN: ld.lld %t-1.o %t-2.o -o %t
 // RUN: lldb-test symbols --find=variable --name=foo  %t | FileCheck %s
 
diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/dwo-not-found-warning.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/dwo-not-found-warning.cpp
index 929e11f80e34e..36eb299f06630 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/x86/dwo-not-found-warning.cpp
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/dwo-not-found-warning.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang --target=x86_64-pc-linux -g -gsplit-dwarf -c %s -o %t.o
+// RUN: %clangxx --target=x86_64-pc-linux -g -gsplit-dwarf -c %s -o %t.o
 // RUN: rm %t.dwo
 // RUN: %lldb %t.o -o "br set -n main" -o exit 2>&1 | FileCheck %s
 
diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/dwp-foreign-type-units.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/dwp-foreign-type-units.cpp
index 9251930d7d13c..7fbc4f98e7976 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/x86/dwp-foreign-type-units.cpp
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/dwp-foreign-type-units.cpp
@@ -16,9 +16,9 @@
 // type unit comes from by looking at the DW_AT_dwo_name attribute in the
 // DW_TAG_type_unit.
 
-// RUN: %clang -target x86_64-pc-linux -gdwarf-5 -gsplit-dwarf \
+// RUN: %clangxx -target x86_64-pc-linux -gdwarf-5 -gsplit-dwarf \
 // RUN:   -fdebug-types-section -gpubnames -c %s -o %t.main.o
-// RUN: %clang -target x86_64-pc-linux -gdwarf-5 -gsplit-dwarf -DVARIANT \
+// RUN: %clangxx -target x86_64-pc-linux -gdwarf-5 -gsplit-dwarf -DVARIANT \
 // RUN:   -fdebug-types-section -gpubnames -c %s -o %t.foo.o
 // RUN: ld.lld %t.main.o %t.foo.o -o %t
 
diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/dwp-index-cache.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/dwp-index-cache.cpp
index 3e97c3fb1ebc2..3edcd8f180a15 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/x86/dwp-index-cache.cpp
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/dwp-index-cache.cpp
@@ -14,8 +14,8 @@
 // complete DWARF index.
 
 // Test that if we don't have .debug_names, that we save a full DWARF index.
-// RUN: %clang -target x86_64-pc-linux -gsplit-dwarf -gdwarf-5 -DMAIN=1 -c %s -o %t.main.o
-// RUN: %clang -target x86_64-pc-linux -gsplit-dwarf -gdwarf-5 -DMAIN=0 -c %s -o %t.foo.o
+// RUN: %clangxx -target x86_64-pc-linux -gsplit-dwarf -gdwarf-5 -DMAIN=1 -c %s -o %t.main.o
+// RUN: %clangxx -target x86_64-pc-linux -gsplit-dwarf -gdwarf-5 -DMAIN=0 -c %s -o %t.foo.o
 // RUN: ld.lld %t.main.o %t.foo.o -o %t.nonames
 // RUN: llvm-dwp %t.main.dwo %t.foo.dwo -o %t.nonames.dwp
 // RUN: rm %t.main.dwo %t.foo.dwo
@@ -35,8 +35,8 @@
 
 // Test that if we have one .o file with .debug_names and one without, that we
 // save a partial DWARF index.
-// RUN: %clang -target x86_64-pc-linux -gsplit-dwarf -gdwarf-5 -DMAIN=1 -c %s -o %t.main.o -gpubnames
-// RUN: %clang -target x86_64-pc-linux -gsplit-dwarf -gdwarf-5 -DMAIN=0 -c %s -o %t.foo.o
+// RUN: %clangxx -target x86_64-pc-linux -gsplit-dwarf -gdwarf-5 -DMAIN=1 -c %s -o %t.main.o -gpubnames
+// RUN: %clangxx -target x86_64-pc-linux -gsplit-dwarf -gdwarf-5 -DMAIN=0 -c %s -o %t.foo.o
 // RUN: ld.lld %t.main.o %t.foo.o -o %t.somenames
 // RUN: llvm-dwp %t.main.dwo %t.foo.dwo -o %t.somenames.dwp
 // RUN: rm %t.main.dwo %t.foo.dwo
diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/dwp-separate-debug-file.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/dwp-separate-debug-file.cpp
index 888e96bbb10af..f625fda2087db 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/x86/dwp-separate-debug-file.cpp
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/dwp-separate-debug-file.cpp
@@ -1,7 +1,7 @@
 // REQUIRES: lld, python
 
 // Now test with DWARF5
-// RUN: %clang -target x86_64-pc-linux -gsplit-dwarf -gdwarf-5 -c %s -o %t.dwarf5.o
+// RUN: %clangxx -target x86_64-pc-linux -gsplit-dwarf -gdwarf-5 -c %s -o %t.dwarf5.o
 // RUN: ld.lld %t.dwarf5.o -o %t.dwarf5
 // RUN: llvm-dwp %t.dwarf5.dwo -o %t.dwarf5.dwp
 // RUN: rm %t.dwarf5.dwo
@@ -64,7 +64,7 @@
 // RUN:   -b %t.dwarf5.debug 2>&1 | FileCheck %s -check-prefix=NODWP
 
 // Now test with DWARF4
-// RUN: %clang -target x86_64-pc-linux -gsplit-dwarf -gdwarf-4 -c %s -o %t.dwarf4.o
+// RUN: %clangxx -target x86_64-pc-linux -gsplit-dwarf -gdwarf-4 -c %s -o %t.dwarf4.o
 // RUN: ld.lld %t.dwarf4.o -o %t.dwarf4
 // RUN: llvm-dwp %t.dwarf4.dwo -o %t.dwarf4.dwp
 // RUN: rm %t.dwarf4.dwo
@@ -128,7 +128,7 @@
 
 // Test if we have a GNU build ID in our main executable and in our debug file,
 // and we have a .dwp file that doesn't, that we can still load our .dwp file.
-// RUN: %clang -target x86_64-pc-linux -gsplit-dwarf -gdwarf-5 -c %s -o %t.o
+// RUN: %clangxx -target x86_64-pc-linux -gsplit-dwarf -gdwarf-5 -c %s -o %t.o
 // RUN: ld.lld %t.o --build-id=md5 -o %t
 // RUN: llvm-dwp %t.dwo -o %t.dwp
 // RUN: rm %t.dwo
diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/find-basic-function.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/find-basic-function.cpp
index c42f9fe0b8b52..a00b2bd9506ef 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/x86/find-basic-function.cpp
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/find-basic-function.cpp
@@ -1,6 +1,6 @@
 // REQUIRES: lld
 
-// RUN: %clang %s -g -c -o %t.o --target=x86_64-pc-linux -gno-pubnames
+// RUN: %clangxx %s -g -c -o %t.o --target=x86_64-pc-linux -gno-pubnames
 // RUN: ld.lld %t.o -o %t
 // RUN: lldb-test symbols --name=foo --find=function --function-flags=base %t | \
 // RUN:   FileCheck --check-prefix=BASE %s
@@ -19,7 +19,7 @@
 // RUN: lldb-test symbols --name=not_there --find=function %t | \
 // RUN:   FileCheck --check-prefix=EMPTY %s
 //
-// RUN: %clang %s -g -c -o %t --target=x86_64-apple-macosx
+// RUN: %clangxx %s -g -c -o %t --target=x86_64-apple-macosx
 // RUN: lldb-test symbols --name=foo --find=function --function-flags=base %t | \
 // RUN:   FileCheck --check-prefix=BASE %s
 // RUN: lldb-test symbols --name=foo --find=function --function-flags=method %t | \
@@ -39,7 +39,7 @@
 // RUN: lldb-test symbols --name=not_there --find=function %t | \
 // RUN:   FileCheck --check-prefix=EMPTY %s
 
-// RUN: %clang %s -c -o %t.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames
+// RUN: %clangxx %s -c -o %t.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames
 // RUN: ld.lld %t.o -o %t
 // RUN: llvm-readobj --sections %t | FileCheck %s --check-prefix NAMES
 // RUN: lldb-test symbols --name=foo --find=function --function-flags=base %t | \
diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/find-basic-namespace.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/find-basic-namespace.cpp
index 13d50af7ef601..14c73c3e82efb 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/x86/find-basic-namespace.cpp
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/find-basic-namespace.cpp
@@ -1,6 +1,6 @@
 // REQUIRES: lld
 
-// RUN: %clang %s -g -c -o %t.o --target=x86_64-pc-linux -gno-pubnames
+// RUN: %clangxx %s -g -c -o %t.o --target=x86_64-pc-linux -gno-pubnames
 // RUN: ld.lld %t.o -o %t
 // RUN: lldb-test symbols --name=foo --find=namespace %t | \
 // RUN:   FileCheck --check-prefix=FOO %s
@@ -9,7 +9,7 @@
 // RUN: lldb-test symbols --name=not_there --find=namespace %t | \
 // RUN:   FileCheck --check-prefix=EMPTY %s
 //
-// RUN: %clang %s -g -c -o %t --target=x86_64-apple-macosx
+// RUN: %clangxx %s -g -c -o %t --target=x86_64-apple-macosx
 // RUN: lldb-test symbols --name=foo --find=namespace %t | \
 // RUN:   FileCheck --check-prefix=FOO %s
 // RUN: lldb-test symbols --name=foo --find=namespace --context=context %t | \
@@ -17,7 +17,7 @@
 // RUN: lldb-test symbols --name=not_there --find=namespace %t | \
 // RUN:   FileCheck --check-prefix=EMPTY %s
 
-// RUN: %clang %s -c -o %t.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames
+// RUN: %clangxx %s -c -o %t.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames
 // RUN: ld.lld %t.o -o %t
 // RUN: llvm-readobj --sections %t | FileCheck %s --check-prefix NAMES
 // RUN: lldb-test symbols --name=foo --find=namespace %t | \
diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/find-basic-type.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/find-basic-type.cpp
index af49206608723..315fab344dfee 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/x86/find-basic-type.cpp
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/find-basic-type.cpp
@@ -1,6 +1,6 @@
 // REQUIRES: lld
 
-// RUN: %clang %s -g -c -o %t.o --target=x86_64-pc-linux -gno-pubnames
+// RUN: %clangxx %s -g -c -o %t.o --target=x86_64-pc-linux -gno-pubnames
 // RUN: ld.lld %t.o -o %t
 // RUN: lldb-test symbols --name=foo --find=type %t | \
 // RUN:   FileCheck --check-prefix=NAME %s
@@ -11,7 +11,7 @@
 // RUN: lldb-test symbols --name=not_there --find=type %t | \
 // RUN:   FileCheck --check-prefix=EMPTY %s
 //
-// RUN: %clang %s -g -c -o %t --target=x86_64-apple-macosx
+// RUN: %clangxx %s -g -c -o %t --target=x86_64-apple-macosx
 // RUN: lldb-test symbols --name=foo --find=type %t | \
 // RUN:   FileCheck --check-prefix=NAME %s
 // RUN: lldb-test symbols --name=::foo --find=type %t | \
@@ -21,7 +21,7 @@
 // RUN: lldb-test symbols --name=not_there --find=type %t | \
 // RUN:   FileCheck --check-prefix=EMPTY %s
 
-// RUN: %clang %s -c -o %t.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames
+// RUN: %clangxx %s -c -o %t.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames
 // RUN: ld.lld %t.o -o %t
 // RUN: llvm-readobj --sections %t | FileCheck %s --check-prefix NAMES
 // RUN: lldb-test symbols --name=foo --find=type %t | \
diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/find-basic-variable.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/find-basic-variable.cpp
index e46fa14489d32..b6e2252c28402 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/x86/find-basic-variable.cpp
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/find-basic-variable.cpp
@@ -1,6 +1,6 @@
 // REQUIRES: lld
 
-// RUN: %clang %s -g -c -o %t.o --target=x86_64-pc-linux -gno-pubnames
+// RUN: %clangxx %s -g -c -o %t.o --target=x86_64-pc-linux -gno-pubnames
 // RUN: ld.lld %t.o -o %t
 // RUN: lldb-test symbols --name=foo --find=variable --context=context %t | \
 // RUN:   FileCheck --check-prefix=CONTEXT %s
@@ -11,7 +11,7 @@
 // RUN: lldb-test symbols --name=not_there --find=variable %t | \
 // RUN:   FileCheck --check-prefix=EMPTY %s
 //
-// RUN: %clang %s -g -c -o %t --target=x86_64-apple-macosx
+// RUN: %clangxx %s -g -c -o %t --target=x86_64-apple-macosx
 // RUN: lldb-test symbols --name=foo --find=variable --context=context %t | \
 // RUN:   FileCheck --check-prefix=CONTEXT %s
 // RUN: lldb-test symbols --name=foo --find=variable %t | \
@@ -21,7 +21,7 @@
 // RUN: lldb-test symbols --name=not_there --find=variable %t | \
 // RUN:   FileCheck --check-prefix=EMPTY %s
 //
-// RUN: %clang %s -g -c -o %t.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames
+// RUN: %clangxx %s -g -c -o %t.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames
 // RUN: ld.lld %t.o -o %t
 // RUN: llvm-readobj --sections %t | FileCheck %s --check-prefix NAMES
 // RUN: lldb-test symbols --name=foo --find=variable --context=context %t | \
diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/find-function-regex.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/find-function-regex.cpp
index be267596fb372..5c7ad844f6603 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/x86/find-function-regex.cpp
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/find-function-regex.cpp
@@ -1,13 +1,13 @@
 // REQUIRES: lld
 
-// RUN: %clang %s -g -c -o %t.o --target=x86_64-pc-linux -gno-pubnames
+// RUN: %clangxx %s -g -c -o %t.o --target=x86_64-pc-linux -gno-pubnames
 // RUN: ld.lld %t.o -o %t
 // RUN: lldb-test symbols --name=f.o --regex --find=function %t | FileCheck %s
 //
-// RUN: %clang %s -g -c -o %t --target=x86_64-apple-macosx
+// RUN: %clangxx %s -g -c -o %t --target=x86_64-apple-macosx
 // RUN: lldb-test symbols --name=f.o --regex --find=function %t | FileCheck %s
 
-// RUN: %clang %s -g -c -o %t.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames
+// RUN: %clangxx %s -g -c -o %t.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames
 // RUN: ld.lld %t.o -o %t
 // RUN: llvm-readobj --sections %t | FileCheck %s --check-prefix NAMES
 // RUN: lldb-test symbols --name=f.o --regex --find=function %t | FileCheck %s
diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/find-method-local-struct.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/find-method-local-struct.cpp
index 3da4a4a23f8a8..46553a83081e4 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/x86/find-method-local-struct.cpp
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/find-method-local-struct.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang %s -g -c -o %t --target=x86_64-apple-macosx
+// RUN: %clangxx %s -g -c -o %t --target=x86_64-apple-macosx
 // RUN: lldb-test symbols --name=foo --find=function --function-flags=method %t | \
 // RUN:   FileCheck %s
 
diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/find-method.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/find-method.cpp
index 9f8b3df2f31a7..26faf8907b4a9 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/x86/find-method.cpp
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/find-method.cpp
@@ -1,15 +1,15 @@
 // REQUIRES: lld
 
-// RUN: %clang %s -g -c -o %t.o --target=x86_64-pc-linux -gno-pubnames
+// RUN: %clangxx %s -g -c -o %t.o --target=x86_64-pc-linux -gno-pubnames
 // RUN: ld.lld %t.o -o %t
 // RUN: lldb-test symbols --name=foo --find=function --function-flags=method %t | \
 // RUN:   FileCheck %s
 //
-// RUN: %clang %s -g -c -o %t --target=x86_64-apple-macosx
+// RUN: %clangxx %s -g -c -o %t --target=x86_64-apple-macosx
 // RUN: lldb-test symbols --name=foo --find=function --function-flags=method %t | \
 // RUN:   FileCheck %s
 
-// RUN: %clang %s -c -o %t.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames
+// RUN: %clangxx %s -c -o %t.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames
 // RUN: ld.lld %t.o -o %t
 // RUN: llvm-readobj --sections %t | FileCheck %s --check-prefix NAMES
 // RUN: lldb-test symbols --name=foo --find=function --function-flags=method %t | \
diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/find-qualified-variable.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/find-qualified-variable.cpp
index 1ad3e7fbadf51..e3f9ce308b75c 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/x86/find-qualified-variable.cpp
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/find-qualified-variable.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang %s -g -c -o %t --target=x86_64-apple-macosx
+// RUN: %clangxx %s -g -c -o %t --target=x86_64-apple-macosx
 // RUN: lldb-test symbols --name=A::foo --find=variable %t | FileCheck %s
 
 // CHECK: Found 1 variables:
diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/find-variable-dwo.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/find-variable-dwo.cpp
index b5d35e4f7883f..250b34377acda 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/x86/find-variable-dwo.cpp
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/find-variable-dwo.cpp
@@ -1,9 +1,9 @@
 // REQUIRES: lld
 
-// RUN: %clang %s -gdwarf-5 -gpubnames -gsplit-dwarf -c -emit-llvm -o - --target=x86_64-pc-linux -DONE | \
+// RUN: %clangxx %s -gdwarf-5 -gpubnames -gsplit-dwarf -c -emit-llvm -o - --target=x86_64-pc-linux -DONE | \
 // RUN:   llc -filetype=obj -split-dwarf-file=%t-1.dwo -o %t-1.o
 // RUN: llvm-objcopy --split-dwo=%t-1.dwo %t-1.o
-// RUN: %clang %s -gdwarf-5 -gpubnames -gsplit-dwarf -c -emit-llvm -o - --target=x86_64-pc-linux -DTWO | \
+// RUN: %clangxx %s -gdwarf-5 -gpubnames -gsplit-dwarf -c -emit-llvm -o - --target=x86_64-pc-linux -DTWO | \
 // RUN:   llc -filetype=obj -split-dwarf-file=%t-2.dwo -o %t-2.o
 // RUN: llvm-objcopy --split-dwo=%t-2.dwo %t-2.o
 // RUN: ld.lld %t-1.o %t-2.o -o %t
diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/find-variable-file.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/find-variable-file.cpp
index f1a9a4eb12d07..3a8cf89ac367b 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/x86/find-variable-file.cpp
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/find-variable-file.cpp
@@ -1,7 +1,7 @@
 // REQUIRES: lld
 
-// RUN: %clang -g -c -o %t-1.o --target=x86_64-pc-linux -gno-pubnames %s
-// RUN: %clang -g -c -o %t-2.o --target=x86_64-pc-linux -gno-pubnames %S/Inputs/find-variable-file-2.cpp
+// RUN: %clangxx -g -c -o %t-1.o --target=x86_64-pc-linux -gno-pubnames %s
+// RUN: %clangxx -g -c -o %t-2.o --target=x86_64-pc-linux -gno-pubnames %S/Inputs/find-variable-file-2.cpp
 // RUN: ld.lld %t-1.o %t-2.o -o %t
 // RUN: lldb-test symbols --file=find-variable-file.cpp --find=variable %t | \
 // RUN:   FileCheck --check-prefix=ONE %s
@@ -10,16 +10,16 @@
 
 // Run the same test with split-dwarf. This is interesting because the two
 // split compile units will have the same offset (0).
-// RUN: %clang -g -c -o %t-1.o --target=x86_64-pc-linux -gsplit-dwarf %s
-// RUN: %clang -g -c -o %t-2.o --target=x86_64-pc-linux -gsplit-dwarf %S/Inputs/find-variable-file-2.cpp
+// RUN: %clangxx -g -c -o %t-1.o --target=x86_64-pc-linux -gsplit-dwarf %s
+// RUN: %clangxx -g -c -o %t-2.o --target=x86_64-pc-linux -gsplit-dwarf %S/Inputs/find-variable-file-2.cpp
 // RUN: ld.lld %t-1.o %t-2.o -o %t
 // RUN: lldb-test symbols --file=find-variable-file.cpp --find=variable %t | \
 // RUN:   FileCheck --check-prefix=ONE %s
 // RUN: lldb-test symbols --file=find-variable-file-2.cpp --find=variable %t | \
 // RUN:   FileCheck --check-prefix=TWO %s
 
-// RUN: %clang -c -o %t-1.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames %s
-// RUN: %clang -c -o %t-2.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames %S/Inputs/find-variable-file-2.cpp
+// RUN: %clangxx -c -o %t-1.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames %s
+// RUN: %clangxx -c -o %t-2.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames %S/Inputs/find-variable-file-2.cpp
 // RUN: ld.lld %t-1.o %t-2.o -o %t
 // RUN: llvm-readobj --sections %t | FileCheck %s --check-prefix NAMES
 // RUN: lldb-test symbols --file=find-variable-file.cpp --find=variable %t | \
@@ -29,9 +29,9 @@
 
 // Run the same test with split dwarf and pubnames to check whether we can find
 // the compile unit using the name index if it is split.
-// RUN: %clang -c -o %t-1.o --target=x86_64-pc-linux -gdwarf-5 -gsplit-dwarf -gpubnames %s
-// RUN: %clang -c -o %t-2.o --target=x86_64-pc-linux -gdwarf-5 -gsplit-dwarf -gpubnames %S/Inputs/find-variable-file-2.cpp
-// RUN: %clang -c -o %t-3.o --target=x86_64-pc-linux -gdwarf-5 -gsplit-dwarf -gpubnames %S/Inputs/find-variable-file-3.cpp
+// RUN: %clangxx -c -o %t-1.o --target=x86_64-pc-linux -gdwarf-5 -gsplit-dwarf -gpubnames %s
+// RUN: %clangxx -c -o %t-2.o --target=x86_64-pc-linux -gdwarf-5 -gsplit-dwarf -gpubnames %S/Inputs/find-variable-file-2.cpp
+// RUN: %clangxx -c -o %t-3.o --target=x86_64-pc-linux -gdwarf-5 -gsplit-dwarf -gpubnames %S/Inputs/find-variable-file-3.cpp
 // RUN: ld.lld %t-1.o %t-2.o %t-3.o -o %t
 // RUN: llvm-readobj --sections %t | FileCheck %s --check-prefix NAMES
 // RUN: lldb-test symbols --file=find-variable-file.cpp --find=variable %t | \
diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/member-pointers.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/member-pointers.cpp
index a12892305798a..00805770af11e 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/x86/member-pointers.cpp
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/member-pointers.cpp
@@ -1,7 +1,7 @@
 // REQUIRES: lld
 
 // Itanium ABI:
-// RUN: %clang --target=x86_64-pc-linux -gdwarf -c -o %t_linux.o %s
+// RUN: %clangxx --target=x86_64-pc-linux -gdwarf -c -o %t_linux.o %s
 // RUN: %lldb -f %t_linux.o -b -o "target variable s1 s2 m1 m2 v1 v2 v3 v4" | FileCheck --check-prefix=CHECK-GNU %s
 //
 // CHECK-GNU: (void (Single1::*)()) s1 = 0x00000000000000000000000000000000
diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/module-ownership.mm b/lldb/test/Shell/SymbolFile/DWARF/x86/module-ownership.mm
index 2dec109a781ca..27aa1365ab54c 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/x86/module-ownership.mm
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/module-ownership.mm
@@ -1,5 +1,5 @@
 // RUN: rm -rf %t.cache
-// RUN: %clang --target=x86_64-apple-macosx -g -gmodules -Wno-objc-root-class \
+// RUN: %clangxx --target=x86_64-apple-macosx -g -gmodules -Wno-objc-root-class \
 // RUN:    -fmodules -fmodules-cache-path=%t.cache \
 // RUN:    -c -o %t.o %s -I%S/Inputs
 // RUN: lldb-test symbols -dump-clang-ast %t.o | FileCheck --check-prefix CHECK-ANON-S1 %s
diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/no_unique_address-with-bitfields.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/no_unique_address-with-bitfields.cpp
index 297fb82caee5f..8f530c803a40c 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/x86/no_unique_address-with-bitfields.cpp
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/no_unique_address-with-bitfields.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang --target=x86_64-apple-macosx -c -gdwarf -o %t %s
+// RUN: %clangxx --target=x86_64-apple-macosx -c -gdwarf -o %t %s
 // RUN: %lldb %t \
 // RUN:   -o "target var global" \
 // RUN:   -o "target var global2" \
diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/type-definition-search.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/type-definition-search.cpp
index 5a40a6e0fbc27..5ab45eefd2211 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/x86/type-definition-search.cpp
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/type-definition-search.cpp
@@ -4,18 +4,18 @@
 
 // REQUIRES: lld
 
-// RUN: %clang --target=x86_64-pc-linux -c %s -o %t-n-a.o -g -gsimple-template-names -DFILE_A
-// RUN: %clang --target=x86_64-pc-linux -c %s -o %t-n-b.o -g -gsimple-template-names -DFILE_B
+// RUN: %clangxx --target=x86_64-pc-linux -c %s -o %t-n-a.o -g -gsimple-template-names -DFILE_A
+// RUN: %clangxx --target=x86_64-pc-linux -c %s -o %t-n-b.o -g -gsimple-template-names -DFILE_B
 // RUN: ld.lld %t-n-a.o %t-n-b.o -o %t-n
 // RUN: %lldb %t-n -o "target variable --ptr-depth 1 --show-types both_a both_b" -o exit | FileCheck %s
 
-// RUN: %clang --target=x86_64-pc-linux -c %s -o %t-t-a.o -g -fdebug-types-section -DFILE_A
-// RUN: %clang --target=x86_64-pc-linux -c %s -o %t-t-b.o -g -fdebug-types-section -DFILE_B
+// RUN: %clangxx --target=x86_64-pc-linux -c %s -o %t-t-a.o -g -fdebug-types-section -DFILE_A
+// RUN: %clangxx --target=x86_64-pc-linux -c %s -o %t-t-b.o -g -fdebug-types-section -DFILE_B
 // RUN: ld.lld %t-t-a.o %t-t-b.o -o %t-t
 // RUN: %lldb %t-t -o "target variable --ptr-depth 1 --show-types both_a both_b" -o exit | FileCheck %s
 
-// RUN: %clang --target=x86_64-pc-linux -c %s -o %t-tn-a.o -g -fdebug-types-section -gsimple-template-names -DFILE_A
-// RUN: %clang --target=x86_64-pc-linux -c %s -o %t-tn-b.o -g -fdebug-types-section -gsimple-template-names -DFILE_B
+// RUN: %clangxx --target=x86_64-pc-linux -c %s -o %t-tn-a.o -g -fdebug-types-section -gsimple-template-names -DFILE_A
+// RUN: %clangxx --target=x86_64-pc-linux -c %s -o %t-tn-b.o -g -fdebug-types-section -gsimple-template-names -DFILE_B
 // RUN: ld.lld %t-tn-a.o %t-tn-b.o -o %t-tn
 // RUN: %lldb %t-tn -o "target variable --ptr-depth 1 --show-types both_a both_b" -o exit | FileCheck %s
 
diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/type-unit-same-basename.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/type-unit-same-basename.cpp
index f7f5a30aaba9e..f9fd5b5e52250 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/x86/type-unit-same-basename.cpp
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/type-unit-same-basename.cpp
@@ -5,8 +5,8 @@
 
 // REQUIRES: lld
 
-// RUN: %clang --target=x86_64-pc-linux -c %s -o %t-a.o -g -fdebug-types-section -flimit-debug-info -DFILE_A
-// RUN: %clang --target=x86_64-pc-linux -c %s -o %t-b.o -g -fdebug-types-section -flimit-debug-info -DFILE_B
+// RUN: %clangxx --target=x86_64-pc-linux -c %s -o %t-a.o -g -fdebug-types-section -flimit-debug-info -DFILE_A
+// RUN: %clangxx --target=x86_64-pc-linux -c %s -o %t-b.o -g -fdebug-types-section -flimit-debug-info -DFILE_B
 // RUN: ld.lld -z undefs %t-a.o %t-b.o -o %t
 // RUN: %lldb %t -o "target variable x" -o exit | FileCheck %s
 
diff --git a/lldb/test/Shell/SymbolFile/NativePDB/native-setting.cpp b/lldb/test/Shell/SymbolFile/NativePDB/native-setting.cpp
index dc26ec8d30cb4..91f451fd0dadc 100644
--- a/lldb/test/Shell/SymbolFile/NativePDB/native-setting.cpp
+++ b/lldb/test/Shell/SymbolFile/NativePDB/native-setting.cpp
@@ -8,9 +8,9 @@
 // RUN: env LLDB_USE_NATIVE_PDB_READER=0 %lldb %t.exe -o 'target modules dump symfile' 2>&1 | FileCheck --check-prefix=ENV0 %s
 // RUN: env LLDB_USE_NATIVE_PDB_READER=1 %lldb %t.exe -o 'target modules dump symfile' 2>&1 | FileCheck --check-prefix=ENV1 %s
 
-// RUN: env LLDB_USE_NATIVE_PDB_READER=foo %lldb %t.exe -o 'target modules dump symfile' 2>&1 | FileCheck --check-prefix=ENV0 %s
-// RUN: env LLDB_USE_NATIVE_PDB_READER=42 %lldb %t.exe -o 'target modules dump symfile' 2>&1 | FileCheck --check-prefix=ENV0 %s
-// RUN: env LLDB_USE_NATIVE_PDB_READER=-1 %lldb %t.exe -o 'target modules dump symfile' 2>&1 | FileCheck --check-prefix=ENV0 %s
+// RUN: env LLDB_USE_NATIVE_PDB_READER=foo %lldb %t.exe -o 'target modules dump symfile' 2>&1 | FileCheck --check-prefix=ENV1 %s
+// RUN: env LLDB_USE_NATIVE_PDB_READER=42 %lldb %t.exe -o 'target modules dump symfile' 2>&1 | FileCheck --check-prefix=ENV1 %s
+// RUN: env LLDB_USE_NATIVE_PDB_READER=-1 %lldb %t.exe -o 'target modules dump symfile' 2>&1 | FileCheck --check-prefix=ENV1 %s
 
 // RUN: env LLDB_USE_NATIVE_PDB_READER=0 %lldb \
 // RUN:     -o 'settings set plugin.symbol-file.pdb.reader dia' \
diff --git a/lldb/test/Shell/SymbolFile/NativePDB/symtab.cpp b/lldb/test/Shell/SymbolFile/NativePDB/symtab.cpp
index beb5ae2f90256..75c59c560fad9 100644
--- a/lldb/test/Shell/SymbolFile/NativePDB/symtab.cpp
+++ b/lldb/test/Shell/SymbolFile/NativePDB/symtab.cpp
@@ -42,18 +42,18 @@ int main(int argc, char **argv) {
   return ns::a_function() + b.b_func();
 }
 
-// CHECK-DAG: Code {{.*}} main
-// CHECK-DAG: Code {{.*}} ?b_func@?$B@F@ns@@QEBAHXZ
-// CHECK-DAG: Code {{.*}} ?something@A@@QEAAXXZ
-// CHECK-DAG: Code {{.*}} ??_GDyn@ns@@UEAAPEAXI@Z
-// CHECK-DAG: Code {{.*}} ??2@YAPEAX_K@Z
-// CHECK-DAG: Code {{.*}} ??3@YAXPEAX_K@Z
-// CHECK-DAG: Code {{.*}} ?static_fn@C@?$B@H@ns@@SAHXZ
-// CHECK-DAG: Code {{.*}} ?a_function@ns@@YAHXZ
-// CHECK-DAG: Code {{.*}} ?static_fn@C@?$B@_N@ns@@SAHXZ
-// CHECK-DAG: Code {{.*}} ??1Dyn@ns@@UEAA@XZ
-// CHECK-DAG: Code {{.*}} ??0Dyn@ns@@QEAA@XZ
-// CHECK-DAG: Data {{.*}} ?global_int@@3HA
-// CHECK-DAG: Data {{.*}} ??_7Dyn@ns@@6B@
-// CHECK-DAG: Data {{.*}} ?global_a@@3UA@@A
-// CHECK-DAG: Data {{.*}} ?global_c@@3UC@?$B@_J@ns@@A
+// CHECK-DAG: Code 0x{{[0-9a-f]+}} 0x{{0*[1-9a-f][0-9a-f]*}} 0x00000000 main
+// CHECK-DAG: Code 0x{{[0-9a-f]+}} 0x{{0*[1-9a-f][0-9a-f]*}} 0x00000000 ?b_func@?$B@F@ns@@QEBAHXZ
+// CHECK-DAG: Code 0x{{[0-9a-f]+}} 0x{{0*[1-9a-f][0-9a-f]*}} 0x00000000 ?something@A@@QEAAXXZ
+// CHECK-DAG: Code 0x{{[0-9a-f]+}} 0x{{0*[1-9a-f][0-9a-f]*}} 0x00000000 ??_GDyn@ns@@UEAAPEAXI@Z
+// CHECK-DAG: Code 0x{{[0-9a-f]+}} 0x{{0*[1-9a-f][0-9a-f]*}} 0x00000000 ??2@YAPEAX_K@Z
+// CHECK-DAG: Code 0x{{[0-9a-f]+}} 0x{{0*[1-9a-f][0-9a-f]*}} 0x00000000 ??3@YAXPEAX_K@Z
+// CHECK-DAG: Code 0x{{[0-9a-f]+}} 0x{{0*[1-9a-f][0-9a-f]*}} 0x00000000 ?static_fn@C@?$B@H@ns@@SAHXZ
+// CHECK-DAG: Code 0x{{[0-9a-f]+}} 0x{{0*[1-9a-f][0-9a-f]*}} 0x00000000 ?a_function@ns@@YAHXZ
+// CHECK-DAG: Code 0x{{[0-9a-f]+}} 0x{{0*[1-9a-f][0-9a-f]*}} 0x00000000 ?static_fn@C@?$B@_N@ns@@SAHXZ
+// CHECK-DAG: Code 0x{{[0-9a-f]+}} 0x{{0*[1-9a-f][0-9a-f]*}} 0x00000000 ??1Dyn@ns@@UEAA@XZ
+// CHECK-DAG: Code 0x{{[0-9a-f]+}} 0x{{0*[1-9a-f][0-9a-f]*}} 0x00000000 ??0Dyn@ns@@QEAA@XZ
+// CHECK-DAG: Data 0x{{[0-9a-f]+}} 0x{{0*[1-9a-f][0-9a-f]*}} 0x00000000 ?global_int@@3HA
+// CHECK-DAG: Data 0x{{[0-9a-f]+}} 0x{{0*[1-9a-f][0-9a-f]*}} 0x00000000 ??_7Dyn@ns@@6B@
+// CHECK-DAG: Data 0x{{[0-9a-f]+}} 0x{{0*[1-9a-f][0-9a-f]*}} 0x00000000 ?global_a@@3UA@@A
+// CHECK-DAG: Data 0x{{[0-9a-f]+}} 0x{{0*[1-9a-f][0-9a-f]*}} 0x00000000 ?global_c@@3UC@?$B@_J@ns@@A
diff --git a/lldb/test/Shell/SymbolFile/PDB/function-nested-block.test b/lldb/test/Shell/SymbolFile/PDB/function-nested-block.test
index 4a2355bf23c9a..a18955b18151f 100644
--- a/lldb/test/Shell/SymbolFile/PDB/function-nested-block.test
+++ b/lldb/test/Shell/SymbolFile/PDB/function-nested-block.test
@@ -1,7 +1,9 @@
 REQUIRES: system-windows, lld
 RUN: %build --compiler=clang-cl --nodefaultlib --output=%t.exe %S/Inputs/FunctionNestedBlockTest.cpp
-RUN: lldb-test symbols -find=function -file FunctionNestedBlockTest.cpp -line 4 %t.exe | FileCheck --check-prefix=CHECK-FUNCTION %s
-RUN: lldb-test symbols -find=block -file FunctionNestedBlockTest.cpp -line 4 %t.exe | FileCheck --check-prefix=CHECK-BLOCK %s
+RUN: env LLDB_USE_NATIVE_PDB_READER=0 lldb-test symbols -find=function -file FunctionNestedBlockTest.cpp -line 4 %t.exe | FileCheck --check-prefix=CHECK-FUNCTION %s
+RUN: env LLDB_USE_NATIVE_PDB_READER=0 lldb-test symbols -find=block -file FunctionNestedBlockTest.cpp -line 4 %t.exe | FileCheck --check-prefix=CHECK-BLOCK %s
+RUN: env LLDB_USE_NATIVE_PDB_READER=1 lldb-test symbols -find=function -file FunctionNestedBlockTest.cpp -line 4 %t.exe | FileCheck --check-prefix=CHECK-FUNCTION %s
+RUN: env LLDB_USE_NATIVE_PDB_READER=1 lldb-test symbols -find=block -file FunctionNestedBlockTest.cpp -line 4 %t.exe | FileCheck --check-prefix=CHECK-BLOCK %s
 
 CHECK-FUNCTION: Found 1 functions:
 CHECK-FUNCTION: name = "main"
diff --git a/lldb/test/Shell/SymbolFile/PDB/native-setting.cpp b/lldb/test/Shell/SymbolFile/PDB/native-setting.cpp
index f5e54592b0b31..54b7f28a71259 100644
--- a/lldb/test/Shell/SymbolFile/PDB/native-setting.cpp
+++ b/lldb/test/Shell/SymbolFile/PDB/native-setting.cpp
@@ -8,9 +8,9 @@
 // RUN: env LLDB_USE_NATIVE_PDB_READER=0 %lldb %t.exe -o 'target modules dump symfile' 2>&1 | FileCheck --check-prefix=ENV0 %s
 // RUN: env LLDB_USE_NATIVE_PDB_READER=1 %lldb %t.exe -o 'target modules dump symfile' 2>&1 | FileCheck --check-prefix=ENV1 %s
 
-// RUN: env LLDB_USE_NATIVE_PDB_READER=foo %lldb %t.exe -o 'target modules dump symfile' 2>&1 | FileCheck --check-prefix=ENV0 %s
-// RUN: env LLDB_USE_NATIVE_PDB_READER=42 %lldb %t.exe -o 'target modules dump symfile' 2>&1 | FileCheck --check-prefix=ENV0 %s
-// RUN: env LLDB_USE_NATIVE_PDB_READER=-1 %lldb %t.exe -o 'target modules dump symfile' 2>&1 | FileCheck --check-prefix=ENV0 %s
+// RUN: env LLDB_USE_NATIVE_PDB_READER=foo %lldb %t.exe -o 'target modules dump symfile' 2>&1 | FileCheck --check-prefix=ENV1 %s
+// RUN: env LLDB_USE_NATIVE_PDB_READER=42 %lldb %t.exe -o 'target modules dump symfile' 2>&1 | FileCheck --check-prefix=ENV1 %s
+// RUN: env LLDB_USE_NATIVE_PDB_READER=-1 %lldb %t.exe -o 'target modules dump symfile' 2>&1 | FileCheck --check-prefix=ENV1 %s
 
 // RUN: env LLDB_USE_NATIVE_PDB_READER=0 %lldb \
 // RUN:     -o 'settings set plugin.symbol-file.pdb.reader dia' \
@@ -36,7 +36,7 @@
 // NO-ENV-NOT: warning:
 // NO-ENV: (lldb) target modules dump symfile
 // NO-ENV: Dumping debug symbols for 1 modules.
-// NO-ENV: SymbolFile pdb
+// NO-ENV: SymbolFile native-pdb
 
 // ENV0-NOT: warning:
 // ENV0: (lldb) target modules dump symfile
diff --git a/lldb/tools/lldb-dap/CMakeLists.txt b/lldb/tools/lldb-dap/CMakeLists.txt
index 7db334ca56bcf..dd1bbbdddfc59 100644
--- a/lldb/tools/lldb-dap/CMakeLists.txt
+++ b/lldb/tools/lldb-dap/CMakeLists.txt
@@ -1,9 +1,6 @@
 # We need to include the llvm components we depend on manually, as liblldb does
 # not re-export those.
 set(LLVM_LINK_COMPONENTS Support)
-set(LLVM_TARGET_DEFINITIONS Options.td)
-tablegen(LLVM Options.inc -gen-opt-parser-defs)
-add_public_tablegen_target(LLDBDAPOptionsTableGen)
 
 add_lldb_library(lldbDAP
   Breakpoint.cpp
diff --git a/lldb/tools/lldb-dap/EventHelper.cpp b/lldb/tools/lldb-dap/EventHelper.cpp
index c5d5f2bb59b42..12d9e21c52ab3 100644
--- a/lldb/tools/lldb-dap/EventHelper.cpp
+++ b/lldb/tools/lldb-dap/EventHelper.cpp
@@ -176,7 +176,7 @@ llvm::Error SendThreadStoppedEvent(DAP &dap, bool on_entry) {
 
   llvm::DenseSet<lldb::tid_t> old_thread_ids;
   old_thread_ids.swap(dap.thread_ids);
-  uint32_t stop_id = process.GetStopID();
+  uint32_t stop_id = on_entry ? 0 : process.GetStopID();
   const uint32_t num_threads = process.GetNumThreads();
 
   // First make a pass through the threads to see if the focused thread
diff --git a/lldb/tools/lldb-dap/Handler/AttachRequestHandler.cpp b/lldb/tools/lldb-dap/Handler/AttachRequestHandler.cpp
index 371349a26866e..490513fe8a0b8 100644
--- a/lldb/tools/lldb-dap/Handler/AttachRequestHandler.cpp
+++ b/lldb/tools/lldb-dap/Handler/AttachRequestHandler.cpp
@@ -124,6 +124,8 @@ Error AttachRequestHandler::Run(const AttachRequestArguments &args) const {
       attach_info.SetWaitForLaunch(args.waitFor, /*async=*/false);
       dap.target.Attach(attach_info, error);
     }
+    if (error.Fail())
+      return ToError(error);
   }
 
   // Make sure the process is attached and stopped.
diff --git a/lldb/tools/lldb-dap/JSONUtils.cpp b/lldb/tools/lldb-dap/JSONUtils.cpp
index 2780a5b7748e8..1a3a6701b194d 100644
--- a/lldb/tools/lldb-dap/JSONUtils.cpp
+++ b/lldb/tools/lldb-dap/JSONUtils.cpp
@@ -711,7 +711,7 @@ llvm::json::Value CreateThreadStopped(DAP &dap, lldb::SBThread &thread,
     break;
   }
   if (stop_id == 0)
-    body.try_emplace("reason", "entry");
+    body["reason"] = "entry";
   const lldb::tid_t tid = thread.GetThreadID();
   body.try_emplace("threadId", (int64_t)tid);
   // If no description has been set, then set it to the default thread stopped
diff --git a/lldb/tools/lldb-dap/tool/CMakeLists.txt b/lldb/tools/lldb-dap/tool/CMakeLists.txt
index b39a4ed9c40e7..5335d25c5d450 100644
--- a/lldb/tools/lldb-dap/tool/CMakeLists.txt
+++ b/lldb/tools/lldb-dap/tool/CMakeLists.txt
@@ -1,3 +1,7 @@
+set(LLVM_TARGET_DEFINITIONS Options.td)
+tablegen(LLVM Options.inc -gen-opt-parser-defs)
+add_public_tablegen_target(LLDBDAPOptionsTableGen)
+
 add_lldb_tool(lldb-dap
   lldb-dap.cpp
 
diff --git a/lldb/tools/lldb-dap/Options.td b/lldb/tools/lldb-dap/tool/Options.td
similarity index 100%
rename from lldb/tools/lldb-dap/Options.td
rename to lldb/tools/lldb-dap/tool/Options.td
diff --git a/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp b/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp
index 6f5d9fd97ee28..3d0e2d8a62482 100644
--- a/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp
+++ b/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp
@@ -90,7 +90,8 @@ PyObject *lldb_private::python::SWIGBridge::LLDBSwigPython_GetChildAtIndex(
   return nullptr;
 }
 
-int lldb_private::python::SWIGBridge::LLDBSwigPython_GetIndexOfChildWithName(
+uint32_t
+lldb_private::python::SWIGBridge::LLDBSwigPython_GetIndexOfChildWithName(
     PyObject *implementor, const char *child_name) {
   return 0;
 }
diff --git a/lldb/unittests/Symbol/TestTypeSystemClang.cpp b/lldb/unittests/Symbol/TestTypeSystemClang.cpp
index 1981e912fa4fa..155fc743934c2 100644
--- a/lldb/unittests/Symbol/TestTypeSystemClang.cpp
+++ b/lldb/unittests/Symbol/TestTypeSystemClang.cpp
@@ -52,6 +52,12 @@ class TestTypeSystemClang : public testing::Test {
     return ClangUtil::GetQualType(
         m_ast->GetBuiltinTypeByName(ConstString(name)));
   }
+
+  CompilerType GetBuiltinTypeForDWARFEncodingAndBitSize(
+      llvm::StringRef type_name, uint32_t encoding, uint32_t bit_size) const {
+    return m_ast->GetBuiltinTypeForDWARFEncodingAndBitSize(type_name, encoding,
+                                                           bit_size);
+  }
 };
 
 TEST_F(TestTypeSystemClang, TestGetBasicTypeFromEnum) {
@@ -238,6 +244,91 @@ TEST_F(TestTypeSystemClang, TestBuiltinTypeForEncodingAndBitSize) {
   VerifyEncodingAndBitSize(*m_ast, eEncodingIEEE754, 64);
 }
 
+TEST_F(TestTypeSystemClang, TestGetBuiltinTypeForDWARFEncodingAndBitSize) {
+  EXPECT_FALSE(GetBuiltinTypeForDWARFEncodingAndBitSize(
+                   "_BitIn", llvm::dwarf::DW_ATE_signed, 2)
+                   .IsValid());
+  EXPECT_FALSE(GetBuiltinTypeForDWARFEncodingAndBitSize(
+                   "BitInt", llvm::dwarf::DW_ATE_signed, 2)
+                   .IsValid());
+  EXPECT_FALSE(GetBuiltinTypeForDWARFEncodingAndBitSize(
+                   "_BitInt(2)", llvm::dwarf::DW_ATE_signed_char, 2)
+                   .IsValid());
+  EXPECT_FALSE(GetBuiltinTypeForDWARFEncodingAndBitSize(
+                   "_BitInt", llvm::dwarf::DW_ATE_signed_char, 2)
+                   .IsValid());
+  EXPECT_FALSE(GetBuiltinTypeForDWARFEncodingAndBitSize(
+                   "_BitInt(2)", llvm::dwarf::DW_ATE_unsigned, 2)
+                   .IsValid());
+  EXPECT_FALSE(GetBuiltinTypeForDWARFEncodingAndBitSize(
+                   "_BitInt", llvm::dwarf::DW_ATE_unsigned, 2)
+                   .IsValid());
+
+  EXPECT_EQ(GetBuiltinTypeForDWARFEncodingAndBitSize(
+                "_BitInt(2)", llvm::dwarf::DW_ATE_signed, 2)
+                .GetTypeName(),
+            "_BitInt(2)");
+  EXPECT_EQ(GetBuiltinTypeForDWARFEncodingAndBitSize(
+                "_BitInt", llvm::dwarf::DW_ATE_signed, 2)
+                .GetTypeName(),
+            "_BitInt(2)");
+  EXPECT_EQ(GetBuiltinTypeForDWARFEncodingAndBitSize(
+                "_BitInt(129)", llvm::dwarf::DW_ATE_signed, 129)
+                .GetTypeName(),
+            "_BitInt(129)");
+  EXPECT_EQ(GetBuiltinTypeForDWARFEncodingAndBitSize(
+                "_BitInt", llvm::dwarf::DW_ATE_signed, 129)
+                .GetTypeName(),
+            "_BitInt(129)");
+
+  EXPECT_FALSE(GetBuiltinTypeForDWARFEncodingAndBitSize(
+                   "unsigned _BitIn", llvm::dwarf::DW_ATE_unsigned, 2)
+                   .IsValid());
+  EXPECT_FALSE(GetBuiltinTypeForDWARFEncodingAndBitSize(
+                   "unsigned BitInt", llvm::dwarf::DW_ATE_unsigned, 2)
+                   .IsValid());
+  EXPECT_FALSE(GetBuiltinTypeForDWARFEncodingAndBitSize(
+                   "unsigned _BitInt(2)", llvm::dwarf::DW_ATE_unsigned_char, 2)
+                   .IsValid());
+  EXPECT_FALSE(GetBuiltinTypeForDWARFEncodingAndBitSize(
+                   "unsigned _BitInt", llvm::dwarf::DW_ATE_unsigned_char, 2)
+                   .IsValid());
+  EXPECT_FALSE(GetBuiltinTypeForDWARFEncodingAndBitSize(
+                   "unsigned _BitInt(2)", llvm::dwarf::DW_ATE_signed, 2)
+                   .IsValid());
+  EXPECT_FALSE(GetBuiltinTypeForDWARFEncodingAndBitSize(
+                   "unsigned _BitInt", llvm::dwarf::DW_ATE_signed, 2)
+                   .IsValid());
+
+  EXPECT_EQ(GetBuiltinTypeForDWARFEncodingAndBitSize(
+                "unsigned _BitInt(2)", llvm::dwarf::DW_ATE_unsigned, 2)
+                .GetTypeName(),
+            "unsigned _BitInt(2)");
+  EXPECT_EQ(GetBuiltinTypeForDWARFEncodingAndBitSize(
+                "unsigned _BitInt", llvm::dwarf::DW_ATE_unsigned, 2)
+                .GetTypeName(),
+            "unsigned _BitInt(2)");
+  EXPECT_EQ(GetBuiltinTypeForDWARFEncodingAndBitSize(
+                "unsigned _BitInt(129)", llvm::dwarf::DW_ATE_unsigned, 129)
+                .GetTypeName(),
+            "unsigned _BitInt(129)");
+  EXPECT_EQ(GetBuiltinTypeForDWARFEncodingAndBitSize(
+                "unsigned _BitInt", llvm::dwarf::DW_ATE_unsigned, 129)
+                .GetTypeName(),
+            "unsigned _BitInt(129)");
+}
+
+TEST_F(TestTypeSystemClang, TestBitIntTypeInfo) {
+  EXPECT_EQ(GetBuiltinTypeForDWARFEncodingAndBitSize(
+                "_BitInt", llvm::dwarf::DW_ATE_signed, 2)
+                .GetTypeInfo(),
+            eTypeIsSigned | eTypeIsScalar | eTypeHasValue | eTypeIsInteger);
+  EXPECT_EQ(GetBuiltinTypeForDWARFEncodingAndBitSize(
+                "unsigned _BitInt", llvm::dwarf::DW_ATE_unsigned, 2)
+                .GetTypeInfo(),
+            eTypeIsScalar | eTypeHasValue | eTypeIsInteger);
+}
+
 TEST_F(TestTypeSystemClang, TestBuiltinTypeForEmptyTriple) {
   // Test that we can access type-info of builtin Clang AST
   // types without crashing even when the target triple is
@@ -1123,6 +1214,30 @@ TEST_F(TestTypeSystemClang, AddMethodToCXXRecordType_ParmVarDecls) {
   EXPECT_EQ(method_it->getParamDecl(1)->getDeclContext(), *method_it);
 }
 
+TEST_F(TestTypeSystemClang, TestGetTypeInfo) {
+  // Tests TypeSystemClang::GetTypeInfo
+
+  const ASTContext &ast = m_ast->getASTContext();
+
+  CompilerType complex_int = m_ast->GetType(ast.getComplexType(ast.IntTy));
+  EXPECT_EQ(complex_int.GetTypeInfo(),
+            (eTypeIsInteger | eTypeIsComplex | eTypeIsBuiltIn | eTypeHasValue));
+
+  CompilerType complex_float = m_ast->GetType(ast.getComplexType(ast.FloatTy));
+  EXPECT_EQ(complex_float.GetTypeInfo(),
+            (eTypeIsFloat | eTypeIsComplex | eTypeIsBuiltIn | eTypeHasValue));
+
+  CompilerType vector_of_int =
+      m_ast->GetType(ast.getVectorType(ast.IntTy, 1, VectorKind::Generic));
+  EXPECT_EQ(vector_of_int.GetTypeInfo(),
+            (eTypeIsInteger | eTypeIsVector | eTypeHasChildren));
+
+  CompilerType vector_of_float =
+      m_ast->GetType(ast.getVectorType(ast.FloatTy, 1, VectorKind::Generic));
+  EXPECT_EQ(vector_of_float.GetTypeInfo(),
+            (eTypeIsFloat | eTypeIsVector | eTypeHasChildren));
+}
+
 TEST_F(TestTypeSystemClang, AsmLabel_CtorDtor) {
   // Tests TypeSystemClang::DeclGetMangledName for constructors/destructors
   // with and without AsmLabels.
diff --git a/lldb/unittests/SymbolFile/DWARF/CMakeLists.txt b/lldb/unittests/SymbolFile/DWARF/CMakeLists.txt
index eb2e00adba64b..88492188e794b 100644
--- a/lldb/unittests/SymbolFile/DWARF/CMakeLists.txt
+++ b/lldb/unittests/SymbolFile/DWARF/CMakeLists.txt
@@ -27,6 +27,7 @@ add_lldb_unittest(SymbolFileDWARFTests
 
 set(test_inputs
    test-dwarf.exe
-   DW_AT_default_value-test.yaml)
+   DW_AT_default_value-test.yaml
+   DW_AT_spec_decl_exists-test.yaml)
 
 add_unittest_inputs(SymbolFileDWARFTests "${test_inputs}")
diff --git a/lldb/unittests/SymbolFile/DWARF/DWARFASTParserClangTests.cpp b/lldb/unittests/SymbolFile/DWARF/DWARFASTParserClangTests.cpp
index 0cae01de2902a..cef3a25a4a960 100644
--- a/lldb/unittests/SymbolFile/DWARF/DWARFASTParserClangTests.cpp
+++ b/lldb/unittests/SymbolFile/DWARF/DWARFASTParserClangTests.cpp
@@ -599,6 +599,40 @@ TEST_F(DWARFASTParserClangTests, TestDefaultTemplateParamParsing) {
   }
 }
 
+TEST_F(DWARFASTParserClangTests, TestSpecDeclExistsError) {
+  // Tests that parsing a ClassTemplateSpecializationDecl that already exists
+  // is handled gracefully.
+  auto BufferOrError = llvm::MemoryBuffer::getFile(
+      GetInputFilePath("DW_AT_spec_decl_exists-test.yaml"), /*IsText=*/true);
+  ASSERT_TRUE(BufferOrError);
+  YAMLModuleTester t(BufferOrError.get()->getBuffer());
+
+  DWARFUnit *unit = t.GetDwarfUnit();
+  ASSERT_NE(unit, nullptr);
+  const DWARFDebugInfoEntry *cu_entry = unit->DIE().GetDIE();
+  ASSERT_EQ(cu_entry->Tag(), DW_TAG_compile_unit);
+  DWARFDIE cu_die(unit, cu_entry);
+
+  auto holder = std::make_unique<clang_utils::TypeSystemClangHolder>("ast");
+  auto &ast_ctx = *holder->GetAST();
+  DWARFASTParserClangStub ast_parser(ast_ctx);
+
+  llvm::SmallVector<lldb::TypeSP, 2> specializations;
+  for (DWARFDIE die : cu_die.children()) {
+    SymbolContext sc;
+    bool new_type = false;
+    auto type = ast_parser.ParseTypeFromDWARF(sc, die, &new_type);
+    llvm::StringRef die_name = llvm::StringRef(die.GetName());
+    if (die_name.starts_with("_Optional_payload")) {
+      specializations.push_back(std::move(type));
+    }
+  }
+
+  ASSERT_EQ(specializations.size(), 2U);
+  ASSERT_NE(specializations[0], nullptr);
+  ASSERT_EQ(specializations[1], nullptr);
+}
+
 TEST_F(DWARFASTParserClangTests, TestUniqueDWARFASTTypeMap_CppInsertMapFind) {
   // This tests the behaviour of UniqueDWARFASTTypeMap under
   // following scenario:
@@ -1617,3 +1651,305 @@ TEST_F(DWARFASTParserClangTests, TestObjectPointer_IndexEncoding) {
     EXPECT_EQ(param_die, ast_parser.GetObjectParameter(sub2, context_die));
   }
 }
+
+TEST_F(DWARFASTParserClangTests, TestTypeBitSize) {
+  // Tests that we correctly parse DW_AT_bit_size of a DW_AT_base_type.
+
+  const char *yamldata = R"(
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_AARCH64
+DWARF:
+  debug_str:
+    - _BitInt(2)
+  debug_abbrev:
+    - ID:              0
+      Table:
+        - Code:            0x1
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_language
+              Form:            DW_FORM_data2
+        - Code:            0x2
+          Tag:             DW_TAG_base_type
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute: DW_AT_name
+              Form:      DW_FORM_strp
+            - Attribute: DW_AT_encoding
+              Form:      DW_FORM_data1
+            - Attribute: DW_AT_byte_size
+              Form:      DW_FORM_data1
+            - Attribute: DW_AT_bit_size
+              Form:      DW_FORM_data1
+
+  debug_info:
+     - Version:  5
+       UnitType: DW_UT_compile
+       AddrSize: 8
+       Entries:
+
+# DW_TAG_compile_unit
+#   DW_AT_language [DW_FORM_data2]    (DW_LANG_C_plus_plus)
+
+        - AbbrCode: 0x1
+          Values:
+            - Value: 0x04
+
+#   DW_TAG_base_type
+#     DW_AT_name [DW_FORM_strp] ('_BitInt(2)')
+
+        - AbbrCode: 0x2
+          Values:
+            - Value: 0x0
+            - Value: 0x05
+            - Value: 0x01
+            - Value: 0x02
+...
+)";
+
+  YAMLModuleTester t(yamldata);
+
+  DWARFUnit *unit = t.GetDwarfUnit();
+  ASSERT_NE(unit, nullptr);
+  const DWARFDebugInfoEntry *cu_entry = unit->DIE().GetDIE();
+  ASSERT_EQ(cu_entry->Tag(), DW_TAG_compile_unit);
+  ASSERT_EQ(unit->GetDWARFLanguageType(), DW_LANG_C_plus_plus);
+  DWARFDIE cu_die(unit, cu_entry);
+
+  auto holder = std::make_unique<clang_utils::TypeSystemClangHolder>("ast");
+  auto &ast_ctx = *holder->GetAST();
+  DWARFASTParserClangStub ast_parser(ast_ctx);
+
+  auto type_die = cu_die.GetFirstChild();
+  ASSERT_TRUE(type_die.IsValid());
+  ASSERT_EQ(type_die.Tag(), DW_TAG_base_type);
+
+  ParsedDWARFTypeAttributes attrs(type_die);
+  EXPECT_EQ(attrs.byte_size.value_or(0), 1U);
+  EXPECT_EQ(attrs.data_bit_size.value_or(0), 2U);
+
+  SymbolContext sc;
+  auto type_sp =
+      ast_parser.ParseTypeFromDWARF(sc, type_die, /*type_is_new_ptr=*/nullptr);
+  ASSERT_NE(type_sp, nullptr);
+
+  EXPECT_EQ(llvm::expectedToOptional(type_sp->GetByteSize(nullptr)).value_or(0),
+            1U);
+}
+
+TEST_F(DWARFASTParserClangTests, TestBitIntParsing) {
+  // Tests that we correctly parse the DW_AT_base_type for a _BitInt.
+  // Older versions of Clang only emit the `_BitInt` string into the
+  // DW_AT_name (not including the bitsize). Make sure we understand
+  // those too.
+
+  const char *yamldata = R"(
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_AARCH64
+DWARF:
+  debug_str:
+    - _BitInt(2)
+    - _BitInt
+    - unsigned _BitInt(2)
+    - unsigned _BitInt
+  debug_abbrev:
+    - ID:              0
+      Table:
+        - Code:            0x1
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_language
+              Form:            DW_FORM_data2
+        - Code:            0x2
+          Tag:             DW_TAG_base_type
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute: DW_AT_name
+              Form:      DW_FORM_strp
+            - Attribute: DW_AT_encoding
+              Form:      DW_FORM_data1
+            - Attribute: DW_AT_byte_size
+              Form:      DW_FORM_data1
+            - Attribute: DW_AT_bit_size
+              Form:      DW_FORM_data1
+        - Code:            0x3
+          Tag:             DW_TAG_base_type
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute: DW_AT_name
+              Form:      DW_FORM_strp
+            - Attribute: DW_AT_encoding
+              Form:      DW_FORM_data1
+            - Attribute: DW_AT_byte_size
+              Form:      DW_FORM_data1
+
+  debug_info:
+     - Version:  5
+       UnitType: DW_UT_compile
+       AddrSize: 8
+       Entries:
+
+# DW_TAG_compile_unit
+#   DW_AT_language [DW_FORM_data2]    (DW_LANG_C_plus_plus)
+
+        - AbbrCode: 0x1
+          Values:
+            - Value: 0x04
+
+#   DW_TAG_base_type
+#     DW_AT_name [DW_FORM_strp] ('_BitInt(2)')
+
+        - AbbrCode: 0x2
+          Values:
+            - Value: 0x0
+            - Value: 0x05
+            - Value: 0x01
+            - Value: 0x02
+
+#   DW_TAG_base_type
+#     DW_AT_name [DW_FORM_strp] ('_BitInt')
+
+        - AbbrCode: 0x2
+          Values:
+            - Value: 0x0b
+            - Value: 0x05
+            - Value: 0x08
+            - Value: 0x34
+
+#   DW_TAG_base_type
+#     DW_AT_name [DW_FORM_strp] ('unsigned _BitInt(2)')
+
+        - AbbrCode: 0x2
+          Values:
+            - Value: 0x13
+            - Value: 0x07
+            - Value: 0x01
+            - Value: 0x02
+
+#   DW_TAG_base_type
+#     DW_AT_name [DW_FORM_strp] ('unsigned _BitInt')
+
+        - AbbrCode: 0x2
+          Values:
+            - Value: 0x27
+            - Value: 0x07
+            - Value: 0x08
+            - Value: 0x34
+
+#   DW_TAG_base_type
+#     DW_AT_name [DW_FORM_strp] ('_BitInt')
+
+        - AbbrCode: 0x3
+          Values:
+            - Value: 0x0b
+            - Value: 0x05
+            - Value: 0x08
+...
+
+)";
+
+  YAMLModuleTester t(yamldata);
+
+  DWARFUnit *unit = t.GetDwarfUnit();
+  ASSERT_NE(unit, nullptr);
+  const DWARFDebugInfoEntry *cu_entry = unit->DIE().GetDIE();
+  ASSERT_EQ(cu_entry->Tag(), DW_TAG_compile_unit);
+  ASSERT_EQ(unit->GetDWARFLanguageType(), DW_LANG_C_plus_plus);
+  DWARFDIE cu_die(unit, cu_entry);
+
+  auto holder = std::make_unique<clang_utils::TypeSystemClangHolder>("ast");
+  auto &ast_ctx = *holder->GetAST();
+  DWARFASTParserClangStub ast_parser(ast_ctx);
+
+  auto type_die = cu_die.GetFirstChild();
+  ASSERT_TRUE(type_die.IsValid());
+
+  {
+    SymbolContext sc;
+    auto type_sp = ast_parser.ParseTypeFromDWARF(sc, type_die,
+                                                 /*type_is_new_ptr=*/nullptr);
+    ASSERT_NE(type_sp, nullptr);
+
+    EXPECT_EQ(
+        llvm::expectedToOptional(type_sp->GetByteSize(nullptr)).value_or(0),
+        1U);
+    EXPECT_EQ(type_sp->GetEncoding(), lldb::eEncodingSint);
+    EXPECT_EQ(type_sp->GetName(), "_BitInt(2)");
+    EXPECT_EQ(type_sp->GetForwardCompilerType().GetTypeName(), "_BitInt(2)");
+  }
+
+  {
+    type_die = type_die.GetSibling();
+    SymbolContext sc;
+    auto type_sp = ast_parser.ParseTypeFromDWARF(sc, type_die,
+                                                 /*type_is_new_ptr=*/nullptr);
+    ASSERT_NE(type_sp, nullptr);
+
+    EXPECT_EQ(
+        llvm::expectedToOptional(type_sp->GetByteSize(nullptr)).value_or(0),
+        8U);
+    EXPECT_EQ(type_sp->GetEncoding(), lldb::eEncodingSint);
+    EXPECT_EQ(type_sp->GetName(), "_BitInt");
+    EXPECT_EQ(type_sp->GetForwardCompilerType().GetTypeName(), "_BitInt(52)");
+  }
+
+  {
+    type_die = type_die.GetSibling();
+    SymbolContext sc;
+    auto type_sp = ast_parser.ParseTypeFromDWARF(sc, type_die,
+                                                 /*type_is_new_ptr=*/nullptr);
+    ASSERT_NE(type_sp, nullptr);
+
+    EXPECT_EQ(
+        llvm::expectedToOptional(type_sp->GetByteSize(nullptr)).value_or(0),
+        1U);
+    EXPECT_EQ(type_sp->GetEncoding(), lldb::eEncodingUint);
+    EXPECT_EQ(type_sp->GetName(), "unsigned _BitInt(2)");
+    EXPECT_EQ(type_sp->GetForwardCompilerType().GetTypeName(),
+              "unsigned _BitInt(2)");
+  }
+
+  {
+    type_die = type_die.GetSibling();
+    SymbolContext sc;
+    auto type_sp = ast_parser.ParseTypeFromDWARF(sc, type_die,
+                                                 /*type_is_new_ptr=*/nullptr);
+    ASSERT_NE(type_sp, nullptr);
+
+    EXPECT_EQ(
+        llvm::expectedToOptional(type_sp->GetByteSize(nullptr)).value_or(0),
+        8U);
+    EXPECT_EQ(type_sp->GetEncoding(), lldb::eEncodingUint);
+    EXPECT_EQ(type_sp->GetName(), "unsigned _BitInt");
+    EXPECT_EQ(type_sp->GetForwardCompilerType().GetTypeName(),
+              "unsigned _BitInt(52)");
+  }
+
+  {
+    type_die = type_die.GetSibling();
+    SymbolContext sc;
+    auto type_sp = ast_parser.ParseTypeFromDWARF(sc, type_die,
+                                                 /*type_is_new_ptr=*/nullptr);
+    ASSERT_NE(type_sp, nullptr);
+
+    EXPECT_EQ(
+        llvm::expectedToOptional(type_sp->GetByteSize(nullptr)).value_or(0),
+        8U);
+    EXPECT_EQ(type_sp->GetEncoding(), lldb::eEncodingSint);
+    EXPECT_EQ(type_sp->GetName(), "_BitInt");
+
+    // Older versions of Clang didn't emit a DW_AT_bit_size for _BitInt. In
+    // those cases we would format the CompilerType name using the byte-size.
+    EXPECT_EQ(type_sp->GetForwardCompilerType().GetTypeName(), "_BitInt(64)");
+  }
+}
diff --git a/lldb/unittests/SymbolFile/DWARF/Inputs/DW_AT_spec_decl_exists-test.yaml b/lldb/unittests/SymbolFile/DWARF/Inputs/DW_AT_spec_decl_exists-test.yaml
new file mode 100644
index 0000000000000..91245f09abbbf
--- /dev/null
+++ b/lldb/unittests/SymbolFile/DWARF/Inputs/DW_AT_spec_decl_exists-test.yaml
@@ -0,0 +1,677 @@
+# struct Type {};
+#
+# template <typename _Tp, bool, bool, bool> struct _Optional_payload;
+#
+# template <typename _Tp> struct _Optional_payload<_Tp, true, false, false> {};
+#
+# template <typename _Tp, bool _Copy, bool _Move>
+# struct _Optional_payload<_Tp, false, _Copy, _Move>
+#     : _Optional_payload<_Tp, true, false, false> {};
+#
+# int main() {
+#   _Optional_payload<Type, false, false, true> X;
+# }
+#
+# YAML generated on Linux using obj2yaml on the above program compiled with
+# G++. This is malformed DWARF that is missing DW_TAG_template_value_parameter
+# entries, which is important for the test because that makes the two
+# specializations look like identical structure definitions.
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_DYN
+  Machine:         EM_X86_64
+  Entry:           0x1040
+ProgramHeaders:
+  - Type:            PT_PHDR
+    Flags:           [ PF_R ]
+    VAddr:           0x40
+    Align:           0x8
+    Offset:          0x40
+  - Type:            PT_INTERP
+    Flags:           [ PF_R ]
+    FirstSec:        .interp
+    LastSec:         .interp
+    VAddr:           0x318
+    Offset:          0x318
+  - Type:            PT_LOAD
+    Flags:           [ PF_R ]
+    FirstSec:        .interp
+    LastSec:         .rela.dyn
+    Align:           0x1000
+    Offset:          0x0
+  - Type:            PT_LOAD
+    Flags:           [ PF_X, PF_R ]
+    FirstSec:        .init
+    LastSec:         .fini
+    VAddr:           0x1000
+    Align:           0x1000
+    Offset:          0x1000
+  - Type:            PT_LOAD
+    Flags:           [ PF_R ]
+    FirstSec:        .rodata
+    LastSec:         .eh_frame
+    VAddr:           0x2000
+    Align:           0x1000
+    Offset:          0x2000
+  - Type:            PT_LOAD
+    Flags:           [ PF_W, PF_R ]
+    FirstSec:        .init_array
+    LastSec:         .bss
+    VAddr:           0x3DF0
+    Align:           0x1000
+    Offset:          0x2DF0
+  - Type:            PT_DYNAMIC
+    Flags:           [ PF_W, PF_R ]
+    FirstSec:        .dynamic
+    LastSec:         .dynamic
+    VAddr:           0x3E00
+    Align:           0x8
+    Offset:          0x2E00
+  - Type:            PT_NOTE
+    Flags:           [ PF_R ]
+    FirstSec:        .note.gnu.property
+    LastSec:         .note.gnu.property
+    VAddr:           0x338
+    Align:           0x8
+    Offset:          0x338
+  - Type:            PT_NOTE
+    Flags:           [ PF_R ]
+    FirstSec:        .note.gnu.build-id
+    LastSec:         .note.ABI-tag
+    VAddr:           0x358
+    Align:           0x4
+    Offset:          0x358
+  - Type:            PT_GNU_PROPERTY
+    Flags:           [ PF_R ]
+    FirstSec:        .note.gnu.property
+    LastSec:         .note.gnu.property
+    VAddr:           0x338
+    Align:           0x8
+    Offset:          0x338
+  - Type:            PT_GNU_EH_FRAME
+    Flags:           [ PF_R ]
+    FirstSec:        .eh_frame_hdr
+    LastSec:         .eh_frame_hdr
+    VAddr:           0x2004
+    Align:           0x4
+    Offset:          0x2004
+  - Type:            PT_GNU_STACK
+    Flags:           [ PF_W, PF_R ]
+    Align:           0x10
+    Offset:          0x0
+  - Type:            PT_GNU_RELRO
+    Flags:           [ PF_R ]
+    FirstSec:        .init_array
+    LastSec:         .got
+    VAddr:           0x3DF0
+    Offset:          0x2DF0
+Sections:
+  - Name:            .interp
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x318
+    AddressAlign:    0x1
+    Content:         2F6C696236342F6C642D6C696E75782D7838362D36342E736F2E3200
+  - Name:            .note.gnu.property
+    Type:            SHT_NOTE
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x338
+    AddressAlign:    0x8
+    Notes:
+      - Name:            GNU
+        Desc:            020000C0040000000300000000000000
+        Type:            NT_GNU_PROPERTY_TYPE_0
+  - Name:            .note.gnu.build-id
+    Type:            SHT_NOTE
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x358
+    AddressAlign:    0x4
+    Notes:
+      - Name:            GNU
+        Desc:            AF3A83002F03E80537DCB46B3E56062984AD2629
+        Type:            NT_PRPSINFO
+  - Name:            .note.ABI-tag
+    Type:            SHT_NOTE
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x37C
+    AddressAlign:    0x4
+    Notes:
+      - Name:            GNU
+        Desc:            '00000000030000000200000000000000'
+        Type:            NT_VERSION
+  - Name:            .gnu.hash
+    Type:            SHT_GNU_HASH
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x3A0
+    Link:            .dynsym
+    AddressAlign:    0x8
+    Header:
+      SymNdx:          0x5
+      Shift2:          0x6
+    BloomFilter:     [ 0x810000 ]
+    HashBuckets:     [ 0x5, 0x0 ]
+    HashValues:      [ 0x6DCE65D1 ]
+  - Name:            .dynsym
+    Type:            SHT_DYNSYM
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x3C8
+    Link:            .dynstr
+    AddressAlign:    0x8
+  - Name:            .dynstr
+    Type:            SHT_STRTAB
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x458
+    AddressAlign:    0x1
+  - Name:            .gnu.version
+    Type:            SHT_GNU_versym
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x4D6
+    Link:            .dynsym
+    AddressAlign:    0x2
+    Entries:         [ 0, 0, 2, 0, 0, 2 ]
+  - Name:            .gnu.version_r
+    Type:            SHT_GNU_verneed
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x4E8
+    Link:            .dynstr
+    AddressAlign:    0x8
+    Dependencies:
+      - Version:         1
+        File:            libc.so.6
+        Entries:
+          - Name:            GLIBC_2.2.5
+            Hash:            157882997
+            Flags:           0
+            Other:           2
+  - Name:            .rela.dyn
+    Type:            SHT_RELA
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x508
+    Link:            .dynsym
+    AddressAlign:    0x8
+    Relocations:
+      - Offset:          0x3DF0
+        Type:            R_X86_64_RELATIVE
+        Addend:          4384
+      - Offset:          0x3DF8
+        Type:            R_X86_64_RELATIVE
+        Addend:          4320
+      - Offset:          0x4008
+        Type:            R_X86_64_RELATIVE
+        Addend:          16392
+      - Offset:          0x3FD8
+        Symbol:          _ITM_deregisterTMCloneTable
+        Type:            R_X86_64_GLOB_DAT
+      - Offset:          0x3FE0
+        Symbol:          __libc_start_main
+        Type:            R_X86_64_GLOB_DAT
+      - Offset:          0x3FE8
+        Symbol:          __gmon_start__
+        Type:            R_X86_64_GLOB_DAT
+      - Offset:          0x3FF0
+        Symbol:          _ITM_registerTMCloneTable
+        Type:            R_X86_64_GLOB_DAT
+      - Offset:          0x3FF8
+        Symbol:          __cxa_finalize
+        Type:            R_X86_64_GLOB_DAT
+  - Name:            .init
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1000
+    AddressAlign:    0x4
+    Offset:          0x1000
+    Content:         F30F1EFA4883EC08488B05D92F00004885C07402FFD04883C408C3
+  - Name:            .plt
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1020
+    AddressAlign:    0x10
+    EntSize:         0x10
+    Content:         FF35A22F0000F2FF25A32F00000F1F00
+  - Name:            .plt.got
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1030
+    AddressAlign:    0x10
+    EntSize:         0x10
+    Content:         F30F1EFAF2FF25BD2F00000F1F440000
+  - Name:            .text
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1040
+    AddressAlign:    0x10
+    Content:         F30F1EFA31ED4989D15E4889E24883E4F050544C8D0556010000488D0DDF000000488D3DC1000000FF15722F0000F490488D3D992F0000488D05922F00004839F87415488B054E2F00004885C07409FFE00F1F8000000000C30F1F8000000000488D3D692F0000488D35622F00004829FE4889F048C1EE3F48C1F8034801C648D1FE7414488B05252F00004885C07408FFE0660F1F440000C30F1F8000000000F30F1EFA803D252F000000752B5548833D022F0000004889E5740C488B3D062F0000E829FFFFFFE864FFFFFFC605FD2E0000015DC30F1F00C30F1F8000000000F30F1EFAE977FFFFFFF30F1EFA554889E5B8000000005DC30F1F840000000000F30F1EFA41574C8D3DA32C000041564989D641554989F541544189FC55488D2D942C0000534C29FD4883EC08E88FFEFFFF48C1FD03741F31DB0F1F80000000004C89F24C89EE4489E741FF14DF4883C3014839DD75EA4883C4085B5D415C415D415E415FC366662E0F1F840000000000F30F1EFAC3
+  - Name:            .fini
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x11B8
+    AddressAlign:    0x4
+    Content:         F30F1EFA4883EC084883C408C3
+  - Name:            .rodata
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_MERGE ]
+    Address:         0x2000
+    AddressAlign:    0x4
+    EntSize:         0x4
+    Offset:          0x2000
+    Content:         '01000200'
+  - Name:            .eh_frame_hdr
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2004
+    AddressAlign:    0x4
+    Content:         011B033B38000000060000001CF0FFFF6C0000002CF0FFFF940000003CF0FFFF5400000025F1FFFFAC0000003CF1FFFFCC000000ACF1FFFF14010000
+  - Name:            .eh_frame
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2040
+    AddressAlign:    0x8
+    Content:         1400000000000000017A5200017810011B0C070890010000140000001C000000E0EFFFFF2F00000000440710000000002400000034000000A8EFFFFF10000000000E10460E184A0F0B770880003F1A3A2A33242200000000140000005C00000090EFFFFF1000000000000000000000001C0000007400000071F0FFFF0F00000000450E108602430D06460C0708000000440000009400000068F0FFFF6500000000460E108F02490E188E03450E208D04450E288C05440E308606480E388307470E406E0E38410E30410E28420E20420E18420E10420E080010000000DC00000090F0FFFF050000000000000000000000
+  - Name:            .init_array
+    Type:            SHT_INIT_ARRAY
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x3DF0
+    AddressAlign:    0x8
+    EntSize:         0x8
+    Offset:          0x2DF0
+    Content:         '2011000000000000'
+  - Name:            .fini_array
+    Type:            SHT_FINI_ARRAY
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x3DF8
+    AddressAlign:    0x8
+    EntSize:         0x8
+    Content:         E010000000000000
+  - Name:            .dynamic
+    Type:            SHT_DYNAMIC
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x3E00
+    Link:            .dynstr
+    AddressAlign:    0x8
+    Entries:
+      - Tag:             DT_NEEDED
+        Value:           0x1
+      - Tag:             DT_INIT
+        Value:           0x1000
+      - Tag:             DT_FINI
+        Value:           0x11B8
+      - Tag:             DT_INIT_ARRAY
+        Value:           0x3DF0
+      - Tag:             DT_INIT_ARRAYSZ
+        Value:           0x8
+      - Tag:             DT_FINI_ARRAY
+        Value:           0x3DF8
+      - Tag:             DT_FINI_ARRAYSZ
+        Value:           0x8
+      - Tag:             DT_GNU_HASH
+        Value:           0x3A0
+      - Tag:             DT_STRTAB
+        Value:           0x458
+      - Tag:             DT_SYMTAB
+        Value:           0x3C8
+      - Tag:             DT_STRSZ
+        Value:           0x7D
+      - Tag:             DT_SYMENT
+        Value:           0x18
+      - Tag:             DT_DEBUG
+        Value:           0x0
+      - Tag:             DT_PLTGOT
+        Value:           0x3FC0
+      - Tag:             DT_RELA
+        Value:           0x508
+      - Tag:             DT_RELASZ
+        Value:           0xC0
+      - Tag:             DT_RELAENT
+        Value:           0x18
+      - Tag:             DT_FLAGS
+        Value:           0x8
+      - Tag:             DT_FLAGS_1
+        Value:           0x8000001
+      - Tag:             DT_VERNEED
+        Value:           0x4E8
+      - Tag:             DT_VERNEEDNUM
+        Value:           0x1
+      - Tag:             DT_VERSYM
+        Value:           0x4D6
+      - Tag:             DT_RELACOUNT
+        Value:           0x3
+      - Tag:             DT_NULL
+        Value:           0x0
+      - Tag:             DT_NULL
+        Value:           0x0
+      - Tag:             DT_NULL
+        Value:           0x0
+      - Tag:             DT_NULL
+        Value:           0x0
+      - Tag:             DT_NULL
+        Value:           0x0
+  - Name:            .got
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x3FC0
+    AddressAlign:    0x8
+    EntSize:         0x8
+    Content:         '003E0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000'
+  - Name:            .data
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x4000
+    AddressAlign:    0x8
+    Content:         '00000000000000000840000000000000'
+  - Name:            .bss
+    Type:            SHT_NOBITS
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x4010
+    AddressAlign:    0x1
+    Size:            0x8
+  - Name:            .comment
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_MERGE, SHF_STRINGS ]
+    AddressAlign:    0x1
+    EntSize:         0x1
+    Content:         4743433A20285562756E747520392E342E302D317562756E7475317E32302E30342E322920392E342E3000
+  - Name:            .debug_info
+    Type:            SHT_PROGBITS
+    AddressAlign:    0x1
+    Content:         9E00000004000000000008013A0000000431000000FE00000029110000000000000F000000000000000000000002000000000101010803D2000000010105204D000000045F5470002D000000000305000000010108086A000000053600000000045F5470002D0000000006CD000000010B059A00000029110000000000000F00000000000000019C9A000000075800010C2F4D00000002916F00080405696E740000
+  - Name:            .debug_abbrev
+    Type:            SHT_PROGBITS
+    AddressAlign:    0x1
+    Content:         011101250E130B030E1B0E1101120710170000021300030E0B0B3A0B3B0B390B0000031301030E0B0B3A0B3B0B390B01130000042F00030849130000051C004913380B0000062E013F19030E3A0B3B0B390B49131101120740189742190113000007340003083A0B3B0B390B4913021800000824000B0B3E0B0308000000
+  - Name:            .debug_line
+    Type:            SHT_PROGBITS
+    AddressAlign:    0x1
+    Content:         3D00000003001F0000000101FB0E0D000101010100000001000001006D61696E2E6370700000000000050C0009022911000000000000030A010501840207000101
+Symbols:
+  - Name:            .interp
+    Type:            STT_SECTION
+    Section:         .interp
+    Value:           0x318
+  - Name:            .note.gnu.property
+    Type:            STT_SECTION
+    Section:         .note.gnu.property
+    Value:           0x338
+  - Name:            .note.gnu.build-id
+    Type:            STT_SECTION
+    Section:         .note.gnu.build-id
+    Value:           0x358
+  - Name:            .note.ABI-tag
+    Type:            STT_SECTION
+    Section:         .note.ABI-tag
+    Value:           0x37C
+  - Name:            .gnu.hash
+    Type:            STT_SECTION
+    Section:         .gnu.hash
+    Value:           0x3A0
+  - Name:            .dynsym
+    Type:            STT_SECTION
+    Section:         .dynsym
+    Value:           0x3C8
+  - Name:            .dynstr
+    Type:            STT_SECTION
+    Section:         .dynstr
+    Value:           0x458
+  - Name:            .gnu.version
+    Type:            STT_SECTION
+    Section:         .gnu.version
+    Value:           0x4D6
+  - Name:            .gnu.version_r
+    Type:            STT_SECTION
+    Section:         .gnu.version_r
+    Value:           0x4E8
+  - Name:            .rela.dyn
+    Type:            STT_SECTION
+    Section:         .rela.dyn
+    Value:           0x508
+  - Name:            .init
+    Type:            STT_SECTION
+    Section:         .init
+    Value:           0x1000
+  - Name:            .plt
+    Type:            STT_SECTION
+    Section:         .plt
+    Value:           0x1020
+  - Name:            .plt.got
+    Type:            STT_SECTION
+    Section:         .plt.got
+    Value:           0x1030
+  - Name:            .text
+    Type:            STT_SECTION
+    Section:         .text
+    Value:           0x1040
+  - Name:            .fini
+    Type:            STT_SECTION
+    Section:         .fini
+    Value:           0x11B8
+  - Name:            .rodata
+    Type:            STT_SECTION
+    Section:         .rodata
+    Value:           0x2000
+  - Name:            .eh_frame_hdr
+    Type:            STT_SECTION
+    Section:         .eh_frame_hdr
+    Value:           0x2004
+  - Name:            .eh_frame
+    Type:            STT_SECTION
+    Section:         .eh_frame
+    Value:           0x2040
+  - Name:            .init_array
+    Type:            STT_SECTION
+    Section:         .init_array
+    Value:           0x3DF0
+  - Name:            .fini_array
+    Type:            STT_SECTION
+    Section:         .fini_array
+    Value:           0x3DF8
+  - Name:            .dynamic
+    Type:            STT_SECTION
+    Section:         .dynamic
+    Value:           0x3E00
+  - Name:            .got
+    Type:            STT_SECTION
+    Section:         .got
+    Value:           0x3FC0
+  - Name:            .data
+    Type:            STT_SECTION
+    Section:         .data
+    Value:           0x4000
+  - Name:            .bss
+    Type:            STT_SECTION
+    Section:         .bss
+    Value:           0x4010
+  - Name:            .comment
+    Type:            STT_SECTION
+    Section:         .comment
+  - Name:            .debug_aranges
+    Type:            STT_SECTION
+    Section:         .debug_aranges
+  - Name:            .debug_info
+    Type:            STT_SECTION
+    Section:         .debug_info
+  - Name:            .debug_abbrev
+    Type:            STT_SECTION
+    Section:         .debug_abbrev
+  - Name:            .debug_line
+    Type:            STT_SECTION
+    Section:         .debug_line
+  - Name:            .debug_str
+    Type:            STT_SECTION
+    Section:         .debug_str
+  - Name:            crtstuff.c
+    Type:            STT_FILE
+    Index:           SHN_ABS
+  - Name:            deregister_tm_clones
+    Type:            STT_FUNC
+    Section:         .text
+    Value:           0x1070
+  - Name:            register_tm_clones
+    Type:            STT_FUNC
+    Section:         .text
+    Value:           0x10A0
+  - Name:            __do_global_dtors_aux
+    Type:            STT_FUNC
+    Section:         .text
+    Value:           0x10E0
+  - Name:            completed.8061
+    Type:            STT_OBJECT
+    Section:         .bss
+    Value:           0x4010
+    Size:            0x1
+  - Name:            __do_global_dtors_aux_fini_array_entry
+    Type:            STT_OBJECT
+    Section:         .fini_array
+    Value:           0x3DF8
+  - Name:            frame_dummy
+    Type:            STT_FUNC
+    Section:         .text
+    Value:           0x1120
+  - Name:            __frame_dummy_init_array_entry
+    Type:            STT_OBJECT
+    Section:         .init_array
+    Value:           0x3DF0
+  - Name:            main.cpp
+    Type:            STT_FILE
+    Index:           SHN_ABS
+  - Name:            'crtstuff.c (1)'
+    Type:            STT_FILE
+    Index:           SHN_ABS
+  - Name:            __FRAME_END__
+    Type:            STT_OBJECT
+    Section:         .eh_frame
+    Value:           0x212C
+  - Type:            STT_FILE
+    Index:           SHN_ABS
+  - Name:            __init_array_end
+    Section:         .init_array
+    Value:           0x3DF8
+  - Name:            _DYNAMIC
+    Type:            STT_OBJECT
+    Section:         .dynamic
+    Value:           0x3E00
+  - Name:            __init_array_start
+    Section:         .init_array
+    Value:           0x3DF0
+  - Name:            __GNU_EH_FRAME_HDR
+    Section:         .eh_frame_hdr
+    Value:           0x2004
+  - Name:            _GLOBAL_OFFSET_TABLE_
+    Type:            STT_OBJECT
+    Section:         .got
+    Value:           0x3FC0
+  - Name:            _init
+    Type:            STT_FUNC
+    Section:         .init
+    Value:           0x1000
+  - Name:            __libc_csu_fini
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+    Value:           0x11B0
+    Size:            0x5
+  - Name:            _ITM_deregisterTMCloneTable
+    Binding:         STB_WEAK
+  - Name:            data_start
+    Section:         .data
+    Binding:         STB_WEAK
+    Value:           0x4000
+  - Name:            _edata
+    Section:         .data
+    Binding:         STB_GLOBAL
+    Value:           0x4010
+  - Name:            _fini
+    Type:            STT_FUNC
+    Section:         .fini
+    Binding:         STB_GLOBAL
+    Value:           0x11B8
+    Other:           [ STV_HIDDEN ]
+  - Name:            '__libc_start_main@@GLIBC_2.2.5'
+    Type:            STT_FUNC
+    Binding:         STB_GLOBAL
+  - Name:            __data_start
+    Section:         .data
+    Binding:         STB_GLOBAL
+    Value:           0x4000
+  - Name:            __gmon_start__
+    Binding:         STB_WEAK
+  - Name:            __dso_handle
+    Type:            STT_OBJECT
+    Section:         .data
+    Binding:         STB_GLOBAL
+    Value:           0x4008
+    Other:           [ STV_HIDDEN ]
+  - Name:            _IO_stdin_used
+    Type:            STT_OBJECT
+    Section:         .rodata
+    Binding:         STB_GLOBAL
+    Value:           0x2000
+    Size:            0x4
+  - Name:            __libc_csu_init
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+    Value:           0x1140
+    Size:            0x65
+  - Name:            _end
+    Section:         .bss
+    Binding:         STB_GLOBAL
+    Value:           0x4018
+  - Name:            _start
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+    Value:           0x1040
+    Size:            0x2F
+  - Name:            __bss_start
+    Section:         .bss
+    Binding:         STB_GLOBAL
+    Value:           0x4010
+  - Name:            main
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+    Value:           0x1129
+    Size:            0xF
+  - Name:            __TMC_END__
+    Type:            STT_OBJECT
+    Section:         .data
+    Binding:         STB_GLOBAL
+    Value:           0x4010
+    Other:           [ STV_HIDDEN ]
+  - Name:            _ITM_registerTMCloneTable
+    Binding:         STB_WEAK
+  - Name:            '__cxa_finalize@@GLIBC_2.2.5'
+    Type:            STT_FUNC
+    Binding:         STB_WEAK
+DynamicSymbols:
+  - Name:            _ITM_deregisterTMCloneTable
+    Binding:         STB_WEAK
+  - Name:            __libc_start_main
+    Type:            STT_FUNC
+    Binding:         STB_GLOBAL
+  - Name:            __gmon_start__
+    Binding:         STB_WEAK
+  - Name:            _ITM_registerTMCloneTable
+    Binding:         STB_WEAK
+  - Name:            __cxa_finalize
+    Type:            STT_FUNC
+    Binding:         STB_WEAK
+DWARF:
+  debug_str:
+    - Type
+    - '_Optional_payload<Type, false, false, true>'
+    - main.cpp
+    - 'GNU C++14 9.4.0 -mtune=generic -march=x86-64 -g -O0 -fasynchronous-unwind-tables -fstack-protector-strong -fstack-clash-protection -fcf-protection'
+    - main
+    - '_Optional_payload<Type, true, false, false>'
+    - '/root/os-llvm/llvm-project'
+  debug_aranges:
+    - Length:          0x2C
+      Version:         2
+      CuOffset:        0x0
+      AddressSize:     0x8
+      Descriptors:
+        - Address:         0x1129
+          Length:          0xF
+...
diff --git a/lldb/unittests/SymbolFile/PDB/CMakeLists.txt b/lldb/unittests/SymbolFile/PDB/CMakeLists.txt
index 8edb352e5a3e1..0bd90fe90d88b 100644
--- a/lldb/unittests/SymbolFile/PDB/CMakeLists.txt
+++ b/lldb/unittests/SymbolFile/PDB/CMakeLists.txt
@@ -9,6 +9,7 @@ add_lldb_unittest(SymbolFilePDBTests
     lldbHost
     lldbSymbol
     lldbPluginObjectFilePECOFF
+    lldbPluginPlatformWindows
     lldbPluginSymbolFileDWARF
     lldbPluginSymbolFilePDB
     lldbPluginTypeSystemClang
diff --git a/lldb/unittests/SymbolFile/PDB/SymbolFilePDBTests.cpp b/lldb/unittests/SymbolFile/PDB/SymbolFilePDBTests.cpp
index 858aecd1b9798..90cd4d568f524 100644
--- a/lldb/unittests/SymbolFile/PDB/SymbolFilePDBTests.cpp
+++ b/lldb/unittests/SymbolFile/PDB/SymbolFilePDBTests.cpp
@@ -16,11 +16,13 @@
 #include "llvm/Testing/Support/Error.h"
 
 #include "Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.h"
+#include "Plugins/Platform/Windows/PlatformWindows.h"
 #include "Plugins/SymbolFile/DWARF/SymbolFileDWARF.h"
 #include "Plugins/SymbolFile/PDB/SymbolFilePDB.h"
 #include "Plugins/TypeSystem/Clang/TypeSystemClang.h"
 #include "TestingSupport/TestUtilities.h"
 #include "lldb/Core/Address.h"
+#include "lldb/Core/Debugger.h"
 #include "lldb/Core/Module.h"
 #include "lldb/Core/ModuleSpec.h"
 #include "lldb/Host/FileSystem.h"
@@ -59,6 +61,13 @@ class SymbolFilePDBTests : public testing::Test {
 
     m_pdb_test_exe = GetInputFilePath("test-pdb.exe");
     m_types_test_exe = GetInputFilePath("test-pdb-types.exe");
+
+    ArchSpec arch("x86_64-pc-windows-msvc");
+    Platform::SetHostPlatform(PlatformWindows::CreateInstance(true, &arch));
+    m_debugger_sp = Debugger::CreateInstance();
+    m_debugger_sp->SetPropertyValue(nullptr,
+                                    lldb_private::eVarSetOperationAssign,
+                                    "plugin.symbol-file.pdb.reader", "dia");
   }
 
   void TearDown() override {
@@ -77,6 +86,7 @@ class SymbolFilePDBTests : public testing::Test {
 protected:
   std::string m_pdb_test_exe;
   std::string m_types_test_exe;
+  lldb::DebuggerSP m_debugger_sp;
 
   bool FileSpecMatchesAsBaseOrFull(const FileSpec &left,
                                    const FileSpec &right) const {
diff --git a/llvm/Maintainers.md b/llvm/Maintainers.md
index e52259236fc19..1eba955f9d6ed 100644
--- a/llvm/Maintainers.md
+++ b/llvm/Maintainers.md
@@ -197,7 +197,7 @@ david.green@arm.com (email), [davemgreen](https://github.com/davemgreen) (GitHub
 Amara Emerson (esp. AArch64 GlobalISel) \
 amara@apple.com (email), [aemerson](https://github.com/aemerson) (GitHub) \
 Eli Friedman (esp. ARM64EC) \
-efriedma@quicinc.com (email), [efriedma-quic](https://github.com/efriedma-quic) (GitHub) \
+efriedma@qti.qualcomm.com (email), [efriedma-quic](https://github.com/efriedma-quic) (GitHub) \
 Sjoerd Meijer \
 smeijer@nvidia.com (email), [sjoerdmeijer](https://github.com/sjoerdmeijer) (GitHub) \
 Nashe Mncube \
@@ -246,7 +246,7 @@ mail@justinbogner.com (email), [bogner](https://github.com/bogner) (GitHub)
 #### Hexagon backend
 
 Sundeep Kushwaha \
-sundeepk@quicinc.com (email), [SundeepKushwaha](https://github.com/SundeepKushwaha) (GitHub)
+sundeepk@qti.qualcomm.com (email), [SundeepKushwaha](https://github.com/SundeepKushwaha) (GitHub)
 
 #### Lanai backend
 
diff --git a/llvm/cmake/modules/AddLLVM.cmake b/llvm/cmake/modules/AddLLVM.cmake
index 80e59a4df2433..7d40d309d538e 100644
--- a/llvm/cmake/modules/AddLLVM.cmake
+++ b/llvm/cmake/modules/AddLLVM.cmake
@@ -1747,6 +1747,31 @@ function(add_llvm_implicit_projects)
   llvm_add_implicit_projects(LLVM)
 endfunction(add_llvm_implicit_projects)
 
+function(set_unittest_link_flags target_name)
+  # The runtime benefits of LTO don't outweight the compile time costs for
+  # tests.
+  if(LLVM_ENABLE_LTO)
+    if((UNIX OR MINGW) AND LINKER_IS_LLD)
+      if(LLVM_ENABLE_FATLTO AND NOT APPLE)
+        # When using FatLTO, just use relocatable linking.
+        set_property(TARGET ${target_name} APPEND_STRING PROPERTY
+                      LINK_FLAGS " -Wl,--no-fat-lto-objects")
+      else()
+        set_property(TARGET ${target_name} APPEND_STRING PROPERTY
+                      LINK_FLAGS " -Wl,--lto-O0")
+      endif()
+    elseif(LINKER_IS_LLD_LINK)
+      set_property(TARGET ${target_name} APPEND_STRING PROPERTY
+                    LINK_FLAGS " /opt:lldlto=0")
+    elseif(APPLE AND NOT uppercase_LLVM_ENABLE_LTO STREQUAL "THIN")
+      set_property(TARGET ${target_name} APPEND_STRING PROPERTY
+                    LINK_FLAGS " -Wl,-mllvm,-O0")
+    endif()
+  endif()
+
+  target_link_options(${target_name} PRIVATE "${LLVM_UNITTEST_LINK_FLAGS}")
+endfunction(set_unittest_link_flags)
+
 # Generic support for adding a unittest.
 function(add_unittest test_suite test_name)
   if( NOT LLVM_BUILD_TESTS )
@@ -1770,27 +1795,7 @@ function(add_unittest test_suite test_name)
   get_subproject_title(subproject_title)
   set_target_properties(${test_name} PROPERTIES FOLDER "${subproject_title}/Tests/Unit")
 
-  # The runtime benefits of LTO don't outweight the compile time costs for tests.
-  if(LLVM_ENABLE_LTO)
-    if((UNIX OR MINGW) AND LINKER_IS_LLD)
-      if(LLVM_ENABLE_FATLTO AND NOT APPLE)
-        # When using FatLTO, just use relocatable linking.
-        set_property(TARGET ${test_name} APPEND_STRING PROPERTY
-                      LINK_FLAGS " -Wl,--no-fat-lto-objects")
-      else()
-        set_property(TARGET ${test_name} APPEND_STRING PROPERTY
-                      LINK_FLAGS " -Wl,--lto-O0")
-      endif()
-    elseif(LINKER_IS_LLD_LINK)
-      set_property(TARGET ${test_name} APPEND_STRING PROPERTY
-                    LINK_FLAGS " /opt:lldlto=0")
-    elseif(APPLE AND NOT uppercase_LLVM_ENABLE_LTO STREQUAL "THIN")
-      set_property(TARGET ${target_name} APPEND_STRING PROPERTY
-                    LINK_FLAGS " -Wl,-mllvm,-O0")
-    endif()
-  endif()
-
-  target_link_options(${test_name} PRIVATE "${LLVM_UNITTEST_LINK_FLAGS}")
+  set_unittest_link_flags(${test_name})
 
   set(outdir ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_CFG_INTDIR})
   set_output_directory(${test_name} BINARY_DIR ${outdir} LIBRARY_DIR ${outdir})
diff --git a/llvm/docs/CodeGenerator.rst b/llvm/docs/CodeGenerator.rst
index fc704a3cdd51f..a74f16d7e9477 100644
--- a/llvm/docs/CodeGenerator.rst
+++ b/llvm/docs/CodeGenerator.rst
@@ -498,7 +498,7 @@ The ``MachineBasicBlock`` class
 The ``MachineBasicBlock`` class contains a list of machine instructions
 (:raw-html:`<tt>` `MachineInstr`_ :raw-html:`</tt>` instances).  It roughly
 corresponds to the LLVM code input to the instruction selector, but there can be
-a one-to-many mapping (i.e. one LLVM basic block can map to multiple machine
+a one-to-many mapping (i.e., one LLVM basic block can map to multiple machine
 basic blocks). The ``MachineBasicBlock`` class has a "``getBasicBlock``" method,
 which returns the LLVM basic block that it comes from.
 
@@ -522,7 +522,7 @@ LLVM code generator can model sequences of instructions as MachineInstr
 bundles. A MI bundle can model a VLIW group / pack which contains an arbitrary
 number of parallel instructions. It can also be used to model a sequential list
 of instructions (potentially with data dependencies) that cannot be legally
-separated (e.g. ARM Thumb2 IT blocks).
+separated (e.g., ARM Thumb2 IT blocks).
 
 Conceptually a MI bundle is a MI with a number of other MIs nested within:
 
@@ -583,8 +583,8 @@ Packing / bundling of MachineInstrs for VLIW architectures should
 generally be done as part of the register allocation super-pass. More
 specifically, the pass which determines what MIs should be bundled
 together should be done after code generator exits SSA form
-(i.e. after two-address pass, PHI elimination, and copy coalescing).
-Such bundles should be finalized (i.e. adding BUNDLE MIs and input and
+(i.e., after two-address pass, PHI elimination, and copy coalescing).
+Such bundles should be finalized (i.e., adding BUNDLE MIs and input and
 output register MachineOperands) after virtual registers have been
 rewritten into physical registers. This eliminates the need to add
 virtual register operands to BUNDLE instructions which would
@@ -615,7 +615,7 @@ The ``MCStreamer`` API
 ----------------------
 
 MCStreamer is best thought of as an assembler API.  It is an abstract API which
-is *implemented* in different ways (e.g. to output a ``.s`` file, output an ELF ``.o``
+is *implemented* in different ways (e.g., to output a ``.s`` file, output an ELF ``.o``
 file, etc) but whose API corresponds directly to what you see in a ``.s`` file.
 MCStreamer has one method per directive, such as EmitLabel, EmitSymbolAttribute,
 switchSection, emitValue (for .byte, .word), etc, which directly correspond to
@@ -631,7 +631,7 @@ directives through MCStreamer.
 On the implementation side of MCStreamer, there are two major implementations:
 one for writing out a ``.s`` file (MCAsmStreamer), and one for writing out a ``.o``
 file (MCObjectStreamer).  MCAsmStreamer is a straightforward implementation
-that prints out a directive for each method (e.g. ``EmitValue -> .byte``), but
+that prints out a directive for each method (e.g., ``EmitValue -> .byte``), but
 MCObjectStreamer implements a full assembler.
 
 For target-specific directives, the MCStreamer has a MCTargetStreamer instance.
@@ -681,7 +681,7 @@ The ``MCSection`` class
 -----------------------
 
 The ``MCSection`` class represents an object-file specific section. It is
-subclassed by object file specific implementations (e.g. ``MCSectionMachO``,
+subclassed by object file specific implementations (e.g., ``MCSectionMachO``,
 ``MCSectionCOFF``, ``MCSectionELF``) and these are created and uniqued by
 MCContext.  The MCStreamer has a notion of the current section, which can be
 changed with the SwitchToSection method (which corresponds to a ".section"
@@ -696,7 +696,7 @@ The ``MCInst`` class is a target-independent representation of an instruction.
 It is a simple class (much more so than `MachineInstr`_) that holds a
 target-specific opcode and a vector of MCOperands.  MCOperand, in turn, is a
 simple discriminated union of three cases: 1) a simple immediate, 2) a target
-register ID, 3) a symbolic expression (e.g. "``Lfoo-Lbar+42``") as an MCExpr.
+register ID, 3) a symbolic expression (e.g., "``Lfoo-Lbar+42``") as an MCExpr.
 
 MCInst is the common currency used to represent machine instructions at the MC
 layer.  It is the type used by the instruction encoder, the instruction printer,
@@ -711,9 +711,9 @@ The MC layer's object writers support a variety of object formats. Because of
 target-specific aspects of object formats each target only supports a subset of
 the formats supported by the MC layer. Most targets support emitting ELF
 objects. Other vendor-specific objects are generally supported only on targets
-that are supported by that vendor (i.e. MachO is only supported on targets
+that are supported by that vendor (i.e., MachO is only supported on targets
 supported by Darwin, and XCOFF is only supported on targets that support AIX).
-Additionally some targets have their own object formats (i.e. DirectX, SPIR-V
+Additionally some targets have their own object formats (i.e., DirectX, SPIR-V
 and WebAssembly).
 
 The table below captures a snapshot of object file support in LLVM:
@@ -769,7 +769,7 @@ Introduction to SelectionDAGs
 
 The SelectionDAG provides an abstraction for code representation in a way that
 is amenable to instruction selection using automatic techniques
-(e.g. dynamic-programming based optimal pattern matching selectors). It is also
+(e.g., dynamic-programming based optimal pattern matching selectors). It is also
 well-suited to other phases of code generation; in particular, instruction
 scheduling (SelectionDAG's are very close to scheduling DAGs post-selection).
 Additionally, the SelectionDAG provides a host representation where a large
@@ -898,7 +898,7 @@ Initial SelectionDAG Construction
 The initial SelectionDAG is na\ :raw-html:`&iuml;`\ vely peephole expanded from
 the LLVM input by the ``SelectionDAGBuilder`` class.  The intent of this pass
 is to expose as much low-level, target-specific details to the SelectionDAG as
-possible.  This pass is mostly hard-coded (e.g. an LLVM ``add`` turns into an
+possible.  This pass is mostly hard-coded (e.g., an LLVM ``add`` turns into an
 ``SDNode add`` while a ``getelementptr`` is expanded into the obvious
 arithmetic). This pass requires target-specific hooks to lower calls, returns,
 varargs, etc.  For these features, the :raw-html:`<tt>` `TargetLowering`_
@@ -944,7 +944,7 @@ The Legalize phase is in charge of converting a DAG to only use the operations
 that are natively supported by the target.
 
 Targets often have weird constraints, such as not supporting every operation on
-every supported data type (e.g. X86 does not support byte conditional moves and
+every supported data type (e.g., X86 does not support byte conditional moves and
 PowerPC does not support sign-extending loads from a 16-bit memory location).
 Legalize takes care of this by open-coding another sequence of operations to
 emulate the operation ("expansion"), by promoting one type to a larger type that
@@ -995,7 +995,7 @@ SelectionDAG Optimization Phase: the DAG Combiner
 
 The SelectionDAG optimization phase is run multiple times for code generation,
 immediately after the DAG is built and once after each legalization.  The first
-run of the pass allows the initial code to be cleaned up (e.g. performing
+run of the pass allows the initial code to be cleaned up (e.g., performing
 optimizations that depend on knowing that the operators have restricted type
 inputs).  Subsequent runs of the pass clean up the messy code generated by the
 Legalize passes, which allows Legalize to be very simple (it can focus on making
@@ -1120,10 +1120,10 @@ for your target.  It has the following strengths:
   16-bits of the immediate).
 
 * When using the 'Pat' class to map a pattern to an instruction that has one
-  or more complex operands (like e.g. `X86 addressing mode`_), the pattern may
+  or more complex operands (like e.g., `X86 addressing mode`_), the pattern may
   either specify the operand as a whole using a ``ComplexPattern``, or else it
   may specify the components of the complex operand separately.  The latter is
-  done e.g. for pre-increment instructions by the PowerPC back end:
+  done e.g., for pre-increment instructions by the PowerPC back end:
 
   ::
 
@@ -1145,13 +1145,13 @@ While it has many strengths, the system currently has some limitations,
 primarily because it is a work in progress and is not yet finished:
 
 * Overall, there is no way to define or match SelectionDAG nodes that define
-  multiple values (e.g. ``SMUL_LOHI``, ``LOAD``, ``CALL``, etc).  This is the
+  multiple values (e.g., ``SMUL_LOHI``, ``LOAD``, ``CALL``, etc).  This is the
   biggest reason that you currently still *have to* write custom C++ code
   for your instruction selector.
 
 * There is no great way to support matching complex addressing modes yet.  In
   the future, we will extend pattern fragments to allow them to define multiple
-  values (e.g. the four operands of the `X86 addressing mode`_, which are
+  values (e.g., the four operands of the `X86 addressing mode`_, which are
   currently matched with custom C++ code).  In addition, we'll extend fragments
   so that a fragment can match multiple different patterns.
 
@@ -1175,7 +1175,7 @@ SelectionDAG Scheduling and Formation Phase
 
 The scheduling phase takes the DAG of target instructions from the selection
 phase and assigns an order.  The scheduler can pick an order depending on
-various constraints of the machines (i.e. order for minimal register pressure or
+various constraints of the machines (i.e., order for minimal register pressure or
 try to cover instruction latencies).  Once an order is established, the DAG is
 converted to a list of :raw-html:`<tt>` `MachineInstr`_\s :raw-html:`</tt>` and
 the SelectionDAG is destroyed.
@@ -1615,7 +1615,7 @@ Since the MC layer works at the level of abstraction of object files, it doesn't
 have a notion of functions, global variables etc.  Instead, it thinks about
 labels, directives, and instructions.  A key class used at this time is the
 MCStreamer class.  This is an abstract API that is implemented in different ways
-(e.g. to output a ``.s`` file, output an ELF ``.o`` file, etc) that is effectively an
+(e.g., to output a ``.s`` file, output an ELF ``.o`` file, etc) that is effectively an
 "assembler API".  MCStreamer has one method per directive, such as EmitLabel,
 EmitSymbolAttribute, switchSection, etc, which directly correspond to assembly
 level directives.
diff --git a/llvm/docs/CommandGuide/llvm-config.rst b/llvm/docs/CommandGuide/llvm-config.rst
index 63658d0d90452..1c5c9c7447902 100644
--- a/llvm/docs/CommandGuide/llvm-config.rst
+++ b/llvm/docs/CommandGuide/llvm-config.rst
@@ -126,6 +126,11 @@ OPTIONS
 
  Print the installation prefix for LLVM.
 
+**--quote-paths**
+
+ Quote and escape paths when needed, most notably when a quote, space, backslash
+ or dollar sign characters are present in the path.
+
 **--shared-mode**
 
  Print how the provided components can be collectively linked (`shared` or `static`).
diff --git a/llvm/docs/CommandGuide/llvm-cxxfilt.rst b/llvm/docs/CommandGuide/llvm-cxxfilt.rst
index 8c61cedd9b70b..8e509cec6ce02 100644
--- a/llvm/docs/CommandGuide/llvm-cxxfilt.rst
+++ b/llvm/docs/CommandGuide/llvm-cxxfilt.rst
@@ -54,8 +54,7 @@ OPTIONS
 
 .. option:: --no-strip-underscore, -n
 
-  Do not strip a leading underscore. This is the default for all platforms
-  except Mach-O based hosts.
+  Do not strip a leading underscore. This is the default for all platforms.
 
 .. option:: --quote
 
@@ -64,7 +63,7 @@ OPTIONS
 .. option:: --strip-underscore, -_
 
   Strip a single leading underscore, if present, from each input name before
-  demangling. On by default on Mach-O based platforms.
+  demangling.
 
 .. option:: --types, -t
 
diff --git a/llvm/docs/DeveloperPolicy.rst b/llvm/docs/DeveloperPolicy.rst
index 45f2df20984e6..9135406c2e2a1 100644
--- a/llvm/docs/DeveloperPolicy.rst
+++ b/llvm/docs/DeveloperPolicy.rst
@@ -1189,6 +1189,55 @@ Suggested disclaimer for the project README and the main project web page:
    necessarily a reflection of the completeness or stability of the code, it
    does indicate that the project is not yet endorsed as a component of LLVM.
 
+Adding or enabling a new LLVM pass
+----------------------------------
+
+The guidelines here are primarily targeted at the enablement of new major
+passes in the target-independent optimization pipeline. Small additions, or
+backend-specific passes, require a lesser degree of care. Before creating a new
+pass, consider whether the functionality can be integrated into an existing
+pass first. This is often both faster and more powerful.
+
+When adding a new pass, the goal should be to enable it as part of the default
+optimization pipeline as early as possible and then continue development
+incrementally. (This does not apply to passes that are only relevant for
+specific uses of LLVM, such as GC support passes.)
+
+The recommended workflow is:
+
+1. Implement a basic version of the pass and add it to the pass pipeline behind
+   a flag that is disabled by default. The initial version should focus on
+   handling simple cases correctly and efficiently.
+2. Enable the pass by default. Separating this step allows easily disabling the
+   pass if issues are encountered, without having to revert the entire
+   implementation.
+3. Incrementally extend the pass with new functionality. As the pass is already
+   enabled, it becomes easier to identify the specific change that has caused a
+   regression in correctness, optimization quality or compile-time.
+
+When enabling a pass, certain requirements must be met (in no particular order):
+
+ * **Maintenance:** The pass (and any analyses it depends on) must have at
+   least one maintainer.
+ * **Usefulness:** There should be evidence that the pass improves performance
+   (or whatever metric it optimizes for) on real-world workloads. Improvements
+   seen only on synthetic benchmarks may be insufficient.
+ * **Compile-Time:** The pass should not have a large impact on compile-time,
+   where the evaluation of what "large" means is up to reviewer discretion, and
+   may differ based on the value the pass provides. In any case, it is expected
+   that a concerted effort has been made to mitigate the compile-time impact,
+   both for the average case, and for pathological cases.
+ * **Correctness:** The pass should have no known correctness issues (except
+   global correctness issues that affect all of LLVM). If an old pass is being
+   enabled (rather than implementing a new one incrementally), additional due
+   diligence is required. The pass should be fully reviewed to ensure that it
+   still complies with current quality standards. Fuzzing with disabled
+   profitability checks may help gain additional confidence in the
+   implementation.
+
+If non-trivial issues are found in a newly enabled pass, it may be temporarily
+disabled again, until the issues have been resolved.
+
 .. _copyright-license-patents:
 
 Copyright, License, and Patents
diff --git a/llvm/docs/GettingInvolved.rst b/llvm/docs/GettingInvolved.rst
index 4b4b09ad87aba..0dba9412564d4 100644
--- a/llvm/docs/GettingInvolved.rst
+++ b/llvm/docs/GettingInvolved.rst
@@ -223,6 +223,10 @@ what to add to your calendar invite.
      - `ics <https://calendar.google.com/calendar/ical/c_673c6cd64474c0aff173bf8fa609559f93d654e0984d9d91d71abd32d28c0486%40group.calendar.google.com/public/basic.ics>`__
        `gcal <https://calendar.google.com/calendar/embed?src=c_673c6cd64474c0aff173bf8fa609559f93d654e0984d9d91d71abd32d28c0486%40group.calendar.google.com&ctz=America%2FLos_Angeles>`__
      -
+   * - GlobalISel
+     - Every 2nd Tuesday of the month
+     - `gcal <https://calendar.google.com/calendar/u/0?cid=YWZjNzhmMzE4MDNlNTAyNGY1NmE1MDIyODY0YTYwZmJmYzRjYTEwNTE1NmUxODA2NzBkYTliY2ZhYTVkNjk0NUBncm91cC5jYWxlbmRhci5nb29nbGUuY29t>`__
+     - `Meeting details/agenda <https://docs.google.com/document/d/1Ry8O4-Tm5BFj9AMjr8qTQFU80z-ptiNQ62687NaIvLs/edit?usp=sharing>`__
 
 
 For event owners, our Discord bot also supports sending automated announcements
@@ -254,10 +258,6 @@ the future.
      - `ics <https://calendar.google.com/calendar/ical/c_1mincouiltpa24ac14of14lhi4%40group.calendar.google.com/public/basic.ics>`__
        `gcal <https://calendar.google.com/calendar/embed?src=c_1mincouiltpa24ac14of14lhi4%40group.calendar.google.com>`__
      - `Minutes/docs <https://docs.google.com/document/d/1-uEEZfmRdPThZlctOq9eXlmUaSSAAi8oKxhrPY_lpjk/edit#>`__
-   * - GlobalISel
-     - Every 2nd Tuesday of the month
-     - `gcal <https://calendar.google.com/calendar/u/0?cid=ZDcyMjc0ZjZiZjNhMzFlYmE3NTNkMWM2MGM2NjM5ZWU3ZDE2MjM4MGFlZDc2ZjViY2UyYzMwNzVhZjk4MzQ4ZEBncm91cC5jYWxlbmRhci5nb29nbGUuY29t>`__
-     - `Meeting details/agenda <https://docs.google.com/document/d/1Ry8O4-Tm5BFj9AMjr8qTQFU80z-ptiNQ62687NaIvLs/edit?usp=sharing>`__
    * - Vector Predication
      - Every 2 weeks on Tuesdays, 3pm UTC
      -
diff --git a/llvm/docs/HowToCrossCompileBuiltinsOnArm.rst b/llvm/docs/HowToCrossCompileBuiltinsOnArm.rst
index d7759ad8edd06..58599404d5cd4 100644
--- a/llvm/docs/HowToCrossCompileBuiltinsOnArm.rst
+++ b/llvm/docs/HowToCrossCompileBuiltinsOnArm.rst
@@ -8,18 +8,18 @@ Introduction
 This document contains information about building and testing the builtins part
 of compiler-rt for an Arm target, from an x86_64 Linux machine.
 
-While this document concentrates on Arm and Linux the general principles should
+While this document concentrates on Arm and Linux, the general principles should
 apply to other targets supported by compiler-rt. Further contributions for other
 targets are welcome.
 
 The instructions in this document depend on libraries and programs external to
-LLVM, there are many ways to install and configure these dependencies so you
+LLVM. There are many ways to install and configure these dependencies, so you
 may need to adapt the instructions here to fit your own situation.
 
 Prerequisites
 =============
 
-In this use case we will be using cmake on a Debian-based Linux system,
+In this use case, we will be using cmake on a Debian-based Linux system,
 cross-compiling from an x86_64 host to a hard-float Armv7-A target. We will be
 using as many of the LLVM tools as we can, but it is possible to use GNU
 equivalents.
@@ -35,7 +35,7 @@ You will need:
   An existing sysroot is required because some of the builtins include C library
   headers and a sysroot is the easiest way to get those.
 
-In this example we will be using ``ninja`` as the build tool.
+In this example, we will be using ``ninja`` as the build tool.
 
 See https://compiler-rt.llvm.org/ for information about the dependencies
 on clang and LLVM.
@@ -46,7 +46,7 @@ the source for LLVM and compiler-rt.
 ``qemu-arm`` should be available as a package for your Linux distribution.
 
 The most complicated of the prerequisites to satisfy is the ``arm-linux-gnueabihf``
-sysroot. In theory it is possible to use the Linux distributions multiarch
+sysroot. In theory, it is possible to use the Linux distributions multiarch
 support to fulfill the dependencies for building but unfortunately due to
 ``/usr/local/include`` being added some host includes are selected.
 
@@ -153,7 +153,7 @@ The cmake try compile stage fails
 At an early stage cmake will attempt to compile and link a simple C program to
 test if the toolchain is working.
 
-This stage can often fail at link time if the ``--sysroot=``, ``--target`` or
+This stage can often fail at link time if the ``--sysroot=``, ``--target``, or
 ``--gcc-toolchain=`` options are not passed to the compiler. Check the
 ``CMAKE_<LANGUAGE>_FLAGS`` and ``CMAKE_<LANGAUGE>_COMPILER_TARGET`` flags along
 with any of the specific CMake sysroot and toolchain options.
@@ -165,7 +165,7 @@ to make sure it is working. For example::
 
 Clang uses the host header files
 --------------------------------
-On debian based systems it is possible to install multiarch support for
+On Debian-based systems, it is possible to install multiarch support for
 ``arm-linux-gnueabi`` and ``arm-linux-gnueabihf``. In many cases clang can successfully
 use this multiarch support when ``--gcc-toolchain=`` and ``--sysroot=`` are not supplied.
 Unfortunately clang adds ``/usr/local/include`` before
@@ -177,8 +177,8 @@ use a separate ``arm-linux-gnueabihf`` toolchain.
 
 No target passed to clang
 -------------------------
-If clang is not given a target it will typically use the host target, this will
-not understand the Arm assembly language files resulting in error messages such
+If clang is not given a target, it will typically use the host target. This will
+not understand the Arm assembly language files, resulting in error messages such
 as ``error: unknown directive .syntax unified``.
 
 You can check the clang invocation in the error message to see if there is no
@@ -217,7 +217,7 @@ target to use is:
 
 * ``-DCMAKE_C_COMPILER_TARGET=arm-linux-gnueabi``
 
-Depending on whether you want to use floating point instructions or not you
+Depending on whether you want to use floating point instructions or not, you
 may need extra c-flags such as ``-mfloat-abi=softfp`` for use of floating-point
 instructions, and ``-mfloat-abi=soft -mfpu=none`` for software floating-point
 emulation.
@@ -241,7 +241,7 @@ To build and test the libraries using a similar method to Armv7-A is possible
 but more difficult. The main problems are:
 
 * There is not a ``qemu-arm`` user-mode emulator for bare-metal systems.
-  ``qemu-system-arm`` can be used but this is significantly more difficult
+  ``qemu-system-arm`` can be used, but this is significantly more difficult
   to setup. This document does not explain how to do this.
 * The targets to compile compiler-rt have the suffix ``-none-eabi``. This uses
   the BareMetal driver in clang and by default will not find the libraries
@@ -252,8 +252,8 @@ that are supported on Armv7-A we can still get most of the value of running the
 tests using the same ``qemu-arm`` that we used for Armv7-A by building and
 running the test cases for Armv7-A but using the builtins compiled for
 Armv6-M, Armv7-M or Armv7E-M. This will test that the builtins can be linked
-into a binary and execute the tests correctly but it will not catch if the
-builtins use instructions that are supported on Armv7-A but not Armv6-M,
+into a binary and execute the tests correctly, but it will not catch if the
+builtins use instructions that are supported on Armv7-A but not on Armv6-M,
 Armv7-M and Armv7E-M.
 
 This requires a second ``arm-none-eabi`` toolchain for building the builtins.
@@ -321,9 +321,9 @@ command for Armv7-A build and test::
 
 The Armv6-M builtins will use the soft-float ABI. When compiling the tests for
 Armv7-A we must include ``"-mthumb -mfloat-abi=soft -mfpu=none"`` in the
-test-c-flags. We must use an Armv7-A soft-float abi sysroot for ``qemu-arm``.
+test-c-flags. We must use an Armv7-A soft-float ABI sysroot for ``qemu-arm``.
 
-Depending on the linker used for the test cases you may encounter BuildAttribute
+Depending on the linker used for the test cases, you may encounter BuildAttribute
 mismatches between the M-profile objects from compiler-rt and the A-profile
 objects from the test. The lld linker does not check the profile
 BuildAttribute so it can be used to link the tests by adding ``-fuse-ld=lld`` to the
diff --git a/llvm/docs/HowToReleaseLLVM.rst b/llvm/docs/HowToReleaseLLVM.rst
index 171bf889256cd..c269cc4c54bcc 100644
--- a/llvm/docs/HowToReleaseLLVM.rst
+++ b/llvm/docs/HowToReleaseLLVM.rst
@@ -311,10 +311,10 @@ This section describes how to triage bug reports:
    to backport.  You should also review the bug yourself to ensure that it
    meets the requirements for committing to the release branch.
 
-#. Once a bug has been reviewed, add the release:reviewed label and update the
-   issue's status to "Needs Merge".  Check the pull request associated with the
-   issue.  If all the tests pass, then the pull request can be merged.  If not,
-   then add a comment on the issue asking someone to take a look at the failures.
+#. Once a bug has been reviewed, update the status to "Needs Merge". Check the
+   pull request associated with the issue. If all the tests pass, then the pull
+   request can be merged. If not, then add a comment on the issue asking
+   someone to take a look at the failures.
 
 
 Release Patch Rules
diff --git a/llvm/docs/HowToSubmitABug.rst b/llvm/docs/HowToSubmitABug.rst
index 002087cc55e0a..d62391b5da745 100644
--- a/llvm/docs/HowToSubmitABug.rst
+++ b/llvm/docs/HowToSubmitABug.rst
@@ -6,26 +6,26 @@ Introduction - Got bugs?
 ========================
 
 
-If you're working with LLVM and run into a bug, we definitely want to know
+If you're working with LLVM and encounter a bug, we definitely want to know
 about it.  This document describes what you can do to increase the odds of
 getting it fixed quickly.
 
 🔒 If you believe that the bug is security related, please follow :ref:`report-security-issue`. 🔒
 
-Basically you have to do two things at a minimum. First, decide whether the
+Basically, you have to do two things at a minimum. First, decide whether the
 bug `crashes the compiler`_ or if the compiler is `miscompiling`_ the program
 (i.e., the compiler successfully produces an executable, but it doesn't run
 right). Based on what type of bug it is, follow the instructions in the
 linked section to narrow down the bug so that the person who fixes it will be
 able to find the problem more easily.
 
-Once you have a reduced test-case, go to `the LLVM Bug Tracking System
+Once you have a reduced test case, go to `the LLVM Bug Tracking System
 <https://github.com/llvm/llvm-project/issues>`_ and fill out the form with the
 necessary details (note that you don't need to pick a label, just use if you're
 not sure).  The bug description should contain the following information:
 
 * All information necessary to reproduce the problem.
-* The reduced test-case that triggers the bug.
+* The reduced test case that triggers the bug.
 * The location where you obtained LLVM (if not from our Git
   repository).
 
@@ -39,10 +39,10 @@ Crashing Bugs
 More often than not, bugs in the compiler cause it to crash---often due to
 an assertion failure of some sort. The most important piece of the puzzle
 is to figure out if it is crashing in the Clang front-end or if it is one of
-the LLVM libraries (e.g. the optimizer or code generator) that has
+the LLVM libraries (e.g., the optimizer or code generator) that has
 problems.
 
-To figure out which component is crashing (the front-end, middle-end
+To identify the crashing component (the front-end, middle-end
 optimizer, or backend code generator), run the ``clang`` command line as you
 were when the crash occurred, but with the following extra command line
 options:
@@ -53,7 +53,7 @@ options:
   <frontend-crash>`.
 
 * ``-emit-llvm``: If ``clang`` crashes with this option (which disables
-  the code generator), you found a middle-end optimizer bug. Jump ahead to
+  the code generator), you've found a middle-end optimizer bug. Jump ahead to
   :ref:`middle-end bugs <middleend-crash>`.
 
 * Otherwise, you have a backend code generator crash. Jump ahead to :ref:`code
@@ -102,19 +102,19 @@ functions. Then run:
 If this doesn't crash, please follow the instructions for a :ref:`front-end
 bug <frontend-crash>`.
 
-If this does crash, then you should be able to debug this with the following
+If this does crash, then you can debug this with the following
 :doc:`bugpoint <Bugpoint>` command:
 
 .. code-block:: bash
 
    bugpoint foo.bc -O3
 
-Run this, then file a bug with the instructions and reduced .bc
+Run this, then file a bug with the instructions and reduced ``.bc``
 files that bugpoint emits.
 
 If bugpoint doesn't reproduce the crash,
 :doc:`llvm-reduce <CommandGuide/llvm-reduce>` is an alternative way to reduce
-LLVM IR. Create a script that repros the crash and run:
+LLVM IR. Create a script that reproduces the crash and run:
 
 .. code-block:: bash
 
@@ -137,16 +137,16 @@ Backend code generator bugs
 ---------------------------
 
 If you find a bug that crashes clang in the code generator, compile your
-source file to a .bc file by passing "``-emit-llvm -c -o foo.bc``" to
-clang (in addition to the options you already pass).  Once your have
-foo.bc, one of the following commands should fail:
+source file to a ``.bc`` file by passing "``-emit-llvm -c -o foo.bc``" to
+clang (in addition to the options you already pass).  Once you have
+``foo.bc``, one of the following commands should fail:
 
 #. ``llc foo.bc``
 #. ``llc foo.bc -relocation-model=pic``
 #. ``llc foo.bc -relocation-model=static``
 
 If none of these crash, please follow the instructions for a :ref:`front-end
-bug<frontend-crash>`. If one of these do crash, you should be able to reduce
+bug<frontend-crash>`. If one of these crashes, you should be able to reduce
 this with one of the following :doc:`bugpoint <Bugpoint>` command lines (use
 the one corresponding to the command above that failed):
 
@@ -154,9 +154,9 @@ the one corresponding to the command above that failed):
 #. ``bugpoint -run-llc foo.bc --tool-args -relocation-model=pic``
 #. ``bugpoint -run-llc foo.bc --tool-args -relocation-model=static``
 
-Please run this, then file a bug with the instructions and reduced .bc file
+Please run this, then file a bug with the instructions and reduced ``.bc`` file
 that bugpoint emits.  If something goes wrong with bugpoint, please submit
-the "foo.bc" file and the option that llc crashes with.
+the ``foo.bc`` file and the option that llc crashes with.
 
 LTO bugs
 ---------------------------
@@ -174,7 +174,7 @@ in addition to your existing compilation options:
 These options enable LTO and save temporary files generated during compilation
 for later analysis.
 
-On Windows, you should be using lld-link as the linker. Adjust your compilation 
+On Windows, use lld-link as the linker. Adjust your compilation 
 flags as follows:
 * Add ``/lldsavetemps`` to the linker flags.
 * When linking from the compiler driver, add ``/link /lldsavetemps`` in order to forward that flag to the linker.
@@ -199,7 +199,7 @@ command line (use the bc file corresponding to the command above that failed):
 
    llvm-reduce --test reduce.sh a.out.0.2.internalize.bc
 
-Example of reduce.sh script
+Example of ``reduce.sh`` script
 
 .. code-block:: bash
 
@@ -209,9 +209,9 @@ Example of reduce.sh script
    path/to/not --crash path/to/opt "-passes=lto<O3>" $1 -o temp.bc  2> err.log
    grep -q "It->second == &Insn" err.log
 
-Here we have grepped the failed assert message.
+Here we have grepped for the failed assert message.
 
-Please run this, then file a bug with the instructions and reduced .bc file
+Please run this, then file a bug with the instructions and reduced ``.bc`` file
 that llvm-reduce emits.
 
 .. _miscompiling:
@@ -221,16 +221,16 @@ Miscompilations
 
 If clang successfully produces an executable, but that executable doesn't run
 right, this is either a bug in the code or a bug in the compiler. The first
-thing to check is to make sure it is not using undefined behavior (e.g.
+thing to check is to make sure it is not using undefined behavior (e.g.,
 reading a variable before it is defined). In particular, check to see if the
 program is clean under various `sanitizers
-<https://github.com/google/sanitizers>`_ (e.g. ``clang
+<https://github.com/google/sanitizers>`_ (e.g., ``clang
 -fsanitize=undefined,address``) and `valgrind <http://valgrind.org/>`_. Many
 "LLVM bugs" that we have chased down ended up being bugs in the program being
 compiled, not LLVM.
 
 Once you determine that the program itself is not buggy, you should choose
-which code generator you wish to compile the program with (e.g. LLC or the JIT)
+which code generator you wish to compile the program with (e.g., LLC or the JIT)
 and optionally a series of LLVM passes to run.  For example:
 
 .. code-block:: bash
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 1c6823be44dcb..3c089b5a0ba79 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -159,7 +159,7 @@ There are two kinds of escapes.
 * ``\\`` represents a single ``\`` character.
 
 * ``\`` followed by two hexadecimal characters (0-9, a-f, or A-F)
-  represents the byte with the given value (e.g. ``\00`` represents a
+  represents the byte with the given value (e.g., ``\00`` represents a
   null byte).
 
 To represent a ``"`` character, use ``\22``. (``\"`` will end the string
@@ -168,7 +168,7 @@ with a trailing ``\``.)
 Newlines do not terminate string constants; strings can span multiple
 lines.
 
-The interpretation of string constants (e.g. their character encoding)
+The interpretation of string constants (e.g., their character encoding)
 depends on context.
 
 
@@ -330,7 +330,7 @@ added in the future:
     the function (as does normal C).
 "``fastcc``" - The fast calling convention
     This calling convention attempts to make calls as fast as possible
-    (e.g. by passing things in registers). This calling convention
+    (e.g., by passing things in registers). This calling convention
     allows the target to use whatever tricks it wants to produce fast
     code for the target, without having to conform to an externally
     specified ABI (Application Binary Interface). `Tail calls can only
@@ -465,7 +465,7 @@ added in the future:
     This calling convention doesn't preserve any general registers. So all
     general registers are caller saved registers. It also uses all general
     registers to pass arguments. This attribute doesn't impact non-general
-    purpose registers (e.g. floating point registers, on X86 XMMs/YMMs).
+    purpose registers (e.g., floating point registers, on X86 XMMs/YMMs).
     Non-general purpose registers still follow the standard C calling
     convention. Currently it is for x86_64 and AArch64 only.
 "``cxx_fast_tlscc``" - The `CXX_FAST_TLS` calling convention for access functions
@@ -668,7 +668,7 @@ representation is not just an integer address are called "non-integral".
 Non-integral pointers have at least one of the following three properties:
 
 * the pointer representation contains non-address bits
-* the pointer representation is unstable (may changed at any time in a
+* the pointer representation is unstable (may change at any time in a
   target-specific way)
 * the pointer representation has external state
 
@@ -700,7 +700,7 @@ Unstable pointer representation
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Pointers in this address space have an *unspecified* bitwise representation
-(i.e. not backed by a fixed integer). The bitwise pattern of such pointers is
+(i.e., not backed by a fixed integer). The bitwise pattern of such pointers is
 allowed to change in a target-specific way. For example, this could be a pointer
 type used with copying garbage collection where the garbage collector could
 update the pointer at any time in the collection sweep.
@@ -757,7 +757,7 @@ The following restrictions apply to IR level optimization passes:
 
 The ``inttoptr`` instruction does not recreate the external state and therefore
 it is target dependent whether it can be used to create a dereferenceable
-pointer. In general passes should assume that the result of such an inttoptr
+pointer. In general passes should assume that the result of such an ``inttoptr``
 is not dereferenceable. For example, on CHERI targets an ``inttoptr`` will
 yield a capability with the external state (the validity tag bit) set to zero,
 which will cause any dereference to trap.
@@ -784,7 +784,7 @@ be performed as loads and stores of the correct type since stores of other
 types may not propagate the external data.
 Therefore it is not legal to convert an existing load/store (or a
 ``llvm.memcpy`` / ``llvm.memmove`` intrinsic) of pointer types with external
-state to a load/store of an integer type with same bitwidth, as that may drop
+state to a load/store of an integer type with the same bitwidth, as that may drop
 the external state.
 
 
@@ -806,7 +806,7 @@ Global variables can optionally specify a :ref:`linkage type <linkage>`.
 Either global variable definitions or declarations may have an explicit section
 to be placed in and may have an optional explicit alignment specified. If there
 is a mismatch between the explicit or inferred section information for the
-variable declaration and its definition the resulting behavior is undefined.
+variable declaration and its definition, the resulting behavior is undefined.
 
 A variable may be defined as a global ``constant``, which indicates that
 the contents of the variable will **never** be modified (enabling better
@@ -903,7 +903,7 @@ size is unknown at compile time. They are allowed in structs to facilitate
 intrinsics returning multiple values. Generally, structs containing scalable
 vectors are not considered "sized" and cannot be used in loads, stores, allocas,
 or GEPs. The only exception to this rule is for structs that contain scalable
-vectors of the same type (e.g. ``{<vscale x 2 x i32>, <vscale x 2 x i32>}``
+vectors of the same type (e.g., ``{<vscale x 2 x i32>, <vscale x 2 x i32>}``
 contains the same type while ``{<vscale x 2 x i32>, <vscale x 2 x i64>}``
 doesn't). These kinds of structs (we may call them homogeneous scalable vector
 structs) are considered sized and can be used in loads, stores, allocas, but
@@ -1221,7 +1221,7 @@ sections.
 Note that certain IR constructs like global variables and functions may
 create COMDATs in the object file in addition to any which are specified using
 COMDAT IR. This arises when the code generator is configured to emit globals
-in individual sections (e.g. when `-data-sections` or `-function-sections`
+in individual sections (e.g., when `-data-sections` or `-function-sections`
 is supplied to `llc`).
 
 .. _namedmetadatastructure:
@@ -1334,7 +1334,7 @@ Currently, only the following parameter attributes are defined:
     The byval type argument indicates the in-memory value type.
 
     The byval attribute also supports specifying an alignment with the
-    align attribute. It indicates the alignment of the stack slot to
+    ``align`` attribute. It indicates the alignment of the stack slot to
     form and the known alignment of the pointer specified to the call
     site. If the alignment is not specified, then the code generator
     makes a target-specific assumption.
@@ -1355,7 +1355,7 @@ Currently, only the following parameter attributes are defined:
 
     This is not a valid attribute for return values.
 
-    The alignment for an ``byref`` parameter can be explicitly
+    The alignment for a ``byref`` parameter can be explicitly
     specified by combining it with the ``align`` attribute, similar to
     ``byval``. If the alignment is not specified, then the code generator
     makes a target-specific assumption.
@@ -1382,7 +1382,7 @@ Currently, only the following parameter attributes are defined:
     The preallocated attribute requires a type argument.
 
     The preallocated attribute also supports specifying an alignment with the
-    align attribute. It indicates the alignment of the stack slot to
+    ``align`` attribute. It indicates the alignment of the stack slot to
     form and the known alignment of the pointer specified to the call
     site. If the alignment is not specified, then the code generator
     makes a target-specific assumption.
@@ -1550,7 +1550,7 @@ Currently, only the following parameter attributes are defined:
 
 ``nonnull``
     This indicates that the parameter or return pointer is not null. This
-    attribute may only be applied to pointer typed parameters. This is not
+    attribute may only be applied to pointer-typed parameters. This is not
     checked or enforced by LLVM; if the parameter or return pointer is null,
     :ref:`poison value <poisonvalues>` is returned or passed instead.
     The ``nonnull`` attribute should be combined with the ``noundef`` attribute
@@ -1558,7 +1558,7 @@ Currently, only the following parameter attributes are defined:
 
 ``dereferenceable(<n>)``
     This indicates that the parameter or return pointer is dereferenceable. This
-    attribute may only be applied to pointer typed parameters. A pointer that
+    attribute may only be applied to pointer-typed parameters. A pointer that
     is dereferenceable can be loaded from speculatively without a risk of
     trapping. The number of bytes known to be dereferenceable must be provided
     in parentheses. It is legal for the number of bytes to be less than the
@@ -1584,7 +1584,7 @@ Currently, only the following parameter attributes are defined:
     implies that a pointer is at least one of ``dereferenceable(<n>)``
     or ``null`` (i.e., it may be both ``null`` and
     ``dereferenceable(<n>)``). This attribute may only be applied to
-    pointer typed parameters.
+    pointer-typed parameters.
 
 ``swiftself``
     This indicates that the parameter is the self/context parameter. This is not
@@ -1601,7 +1601,7 @@ Currently, only the following parameter attributes are defined:
 
 ``swifterror``
     This attribute is motivated to model and optimize Swift error handling. It
-    can be applied to a parameter with pointer to pointer type or a
+    can be applied to a parameter with pointer-to-pointer type or a
     pointer-sized alloca. At the call site, the actual argument that corresponds
     to a ``swifterror`` parameter has to come from a ``swifterror`` alloca or
     the ``swifterror`` parameter of the caller. A ``swifterror`` value (either
@@ -1722,7 +1722,7 @@ Currently, only the following parameter attributes are defined:
     The function parameter marked with this attribute is the alignment in bytes of the
     newly allocated block returned by this function. The returned value must either have
     the specified alignment or be the null pointer. The return value MAY be more aligned
-    than the requested alignment, but not less aligned.  Invalid (e.g. non-power-of-2)
+    than the requested alignment, but not less aligned.  Invalid (e.g., non-power-of-2)
     alignments are permitted for the allocalign parameter, so long as the returned pointer
     is null. This attribute may only be applied to integer parameters.
 
@@ -1989,7 +1989,7 @@ functions will use the same set of attributes. In the degenerate case of a
 group will capture the important command line flags used to build that file.
 
 An attribute group is a module-level object. To use an attribute group, an
-object references the attribute group's ID (e.g. ``#37``). An object may refer
+object references the attribute group's ID (e.g., ``#37``). An object may refer
 to more than one attribute group. In that situation, the attributes from the
 different groups are merged.
 
@@ -2222,7 +2222,7 @@ For example:
     - ``errnomem``: This refers to accesses to the ``errno`` variable.
     - The default access kind (specified without a location prefix) applies to
       all locations that haven't been specified explicitly, including those that
-      don't currently have a dedicated location kind (e.g. accesses to globals
+      don't currently have a dedicated location kind (e.g., accesses to globals
       or captured pointers).
 
     If the ``memory`` attribute is not specified, then ``memory(readwrite)``
@@ -2713,7 +2713,7 @@ For example:
 
 ``mustprogress``
     This attribute indicates that the function is required to return, unwind,
-    or interact with the environment in an observable way e.g. via a volatile
+    or interact with the environment in an observable way e.g., via a volatile
     memory access, I/O, or other synchronization.  The ``mustprogress``
     attribute is intended to model the requirements of the first section of
     [intro.progress] of the C++ Standard. As a consequence, a loop in a
@@ -2851,7 +2851,7 @@ are grouped into a single :ref:`attribute group <attrgrp>`.
     with `__attribute__((no_sanitize("memtag")))`,
     `__attribute__((disable_sanitizer_instrumentation))`, or included in the
     `-fsanitize-ignorelist` file. The AArch64 Globals Tagging pass may remove
-    this attribute when it's not possible to tag the global (e.g. it's a TLS
+    this attribute when it's not possible to tag the global (e.g., it's a TLS
     variable).
 ``sanitize_address_dyninit``
     This attribute indicates that the global variable, when instrumented with
@@ -3076,7 +3076,7 @@ the behavior is undefined, unless one of the following exceptions applies:
 
 * ``dereferenceable(<n>)`` operand bundles only guarantee the pointer is
   dereferenceable at the point of the assumption. The pointer may not be
-  dereferenceable at later pointers, e.g. because it could have been freed.
+  dereferenceable at later pointers, e.g., because it could have been freed.
 
 In addition to allowing operand bundles encoding function and parameter
 attributes, an assume operand bundle may also encode a ``separate_storage``
@@ -3270,7 +3270,7 @@ as follows:
     address space 0.
     Note: variable declarations without an address space are always created in
     address space 0, this property only affects the default value to be used
-    when creating globals without additional contextual information (e.g. in
+    when creating globals without additional contextual information (e.g., in
     LLVM passes).
 
 .. _alloca_addrspace:
@@ -3282,7 +3282,7 @@ as follows:
     This specifies the properties of a pointer in address space ``as``.
     The ``<size>`` parameter specifies the size of the bitwise representation.
     For :ref:`non-integral pointers <nointptrtype>` the representation size may
-    be larger than the address width of the underlying address space (e.g. to
+    be larger than the address width of the underlying address space (e.g., to
     accommodate additional metadata).
     The alignment requirements are specified via the ``<abi>`` and
     ``<pref>``\erred alignments parameters.
@@ -3478,7 +3478,7 @@ variables) may *not* change their size. (``realloc``-style operations do not
 change the size of an existing allocated object; instead, they create a new
 allocated object. Even if the object is at the same location as the old one, old
 pointers cannot be used to access this new object.) However, allocated objects
-can also be created by means not recognized by LLVM, e.g. by directly calling
+can also be created by means not recognized by LLVM, e.g., by directly calling
 ``mmap``. Those allocated objects are allowed to grow to the right (i.e.,
 keeping the same base address, but increasing their size) while maintaining the
 validity of existing pointers, as long as they always satisfy the properties
@@ -3632,7 +3632,7 @@ through the return value only:
     }
 
 However, we always consider direct inspection of the pointer address
-(e.g. using ``ptrtoint``) to be location-independent. The following example
+(e.g., using ``ptrtoint``) to be location-independent. The following example
 is *not* considered a return-only capture, even though the ``ptrtoint``
 ultimately only contributes to the return value:
 
@@ -4145,7 +4145,7 @@ output, given the original flags.
    ``a * (c / b)`` can be rewritten into ``a / (b / c)``.
 
 ``contract``
-   Allow floating-point contraction (e.g. fusing a multiply followed by an
+   Allow floating-point contraction (e.g., fusing a multiply followed by an
    addition into a fused multiply-and-add). This does not enable reassociation
    to form arbitrary contractions. For example, ``(a*b) + (c*d) + e`` can not
    be transformed into ``(a*b) + ((c*d) + e)`` to create two fma operations.
@@ -4440,7 +4440,7 @@ the default globals address space and ``addrspace("P")`` the program address
 space.
 
 The representation of pointers can be different for each address space and does
-not necessarily need to be a plain integer address (e.g. for
+not necessarily need to be a plain integer address (e.g., for
 :ref:`non-integral pointers <nointptrtype>`). In addition to a representation
 bits size, pointers in each address space also have an index size which defines
 the bitwidth of indexing operations as well as the size of `integer addresses`
@@ -4750,7 +4750,7 @@ is inserted as defined by the DataLayout string in the module, which is
 required to match what the underlying code generator expects.
 
 Structures can either be "literal" or "identified". A literal structure
-is defined inline with other types (e.g. ``[2 x {i32, i32}]``) whereas
+is defined inline with other types (e.g., ``[2 x {i32, i32}]``) whereas
 identified types are always defined at the top level with a name.
 Literal types are uniqued by their contents and can never be recursive
 or opaque since there is no way to write one. Identified types can be
@@ -4791,7 +4791,7 @@ Simple Constants
     Standard integers (such as '4') are constants of the :ref:`integer
     <t_integer>` type. They can be either decimal or
     hexadecimal. Decimal integers can be prefixed with - to represent
-    negative integers, e.g. '``-1234``'. Hexadecimal integers must be
+    negative integers, e.g., '``-1234``'. Hexadecimal integers must be
     prefixed with either u or s to indicate whether they are unsigned
     or signed respectively. e.g '``u0x8000``' gives 32768, whilst
     '``s0x8000``' gives -32768.
@@ -4801,7 +4801,7 @@ Simple Constants
     zeros. So '``s0x0001``' of type '``i16``' will be -1, not 1.
 **Floating-point constants**
     Floating-point constants use standard decimal notation (e.g.
-    123.421), exponential notation (e.g. 1.23421e+2), or a more precise
+    123.421), exponential notation (e.g., 1.23421e+2), or a more precise
     hexadecimal notation (see below). The assembler requires the exact
     decimal value of a floating-point constant. For example, the
     assembler accepts 1.25 but rejects 1.3 because 1.3 is a repeating
@@ -4883,7 +4883,7 @@ constants and smaller complex constants.
     The string '``zeroinitializer``' can be used to zero initialize a
     value to zero of *any* type, including scalar and
     :ref:`aggregate <t_aggregate>` types. This is often used to avoid
-    having to print large zero initializers (e.g. for large arrays) and
+    having to print large zero initializers (e.g., for large arrays) and
     is always exactly equivalent to using explicit zero initializers.
 **Metadata node**
     A metadata node is a constant tuple without types. For example:
@@ -5286,7 +5286,7 @@ Constant Expressions
 Constant expressions are used to allow expressions involving other
 constants to be used as constants. Constant expressions may be of any
 :ref:`first class <t_firstclass>` type and may involve any LLVM operation
-that does not have side effects (e.g. load and call are not supported).
+that does not have side effects (e.g., load and call are not supported).
 The following is the syntax for constant expressions:
 
 ``trunc (CST to TYPE)``
@@ -5472,7 +5472,7 @@ There are also three different categories of constraint codes:
 Output constraints
 """"""""""""""""""
 
-Output constraints are specified by an "``=``" prefix (e.g. "``=r``"). This
+Output constraints are specified by an "``=``" prefix (e.g., "``=r``"). This
 indicates that the assembly will write to this operand, and the operand will
 then be made available as a return value of the ``asm`` expression. Output
 constraints do not consume an argument from the call instruction. (Except, see
@@ -5480,10 +5480,10 @@ below about indirect outputs).
 
 Normally, it is expected that no output locations are written to by the assembly
 expression until *all* of the inputs have been read. As such, LLVM may assign
-the same register to an output and an input. If this is not safe (e.g. if the
+the same register to an output and an input. If this is not safe (e.g., if the
 assembly contains two instructions, where the first writes to one output, and
 the second reads an input and writes to a second output), then the "``&``"
-modifier must be used (e.g. "``=&r``") to specify that the output is an
+modifier must be used (e.g., "``=&r``") to specify that the output is an
 "early-clobber" output. Marking an output as "early-clobber" ensures that LLVM
 will not use the same register for any inputs (other than an input tied to this
 output).
@@ -5523,17 +5523,17 @@ However, this feature is often not as useful as you might think.
 
 Firstly, the registers are *not* guaranteed to be consecutive. So, on those
 architectures that have instructions which operate on multiple consecutive
-instructions, this is not an appropriate way to support them. (e.g. the 32-bit
+instructions, this is not an appropriate way to support them. (e.g., the 32-bit
 SparcV8 has a 64-bit load, which instruction takes a single 32-bit register. The
 hardware then loads into both the named register, and the next register. This
 feature of inline asm would not be useful to support that.)
 
 A few of the targets provide a template string modifier allowing explicit access
-to the second register of a two-register operand (e.g. MIPS ``L``, ``M``, and
+to the second register of a two-register operand (e.g., MIPS ``L``, ``M``, and
 ``D``). On such an architecture, you can actually access the second allocated
 register (yet, still, not any subsequent ones). But, in that case, you're still
 probably better off simply splitting the value into two separate operands, for
-clarity. (e.g. see the description of the ``A`` constraint on X86, which,
+clarity. (e.g., see the description of the ``A`` constraint on X86, which,
 despite existing only for use with this feature, is not really a good idea to
 use)
 
@@ -5549,11 +5549,11 @@ rather than producing a return value. An indirect output constraint is an
 "output" only in that the asm is expected to write to the contents of the input
 memory location, instead of just read from it).
 
-This is most typically used for memory constraint, e.g. "``=*m``", to pass the
+This is most typically used for memory constraint, e.g., "``=*m``", to pass the
 address of a variable as a value.
 
 It is also possible to use an indirect *register* constraint, but only on output
-(e.g. "``=*r``"). This will cause LLVM to allocate a register for an output
+(e.g., "``=*r``"). This will cause LLVM to allocate a register for an output
 value normally, and then, separately emit a store to the address provided as
 input, after the provided inline asm. (It's not clear what value this
 functionality provides, compared to writing the store explicitly after the asm
@@ -5570,7 +5570,7 @@ Clobber constraints
 A clobber constraint is indicated by a "``~``" prefix. A clobber does not
 consume an input operand, nor generate an output. Clobbers cannot use any of the
 general constraint code letters -- they may use only explicit register
-constraints, e.g. "``~{eax}``". The one exception is that a clobber string of
+constraints, e.g., "``~{eax}``". The one exception is that a clobber string of
 "``~{memory}``" indicates that the assembly writes to arbitrary undeclared
 memory locations -- not only the memory pointed to by a declared indirect
 output.
@@ -5594,9 +5594,9 @@ Constraint Codes
 """"""""""""""""
 After a potential prefix comes constraint code, or codes.
 
-A Constraint Code is either a single letter (e.g. "``r``"), a "``^``" character
-followed by two letters (e.g. "``^wc``"), or "``{``" register-name "``}``"
-(e.g. "``{eax}``").
+A Constraint Code is either a single letter (e.g., "``r``"), a "``^``" character
+followed by two letters (e.g., "``^wc``"), or "``{``" register-name "``}``"
+(e.g., "``{eax}``").
 
 The one and two letter constraint codes are typically chosen to be the same as
 GCC's constraint codes.
@@ -5973,11 +5973,11 @@ Target-independent:
 
 - ``a``: Print a memory reference. Targets might customize the output.
 - ``c``: Print an immediate integer constant unadorned, without
-  the target-specific immediate punctuation (e.g. no ``$`` prefix).
+  the target-specific immediate punctuation (e.g., no ``$`` prefix).
 - ``n``: Negate and print immediate integer constant unadorned, without the
-  target-specific immediate punctuation (e.g. no ``$`` prefix).
+  target-specific immediate punctuation (e.g., no ``$`` prefix).
 - ``l``: Print as an unadorned label, without the target-specific label
-  punctuation (e.g. no ``$`` prefix).
+  punctuation (e.g., no ``$`` prefix).
 
 AArch64:
 
@@ -5998,7 +5998,7 @@ ARM:
   register).
 - ``P``: No effect.
 - ``q``: No effect.
-- ``y``: Print a VFP single-precision register as an indexed double (e.g. print
+- ``y``: Print a VFP single-precision register as an indexed double (e.g., print
   as ``d4[1]`` instead of ``s9``)
 - ``B``: Bitwise invert and print an immediate integer constant without ``#``
   prefix.
@@ -6114,18 +6114,18 @@ X86:
 - ``c``: Print an unadorned integer or symbol name. (The latter is
   target-specific behavior for this typically target-independent modifier).
 - ``A``: Print a register name with a '``*``' before it.
-- ``b``: Print an 8-bit register name (e.g. ``al``); do nothing on a memory
+- ``b``: Print an 8-bit register name (e.g., ``al``); do nothing on a memory
   operand.
-- ``h``: Print the upper 8-bit register name (e.g. ``ah``); do nothing on a
+- ``h``: Print the upper 8-bit register name (e.g., ``ah``); do nothing on a
   memory operand.
-- ``w``: Print the 16-bit register name (e.g. ``ax``); do nothing on a memory
+- ``w``: Print the 16-bit register name (e.g., ``ax``); do nothing on a memory
   operand.
-- ``k``: Print the 32-bit register name (e.g. ``eax``); do nothing on a memory
+- ``k``: Print the 32-bit register name (e.g., ``eax``); do nothing on a memory
   operand.
-- ``q``: Print the 64-bit register name (e.g. ``rax``), if 64-bit registers are
+- ``q``: Print the 64-bit register name (e.g., ``rax``), if 64-bit registers are
   available, otherwise the 32-bit register name; do nothing on a memory operand.
 - ``n``: Negate and print an unadorned integer, or, for operands other than an
-  immediate integer (e.g. a relocatable symbol expression), print a '-' before
+  immediate integer (e.g., a relocatable symbol expression), print a '-' before
   the operand. (The behavior for relocatable symbol expressions is a
   target-specific behavior for this typically target-independent modifier)
 - ``H``: Print a memory reference with additional offset +8.
@@ -6883,7 +6883,7 @@ See :ref:`diexpression` for details.
 .. note::
 
    ``DIExpression``\s are always printed and parsed inline; they can never be
-   referenced by an ID (e.g. ``!1``).
+   referenced by an ID (e.g., ``!1``).
 
 Some examples of expressions:
 
@@ -8469,8 +8469,8 @@ that was typically cold and one allocating memory that was typically not cold.
 The format of the metadata describing a context specific profile (e.g.
 ``!1`` and ``!3`` above) requires a first operand that is a metadata node
 describing the context, followed by a list of string metadata tags describing
-the profile behavior (e.g. ``cold`` and ``notcold``) above. The metadata nodes
-describing the context (e.g. ``!2`` and ``!4`` above) are unique ids
+the profile behavior (e.g., ``cold`` and ``notcold``) above. The metadata nodes
+describing the context (e.g., ``!2`` and ``!4`` above) are unique ids
 corresponding to callsites, which can be matched to associated IR calls via
 :ref:`callsite metadata<md_callsite>`. In practice these ids are formed via
 a hash of the callsite's debug info, and the associated call may be in a
@@ -8946,7 +8946,7 @@ in syntax by a caret ('``^``').
 
 The summary is parsed into a bitcode output, along with the Module
 IR, via the "``llvm-as``" tool. Tools that parse the Module IR for the purposes
-of optimization (e.g. "``clang -x ir``" and "``opt``"), will ignore the
+of optimization (e.g., "``clang -x ir``" and "``opt``"), will ignore the
 summary entries (just as they currently ignore summary entries in a bitcode
 input file).
 
@@ -9176,7 +9176,7 @@ The optional ``Refs`` field looks like:
     refs: ((Ref)[, (Ref)]*)
 
 where each ``Ref`` contains a reference to the summary id of the referenced
-value (e.g. ``^1``).
+value (e.g., ``^1``).
 
 .. _typeidinfo_summary:
 
@@ -10385,7 +10385,7 @@ bit width of the result.
 Because LLVM integers use a two's complement representation, and the
 result is the same width as the operands, this instruction returns the
 correct result for both signed and unsigned integers. If a full product
-(e.g. ``i32`` * ``i32`` -> ``i64``) is needed, the operands should be
+(e.g., ``i32`` * ``i32`` -> ``i64``) is needed, the operands should be
 sign-extended or zero-extended as appropriate to the width of the full
 product.
 
@@ -11378,7 +11378,7 @@ allocation on any convenient boundary compatible with the type.
 '``type``' may be any sized type.
 
 Structs containing scalable vectors cannot be used in allocas unless all
-fields are the same scalable vector type (e.g. ``{<vscale x 2 x i32>,
+fields are the same scalable vector type (e.g., ``{<vscale x 2 x i32>,
 <vscale x 2 x i32>}`` contains the same type while ``{<vscale x 2 x i32>,
 <vscale x 2 x i64>}`` doesn't).
 
@@ -12766,7 +12766,7 @@ pointer then a truncation is done. If ``value`` is smaller than the size
 of a pointer then a zero extension is done. If they are the same size,
 nothing is done (*no-op cast*).
 The behavior is equivalent to a ``bitcast``, however, the resulting value is not
-guaranteed to be dereferenceable (e.g. if the result type is a
+guaranteed to be dereferenceable (e.g., if the result type is a
 :ref:`non-integral pointers <nointptrtype>`).
 
 Example:
@@ -14697,7 +14697,7 @@ C++ object with a non-trivial destructor.  ``llvm.seh.scope.begin`` is used to m
 the start of the region; it is always called with ``invoke``, with the unwind block
 being the desired unwind destination for any potentially-throwing instructions
 within the region.  `llvm.seh.scope.end` is used to mark when the scope ends
-and the EH cleanup is no longer required (e.g. because the destructor is being
+and the EH cleanup is no longer required (e.g., because the destructor is being
 called).
 
 .. _int_read_register:
@@ -14737,7 +14737,7 @@ return the current value of the register, where possible. The
 where possible.
 
 A call to '``llvm.read_volatile_register``' is assumed to have side-effects
-and possibly return a different value each time (e.g. for a timer register).
+and possibly return a different value each time (e.g., for a timer register).
 
 This is useful to implement named register global variables that need
 to always be mapped to a specific register, as is common practice on
@@ -15008,9 +15008,9 @@ flushes the instruction cache.
 Semantics:
 """"""""""
 
-On platforms with coherent instruction and data caches (e.g. x86), this
+On platforms with coherent instruction and data caches (e.g., x86), this
 intrinsic is a nop. On platforms with non-coherent instruction and data
-cache (e.g. ARM, MIPS), the intrinsic is lowered either to appropriate
+cache (e.g., ARM, MIPS), the intrinsic is lowered either to appropriate
 instructions or a system call, if cache flushing requires special
 privileges.
 
@@ -15462,7 +15462,7 @@ A call to '``llvm.call.preallocated.arg``' must have a call site
 ``preallocated`` attribute. The type of the ``preallocated`` attribute must
 match the type used by the ``preallocated`` attribute of the corresponding
 argument at the preallocated call. The type is used in the case that an
-``llvm.call.preallocated.setup`` does not have a corresponding call (e.g. due
+``llvm.call.preallocated.setup`` does not have a corresponding call (e.g., due
 to DCE), where otherwise we cannot know how large the arguments are.
 
 It is undefined behavior if this is called with a token from an
@@ -16656,7 +16656,7 @@ for large input values.
 .. note::
 
   Currently, the default lowering of this intrinsic relies on the ``sincospi[f|l]``
-  functions being available in the target's runtime (e.g. libc).
+  functions being available in the target's runtime (e.g., libc).
 
 When specified with the fast-math-flag 'afn', the result may be approximated
 using a less accurate calculation.
@@ -19719,7 +19719,7 @@ Arguments:
 """"""""""
 
 The integer operand is the loop trip count of the hardware-loop, and thus
-not e.g. the loop back-edge taken count.
+not e.g., the loop back-edge taken count.
 
 Semantics:
 """"""""""
@@ -19758,7 +19758,7 @@ Arguments:
 """"""""""
 
 The integer operand is the loop trip count of the hardware-loop, and thus
-not e.g. the loop back-edge taken count.
+not e.g., the loop back-edge taken count.
 
 Semantics:
 """"""""""
@@ -19794,7 +19794,7 @@ Arguments:
 """"""""""
 
 The integer operand is the loop trip count of the hardware-loop, and thus
-not e.g. the loop back-edge taken count.
+not e.g., the loop back-edge taken count.
 
 Semantics:
 """"""""""
@@ -19832,7 +19832,7 @@ Arguments:
 """"""""""
 
 The integer operand is the loop trip count of the hardware-loop, and thus
-not e.g. the loop back-edge taken count.
+not e.g., the loop back-edge taken count.
 
 Semantics:
 """"""""""
@@ -20768,7 +20768,7 @@ of the result's type, while maintaining the same element type.
 Semantics:
 """"""""""
 
-Other than the reduction operator (e.g. add) the way in which the concatenated
+Other than the reduction operator (e.g., add) the way in which the concatenated
 arguments is reduced is entirely unspecified. By their nature these intrinsics
 are not expected to be useful in isolation but instead implement the first phase
 of an overall reduction operation.
@@ -24286,7 +24286,7 @@ The arguments are scalar types to accommodate scalable vector types, for which
 it is unknown what the type of the step vector needs to be that enumerate its
 lanes without overflow.
 
-This mask ``%m`` can e.g. be used in masked load/store instructions. These
+This mask ``%m`` can e.g., be used in masked load/store instructions. These
 intrinsics provide a hint to the backend. I.e., for a vector loop, the
 back-edge taken count of the original scalar loop is explicit as the second
 argument.
@@ -27966,7 +27966,7 @@ The quiet comparison operation performed by
 if either argument is a SNAN.  The signaling comparison operation
 performed by '``llvm.experimental.constrained.fcmps``' will raise an
 exception if either argument is a NAN (QNAN or SNAN). Such an exception
-does not preclude a result being produced (e.g. exception might only
+does not preclude a result being produced (e.g., exception might only
 set a flag), therefore the distinction between ordered and unordered
 comparisons is also relevant for the
 '``llvm.experimental.constrained.fcmps``' intrinsic.
@@ -29983,7 +29983,7 @@ Semantics:
 
 On some platforms, the value returned by this intrinsic remains unchanged
 between loads in the same thread. On other platforms, it returns the same
-global variable value, if any, e.g. ``@__stack_chk_guard``.
+global variable value, if any, e.g., ``@__stack_chk_guard``.
 
 Currently some platforms have IR-level customized stack guard loading (e.g.
 X86 Linux) that is not handled by ``llvm.stackguard()``, while they should be
diff --git a/llvm/docs/ProgrammersManual.rst b/llvm/docs/ProgrammersManual.rst
index d99b5843c2133..270a635e0d153 100644
--- a/llvm/docs/ProgrammersManual.rst
+++ b/llvm/docs/ProgrammersManual.rst
@@ -113,7 +113,7 @@ rarely have to include this file directly).
 
 ``isa<>``:
   The ``isa<>`` operator works exactly like the Java "``instanceof``" operator.
-  It returns true or false depending on whether a reference or pointer points to
+  It returns ``true`` or ``false`` depending on whether a reference or pointer points to
   an instance of the specified class.  This can be very useful for constraint
   checking of various sorts (example below).
 
@@ -167,7 +167,7 @@ rarely have to include this file directly).
 ``isa_and_present<>``:
   The ``isa_and_present<>`` operator works just like the ``isa<>`` operator,
   except that it allows for a null pointer as an argument (which it then
-  returns false).  This can sometimes be useful, allowing you to combine several
+  returns ``false``).  This can sometimes be useful, allowing you to combine several
   null checks into one.
 
 ``cast_if_present<>``:
@@ -402,7 +402,7 @@ doxygen documentation or by looking at the unit test suite.
 Error handling
 --------------
 
-Proper error handling helps us identify bugs in our code, and helps end-users
+Proper error handling helps us identify bugs in our code, and helps end users
 understand errors in their tool usage. Errors fall into two broad categories:
 *programmatic* and *recoverable*, with different strategies for handling and
 reporting.
@@ -449,10 +449,10 @@ violations even in builds that do not enable assertions:
 Recoverable Errors
 ^^^^^^^^^^^^^^^^^^
 
-Recoverable errors represent an error in the program's environment, for example
+Recoverable errors represent an error in the program's environment, for example,
 a resource failure (a missing file, a dropped network connection, etc.), or
 malformed input. These errors should be detected and communicated to a level of
-the program where they can be handled appropriately. Handling the error may be
+the program that can handle them appropriately. Handling the error may be
 as simple as reporting the issue to the user, or it may involve attempts at
 recovery.
 
@@ -668,7 +668,7 @@ Since the list of handlers passed to ``handleErrors`` may not cover every error
 type that can occur, the ``handleErrors`` function also returns an Error value
 that must be checked or propagated. If the error value that is passed to
 ``handleErrors`` does not match any of the handlers it will be returned from
-handleErrors. Idiomatic use of ``handleErrors`` thus looks like:
+``handleErrors``. Idiomatic use of ``handleErrors`` thus looks like:
 
 .. code-block:: c++
 
@@ -683,18 +683,18 @@ handleErrors. Idiomatic use of ``handleErrors`` thus looks like:
           }))
     return Err;
 
-In cases where you truly know that the handler list is exhaustive the
+In cases where you truly know that the handler list is exhaustive, the
 ``handleAllErrors`` function can be used instead. This is identical to
 ``handleErrors`` except that it will terminate the program if an unhandled
 error is passed in, and can therefore return void. The ``handleAllErrors``
 function should generally be avoided: the introduction of a new error type
 elsewhere in the program can easily turn a formerly exhaustive list of errors
 into a non-exhaustive list, risking unexpected program termination. Where
-possible, use handleErrors and propagate unknown errors up the stack instead.
+possible, use ``handleErrors`` and propagate unknown errors up the stack instead.
 
 For tool code, where errors can be handled by printing an error message then
 exiting with an error code, the :ref:`ExitOnError <err_exitonerr>` utility
-may be a better choice than handleErrors, as it simplifies control flow when
+may be a better choice than ``handleErrors``, as it simplifies control flow when
 calling fallible functions.
 
 In situations where it is known that a particular call to a fallible function
@@ -706,9 +706,9 @@ simplifying control flow.
 StringError
 """""""""""
 
-Many kinds of errors have no recovery strategy, the only action that can be
+Many kinds of errors have no recovery strategy; the only action that can be
 taken is to report them to the user so that the user can attempt to fix the
-environment. In this case representing the error as a string makes perfect
+environment. In this case, representing the error as a string makes perfect
 sense. LLVM provides the ``StringError`` class for this purpose. It takes two
 arguments: A string error message, and an equivalent ``std::error_code`` for
 interoperability. It also provides a ``createStringError`` function to simplify
@@ -721,7 +721,7 @@ common usage of this class:
   createStringError(errc::executable_format_error, "Bad executable");
 
 If you're certain that the error you're building will never need to be converted
-to a ``std::error_code`` you can use the ``inconvertibleErrorCode()`` function:
+to a ``std::error_code``, you can use the ``inconvertibleErrorCode()`` function:
 
 .. code-block:: c++
 
@@ -791,18 +791,18 @@ actually recognises three different forms of handler signature:
   Error(std::unique_ptr<UserDefinedError> E);
 
 Any error returned from a handler will be returned from the ``handleErrors``
-function so that it can be handled itself, or propagated up the stack.
+function so that it can be handled itself or propagated up the stack.
 
 .. _err_exitonerr:
 
 Using ExitOnError to simplify tool code
 """""""""""""""""""""""""""""""""""""""
 
-Library code should never call ``exit`` for a recoverable error, however in tool
+Library code should never call ``exit`` for a recoverable error; however, in tool
 code (especially command line tools) this can be a reasonable approach. Calling
 ``exit`` upon encountering an error dramatically simplifies control flow as the
 error no longer needs to be propagated up the stack. This allows code to be
-written in straight-line style, as long as each fallible call is wrapped in a
+written in a straight-line style, as long as each fallible call is wrapped in a
 check and call to exit. The ``ExitOnError`` class supports this pattern by
 providing call operators that inspect ``Error`` values, stripping the error away
 in the success case and logging to ``stderr`` then exiting in the failure case.
@@ -827,7 +827,7 @@ turning them into non-failing calls:
   }
 
 On failure, the error's log message will be written to ``stderr``, optionally
-preceded by a string "banner" that can be set by calling the setBanner method. A
+preceded by a string "banner" that can be set by calling the ``setBanner`` method. A
 mapping can also be supplied from ``Error`` values to exit codes using the
 ``setExitCodeMapper`` method:
 
@@ -854,8 +854,8 @@ Some functions may only fail for a subset of their inputs, so calls using known
 safe inputs can be assumed to succeed.
 
 The cantFail functions encapsulate this by wrapping an assertion that their
-argument is a success value and, in the case of Expected<T>, unwrapping the
-T value:
+argument is a success value and, in the case of ``Expected<T>``, unwrapping the
+``T`` value:
 
 .. code-block:: c++
 
@@ -868,16 +868,16 @@ T value:
     ...
   }
 
-Like the ExitOnError utility, cantFail simplifies control flow. Their treatment
+Like the ExitOnError utility, ``cantFail`` simplifies control flow. Their treatment
 of error cases is very different, however: Where ExitOnError is guaranteed to
-terminate the program on an error input, cantFail simply asserts that the result
+terminate the program on an error input, ``cantFail`` simply asserts that the result
 is success. In debug builds this will result in an assertion failure if an error
-is encountered. In release builds, the behavior of cantFail for failure values is
-undefined. As such, care must be taken in the use of cantFail: clients must be
-certain that a cantFail wrapped call really can not fail with the given
+is encountered. In release builds, the behavior of ``cantFail`` for failure values is
+undefined. As such, care must be taken in the use of ``cantFail``: clients must be
+certain that a ``cantFail`` wrapped call really can not fail with the given
 arguments.
 
-Use of the cantFail functions should be rare in library code, but they are
+Use of the ``cantFail`` functions should be rare in library code, but they are
 likely to be of more use in tool and unit-test code where inputs and/or
 mocked-up classes or functions may be known to be safe.
 
@@ -979,7 +979,7 @@ completing the walk over the archive they could use the ``joinErrors`` utility:
   }
 
 The ``joinErrors`` routine builds a special error type called ``ErrorList``,
-which holds a list of user defined errors. The ``handleErrors`` routine
+which holds a list of user-defined errors. The ``handleErrors`` routine
 recognizes this type and will attempt to handle each of the contained errors in
 order. If all contained errors can be handled, ``handleErrors`` will return
 ``Error::success()``; otherwise, ``handleErrors`` will concatenate the remaining
@@ -1043,7 +1043,7 @@ compared to ``end`` and found to be unequal (in particular, this marks the
 error as checked throughout the body of a range-based for loop), enabling early
 exit from the loop without redundant error checking.
 
-Instances of the fallible iterator interface (e.g. FallibleChildIterator above)
+Instances of the fallible iterator interface (e.g., FallibleChildIterator above)
 are wrapped using the ``make_fallible_itr`` and ``make_fallible_end``
 functions. E.g.:
 
@@ -1146,7 +1146,7 @@ be passed by value.
 The ``LDBG`` and ``LLVM_DEBUG()`` macros and ``-debug`` option
 --------------------------------------------------------------
 
-Often when working on your pass you will put a bunch of debugging printouts and
+Often, when working on your pass, you will put a bunch of debugging printouts and
 other code into your pass.  After you get it working, you want to remove it, but
 you may need it again in the future (to work out new bugs that you run across).
 
@@ -1183,7 +1183,7 @@ The debug output can be enabled by passing the ``-debug`` command line argument.
   $ opt < a.bc > /dev/null -mypass -debug
   [my-pass MyPass.cpp:123 2] I am here!
 
-While `LDBG()` is useful to add debug output to your code, there are cases
+While ``LDBG()`` is useful to add debug output to your code, there are cases
 where you may need to guard a block of code with a debug check. The
 ``llvm/Support/Debug.h`` (`doxygen
 <https://llvm.org/doxygen/Debug_8h_source.html>`__) file provides a macro named
@@ -1220,7 +1220,7 @@ with ``-debug``.
 Fine grained debug info with ``DEBUG_TYPE`` and the ``-debug-only`` option
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-Sometimes you may find yourself in a situation where enabling ``-debug`` just
+Sometimes, you may find yourself in a situation where enabling ``-debug`` just
 turns on **too much** information (such as when working on the code generator).
 If you want to enable debug information with more fine-grained control, you
 can control the debug type and level with associate with each logging statement
@@ -1389,7 +1389,7 @@ maintainable and useful.
 Adding debug counters to aid in debugging your code
 ---------------------------------------------------
 
-Sometimes, when writing new passes, or trying to track down bugs, it
+Sometimes, when writing new passes or trying to track down bugs, it
 is useful to be able to control whether certain things in your pass
 happen or not.  For example, there are times the minimization tooling
 can only easily give you large testcases.  You would like to narrow
@@ -1640,7 +1640,7 @@ dynamically smaller than N, no malloc is performed.  This can be a big win in
 cases where the malloc/free call is far more expensive than the code that
 fiddles around with the elements.
 
-This is good for vectors that are "usually small" (e.g. the number of
+This is good for vectors that are "usually small" (e.g., the number of
 predecessors/successors of a block is usually less than 8).  On the other hand,
 this makes the size of the ``SmallVector`` itself large, so you don't want to
 allocate lots of them (doing so will waste a lot of space).  As such,
@@ -1684,7 +1684,7 @@ to keep ``sizeof(SmallVector<T>)`` around 64 bytes).
 
    .. code-block:: c++
 
-      // DISCOURAGED: Clients cannot pass e.g. raw arrays.
+      // DISCOURAGED: Clients cannot pass e.g., raw arrays.
       hardcodedContiguousStorage(const SmallVectorImpl<Foo> &In);
       // ENCOURAGED: Clients can pass any contiguous storage of Foo.
       allowsAnyContiguousStorage(ArrayRef<Foo> In);
@@ -1695,7 +1695,7 @@ to keep ``sizeof(SmallVector<T>)`` around 64 bytes).
         allowsAnyContiguousStorage(Vec); // Works.
       }
 
-      // DISCOURAGED: Clients cannot pass e.g. SmallVector<Foo, 8>.
+      // DISCOURAGED: Clients cannot pass e.g., SmallVector<Foo, 8>.
       hardcodedSmallSize(SmallVector<Foo, 2> &Out);
       // ENCOURAGED: Clients can pass any SmallVector<Foo, N>.
       allowsAnySmallSize(SmallVectorImpl<Foo> &Out);
@@ -1729,17 +1729,17 @@ page and one extra indirection when accessing elements with their positional
 index.
 
 In order to minimise the memory footprint of this container, it's important to
-balance the ``PageSize`` so that it's not too small (otherwise the overhead of the
-pointer per page might become too high) and not too big (otherwise the memory
+balance the ``PageSize`` so that it's not too small (otherwise, the overhead of the
+pointer per page might become too high) and not too big (otherwise, the memory
 is wasted if the page is not fully used).
 
 Moreover, while retaining the order of the elements based on their insertion
 index, like a vector, iterating over the elements via ``begin()`` and ``end()``
-is not provided in the API, due to the fact accessing the elements in order
+is not provided in the API, due to the fact that accessing the elements in order
 would allocate all the iterated pages, defeating memory savings and the purpose
 of the ``PagedVector``.
 
-Finally a ``materialized_begin()`` and ``materialized_end`` iterators are
+Finally, ``materialized_begin()`` and ``materialized_end`` iterators are
 provided to access the elements associated to the accessed pages, which could
 speed up operations that need to iterate over initialized elements in a
 non-ordered manner.
@@ -1782,9 +1782,9 @@ loop.
 ^^^^^^^
 
 ``std::deque`` is, in some senses, a generalized version of ``std::vector``.
-Like ``std::vector``, it provides constant time random access and other similar
+Like ``std::vector``, it provides constant-time random access and other similar
 properties, but it also provides efficient access to the front of the list.  It
-does not guarantee continuity of elements within memory.
+does not guarantee the continuity of elements within memory.
 
 In exchange for this extra flexibility, ``std::deque`` has significantly higher
 constant factor costs than ``std::vector``.  If possible, use ``std::vector`` or
@@ -1843,7 +1843,7 @@ Related classes of interest are explained in the following subsections:
 llvm/ADT/PackedVector.h
 ^^^^^^^^^^^^^^^^^^^^^^^
 
-Useful for storing a vector of values using only a few number of bits for each
+Useful for storing a vector of values using only a few bits for each
 value.  Apart from the standard operations of a vector-like container, it can
 also perform an 'or' set operation.
 
@@ -1901,13 +1901,13 @@ non-empty ``ilist``\ s.
 
 The only sensible solution to this problem is to allocate a so-called *sentinel*
 along with the intrusive list, which serves as the ``end`` iterator, providing
-the back-link to the last element.  However conforming to the C++ convention it
+the back-link to the last element.  However, conforming to the C++ convention it
 is illegal to ``operator++`` beyond the sentinel and it also must not be
 dereferenced.
 
 These constraints allow for some implementation freedom to the ``ilist`` how to
 allocate and store the sentinel.  The corresponding policy is dictated by
-``ilist_traits<T>``.  By default a ``T`` gets heap-allocated whenever the need
+``ilist_traits<T>``.  By default, a ``T`` gets heap-allocated whenever the need
 for a sentinel arises.
 
 While the default policy is sufficient in most cases, it may break down when
@@ -1941,7 +1941,7 @@ String-like containers
 
 There are a variety of ways to pass around and use strings in C and C++, and
 LLVM adds a few new options to choose from.  Pick the first option on this list
-that will do what you need, they are ordered according to their relative cost.
+that will do what you need; they are ordered according to their relative cost.
 
 Note that it is generally preferred to *not* pass strings around as ``const
 char*``'s.  These have a number of problems, including the fact that they
@@ -1973,12 +1973,12 @@ either because they are C string literals, ``std::string``, a C array, or a
 ``StringRef`` has a few major limitations which make more powerful string containers
 useful:
 
-#. You cannot directly convert a ``StringRef`` to a 'const char*' because there is
+#. You cannot directly convert a ``StringRef`` to a ``const char*`` because there is
    no way to add a trailing nul (unlike the ``.c_str()`` method on various stronger
    classes).
 
 #. ``StringRef`` doesn't own or keep alive the underlying string bytes.
-   As such it can easily lead to dangling pointers, and is not suitable for
+   As such, it can easily lead to dangling pointers, and is not suitable for
    embedding in datastructures in most cases (instead, use an ``std::string`` or
    something like that).
 
@@ -2064,7 +2064,7 @@ so it can be embedded into heap data structures and returned by-value.  On the
 other hand, ``std::string`` is highly inefficient for inline editing (e.g.
 concatenating a bunch of stuff together) and because it is provided by the
 standard library, its performance characteristics depend a lot of the host
-standard library (e.g. libc++ and MSVC provide a highly optimized string class,
+standard library (e.g., libc++ and MSVC provide a highly optimized string class,
 GCC contains a really slow implementation).
 
 The major disadvantage of ``std::string`` is that almost every operation that makes
@@ -2198,7 +2198,7 @@ physical registers, virtual registers, or numbered basic blocks.
 ``SparseMultiSet`` is useful for algorithms that need very fast
 clear/find/insert/erase of the entire collection, and iteration over sets of
 elements sharing a key. It is often a more efficient choice than using composite
-data structures (e.g. vector-of-vectors, map-of-vectors). It is not intended for
+data structures (e.g., vector-of-vectors, map-of-vectors). It is not intended for
 building composite data structures.
 
 .. _dss_FoldingSet:
@@ -2268,7 +2268,7 @@ iteration.
 The difference between ``SetVector`` and other sets is that the order of iteration
 is guaranteed to match the order of insertion into the ``SetVector``.  This property
 is really important for things like sets of pointers.  Because pointer values
-are non-deterministic (e.g. vary across runs of the program on different
+are non-deterministic (e.g., vary across runs of the program on different
 machines), iterating over the pointers in the set will not be in a well-defined
 order.
 
@@ -2473,7 +2473,7 @@ pair in the map, etc.
 
 ``std::map`` is most useful when your keys or values are very large, if you need to
 iterate over the collection in sorted order, or if you need stable iterators
-into the map (i.e. they don't get invalidated if an insertion or deletion of
+into the map (i.e., they don't get invalidated if an insertion or deletion of
 another element takes place).
 
 .. _dss_mapvector:
@@ -2542,7 +2542,7 @@ There are several bit storage containers, and choosing when to use each is
 relatively straightforward.
 
 One additional option is ``std::vector<bool>``: we discourage its use for two
-reasons 1) the implementation in many common compilers (e.g.  commonly
+reasons 1) the implementation in many common compilers (e.g.,  commonly
 available versions of GCC) is extremely inefficient and 2) the C++ standards
 committee is likely to deprecate this container and/or change it significantly
 somehow.  In any case, please don't use it.
@@ -2557,7 +2557,7 @@ It supports individual bit setting/testing, as well as set operations.  The set
 operations take time O(size of bitvector), but operations are performed one word
 at a time, instead of one bit at a time.  This makes the ``BitVector`` very fast for
 set operations compared to other containers.  Use the ``BitVector`` when you expect
-the number of set bits to be high (i.e. a dense set).
+the number of set bits to be high (i.e., a dense set).
 
 .. _dss_smallbitvector:
 
@@ -3305,7 +3305,7 @@ naming value definitions.  The symbol table can provide a name for any Value_.
 Note that the ``SymbolTable`` class should not be directly accessed by most
 clients.  It should only be used when iteration over the symbol table names
 themselves are required, which is very special purpose.  Note that not all LLVM
-Value_\ s have names, and those without names (i.e. they have an empty name) do
+Value_\ s have names, and those without names (i.e., they have an empty name) do
 not exist in the symbol table.
 
 Symbol tables support iteration over the values in the symbol table with
@@ -3871,7 +3871,7 @@ Important Public Members of the ``Instruction`` class
 
 * ``bool mayWriteToMemory()``
 
-  Returns true if the instruction writes to memory, i.e. it is a ``call``,
+  Returns true if the instruction writes to memory, i.e., it is a ``call``,
   ``free``, ``invoke``, or ``store``.
 
 * ``unsigned getOpcode()``
@@ -3881,7 +3881,7 @@ Important Public Members of the ``Instruction`` class
 * ``Instruction *clone() const``
 
   Returns another instance of the specified instruction, identical in all ways
-  to the original except that the instruction has no parent (i.e. it's not
+  to the original except that the instruction has no parent (i.e., it's not
   embedded into a BasicBlock_), and it has no name.
 
 .. _Constant:
diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index 36383b12788f9..bfe68274eae3f 100644
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -180,6 +180,10 @@ Changes to the LLVM tools
 * Some code paths for supporting Python 2.7 in `llvm-lit` have been removed.
 * Support for `%T` in lit has been removed.
 
+* `llvm-config` gained a new flag `--quote-paths` which quotes and escapes paths
+  emitted on stdout, to account for spaces or other special characters in path.
+  (`#97305 <https://github.com/llvm/llvm-project/pull/97305>`_).
+
 Changes to LLDB
 ---------------------------------
 
@@ -191,6 +195,10 @@ Changes to LLDB
 * The `show-progress` setting, which became a NOOP with the introduction of the
   statusline, now defaults to off and controls using OSC escape codes to show a
   native progress bar in supporting terminals like Ghostty and ConEmu.
+* The default PDB reader on Windows was changed from DIA to native, which uses 
+  LLVM's PDB and CodeView support. You can switch back to the DIA reader with
+  `settings set plugin.symbol-file.pdb.reader dia`. Note that support for the
+  DIA reader will be removed in a future version of LLDB.
 
 Changes to BOLT
 ---------------------------------
diff --git a/llvm/examples/Kaleidoscope/Chapter9/toy.cpp b/llvm/examples/Kaleidoscope/Chapter9/toy.cpp
index 51457a3c22ade..14081fb3c3b10 100644
--- a/llvm/examples/Kaleidoscope/Chapter9/toy.cpp
+++ b/llvm/examples/Kaleidoscope/Chapter9/toy.cpp
@@ -203,7 +203,7 @@ class ExprAST {
 
 public:
   ExprAST(SourceLocation Loc = CurLoc) : Loc(Loc) {}
-  virtual ~ExprAST() {}
+  virtual ~ExprAST() = default;
   virtual Value *codegen() = 0;
   int getLine() const { return Loc.Line; }
   int getCol() const { return Loc.Col; }
diff --git a/llvm/examples/OptSubcommand/llvm-hello-sub.cpp b/llvm/examples/OptSubcommand/llvm-hello-sub.cpp
index 8071f56cb3685..8c0363f93803c 100644
--- a/llvm/examples/OptSubcommand/llvm-hello-sub.cpp
+++ b/llvm/examples/OptSubcommand/llvm-hello-sub.cpp
@@ -46,7 +46,7 @@ class HelloSubOptTable : public GenericOptTable {
   HelloSubOptTable()
       : GenericOptTable(OptionStrTable, OptionPrefixesTable, InfoTable,
                         /*IgnoreCase=*/false, OptionSubCommands,
-                        OptionSubCommandIDsTable) {}
+                        OptionSubCommandIDsTable) = default;
 };
 } // namespace
 
diff --git a/llvm/include/llvm/ADT/AddressRanges.h b/llvm/include/llvm/ADT/AddressRanges.h
index 79ba5d5a3eddb..6ea097d544011 100644
--- a/llvm/include/llvm/ADT/AddressRanges.h
+++ b/llvm/include/llvm/ADT/AddressRanges.h
@@ -21,7 +21,7 @@ namespace llvm {
 /// a start and an end address: [Start, End).
 class AddressRange {
 public:
-  AddressRange() {}
+  AddressRange() = default;
   AddressRange(uint64_t S, uint64_t E) : Start(S), End(E) {
     assert(Start <= End);
   }
diff --git a/llvm/include/llvm/ADT/ArrayRef.h b/llvm/include/llvm/ADT/ArrayRef.h
index 448d10013d371..450f4d04c97fc 100644
--- a/llvm/include/llvm/ADT/ArrayRef.h
+++ b/llvm/include/llvm/ADT/ArrayRef.h
@@ -66,10 +66,6 @@ namespace llvm {
     /// Construct an empty ArrayRef.
     /*implicit*/ ArrayRef() = default;
 
-    /// Construct an empty ArrayRef from std::nullopt.
-    /*implicit*/ LLVM_DEPRECATED("Use {} or ArrayRef<T>() instead", "{}")
-    ArrayRef(std::nullopt_t) {}
-
     /// Construct an ArrayRef from a single element.
     /*implicit*/ ArrayRef(const T &OneElt LLVM_LIFETIME_BOUND)
         : Data(&OneElt), Length(1) {}
diff --git a/llvm/include/llvm/ADT/FloatingPointMode.h b/llvm/include/llvm/ADT/FloatingPointMode.h
index 0314b4cb1c38a..a9702c65e631f 100644
--- a/llvm/include/llvm/ADT/FloatingPointMode.h
+++ b/llvm/include/llvm/ADT/FloatingPointMode.h
@@ -191,7 +191,7 @@ inline DenormalMode::DenormalModeKind
 parseDenormalFPAttributeComponent(StringRef Str) {
   // Assume ieee on unspecified attribute.
   return StringSwitch<DenormalMode::DenormalModeKind>(Str)
-      .Cases("", "ieee", DenormalMode::IEEE)
+      .Cases({"", "ieee"}, DenormalMode::IEEE)
       .Case("preserve-sign", DenormalMode::PreserveSign)
       .Case("positive-zero", DenormalMode::PositiveZero)
       .Case("dynamic", DenormalMode::Dynamic)
diff --git a/llvm/include/llvm/ADT/GenericCycleImpl.h b/llvm/include/llvm/ADT/GenericCycleImpl.h
index 40390789e2deb..00f85ca819f3f 100644
--- a/llvm/include/llvm/ADT/GenericCycleImpl.h
+++ b/llvm/include/llvm/ADT/GenericCycleImpl.h
@@ -561,6 +561,17 @@ auto GenericCycleInfo<ContextT>::getSmallestCommonCycle(CycleT *A,
   return A;
 }
 
+/// \brief Find the innermost cycle containing both given blocks.
+///
+/// \returns the innermost cycle containing both \p A and \p B
+///          or nullptr if there is no such cycle.
+template <typename ContextT>
+auto GenericCycleInfo<ContextT>::getSmallestCommonCycle(BlockT *A,
+                                                        BlockT *B) const
+    -> CycleT * {
+  return getSmallestCommonCycle(getCycle(A), getCycle(B));
+}
+
 /// \brief get the depth for the cycle which containing a given block.
 ///
 /// \returns the depth for the innermost cycle containing \p Block or 0 if it is
diff --git a/llvm/include/llvm/ADT/GenericCycleInfo.h b/llvm/include/llvm/ADT/GenericCycleInfo.h
index b8b6e3e9967a4..c31bab3c178ca 100644
--- a/llvm/include/llvm/ADT/GenericCycleInfo.h
+++ b/llvm/include/llvm/ADT/GenericCycleInfo.h
@@ -298,6 +298,7 @@ template <typename ContextT> class GenericCycleInfo {
 
   CycleT *getCycle(const BlockT *Block) const;
   CycleT *getSmallestCommonCycle(CycleT *A, CycleT *B) const;
+  CycleT *getSmallestCommonCycle(BlockT *A, BlockT *B) const;
   unsigned getCycleDepth(const BlockT *Block) const;
   CycleT *getTopLevelParentCycle(BlockT *Block);
 
diff --git a/llvm/include/llvm/ADT/StringMap.h b/llvm/include/llvm/ADT/StringMap.h
index 01cbf2d3fff71..7901365daa462 100644
--- a/llvm/include/llvm/ADT/StringMap.h
+++ b/llvm/include/llvm/ADT/StringMap.h
@@ -302,7 +302,7 @@ class LLVM_ALLOCATORHOLDER_EMPTYBASE StringMap
       if (FindInRHS == RHS.end())
         return false;
 
-      if constexpr (!std::is_same_v<ValueTy, std::nullopt_t>) {
+      if constexpr (!std::is_same_v<ValueTy, EmptyStringSetTag>) {
         if (!(KeyValue.getValue() == FindInRHS->getValue()))
           return false;
       }
diff --git a/llvm/include/llvm/ADT/StringMapEntry.h b/llvm/include/llvm/ADT/StringMapEntry.h
index 21be5ec343059..b0a3c8cd68abc 100644
--- a/llvm/include/llvm/ADT/StringMapEntry.h
+++ b/llvm/include/llvm/ADT/StringMapEntry.h
@@ -21,6 +21,9 @@
 
 namespace llvm {
 
+/// The "value type" of StringSet represented as an empty struct.
+struct EmptyStringSetTag {};
+
 /// StringMapEntryBase - Shared base class of StringMapEntry instances.
 class StringMapEntryBase {
   size_t keyLength;
@@ -85,14 +88,13 @@ class StringMapEntryStorage : public StringMapEntryBase {
 };
 
 template <>
-class StringMapEntryStorage<std::nullopt_t> : public StringMapEntryBase {
+class StringMapEntryStorage<EmptyStringSetTag> : public StringMapEntryBase {
 public:
-  explicit StringMapEntryStorage(size_t keyLength,
-                                 std::nullopt_t = std::nullopt)
+  explicit StringMapEntryStorage(size_t keyLength, EmptyStringSetTag = {})
       : StringMapEntryBase(keyLength) {}
   StringMapEntryStorage(StringMapEntryStorage &entry) = delete;
 
-  std::nullopt_t getValue() const { return std::nullopt; }
+  EmptyStringSetTag getValue() const { return {}; }
 };
 
 /// StringMapEntry - This is used to represent one value that is inserted into
diff --git a/llvm/include/llvm/ADT/StringSet.h b/llvm/include/llvm/ADT/StringSet.h
index c8be3f2a503e4..dc154af073f2f 100644
--- a/llvm/include/llvm/ADT/StringSet.h
+++ b/llvm/include/llvm/ADT/StringSet.h
@@ -22,8 +22,8 @@ namespace llvm {
 
 /// StringSet - A wrapper for StringMap that provides set-like functionality.
 template <class AllocatorTy = MallocAllocator>
-class StringSet : public StringMap<std::nullopt_t, AllocatorTy> {
-  using Base = StringMap<std::nullopt_t, AllocatorTy>;
+class StringSet : public StringMap<EmptyStringSetTag, AllocatorTy> {
+  using Base = StringMap<EmptyStringSetTag, AllocatorTy>;
 
 public:
   StringSet() = default;
diff --git a/llvm/include/llvm/ADT/StringSwitch.h b/llvm/include/llvm/ADT/StringSwitch.h
index 98685de8573fa..8c8d31bd4f055 100644
--- a/llvm/include/llvm/ADT/StringSwitch.h
+++ b/llvm/include/llvm/ADT/StringSwitch.h
@@ -89,6 +89,7 @@ class StringSwitch {
     return CasesImpl(CaseStrings, Value);
   }
 
+  [[deprecated("Pass cases in std::initializer_list instead")]]
   StringSwitch &Cases(StringLiteral S0, StringLiteral S1, T Value) {
     return CasesImpl({S0, S1}, Value);
   }
@@ -173,6 +174,7 @@ class StringSwitch {
     return CasesLowerImpl(CaseStrings, Value);
   }
 
+  [[deprecated("Pass cases in std::initializer_list instead")]]
   StringSwitch &CasesLower(StringLiteral S0, StringLiteral S1, T Value) {
     return CasesLowerImpl({S0, S1}, Value);
   }
diff --git a/llvm/include/llvm/ADT/TypeSwitch.h b/llvm/include/llvm/ADT/TypeSwitch.h
index 5657303b0a1f2..50ca1d5a6b5b6 100644
--- a/llvm/include/llvm/ADT/TypeSwitch.h
+++ b/llvm/include/llvm/ADT/TypeSwitch.h
@@ -111,6 +111,7 @@ class TypeSwitch : public detail::TypeSwitchBase<TypeSwitch<T, ResultT>, T> {
       return std::move(*result);
     return defaultFn(this->value);
   }
+
   /// As a default, return the given value.
   [[nodiscard]] ResultT Default(ResultT defaultResult) {
     if (result)
@@ -118,6 +119,22 @@ class TypeSwitch : public detail::TypeSwitchBase<TypeSwitch<T, ResultT>, T> {
     return defaultResult;
   }
 
+  /// Default for pointer-like results types that accept `nullptr`.
+  template <typename ArgT = ResultT,
+            typename =
+                std::enable_if_t<std::is_constructible_v<ArgT, std::nullptr_t>>>
+  [[nodiscard]] ResultT Default(std::nullptr_t) {
+    return Default(ResultT(nullptr));
+  }
+
+  /// Default for optional results types that accept `std::nullopt`.
+  template <typename ArgT = ResultT,
+            typename =
+                std::enable_if_t<std::is_constructible_v<ArgT, std::nullopt_t>>>
+  [[nodiscard]] ResultT Default(std::nullopt_t) {
+    return Default(ResultT(std::nullopt));
+  }
+
   /// Declare default as unreachable, making sure that all cases were handled.
   [[nodiscard]] ResultT DefaultUnreachable(
       const char *message = "Fell off the end of a type-switch") {
diff --git a/llvm/include/llvm/Analysis/AliasAnalysis.h b/llvm/include/llvm/Analysis/AliasAnalysis.h
index 1681079054b8b..878b7e7a1fb3b 100644
--- a/llvm/include/llvm/Analysis/AliasAnalysis.h
+++ b/llvm/include/llvm/Analysis/AliasAnalysis.h
@@ -861,7 +861,7 @@ class AAResultBase {
 
   // Provide all the copy and move constructors so that derived types aren't
   // constrained.
-  AAResultBase(const AAResultBase &Arg) {}
+  AAResultBase(const AAResultBase &Arg) = default;
   AAResultBase(AAResultBase &&Arg) {}
 
 public:
diff --git a/llvm/include/llvm/Analysis/ConstantFolding.h b/llvm/include/llvm/Analysis/ConstantFolding.h
index 5f91f9747bb97..ea22ed48ab763 100644
--- a/llvm/include/llvm/Analysis/ConstantFolding.h
+++ b/llvm/include/llvm/Analysis/ConstantFolding.h
@@ -119,12 +119,6 @@ ConstantFoldFPInstOperands(unsigned Opcode, Constant *LHS, Constant *RHS,
 LLVM_ABI Constant *FlushFPConstant(Constant *Operand, const Instruction *I,
                                    bool IsOutput);
 
-/// Attempt to constant fold a select instruction with the specified
-/// operands. The constant result is returned if successful; if not, null is
-/// returned.
-LLVM_ABI Constant *ConstantFoldSelectInstruction(Constant *Cond, Constant *V1,
-                                                 Constant *V2);
-
 /// Attempt to constant fold a cast with the specified operand.  If it
 /// fails, it returns a constant expression of the specified operand.
 LLVM_ABI Constant *ConstantFoldCastOperand(unsigned Opcode, Constant *C,
@@ -135,40 +129,6 @@ LLVM_ABI Constant *ConstantFoldCastOperand(unsigned Opcode, Constant *C,
 LLVM_ABI Constant *ConstantFoldIntegerCast(Constant *C, Type *DestTy,
                                            bool IsSigned, const DataLayout &DL);
 
-/// ConstantFoldInsertValueInstruction - Attempt to constant fold an insertvalue
-/// instruction with the specified operands and indices.  The constant result is
-/// returned if successful; if not, null is returned.
-LLVM_ABI Constant *ConstantFoldInsertValueInstruction(Constant *Agg,
-                                                      Constant *Val,
-                                                      ArrayRef<unsigned> Idxs);
-
-/// Attempt to constant fold an extractvalue instruction with the
-/// specified operands and indices.  The constant result is returned if
-/// successful; if not, null is returned.
-LLVM_ABI Constant *ConstantFoldExtractValueInstruction(Constant *Agg,
-                                                       ArrayRef<unsigned> Idxs);
-
-/// Attempt to constant fold an insertelement instruction with the
-/// specified operands and indices.  The constant result is returned if
-/// successful; if not, null is returned.
-LLVM_ABI Constant *ConstantFoldInsertElementInstruction(Constant *Val,
-                                                        Constant *Elt,
-                                                        Constant *Idx);
-
-/// Attempt to constant fold an extractelement instruction with the
-/// specified operands and indices.  The constant result is returned if
-/// successful; if not, null is returned.
-LLVM_ABI Constant *ConstantFoldExtractElementInstruction(Constant *Val,
-                                                         Constant *Idx);
-
-/// Attempt to constant fold a shufflevector instruction with the
-/// specified operands and mask.  See class ShuffleVectorInst for a description
-/// of the mask representation. The constant result is returned if successful;
-/// if not, null is returned.
-LLVM_ABI Constant *ConstantFoldShuffleVectorInstruction(Constant *V1,
-                                                        Constant *V2,
-                                                        ArrayRef<int> Mask);
-
 /// Extract value of C at the given Offset reinterpreted as Ty. If bits past
 /// the end of C are accessed, they are assumed to be poison.
 LLVM_ABI Constant *ConstantFoldLoadFromConst(Constant *C, Type *Ty,
diff --git a/llvm/include/llvm/Analysis/ConstraintSystem.h b/llvm/include/llvm/Analysis/ConstraintSystem.h
index 307ad50e81fec..1d9ac49a54745 100644
--- a/llvm/include/llvm/Analysis/ConstraintSystem.h
+++ b/llvm/include/llvm/Analysis/ConstraintSystem.h
@@ -64,7 +64,7 @@ class ConstraintSystem {
   SmallVector<std::string> getVarNamesList() const;
 
 public:
-  ConstraintSystem() {}
+  ConstraintSystem() = default;
   ConstraintSystem(ArrayRef<Value *> FunctionArgs) {
     NumVariables += FunctionArgs.size();
     for (auto *Arg : FunctionArgs) {
diff --git a/llvm/include/llvm/Analysis/DDG.h b/llvm/include/llvm/Analysis/DDG.h
index 1c5329181ddb1..120bb46330a79 100644
--- a/llvm/include/llvm/Analysis/DDG.h
+++ b/llvm/include/llvm/Analysis/DDG.h
@@ -60,11 +60,7 @@ class LLVM_ABI DDGNode : public DDGNodeBase {
   DDGNode(DDGNode &&N) : DDGNodeBase(std::move(N)), Kind(N.Kind) {}
   virtual ~DDGNode() = 0;
 
-  DDGNode &operator=(const DDGNode &N) {
-    DGNode::operator=(N);
-    Kind = N.Kind;
-    return *this;
-  }
+  DDGNode &operator=(const DDGNode &N) = default;
 
   DDGNode &operator=(DDGNode &&N) {
     DGNode::operator=(std::move(N));
diff --git a/llvm/include/llvm/Analysis/DOTGraphTraitsPass.h b/llvm/include/llvm/Analysis/DOTGraphTraitsPass.h
index ba5ee1d7db487..19a202f78c6ce 100644
--- a/llvm/include/llvm/Analysis/DOTGraphTraitsPass.h
+++ b/llvm/include/llvm/Analysis/DOTGraphTraitsPass.h
@@ -80,7 +80,7 @@ struct DOTGraphTraitsViewer
   /// virtual destructor needed. Making this dtor protected stops accidental
   /// invocation when the derived class destructor should have been called.
   /// Those derived classes sould be marked final to avoid the warning.
-  ~DOTGraphTraitsViewer() {}
+  ~DOTGraphTraitsViewer() = default;
 
 private:
   StringRef Name;
@@ -161,7 +161,7 @@ struct DOTGraphTraitsPrinter
   /// virtual destructor needed. Making this dtor protected stops accidental
   /// invocation when the derived class destructor should have been called.
   /// Those derived classes sould be marked final to avoid the warning.
-  ~DOTGraphTraitsPrinter() {}
+  ~DOTGraphTraitsPrinter() = default;
 
 private:
   StringRef Name;
diff --git a/llvm/include/llvm/Analysis/IR2Vec.h b/llvm/include/llvm/Analysis/IR2Vec.h
index 5ad62880a779c..7a68773a2643a 100644
--- a/llvm/include/llvm/Analysis/IR2Vec.h
+++ b/llvm/include/llvm/Analysis/IR2Vec.h
@@ -72,7 +72,7 @@ enum class IR2VecKind { Symbolic, FlowAware };
 
 namespace ir2vec {
 
-extern llvm::cl::OptionCategory IR2VecCategory;
+LLVM_ABI extern llvm::cl::OptionCategory IR2VecCategory;
 LLVM_ABI extern cl::opt<float> OpcWeight;
 LLVM_ABI extern cl::opt<float> TypeWeight;
 LLVM_ABI extern cl::opt<float> ArgWeight;
@@ -110,8 +110,8 @@ struct Embedding {
     return Data[Itr];
   }
 
-  using iterator = typename std::vector<double>::iterator;
-  using const_iterator = typename std::vector<double>::const_iterator;
+  using iterator = std::vector<double>::iterator;
+  using const_iterator = std::vector<double>::const_iterator;
 
   iterator begin() { return Data.begin(); }
   iterator end() { return Data.end(); }
@@ -161,7 +161,7 @@ class VocabStorage {
 
 public:
   /// Default constructor creates empty storage (invalid state)
-  VocabStorage() : Sections(), TotalSize(0), Dimension(0) {}
+  VocabStorage() = default;
 
   /// Create a VocabStorage with pre-organized section data
   VocabStorage(std::vector<std::vector<Embedding>> &&SectionData);
diff --git a/llvm/include/llvm/Analysis/LoopIterator.h b/llvm/include/llvm/Analysis/LoopIterator.h
index 523d2a21825d0..1ac8e68bfa2f1 100644
--- a/llvm/include/llvm/Analysis/LoopIterator.h
+++ b/llvm/include/llvm/Analysis/LoopIterator.h
@@ -45,12 +45,12 @@ struct LoopBodyTraits {
   class WrappedSuccIterator
       : public iterator_adaptor_base<
             WrappedSuccIterator, succ_iterator,
-            typename std::iterator_traits<succ_iterator>::iterator_category,
-            NodeRef, std::ptrdiff_t, NodeRef *, NodeRef> {
+            std::iterator_traits<succ_iterator>::iterator_category, NodeRef,
+            std::ptrdiff_t, NodeRef *, NodeRef> {
     using BaseT = iterator_adaptor_base<
         WrappedSuccIterator, succ_iterator,
-        typename std::iterator_traits<succ_iterator>::iterator_category,
-        NodeRef, std::ptrdiff_t, NodeRef *, NodeRef>;
+        std::iterator_traits<succ_iterator>::iterator_category, NodeRef,
+        std::ptrdiff_t, NodeRef *, NodeRef>;
 
     const Loop *L;
 
diff --git a/llvm/include/llvm/Analysis/MemorySSA.h b/llvm/include/llvm/Analysis/MemorySSA.h
index cbb942f022244..07d39ab3e10a9 100644
--- a/llvm/include/llvm/Analysis/MemorySSA.h
+++ b/llvm/include/llvm/Analysis/MemorySSA.h
@@ -1247,7 +1247,7 @@ class upward_defs_iterator
     return DefIterator == Other.DefIterator;
   }
 
-  typename std::iterator_traits<BaseT>::reference operator*() const {
+  std::iterator_traits<BaseT>::reference operator*() const {
     assert(DefIterator != OriginalAccess->defs_end() &&
            "Tried to access past the end of our iterator");
     return CurrentPair;
diff --git a/llvm/include/llvm/Analysis/TargetFolder.h b/llvm/include/llvm/Analysis/TargetFolder.h
index d27455cf3505d..cbce482ef47ab 100644
--- a/llvm/include/llvm/Analysis/TargetFolder.h
+++ b/llvm/include/llvm/Analysis/TargetFolder.h
@@ -20,6 +20,7 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/IR/ConstantFold.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/IRBuilderFolder.h"
 #include "llvm/IR/Operator.h"
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 7b7dc1b46dd80..0f17312b03827 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1764,7 +1764,7 @@ class TargetTransformInfo {
   /// \param Types List of types to check.
   LLVM_ABI bool areTypesABICompatible(const Function *Caller,
                                       const Function *Callee,
-                                      const ArrayRef<Type *> &Types) const;
+                                      ArrayRef<Type *> Types) const;
 
   /// The type of load/store indexing.
   enum MemIndexedMode {
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 4cd607c0d0c8d..aacb88d2f9684 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -1028,7 +1028,7 @@ class TargetTransformInfoImplBase {
 
   virtual bool areTypesABICompatible(const Function *Caller,
                                      const Function *Callee,
-                                     const ArrayRef<Type *> &Types) const {
+                                     ArrayRef<Type *> Types) const {
     return (Caller->getFnAttribute("target-cpu") ==
             Callee->getFnAttribute("target-cpu")) &&
            (Caller->getFnAttribute("target-features") ==
diff --git a/llvm/include/llvm/BinaryFormat/ELF.h b/llvm/include/llvm/BinaryFormat/ELF.h
index 6ee6b666c1735..39e9611c7190e 100644
--- a/llvm/include/llvm/BinaryFormat/ELF.h
+++ b/llvm/include/llvm/BinaryFormat/ELF.h
@@ -1125,6 +1125,8 @@ struct Elf64_Shdr {
   Elf64_Xword sh_entsize;
 };
 
+enum { PN_XNUM = 0xffff };
+
 // Special section indices.
 enum {
   SHN_UNDEF = 0,          // Undefined, missing, irrelevant, or meaningless
diff --git a/llvm/include/llvm/CodeGen/BasicBlockSectionsProfileReader.h b/llvm/include/llvm/CodeGen/BasicBlockSectionsProfileReader.h
index 48650a6df22ff..7b1a5f5019589 100644
--- a/llvm/include/llvm/CodeGen/BasicBlockSectionsProfileReader.h
+++ b/llvm/include/llvm/CodeGen/BasicBlockSectionsProfileReader.h
@@ -54,6 +54,10 @@ struct FunctionPathAndClusterInfo {
   DenseMap<UniqueBBID, uint64_t> NodeCounts;
   // Edge counts for each edge, stored as a nested map.
   DenseMap<UniqueBBID, DenseMap<UniqueBBID, uint64_t>> EdgeCounts;
+  // Hash for each basic block. The Hashes are stored for every original block
+  // (not cloned blocks), hence the map key being unsigned instead of
+  // UniqueBBID.
+  DenseMap<unsigned, uint64_t> BBHashes;
 };
 
 class BasicBlockSectionsProfileReader {
@@ -62,7 +66,7 @@ class BasicBlockSectionsProfileReader {
   BasicBlockSectionsProfileReader(const MemoryBuffer *Buf)
       : MBuf(Buf), LineIt(*Buf, /*SkipBlanks=*/true, /*CommentMarker=*/'#'){};
 
-  BasicBlockSectionsProfileReader(){};
+  BasicBlockSectionsProfileReader() = default;
 
   // Returns true if basic block sections profile exist for function \p
   // FuncName.
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 76b6c8ec68c72..e8dbc964a943e 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -594,12 +594,13 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
 
     // Check if suitable for a bit test
     if (N <= DL.getIndexSizeInBits(0u)) {
-      SmallPtrSet<const BasicBlock *, 4> Dests;
-      for (auto I : SI.cases())
-        Dests.insert(I.getCaseSuccessor());
+      DenseMap<const BasicBlock *, unsigned int> DestMap;
+      for (auto I : SI.cases()) {
+        const BasicBlock *BB = I.getCaseSuccessor();
+        ++DestMap[BB];
+      }
 
-      if (TLI->isSuitableForBitTests(Dests.size(), N, MinCaseVal, MaxCaseVal,
-                                     DL))
+      if (TLI->isSuitableForBitTests(DestMap, MinCaseVal, MaxCaseVal, DL))
         return 1;
     }
 
diff --git a/llvm/include/llvm/CodeGen/DIE.h b/llvm/include/llvm/CodeGen/DIE.h
index 32f46517677f2..92265fd86ebb9 100644
--- a/llvm/include/llvm/CodeGen/DIE.h
+++ b/llvm/include/llvm/CodeGen/DIE.h
@@ -653,7 +653,7 @@ template <class T> class IntrusiveBackList : IntrusiveBackListBase {
   public:
     const_iterator() = default;
     // Placate MSVC by explicitly scoping 'iterator'.
-    const_iterator(typename IntrusiveBackList<T>::iterator X) : N(X.N) {}
+    const_iterator(IntrusiveBackList<T>::iterator X) : N(X.N) {}
     explicit const_iterator(const T *N) : N(N) {}
 
     const_iterator &operator++() {
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index b0601eb72ba3f..96cb7cdf2d531 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -293,7 +293,7 @@ class CombinerHelper {
                                  SmallVectorImpl<Register> &Ops) const;
   /// Replace \p MI with a concat_vectors with \p Ops.
   void applyCombineShuffleVector(MachineInstr &MI,
-                                 const ArrayRef<Register> Ops) const;
+                                 ArrayRef<Register> Ops) const;
 
   /// Optimize memcpy intrinsics et al, e.g. constant len calls.
   /// /p MaxLen if non-zero specifies the max length of a mem libcall to inline.
@@ -640,7 +640,8 @@ class CombinerHelper {
   /// This variant does not erase \p MI after calling the build function.
   void applyBuildFnNoErase(MachineInstr &MI, BuildFnTy &MatchInfo) const;
 
-  bool matchOrShiftToFunnelShift(MachineInstr &MI, BuildFnTy &MatchInfo) const;
+  bool matchOrShiftToFunnelShift(MachineInstr &MI, bool AllowScalarConstants,
+                                 BuildFnTy &MatchInfo) const;
   bool matchFunnelShiftToRotate(MachineInstr &MI) const;
   void applyFunnelShiftToRotate(MachineInstr &MI) const;
   bool matchRotateOutOfRange(MachineInstr &MI) const;
diff --git a/llvm/include/llvm/CodeGen/GlobalMergeFunctions.h b/llvm/include/llvm/CodeGen/GlobalMergeFunctions.h
index caea5b62851ea..54ea68a418846 100644
--- a/llvm/include/llvm/CodeGen/GlobalMergeFunctions.h
+++ b/llvm/include/llvm/CodeGen/GlobalMergeFunctions.h
@@ -58,7 +58,7 @@ class GlobalMergeFunc {
   /// The suffix used to identify the merged function that parameterizes
   /// the constant values. Note that the original function, without this suffix,
   /// becomes a thunk supplying contexts to the merged function via parameters.
-  static constexpr const char MergingInstanceSuffix[] = ".Tgm";
+  static constexpr char MergingInstanceSuffix[] = ".Tgm";
 
   GlobalMergeFunc(const ModuleSummaryIndex *Index) : Index(Index) {};
 
diff --git a/llvm/include/llvm/CodeGen/MIR2Vec.h b/llvm/include/llvm/CodeGen/MIR2Vec.h
index 44f009cd7790e..18b12901c1862 100644
--- a/llvm/include/llvm/CodeGen/MIR2Vec.h
+++ b/llvm/include/llvm/CodeGen/MIR2Vec.h
@@ -73,7 +73,7 @@ namespace mir2vec {
 class MIREmbedder;
 class SymbolicMIREmbedder;
 
-extern llvm::cl::OptionCategory MIR2VecCategory;
+LLVM_ABI extern llvm::cl::OptionCategory MIR2VecCategory;
 extern cl::opt<float> OpcWeight, CommonOperandWeight, RegOperandWeight;
 
 using Embedding = ir2vec::Embedding;
@@ -154,14 +154,14 @@ class MIRVocabulary {
   void buildRegisterOperandMapping();
 
   /// Get canonical index for a machine opcode
-  unsigned getCanonicalOpcodeIndex(unsigned Opcode) const;
+  LLVM_ABI unsigned getCanonicalOpcodeIndex(unsigned Opcode) const;
 
   /// Get index for a common (non-register) machine operand
   unsigned
   getCommonOperandIndex(MachineOperand::MachineOperandType OperandType) const;
 
   /// Get index for a register machine operand
-  unsigned getRegisterOperandIndex(Register Reg) const;
+  LLVM_ABI unsigned getRegisterOperandIndex(Register Reg) const;
 
   // Accessors for operand types
   const Embedding &
@@ -192,7 +192,7 @@ class MIRVocabulary {
 
   /// Get entity ID (flat index) for a common operand type
   /// This is used for triplet generation
-  unsigned getEntityIDForCommonOperand(
+  LLVM_ABI unsigned getEntityIDForCommonOperand(
       MachineOperand::MachineOperandType OperandType) const {
     return Layout.CommonOperandBase + getCommonOperandIndex(OperandType);
   }
@@ -221,7 +221,7 @@ class MIRVocabulary {
                                              bool IsPhysical = true) const;
 
   /// Get the string key for a vocabulary entry at the given position
-  std::string getStringKey(unsigned Pos) const;
+  LLVM_ABI std::string getStringKey(unsigned Pos) const;
 
   unsigned getDimension() const { return Storage.getDimension(); }
 
@@ -268,7 +268,7 @@ class MIRVocabulary {
          const TargetRegisterInfo &TRI, const MachineRegisterInfo &MRI);
 
   /// Create a dummy vocabulary for testing purposes.
-  static Expected<MIRVocabulary>
+  LLVM_ABI static Expected<MIRVocabulary>
   createDummyVocabForTest(const TargetInstrInfo &TII,
                           const TargetRegisterInfo &TRI,
                           const MachineRegisterInfo &MRI, unsigned Dim = 1);
@@ -302,10 +302,10 @@ class MIREmbedder {
         RegOperandWeight(mir2vec::RegOperandWeight) {}
 
   /// Function to compute embeddings.
-  Embedding computeEmbeddings() const;
+  LLVM_ABI Embedding computeEmbeddings() const;
 
   /// Function to compute the embedding for a given machine basic block.
-  Embedding computeEmbeddings(const MachineBasicBlock &MBB) const;
+  LLVM_ABI Embedding computeEmbeddings(const MachineBasicBlock &MBB) const;
 
   /// Function to compute the embedding for a given machine instruction.
   /// Specific to the kind of embeddings being computed.
@@ -316,9 +316,9 @@ class MIREmbedder {
 
   /// Factory method to create an Embedder object of the specified kind
   /// Returns nullptr if the requested kind is not supported.
-  static std::unique_ptr<MIREmbedder> create(MIR2VecKind Mode,
-                                             const MachineFunction &MF,
-                                             const MIRVocabulary &Vocab);
+  LLVM_ABI static std::unique_ptr<MIREmbedder>
+  create(MIR2VecKind Mode, const MachineFunction &MF,
+         const MIRVocabulary &Vocab);
 
   /// Computes and returns the embedding for a given machine instruction MI in
   /// the machine function MF.
@@ -369,7 +369,7 @@ class MIR2VecVocabProvider {
 public:
   MIR2VecVocabProvider(const MachineModuleInfo &MMI) : MMI(MMI) {}
 
-  Expected<mir2vec::MIRVocabulary> getVocabulary(const Module &M);
+  LLVM_ABI Expected<mir2vec::MIRVocabulary> getVocabulary(const Module &M);
 
 private:
   Error readVocabulary(VocabMap &OpcVocab, VocabMap &CommonOperandVocab,
@@ -454,7 +454,7 @@ class MIR2VecPrinterLegacyPass : public MachineFunctionPass {
 };
 
 /// Create a machine pass that prints MIR2Vec embeddings
-MachineFunctionPass *createMIR2VecPrinterLegacyPass(raw_ostream &OS);
+LLVM_ABI MachineFunctionPass *createMIR2VecPrinterLegacyPass(raw_ostream &OS);
 
 } // namespace llvm
 
diff --git a/llvm/include/llvm/CodeGen/MachineScheduler.h b/llvm/include/llvm/CodeGen/MachineScheduler.h
index 5a2aee2fa7643..6c5c27c9662e4 100644
--- a/llvm/include/llvm/CodeGen/MachineScheduler.h
+++ b/llvm/include/llvm/CodeGen/MachineScheduler.h
@@ -829,7 +829,7 @@ class ResourceSegments {
 
 public:
   // constructor for empty set
-  explicit ResourceSegments(){};
+  explicit ResourceSegments() = default;
   bool empty() const { return _Intervals.empty(); }
   explicit ResourceSegments(const std::list<IntervalTy> &Intervals)
       : _Intervals(Intervals) {
diff --git a/llvm/include/llvm/CodeGen/RDFRegisters.h b/llvm/include/llvm/CodeGen/RDFRegisters.h
index 82027cad53bdb..3b7454e1e552f 100644
--- a/llvm/include/llvm/CodeGen/RDFRegisters.h
+++ b/llvm/include/llvm/CodeGen/RDFRegisters.h
@@ -294,7 +294,7 @@ struct RegisterAggr {
   ref_iterator ref_begin() const { return ref_iterator(*this, false); }
   ref_iterator ref_end() const { return ref_iterator(*this, true); }
 
-  using unit_iterator = typename BitVector::const_set_bits_iterator;
+  using unit_iterator = BitVector::const_set_bits_iterator;
   unit_iterator unit_begin() const { return Units.set_bits_begin(); }
   unit_iterator unit_end() const { return Units.set_bits_end(); }
 
diff --git a/llvm/include/llvm/CodeGen/RegAllocRegistry.h b/llvm/include/llvm/CodeGen/RegAllocRegistry.h
index cd81e084a859b..db6264085b8a1 100644
--- a/llvm/include/llvm/CodeGen/RegAllocRegistry.h
+++ b/llvm/include/llvm/CodeGen/RegAllocRegistry.h
@@ -67,7 +67,7 @@ class RegisterRegAlloc : public RegisterRegAllocBase<RegisterRegAlloc> {
 /// RegisterRegAlloc's global Registry tracks allocator registration.
 template <class T>
 MachinePassRegistry<typename RegisterRegAllocBase<T>::FunctionPassCtor>
-RegisterRegAllocBase<T>::Registry;
+    RegisterRegAllocBase<T>::Registry;
 
 } // end namespace llvm
 
diff --git a/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h b/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h
index a9e53bae897ad..f980d3dc255ca 100644
--- a/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h
+++ b/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h
@@ -84,6 +84,10 @@ LLVM_ABI Libcall getSINCOS(EVT RetVT);
 /// UNKNOWN_LIBCALL if there is none.
 LLVM_ABI Libcall getSINCOSPI(EVT RetVT);
 
+/// Return the SINCOS_STRET_ value for the given types, or UNKNOWN_LIBCALL if
+/// there is none.
+LLVM_ABI Libcall getSINCOS_STRET(EVT RetVT);
+
 /// getMODF - Return the MODF_* value for the given types, or
 /// UNKNOWN_LIBCALL if there is none.
 LLVM_ABI Libcall getMODF(EVT RetVT);
diff --git a/llvm/include/llvm/CodeGen/SDPatternMatch.h b/llvm/include/llvm/CodeGen/SDPatternMatch.h
index 0dcf400962393..511cb56f73dcb 100644
--- a/llvm/include/llvm/CodeGen/SDPatternMatch.h
+++ b/llvm/include/llvm/CodeGen/SDPatternMatch.h
@@ -583,6 +583,18 @@ m_InsertSubvector(const LHS &Base, const RHS &Sub, const IDX &Idx) {
   return TernaryOpc_match<LHS, RHS, IDX>(ISD::INSERT_SUBVECTOR, Base, Sub, Idx);
 }
 
+template <typename T0_P, typename T1_P, typename T2_P>
+inline TernaryOpc_match<T0_P, T1_P, T2_P>
+m_TernaryOp(unsigned Opc, const T0_P &Op0, const T1_P &Op1, const T2_P &Op2) {
+  return TernaryOpc_match<T0_P, T1_P, T2_P>(Opc, Op0, Op1, Op2);
+}
+
+template <typename T0_P, typename T1_P, typename T2_P>
+inline TernaryOpc_match<T0_P, T1_P, T2_P, true>
+m_c_TernaryOp(unsigned Opc, const T0_P &Op0, const T1_P &Op1, const T2_P &Op2) {
+  return TernaryOpc_match<T0_P, T1_P, T2_P, true>(Opc, Op0, Op1, Op2);
+}
+
 template <typename LTy, typename RTy, typename TTy, typename FTy, typename CCTy>
 inline auto m_SelectCC(const LTy &L, const RTy &R, const TTy &T, const FTy &F,
                        const CCTy &CC) {
@@ -1299,7 +1311,7 @@ template <typename... PatternTs> struct ReassociatableOpc_match {
   }
 
   [[nodiscard]] inline bool
-  reassociatableMatchHelper(const ArrayRef<SmallBitVector> Matches,
+  reassociatableMatchHelper(ArrayRef<SmallBitVector> Matches,
                             SmallBitVector &Used, size_t Curr = 0) {
     if (Curr == Matches.size())
       return true;
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index df6ce0fe1b037..1a5ffb38f2568 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1113,7 +1113,8 @@ class SelectionDAG {
                                       SDValue Mask, SDValue EVL);
 
   /// Returns sum of the base pointer and offset.
-  /// Unlike getObjectPtrOffset this does not set NoUnsignedWrap by default.
+  /// Unlike getObjectPtrOffset this does not set NoUnsignedWrap and InBounds by
+  /// default.
   LLVM_ABI SDValue
   getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL,
                        const SDNodeFlags Flags = SDNodeFlags());
@@ -1123,15 +1124,18 @@ class SelectionDAG {
 
   /// Create an add instruction with appropriate flags when used for
   /// addressing some offset of an object. i.e. if a load is split into multiple
-  /// components, create an add nuw from the base pointer to the offset.
+  /// components, create an add nuw (or ptradd nuw inbounds) from the base
+  /// pointer to the offset.
   SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset) {
-    return getMemBasePlusOffset(Ptr, Offset, SL, SDNodeFlags::NoUnsignedWrap);
+    return getMemBasePlusOffset(
+        Ptr, Offset, SL, SDNodeFlags::NoUnsignedWrap | SDNodeFlags::InBounds);
   }
 
   SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, SDValue Offset) {
     // The object itself can't wrap around the address space, so it shouldn't be
     // possible for the adds of the offsets to the split parts to overflow.
-    return getMemBasePlusOffset(Ptr, Offset, SL, SDNodeFlags::NoUnsignedWrap);
+    return getMemBasePlusOffset(
+        Ptr, Offset, SL, SDNodeFlags::NoUnsignedWrap | SDNodeFlags::InBounds);
   }
 
   /// Return a new CALLSEQ_START node, that starts new call frame, in which
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index d6ed3a8f739b3..78f63b4406eb0 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -1433,9 +1433,9 @@ class LLVM_ABI TargetLoweringBase {
   /// \p High as its lowest and highest case values, and expects \p NumCmps
   /// case value comparisons. Check if the number of destinations, comparison
   /// metric, and range are all suitable.
-  bool isSuitableForBitTests(unsigned NumDests, unsigned NumCmps,
-                             const APInt &Low, const APInt &High,
-                             const DataLayout &DL) const {
+  bool isSuitableForBitTests(
+      const DenseMap<const BasicBlock *, unsigned int> &DestCmps,
+      const APInt &Low, const APInt &High, const DataLayout &DL) const {
     // FIXME: I don't think NumCmps is the correct metric: a single case and a
     // range of cases both require only one branch to lower. Just looking at the
     // number of clusters and destinations should be enough to decide whether to
@@ -1446,6 +1446,20 @@ class LLVM_ABI TargetLoweringBase {
     if (!rangeFitsInWord(Low, High, DL))
       return false;
 
+    unsigned NumDests = DestCmps.size();
+    unsigned NumCmps = 0;
+    unsigned int MaxBitTestEntry = 0;
+    for (auto &DestCmp : DestCmps) {
+      NumCmps += DestCmp.second;
+      if (DestCmp.second > MaxBitTestEntry)
+        MaxBitTestEntry = DestCmp.second;
+    }
+
+    // Comparisons might be cheaper for small number of comparisons, which can
+    // be Arch Target specific.
+    if (MaxBitTestEntry < getMinimumBitTestCmps())
+      return false;
+
     // Decide whether it's profitable to lower this range with bit tests. Each
     // destination requires a bit test and branch, and there is an overall range
     // check branch. For a small number of clusters, separate comparisons might
@@ -2055,6 +2069,9 @@ class LLVM_ABI TargetLoweringBase {
 
   virtual bool isJumpTableRelative() const;
 
+  /// Retuen the minimum of largest number of comparisons in BitTest.
+  unsigned getMinimumBitTestCmps() const;
+
   /// If a physical register, this specifies the register that
   /// llvm.savestack/llvm.restorestack should save and restore.
   Register getStackPointerRegisterToSaveRestore() const {
@@ -2577,6 +2594,9 @@ class LLVM_ABI TargetLoweringBase {
   /// Set to zero to generate unlimited jump tables.
   void setMaximumJumpTableSize(unsigned);
 
+  /// Set the minimum of largest of number of comparisons to generate BitTest.
+  void setMinimumBitTestCmps(unsigned Val);
+
   /// If set to a physical register, this specifies the register that
   /// llvm.savestack/llvm.restorestack should save and restore.
   void setStackPointerRegisterToSaveRestore(Register R) {
@@ -3719,6 +3739,9 @@ class LLVM_ABI TargetLoweringBase {
   /// backend supports.
   unsigned MinCmpXchgSizeInBits;
 
+  /// The minimum of largest number of comparisons to use bit test for switch.
+  unsigned MinimumBitTestCmps;
+
   /// This indicates if the target supports unaligned atomic operations.
   bool SupportsUnalignedAtomics;
 
@@ -3738,7 +3761,7 @@ class LLVM_ABI TargetLoweringBase {
   /// register class is the largest legal super-reg register class of the
   /// register class of the specified type. e.g. On x86, i8, i16, and i32's
   /// representative class would be GR32.
-  const TargetRegisterClass *RepRegClassForVT[MVT::VALUETYPE_SIZE] = {0};
+  const TargetRegisterClass *RepRegClassForVT[MVT::VALUETYPE_SIZE] = {nullptr};
 
   /// This indicates the "cost" of the "representative" register class for each
   /// ValueType. The cost is used by the scheduler to approximate register
@@ -5626,17 +5649,35 @@ class LLVM_ABI TargetLowering : public TargetLoweringBase {
   /// Get a pointer to vector element \p Idx located in memory for a vector of
   /// type \p VecVT starting at a base address of \p VecPtr. If \p Idx is out of
   /// bounds the returned pointer is unspecified, but will be within the vector
-  /// bounds.
-  SDValue getVectorElementPointer(SelectionDAG &DAG, SDValue VecPtr, EVT VecVT,
-                                  SDValue Index) const;
+  /// bounds. \p PtrArithFlags can be used to mark that arithmetic within the
+  /// vector in memory is known to not wrap or to be inbounds.
+  SDValue getVectorElementPointer(
+      SelectionDAG &DAG, SDValue VecPtr, EVT VecVT, SDValue Index,
+      const SDNodeFlags PtrArithFlags = SDNodeFlags()) const;
+
+  /// Get a pointer to vector element \p Idx located in memory for a vector of
+  /// type \p VecVT starting at a base address of \p VecPtr. If \p Idx is out of
+  /// bounds the returned pointer is unspecified, but will be within the vector
+  /// bounds. \p VecPtr is guaranteed to point to the beginning of a memory
+  /// location large enough for the vector.
+  SDValue getInboundsVectorElementPointer(SelectionDAG &DAG, SDValue VecPtr,
+                                          EVT VecVT, SDValue Index) const {
+    return getVectorElementPointer(DAG, VecPtr, VecVT, Index,
+                                   SDNodeFlags::NoUnsignedWrap |
+                                       SDNodeFlags::InBounds);
+  }
 
   /// Get a pointer to a sub-vector of type \p SubVecVT at index \p Idx located
   /// in memory for a vector of type \p VecVT starting at a base address of
   /// \p VecPtr. If \p Idx plus the size of \p SubVecVT is out of bounds the
   /// returned pointer is unspecified, but the value returned will be such that
-  /// the entire subvector would be within the vector bounds.
-  SDValue getVectorSubVecPointer(SelectionDAG &DAG, SDValue VecPtr, EVT VecVT,
-                                 EVT SubVecVT, SDValue Index) const;
+  /// the entire subvector would be within the vector bounds. \p PtrArithFlags
+  /// can be used to mark that arithmetic within the vector in memory is known
+  /// to not wrap or to be inbounds.
+  SDValue
+  getVectorSubVecPointer(SelectionDAG &DAG, SDValue VecPtr, EVT VecVT,
+                         EVT SubVecVT, SDValue Index,
+                         const SDNodeFlags PtrArithFlags = SDNodeFlags()) const;
 
   /// Method for building the DAG expansion of ISD::[US][MIN|MAX]. This
   /// method accepts integers as its arguments.
diff --git a/llvm/include/llvm/CodeGen/TileShapeInfo.h b/llvm/include/llvm/CodeGen/TileShapeInfo.h
index 9cea327819895..24d9de842645a 100644
--- a/llvm/include/llvm/CodeGen/TileShapeInfo.h
+++ b/llvm/include/llvm/CodeGen/TileShapeInfo.h
@@ -34,30 +34,9 @@ class ShapeT {
     if (MRI)
       deduceImm(MRI);
   }
-  // When ShapeT has multiple shapes, we only use Shapes (never use Row and Col)
-  // and ImmShapes. Due to the most case is only one shape (just simply use
-  // Shape.Row or Shape.Col), so here we don't merge Row and Col into vector
-  // Shapes to keep the speed and code simplicity.
-  // TODO: The upper solution is a temporary way to minimize current tile
-  // register allocation code changes. It can not handle both Reg shape and
-  // Imm shape for different shapes (e.g. shape 1 is reg shape while shape 2
-  // is imm shape). Refine me when we have more multi-tile shape instructions!
-  ShapeT(ArrayRef<MachineOperand *> ShapesOperands,
-         const MachineRegisterInfo *MRI = nullptr)
-      : Row(nullptr), Col(nullptr), RowImm(InvalidImmShape),
-        ColImm(InvalidImmShape) {
-    assert(ShapesOperands.size() % 2 == 0 && "Miss row or col!");
-
-    llvm::append_range(Shapes, ShapesOperands);
-
-    if (MRI)
-      deduceImm(MRI);
-  }
   ShapeT()
       : Row(nullptr), Col(nullptr), RowImm(InvalidImmShape),
         ColImm(InvalidImmShape) {}
-  // TODO: We need to extern cmp operator for multi-shapes if
-  // we have requirement in the future.
   bool operator==(const ShapeT &Shape) const {
     MachineOperand *R = Shape.Row;
     MachineOperand *C = Shape.Col;
@@ -74,40 +53,11 @@ class ShapeT {
 
   bool operator!=(const ShapeT &Shape) const { return !(*this == Shape); }
 
-  MachineOperand *getRow(unsigned I = 0) const {
-    if (Shapes.empty())
-      return Row;
-    assert(Shapes.size() / 2 >= I && "Get invalid row from id!");
-    return Shapes[I * 2];
-  }
-
-  MachineOperand *getCol(unsigned I = 0) const {
-    if (Shapes.empty())
-      return Col;
-    assert(Shapes.size() / 2 >= I && "Get invalid col from id!");
-    return Shapes[I * 2 + 1];
-  }
-
-  int64_t getRowImm(unsigned I = 0) const {
-    if (ImmShapes.empty())
-      return RowImm;
-    assert(ImmShapes.size() / 2 >= I && "Get invalid imm row from id!");
-    return ImmShapes[I * 2];
-  }
-
-  int64_t getColImm(unsigned I = 0) const {
-    if (ImmShapes.empty())
-      return ColImm;
-    assert(ImmShapes.size() / 2 >= I && "Get invalid imm col from id!");
-    return ImmShapes[I * 2 + 1];
-  }
+  MachineOperand *getRow() const { return Row; }
+  MachineOperand *getCol() const { return Col; }
 
-  unsigned getShapeNum() {
-    if (Shapes.empty())
-      return isValid() ? 1 : 0;
-    else
-      return Shapes.size() / 2;
-  }
+  int64_t getRowImm() const { return RowImm; }
+  int64_t getColImm() const { return ColImm; }
 
   bool isValid() { return (Row != nullptr) && (Col != nullptr); }
 
@@ -120,35 +70,14 @@ class ShapeT {
       for (const MachineOperand &DefMO : MRI->def_operands(Reg)) {
         const auto *MI = DefMO.getParent();
         if (MI->isMoveImmediate()) {
-          assert(MI->getNumOperands() == 2 &&
-                 "Unsupported number of operands in instruction for setting "
-                 "row/column.");
-          if (MI->getOperand(1).isImm()) {
-            Imm = MI->getOperand(1).getImm();
-          } else {
-            assert(MI->getOperand(1).isImplicit() &&
-                   "Operand 1 is assumed to be implicit.");
-            Imm = 0;
-          }
+          Imm = MI->getOperand(1).getImm();
           break;
         }
       }
       return Imm;
     };
-    if (Shapes.empty()) { // Single Shape
-      RowImm = GetImm(Row->getReg());
-      ColImm = GetImm(Col->getReg());
-      // The number of rows of 2nd destination buffer is assigned by the one of
-      // 1st destination buffer. If the column size is equal to zero, the row
-      // size should be reset to zero too.
-      if (ColImm == 0)
-        Row = Col;
-    } else { // Multiple Shapes
-      for (auto *Shape : Shapes) {
-        int64_t ImmShape = GetImm(Shape->getReg());
-        ImmShapes.push_back(ImmShape);
-      }
-    }
+    RowImm = GetImm(Row->getReg());
+    ColImm = GetImm(Col->getReg());
   }
 
 private:
@@ -157,9 +86,6 @@ class ShapeT {
   MachineOperand *Col;
   int64_t RowImm = -1;
   int64_t ColImm = -1;
-  // Multiple Shapes
-  SmallVector<MachineOperand *, 0> Shapes;
-  SmallVector<int64_t, 0> ImmShapes;
 };
 
 } // namespace llvm
diff --git a/llvm/include/llvm/CodeGen/WindowScheduler.h b/llvm/include/llvm/CodeGen/WindowScheduler.h
index 476d5ada27876..97776de353e3f 100644
--- a/llvm/include/llvm/CodeGen/WindowScheduler.h
+++ b/llvm/include/llvm/CodeGen/WindowScheduler.h
@@ -105,7 +105,7 @@ class WindowScheduler {
 
 public:
   WindowScheduler(MachineSchedContext *C, MachineLoop &ML);
-  virtual ~WindowScheduler() {}
+  virtual ~WindowScheduler() = default;
 
   bool run();
 
diff --git a/llvm/include/llvm/CodeGenTypes/LowLevelType.h b/llvm/include/llvm/CodeGenTypes/LowLevelType.h
index 4c1fe13790011..472a3f3e23b3f 100644
--- a/llvm/include/llvm/CodeGenTypes/LowLevelType.h
+++ b/llvm/include/llvm/CodeGenTypes/LowLevelType.h
@@ -340,18 +340,18 @@ class LLT {
   ///   valid encodings, SizeInBits/SizeOfElement must be larger than 0.
   /// * Non-pointer scalar (isPointer == 0 && isVector == 0):
   ///   SizeInBits: 32;
-  static const constexpr BitFieldInfo ScalarSizeFieldInfo{32, 29};
+  static constexpr BitFieldInfo ScalarSizeFieldInfo{32, 29};
   /// * Pointer (isPointer == 1 && isVector == 0):
   ///   SizeInBits: 16;
   ///   AddressSpace: 24;
-  static const constexpr BitFieldInfo PointerSizeFieldInfo{16, 45};
-  static const constexpr BitFieldInfo PointerAddressSpaceFieldInfo{24, 21};
+  static constexpr BitFieldInfo PointerSizeFieldInfo{16, 45};
+  static constexpr BitFieldInfo PointerAddressSpaceFieldInfo{24, 21};
   /// * Vector-of-non-pointer (isPointer == 0 && isVector == 1):
   ///   NumElements: 16;
   ///   SizeOfElement: 32;
   ///   Scalable: 1;
-  static const constexpr BitFieldInfo VectorElementsFieldInfo{16, 5};
-  static const constexpr BitFieldInfo VectorScalableFieldInfo{1, 0};
+  static constexpr BitFieldInfo VectorElementsFieldInfo{16, 5};
+  static constexpr BitFieldInfo VectorScalableFieldInfo{1, 0};
   /// * Vector-of-pointer (isPointer == 1 && isVector == 1):
   ///   NumElements: 16;
   ///   SizeOfElement: 16;
diff --git a/llvm/include/llvm/DWARFLinker/StringPool.h b/llvm/include/llvm/DWARFLinker/StringPool.h
index d0f4e211fac3e..7838e3b8d6f20 100644
--- a/llvm/include/llvm/DWARFLinker/StringPool.h
+++ b/llvm/include/llvm/DWARFLinker/StringPool.h
@@ -20,7 +20,7 @@ namespace dwarf_linker {
 
 /// StringEntry keeps data of the string: the length, external offset
 /// and a string body which is placed right after StringEntry.
-using StringEntry = StringMapEntry<std::nullopt_t>;
+using StringEntry = StringMapEntry<EmptyStringSetTag>;
 
 class StringPoolEntryInfo {
 public:
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFUnit.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFUnit.h
index be78647cf9fea..b7d6e725faeeb 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFUnit.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFUnit.h
@@ -136,8 +136,8 @@ class DWARFUnitVector final : public SmallVector<std::unique_ptr<DWARFUnit>, 1>
 
 public:
   using UnitVector = SmallVectorImpl<std::unique_ptr<DWARFUnit>>;
-  using iterator = typename UnitVector::iterator;
-  using iterator_range = llvm::iterator_range<typename UnitVector::iterator>;
+  using iterator = UnitVector::iterator;
+  using iterator_range = llvm::iterator_range<UnitVector::iterator>;
 
   using compile_unit_range =
       decltype(make_filter_range(std::declval<iterator_range>(), isCompileUnit));
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/DbiModuleList.h b/llvm/include/llvm/DebugInfo/PDB/Native/DbiModuleList.h
index 8992faead73bb..bbed56b517093 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/DbiModuleList.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/DbiModuleList.h
@@ -32,7 +32,7 @@ struct FileInfoSubstreamHeader;
 class DbiModuleSourceFilesIterator
     : public iterator_facade_base<DbiModuleSourceFilesIterator,
                                   std::random_access_iterator_tag, StringRef> {
-  using BaseType = typename DbiModuleSourceFilesIterator::iterator_facade_base;
+  using BaseType = DbiModuleSourceFilesIterator::iterator_facade_base;
 
 public:
   LLVM_ABI DbiModuleSourceFilesIterator(const DbiModuleList &Modules,
diff --git a/llvm/include/llvm/Demangle/Utility.h b/llvm/include/llvm/Demangle/Utility.h
index 002a1f55467d6..6e6203d716e7a 100644
--- a/llvm/include/llvm/Demangle/Utility.h
+++ b/llvm/include/llvm/Demangle/Utility.h
@@ -81,7 +81,7 @@ class OutputBuffer {
   OutputBuffer(const OutputBuffer &) = delete;
   OutputBuffer &operator=(const OutputBuffer &) = delete;
 
-  virtual ~OutputBuffer() {}
+  virtual ~OutputBuffer() = default;
 
   operator std::string_view() const {
     return std::string_view(Buffer, CurrentPosition);
diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/aarch32.h b/llvm/include/llvm/ExecutionEngine/JITLink/aarch32.h
index 98170f60f6e49..9479c107447d5 100644
--- a/llvm/include/llvm/ExecutionEngine/JITLink/aarch32.h
+++ b/llvm/include/llvm/ExecutionEngine/JITLink/aarch32.h
@@ -175,7 +175,7 @@ struct HalfWords {
 /// FixupInfo base class is required for dynamic lookups.
 struct FixupInfoBase {
   LLVM_ABI static const FixupInfoBase *getDynFixupInfo(Edge::Kind K);
-  virtual ~FixupInfoBase() {}
+  virtual ~FixupInfoBase() = default;
 };
 
 /// FixupInfo checks for Arm edge kinds work on 32-bit words
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/EPCGenericMemoryAccess.h b/llvm/include/llvm/ExecutionEngine/Orc/EPCGenericMemoryAccess.h
index c69b6f736651e..86207265021c5 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/EPCGenericMemoryAccess.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/EPCGenericMemoryAccess.h
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// Implements ExecutorProcessControl::MemoryAccess by making calls to
+// Implements the MemoryAccess interface by making calls to
 // ExecutorProcessControl::callWrapperAsync.
 //
 // This simplifies the implementaton of new ExecutorProcessControl instances,
@@ -19,6 +19,7 @@
 #define LLVM_EXECUTIONENGINE_ORC_EPCGENERICMEMORYACCESS_H
 
 #include "llvm/ExecutionEngine/Orc/Core.h"
+#include "llvm/ExecutionEngine/Orc/MemoryAccess.h"
 
 namespace llvm {
 namespace orc {
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/MachOBuilder.h b/llvm/include/llvm/ExecutionEngine/Orc/MachOBuilder.h
index dd4102599bdb5..1296e24fa4162 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/MachOBuilder.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/MachOBuilder.h
@@ -36,7 +36,7 @@ size_t writeMachOStruct(MutableArrayRef<char> Buf, size_t Offset, MachOStruct S,
 
 /// Base type for MachOBuilder load command wrappers.
 struct MachOBuilderLoadCommandBase {
-  virtual ~MachOBuilderLoadCommandBase() {}
+  virtual ~MachOBuilderLoadCommandBase() = default;
   virtual size_t size() const = 0;
   virtual size_t write(MutableArrayRef<char> Buf, size_t Offset,
                        bool SwapStruct) = 0;
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Shared/SymbolFilter.h b/llvm/include/llvm/ExecutionEngine/Orc/Shared/SymbolFilter.h
new file mode 100644
index 0000000000000..517089341978a
--- /dev/null
+++ b/llvm/include/llvm/ExecutionEngine/Orc/Shared/SymbolFilter.h
@@ -0,0 +1,173 @@
+//===- SymbolFilter.h - Utilities for Symbol Filtering ---------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_SHARED_SYMBOLFILTER_H
+#define LLVM_EXECUTIONENGINE_ORC_SHARED_SYMBOLFILTER_H
+
+#include "llvm/ExecutionEngine/Orc/Shared/SimplePackedSerialization.h"
+
+#include <cmath>
+#include <type_traits>
+#include <vector>
+
+namespace llvm {
+namespace orc {
+
+namespace shared {
+using SPSBloomFilter =
+    SPSTuple<bool, uint32_t, uint32_t, uint32_t, SPSSequence<uint64_t>>;
+}
+
+class BloomFilter {
+public:
+  using HashFunc = std::function<uint32_t(StringRef)>;
+
+  BloomFilter() = default;
+  BloomFilter(BloomFilter &&) noexcept = default;
+  BloomFilter &operator=(BloomFilter &&) noexcept = default;
+  BloomFilter(const BloomFilter &) = delete;
+  BloomFilter &operator=(const BloomFilter &) = delete;
+
+  BloomFilter(uint32_t SymbolCount, float FalsePositiveRate, HashFunc hashFn)
+      : HashFn(std::move(hashFn)) {
+    initialize(SymbolCount, FalsePositiveRate);
+  }
+  bool isInitialized() const { return Initialized; }
+
+  void add(StringRef Sym) {
+    assert(Initialized);
+    addHash(HashFn(Sym));
+  }
+
+  bool mayContain(StringRef Sym) const {
+    return !isEmpty() && testHash(HashFn(Sym));
+  }
+
+  bool isEmpty() const { return SymbolCount == 0; }
+
+private:
+  friend class shared::SPSSerializationTraits<shared::SPSBloomFilter,
+                                              BloomFilter>;
+  static constexpr uint32_t BitsPerEntry = 64;
+
+  bool Initialized = false;
+  uint32_t SymbolCount = 0;
+  uint32_t BloomSize = 0;
+  uint32_t BloomShift = 0;
+  std::vector<uint64_t> BloomTable;
+  HashFunc HashFn;
+
+  void initialize(uint32_t SymCount, float FalsePositiveRate) {
+    assert(SymCount > 0);
+    SymbolCount = SymCount;
+    Initialized = true;
+
+    float ln2 = std::log(2.0f);
+    float M = -1.0f * SymbolCount * std::log(FalsePositiveRate) / (ln2 * ln2);
+    BloomSize = static_cast<uint32_t>(std::ceil(M / BitsPerEntry));
+    BloomShift = std::min(6u, log2ceil(SymbolCount));
+    BloomTable.resize(BloomSize, 0);
+  }
+
+  void addHash(uint32_t Hash) {
+    uint32_t Hash2 = Hash >> BloomShift;
+    uint32_t N = (Hash / BitsPerEntry) % BloomSize;
+    uint64_t Mask =
+        (1ULL << (Hash % BitsPerEntry)) | (1ULL << (Hash2 % BitsPerEntry));
+    BloomTable[N] |= Mask;
+  }
+
+  bool testHash(uint32_t Hash) const {
+    uint32_t Hash2 = Hash >> BloomShift;
+    uint32_t N = (Hash / BitsPerEntry) % BloomSize;
+    uint64_t Mask =
+        (1ULL << (Hash % BitsPerEntry)) | (1ULL << (Hash2 % BitsPerEntry));
+    return (BloomTable[N] & Mask) == Mask;
+  }
+
+  static constexpr uint32_t log2ceil(uint32_t V) {
+    return V <= 1 ? 0 : 32 - countl_zero(V - 1);
+  }
+};
+
+class BloomFilterBuilder {
+public:
+  using HashFunc = BloomFilter::HashFunc;
+
+  BloomFilterBuilder() = default;
+
+  BloomFilterBuilder &setFalsePositiveRate(float Rate) {
+    assert(Rate > 0.0f && Rate < 1.0f);
+    FalsePositiveRate = Rate;
+    return *this;
+  }
+
+  BloomFilterBuilder &setHashFunction(HashFunc Fn) {
+    HashFn = std::move(Fn);
+    return *this;
+  }
+
+  BloomFilter build(ArrayRef<StringRef> Symbols) const {
+    assert(!Symbols.empty() && "Cannot build filter from empty symbol list.");
+    BloomFilter F(static_cast<uint32_t>(Symbols.size()), FalsePositiveRate,
+                  HashFn);
+    for (const auto &Sym : Symbols)
+      F.add(Sym);
+
+    return F;
+  }
+
+private:
+  float FalsePositiveRate = 0.02f;
+  HashFunc HashFn = [](StringRef S) -> uint32_t {
+    uint32_t H = 5381;
+    for (char C : S)
+      H = ((H << 5) + H) + static_cast<uint8_t>(C); // H * 33 + C
+    return H;
+  };
+};
+
+namespace shared {
+
+template <> class SPSSerializationTraits<SPSBloomFilter, BloomFilter> {
+public:
+  static size_t size(const BloomFilter &Filter) {
+    return SPSBloomFilter::AsArgList::size(
+        Filter.Initialized, Filter.SymbolCount, Filter.BloomSize,
+        Filter.BloomShift, Filter.BloomTable);
+  }
+
+  static bool serialize(SPSOutputBuffer &OB, const BloomFilter &Filter) {
+    return SPSBloomFilter::AsArgList::serialize(
+        OB, Filter.Initialized, Filter.SymbolCount, Filter.BloomSize,
+        Filter.BloomShift, Filter.BloomTable);
+  }
+
+  static bool deserialize(SPSInputBuffer &IB, BloomFilter &Filter) {
+    bool IsInitialized;
+    uint32_t SymbolCount = 0, BloomSize = 0, BloomShift = 0;
+    std::vector<uint64_t> BloomTable;
+
+    if (!SPSBloomFilter::AsArgList::deserialize(
+            IB, IsInitialized, SymbolCount, BloomSize, BloomShift, BloomTable))
+      return false;
+
+    Filter.Initialized = IsInitialized;
+    Filter.SymbolCount = SymbolCount;
+    Filter.BloomSize = BloomSize;
+    Filter.BloomShift = BloomShift;
+    Filter.BloomTable = std::move(BloomTable);
+
+    return true;
+  }
+};
+
+} // end namespace shared
+} // end namespace orc
+} // end namespace llvm
+#endif // LLVM_EXECUTIONENGINE_ORC_SHARED_SYMBOLFILTER_H
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/ExecutorSharedMemoryMapperService.h b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/ExecutorSharedMemoryMapperService.h
index 2c385de48ddf6..8f876504eaf53 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/ExecutorSharedMemoryMapperService.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/ExecutorSharedMemoryMapperService.h
@@ -29,7 +29,7 @@ namespace rt_bootstrap {
 class LLVM_ABI ExecutorSharedMemoryMapperService final
     : public ExecutorBootstrapService {
 public:
-  ~ExecutorSharedMemoryMapperService() override {};
+  ~ExecutorSharedMemoryMapperService() override = default;
 
   Expected<std::pair<ExecutorAddr, std::string>> reserve(uint64_t Size);
   Expected<ExecutorAddr> initialize(ExecutorAddr Reservation,
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/LibraryResolver.h b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/LibraryResolver.h
new file mode 100644
index 0000000000000..7cc78d4be2792
--- /dev/null
+++ b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/LibraryResolver.h
@@ -0,0 +1,511 @@
+//===- LibraryResolver.h - Automatic Library Symbol Resolution -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides support for automatically searching symbols across
+// dynamic libraries that have not yet been loaded.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_LIBRARYRESOLVER_H
+#define LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_LIBRARYRESOLVER_H
+
+#include "llvm/ADT/FunctionExtras.h"
+#include "llvm/ExecutionEngine/Orc/Shared/SymbolFilter.h"
+#include "llvm/ExecutionEngine/Orc/TargetProcess/LibraryScanner.h"
+#include "llvm/Support/Path.h"
+
+#include <atomic>
+#include <shared_mutex>
+#include <unordered_map>
+
+namespace llvm {
+namespace orc {
+
+/// Manages library metadata and state for symbol resolution.
+///
+/// Tracks libraries by load state and kind (user/system), and stores
+/// associated Bloom filters and hash maps to speed up symbol lookups.
+/// Thread-safe for concurrent access.
+class LibraryManager {
+public:
+  enum class LibState : uint8_t { Unloaded = 0, Loaded = 1, Queried = 2 };
+
+  class LibraryInfo {
+  public:
+    LibraryInfo(const LibraryInfo &) = delete;
+    LibraryInfo &operator=(const LibraryInfo &) = delete;
+
+    LibraryInfo(std::string FilePath, LibState S, PathType K,
+                std::optional<BloomFilter> Filter = std::nullopt)
+        : FilePath(std::move(FilePath)), S(S), K(K), Filter(std::move(Filter)) {
+    }
+
+    StringRef getBasePath() const { return sys::path::parent_path(FilePath); }
+    StringRef getFileName() const { return sys::path::filename(FilePath); }
+
+    std::string getFullPath() const { return FilePath; }
+
+    void setFilter(BloomFilter F) {
+      std::lock_guard<std::shared_mutex> Lock(Mtx);
+      if (Filter)
+        return;
+      Filter.emplace(std::move(F));
+    }
+
+    void ensureFilterBuilt(const BloomFilterBuilder &FB,
+                           ArrayRef<StringRef> Symbols) {
+      std::lock_guard<std::shared_mutex> Lock(Mtx);
+      if (Filter)
+        return;
+      Filter.emplace(FB.build(Symbols));
+    }
+
+    bool mayContain(StringRef Symbol) const {
+      assert(hasFilter());
+      std::shared_lock<std::shared_mutex> Lock(Mtx);
+      return Filter->mayContain(Symbol);
+    }
+
+    bool hasFilter() const {
+      std::shared_lock<std::shared_mutex> Lock(Mtx);
+      return Filter.has_value();
+    }
+
+    LibState getState() const { return S.load(); }
+    PathType getKind() const { return K; }
+
+    void setState(LibState s) { S.store(s); }
+
+    bool operator==(const LibraryInfo &other) const {
+      return FilePath == other.FilePath;
+    }
+
+  private:
+    std::string FilePath;
+    std::atomic<LibState> S;
+    PathType K;
+    std::optional<BloomFilter> Filter;
+    mutable std::shared_mutex Mtx;
+  };
+
+  /// A read-only view of libraries filtered by state and kind.
+  ///
+  /// Lets you loop over only the libraries in a map that match a given State
+  /// and PathType.
+  class FilteredView {
+  public:
+    using Map = StringMap<std::shared_ptr<LibraryInfo>>;
+    using Iterator = Map::const_iterator;
+    class FilterIterator {
+    public:
+      FilterIterator(Iterator it_, Iterator end_, LibState S, PathType K)
+          : it(it_), end(end_), S(S), K(K) {
+        advance();
+      }
+
+      bool operator!=(const FilterIterator &other) const {
+        return it != other.it;
+      }
+
+      const std::shared_ptr<LibraryInfo> &operator*() const {
+        return it->second;
+      }
+
+      FilterIterator &operator++() {
+        ++it;
+        advance();
+        return *this;
+      }
+
+    private:
+      void advance() {
+        for (; it != end; ++it)
+          if (it->second->getState() == S && it->second->getKind() == K)
+            break;
+      }
+      Iterator it;
+      Iterator end;
+      LibState S;
+      PathType K;
+    };
+    FilteredView(Iterator begin, Iterator end, LibState s, PathType k)
+        : mapBegin(begin), mapEnd(end), state(s), kind(k) {}
+
+    FilterIterator begin() const {
+      return FilterIterator(mapBegin, mapEnd, state, kind);
+    }
+
+    FilterIterator end() const {
+      return FilterIterator(mapEnd, mapEnd, state, kind);
+    }
+
+  private:
+    Iterator mapBegin;
+    Iterator mapEnd;
+    LibState state;
+    PathType kind;
+  };
+
+private:
+  StringMap<std::shared_ptr<LibraryInfo>> Libraries;
+  mutable std::shared_mutex Mtx;
+
+public:
+  using LibraryVisitor = std::function<bool(const LibraryInfo &)>;
+
+  LibraryManager() = default;
+  ~LibraryManager() = default;
+
+  bool addLibrary(std::string Path, PathType Kind,
+                  std::optional<BloomFilter> Filter = std::nullopt) {
+    std::unique_lock<std::shared_mutex> Lock(Mtx);
+    if (Libraries.count(Path) > 0)
+      return false;
+    Libraries.insert({std::move(Path),
+                      std::make_shared<LibraryInfo>(Path, LibState::Unloaded,
+                                                    Kind, std::move(Filter))});
+    return true;
+  }
+
+  bool hasLibrary(StringRef Path) const {
+    std::shared_lock<std::shared_mutex> Lock(Mtx);
+    if (Libraries.count(Path) > 0)
+      return true;
+    return false;
+  }
+
+  void removeLibrary(StringRef Path) {
+    std::unique_lock<std::shared_mutex> Lock(Mtx);
+    auto I = Libraries.find(Path);
+    if (I == Libraries.end())
+      return;
+    Libraries.erase(I);
+  }
+
+  void markLoaded(StringRef Path) {
+    std::unique_lock<std::shared_mutex> Lock(Mtx);
+    if (auto It = Libraries.find(Path); It != Libraries.end())
+      It->second->setState(LibState::Loaded);
+  }
+
+  void markQueried(StringRef Path) {
+    std::unique_lock<std::shared_mutex> Lock(Mtx);
+    if (auto It = Libraries.find(Path); It != Libraries.end())
+      It->second->setState(LibState::Queried);
+  }
+
+  std::shared_ptr<LibraryInfo> getLibrary(StringRef Path) {
+    std::shared_lock<std::shared_mutex> Lock(Mtx);
+    if (auto It = Libraries.find(Path); It != Libraries.end())
+      return It->second;
+    return nullptr;
+  }
+
+  FilteredView getView(LibState S, PathType K) const {
+    std::shared_lock<std::shared_mutex> Lock(Mtx);
+    return FilteredView(Libraries.begin(), Libraries.end(), S, K);
+  }
+
+  void forEachLibrary(const LibraryVisitor &visitor) const {
+    std::unique_lock<std::shared_mutex> Lock(Mtx);
+    for (const auto &[_, entry] : Libraries) {
+      if (!visitor(*entry))
+        break;
+    }
+  }
+
+  bool isLoaded(StringRef Path) const {
+    std::unique_lock<std::shared_mutex> Lock(Mtx);
+    if (auto It = Libraries.find(Path.str()); It != Libraries.end())
+      return It->second->getState() == LibState::Loaded;
+    return false;
+  }
+
+  bool isQueried(StringRef Path) const {
+    std::unique_lock<std::shared_mutex> Lock(Mtx);
+    if (auto It = Libraries.find(Path.str()); It != Libraries.end())
+      return It->second->getState() == LibState::Queried;
+    return false;
+  }
+
+  void clear() {
+    std::unique_lock<std::shared_mutex> Lock(Mtx);
+    Libraries.clear();
+  }
+};
+
+using LibraryInfo = LibraryManager::LibraryInfo;
+
+struct SearchPlanEntry {
+  LibraryManager::LibState State; // Loaded, Queried, Unloaded
+  PathType Type;                  // User, System
+};
+
+struct SearchPolicy {
+  std::vector<SearchPlanEntry> Plan;
+
+  static SearchPolicy defaultPlan() {
+    return {{{LibraryManager::LibState::Loaded, PathType::User},
+             {LibraryManager::LibState::Queried, PathType::User},
+             {LibraryManager::LibState::Unloaded, PathType::User},
+             {LibraryManager::LibState::Loaded, PathType::System},
+             {LibraryManager::LibState::Queried, PathType::System},
+             {LibraryManager::LibState::Unloaded, PathType::System}}};
+  }
+};
+
+struct SymbolEnumeratorOptions {
+  enum Filter : uint32_t {
+    None = 0,
+    IgnoreUndefined = 1 << 0,
+    IgnoreWeak = 1 << 1,
+    IgnoreIndirect = 1 << 2,
+    IgnoreHidden = 1 << 3,
+    IgnoreNonGlobal = 1 << 4
+  };
+
+  static SymbolEnumeratorOptions defaultOptions() {
+    return {Filter::IgnoreUndefined | Filter::IgnoreWeak |
+            Filter::IgnoreIndirect};
+  }
+  uint32_t FilterFlags = Filter::None;
+};
+
+struct SearchConfig {
+  SearchPolicy Policy;
+  SymbolEnumeratorOptions Options;
+
+  SearchConfig()
+      : Policy(SearchPolicy::defaultPlan()), // default plan
+        Options(SymbolEnumeratorOptions::defaultOptions()) {}
+};
+
+/// Scans libraries and resolves Symbols across user and system paths.
+///
+/// Supports symbol enumeration and filtering via SymbolEnumerator, and tracks
+/// symbol resolution results through SymbolQuery. Thread-safe and uses
+/// LibraryScanHelper for efficient path resolution and caching.
+class LibraryResolver {
+  friend class LibraryResolutionDriver;
+
+public:
+  class SymbolEnumerator {
+  public:
+    enum class EnumerateResult { Continue, Stop, Error };
+
+    using OnEachSymbolFn = std::function<EnumerateResult(StringRef Sym)>;
+
+    static bool enumerateSymbols(StringRef Path, OnEachSymbolFn OnEach,
+                                 const SymbolEnumeratorOptions &Opts);
+  };
+
+  /// Tracks a set of symbols and the libraries where they are resolved.
+  ///
+  /// SymbolQuery is used to keep track of which symbols have been resolved
+  /// to which libraries. It supports concurrent read/write access using a
+  /// shared mutex, allowing multiple readers or a single writer at a time.
+  class SymbolQuery {
+  public:
+    /// Holds the result for a single symbol.
+    struct Result {
+      std::string Name;
+      std::string ResolvedLibPath;
+    };
+
+  private:
+    mutable std::shared_mutex Mtx;
+    StringMap<Result> Results;
+    std::atomic<size_t> ResolvedCount = 0;
+
+  public:
+    explicit SymbolQuery(const std::vector<std::string> &Symbols) {
+      for (const auto &s : Symbols) {
+        if (!Results.contains(s))
+          Results.insert({s, Result{s, ""}});
+      }
+    }
+
+    SmallVector<StringRef> getUnresolvedSymbols() const {
+      SmallVector<StringRef> Unresolved;
+      std::shared_lock<std::shared_mutex> Lock(Mtx);
+      for (const auto &[name, res] : Results) {
+        if (res.ResolvedLibPath.empty())
+          Unresolved.push_back(name);
+      }
+      return Unresolved;
+    }
+
+    void resolve(StringRef Sym, const std::string &LibPath) {
+      std::unique_lock<std::shared_mutex> Lock(Mtx);
+      auto It = Results.find(Sym);
+      if (It != Results.end() && It->second.ResolvedLibPath.empty()) {
+        It->second.ResolvedLibPath = LibPath;
+        ResolvedCount.fetch_add(1, std::memory_order_relaxed);
+      }
+    }
+
+    bool allResolved() const {
+      return ResolvedCount.load(std::memory_order_relaxed) == Results.size();
+    }
+
+    bool hasUnresolved() const {
+      return ResolvedCount.load(std::memory_order_relaxed) < Results.size();
+    }
+
+    std::optional<StringRef> getResolvedLib(StringRef Sym) const {
+      std::shared_lock<std::shared_mutex> Lock(Mtx);
+      auto It = Results.find(Sym);
+      if (It != Results.end() && !It->second.ResolvedLibPath.empty())
+        return StringRef(It->second.ResolvedLibPath);
+      return std::nullopt;
+    }
+
+    bool isResolved(StringRef Sym) const {
+      std::shared_lock<std::shared_mutex> Lock(Mtx);
+      auto It = Results.find(Sym.str());
+      return It != Results.end() && !It->second.ResolvedLibPath.empty();
+    }
+
+    std::vector<const Result *> getAllResults() const {
+      std::shared_lock<std::shared_mutex> Lock(Mtx);
+      std::vector<const Result *> Out;
+      Out.reserve(Results.size());
+      for (const auto &[_, res] : Results)
+        Out.push_back(&res);
+      return Out;
+    }
+  };
+
+  struct Setup {
+    std::vector<std::string> BasePaths;
+    std::shared_ptr<LibraryPathCache> Cache;
+    std::shared_ptr<PathResolver> PResolver;
+
+    size_t ScanBatchSize = 0;
+
+    LibraryScanner::ShouldScanFn ShouldScanCall = [](StringRef) {
+      return true;
+    };
+
+    BloomFilterBuilder FilterBuilder = BloomFilterBuilder();
+
+    static Setup
+    create(std::vector<std::string> BasePaths,
+           std::shared_ptr<LibraryPathCache> existingCache = nullptr,
+           std::shared_ptr<PathResolver> existingResolver = nullptr,
+           LibraryScanner::ShouldScanFn customShouldScan = nullptr) {
+      Setup S;
+      S.BasePaths = std::move(BasePaths);
+
+      S.Cache =
+          existingCache ? existingCache : std::make_shared<LibraryPathCache>();
+
+      S.PResolver = existingResolver ? existingResolver
+                                     : std::make_shared<PathResolver>(S.Cache);
+
+      if (customShouldScan)
+        S.ShouldScanCall = std::move(customShouldScan);
+
+      return S;
+    }
+  };
+
+  LibraryResolver() = delete;
+  explicit LibraryResolver(const Setup &S);
+  ~LibraryResolver() = default;
+
+  using OnSearchComplete = unique_function<void(SymbolQuery &)>;
+
+  void dump() {
+    int i = 0;
+    LibMgr.forEachLibrary([&](const LibraryInfo &Lib) -> bool {
+      dbgs() << ++i << ". Library Path : " << Lib.getFullPath() << " -> \n\t\t:"
+             << " ({Type : ("
+             << (Lib.getKind() == PathType::User ? "User" : "System")
+             << ") }, { State : "
+             << (Lib.getState() == LibraryManager::LibState::Loaded
+                     ? "Loaded"
+                     : "Unloaded")
+             << "})\n";
+      return true;
+    });
+  }
+
+  void searchSymbolsInLibraries(std::vector<std::string> &SymList,
+                                OnSearchComplete OnComplete,
+                                const SearchConfig &Config = SearchConfig());
+
+private:
+  bool scanLibrariesIfNeeded(PathType K, size_t BatchSize = 0);
+  void resolveSymbolsInLibrary(LibraryInfo &Lib, SymbolQuery &Q,
+                               const SymbolEnumeratorOptions &Opts);
+  bool
+  symbolExistsInLibrary(const LibraryInfo &Lib, StringRef Sym,
+                        std::vector<std::string> *MatchedSymbols = nullptr);
+
+  bool symbolExistsInLibrary(const LibraryInfo &Lib, StringRef SymName,
+                             std::vector<std::string> *AllSymbols,
+                             const SymbolEnumeratorOptions &Opts);
+
+  std::shared_ptr<LibraryPathCache> LibPathCache;
+  std::shared_ptr<PathResolver> LibPathResolver;
+  LibraryScanHelper ScanHelper;
+  BloomFilterBuilder FB;
+  LibraryManager LibMgr;
+  LibraryScanner::ShouldScanFn ShouldScanCall;
+  size_t scanBatchSize;
+};
+
+using SymbolEnumerator = LibraryResolver::SymbolEnumerator;
+using SymbolQuery = LibraryResolver::SymbolQuery;
+using EnumerateResult = SymbolEnumerator::EnumerateResult;
+
+class LibraryResolutionDriver {
+public:
+  static std::unique_ptr<LibraryResolutionDriver>
+  create(const LibraryResolver::Setup &S);
+
+  void addScanPath(const std::string &Path, PathType Kind);
+  bool markLibraryLoaded(StringRef Path);
+  bool markLibraryUnLoaded(StringRef Path);
+  bool isLibraryLoaded(StringRef Path) const {
+    return LR->LibMgr.isLoaded(Path);
+  }
+
+  void resetAll() {
+    LR->LibMgr.clear();
+    LR->ScanHelper.resetToScan();
+    LR->LibPathCache->clear();
+  }
+
+  void scanAll(size_t BatchSize = 0) {
+    LR->scanLibrariesIfNeeded(PathType::User, BatchSize);
+    LR->scanLibrariesIfNeeded(PathType::System, BatchSize);
+  }
+
+  void scan(PathType PK, size_t BatchSize = 0) {
+    LR->scanLibrariesIfNeeded(PK, BatchSize);
+  }
+
+  void resolveSymbols(std::vector<std::string> Symbols,
+                      LibraryResolver::OnSearchComplete OnCompletion,
+                      const SearchConfig &Config = SearchConfig());
+
+  ~LibraryResolutionDriver() = default;
+
+private:
+  LibraryResolutionDriver(std::unique_ptr<LibraryResolver> L)
+      : LR(std::move(L)) {}
+
+  std::unique_ptr<LibraryResolver> LR;
+};
+
+} // end namespace orc
+} // end namespace llvm
+
+#endif // LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_LIBRARYRESOLVER_H
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/LibraryScanner.h b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/LibraryScanner.h
new file mode 100644
index 0000000000000..d1c201306bf54
--- /dev/null
+++ b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/LibraryScanner.h
@@ -0,0 +1,474 @@
+//===- LibraryScanner.h - Scanner for Shared Libraries ---------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides functionality for scanning dynamic (shared) libraries.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_LIBRARYSCANNER_H
+#define LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_LIBRARYSCANNER_H
+
+#include "llvm/ADT/FunctionExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSet.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/StringSaver.h"
+
+#include <atomic>
+#include <mutex>
+#include <queue>
+#include <shared_mutex>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+
+namespace llvm {
+namespace orc {
+
+class LibraryManager;
+
+class LibraryPathCache {
+  friend class PathResolver;
+
+public:
+  LibraryPathCache() = default;
+
+  void clear(bool isRealPathCache = false) {
+    std::unique_lock<std::shared_mutex> lock(Mtx);
+    Seen.clear();
+    if (isRealPathCache) {
+      RealPathCache.clear();
+#ifndef _WIN32
+      ReadlinkCache.clear();
+      LstatCache.clear();
+#endif
+    }
+  }
+
+  void markSeen(const std::string &CanonPath) {
+    std::unique_lock<std::shared_mutex> lock(Mtx);
+    Seen.insert(CanonPath);
+  }
+
+  bool hasSeen(StringRef CanonPath) const {
+    std::shared_lock<std::shared_mutex> lock(Mtx);
+    return Seen.contains(CanonPath);
+  }
+
+  bool hasSeenOrMark(StringRef CanonPath) {
+    std::string s = CanonPath.str();
+    {
+      std::shared_lock<std::shared_mutex> lock(Mtx);
+      if (Seen.contains(s))
+        return true;
+    }
+    {
+      std::unique_lock<std::shared_mutex> lock(Mtx);
+      Seen.insert(s);
+    }
+    return false;
+  }
+
+private:
+  mutable std::shared_mutex Mtx;
+
+  struct PathInfo {
+    std::string canonicalPath;
+    std::error_code ErrnoCode;
+  };
+
+  void insert_realpath(StringRef Path, const PathInfo &Info) {
+    std::unique_lock<std::shared_mutex> lock(Mtx);
+    RealPathCache.insert({Path, Info});
+  }
+
+  std::optional<PathInfo> read_realpath(StringRef Path) const {
+    std::shared_lock<std::shared_mutex> lock(Mtx);
+    auto It = RealPathCache.find(Path);
+    if (It != RealPathCache.end())
+      return It->second;
+
+    return std::nullopt;
+  }
+
+  StringSet<> Seen;
+  StringMap<PathInfo> RealPathCache;
+
+#ifndef _WIN32
+  StringMap<std::string> ReadlinkCache;
+  StringMap<mode_t> LstatCache;
+
+  void insert_link(StringRef Path, const std::string &s) {
+    std::unique_lock<std::shared_mutex> lock(Mtx);
+    ReadlinkCache.insert({Path, s});
+  }
+
+  std::optional<std::string> read_link(StringRef Path) const {
+    std::shared_lock<std::shared_mutex> lock(Mtx);
+    auto It = ReadlinkCache.find(Path);
+    if (It != ReadlinkCache.end())
+      return It->second;
+
+    return std::nullopt;
+  }
+
+  void insert_lstat(StringRef Path, mode_t m) {
+    std::unique_lock<std::shared_mutex> lock(Mtx);
+    LstatCache.insert({Path, m});
+  }
+
+  std::optional<mode_t> read_lstat(StringRef Path) const {
+    std::shared_lock<std::shared_mutex> lock(Mtx);
+    auto It = LstatCache.find(Path);
+    if (It != LstatCache.end())
+      return It->second;
+
+    return std::nullopt;
+  }
+
+#endif
+};
+
+/// Resolves file system paths with optional caching of results.
+///
+/// Supports lstat, readlink, and realpath operations. Can resolve paths
+/// relative to a base and handle symbolic links. Caches results to reduce
+/// repeated system calls when enabled.
+class PathResolver {
+private:
+  std::shared_ptr<LibraryPathCache> LibPathCache;
+
+public:
+  PathResolver(std::shared_ptr<LibraryPathCache> cache)
+      : LibPathCache(std::move(cache)) {}
+
+  std::optional<std::string> resolve(StringRef Path, std::error_code &ec) {
+    return realpathCached(Path, ec);
+  }
+#ifndef _WIN32
+  mode_t lstatCached(StringRef Path);
+  std::optional<std::string> readlinkCached(StringRef Path);
+#endif
+  std::optional<std::string> realpathCached(StringRef Path, std::error_code &ec,
+                                            StringRef base = "",
+                                            bool baseIsResolved = false,
+                                            long symloopLevel = 40);
+};
+
+/// Performs placeholder substitution in dynamic library paths.
+///
+/// Configures known placeholders (like @loader_path) and replaces them
+/// in input paths with their resolved values.
+class DylibSubstitutor {
+public:
+  void configure(StringRef loaderPath);
+
+  std::string substitute(StringRef input) const {
+    for (const auto &[ph, value] : Placeholders) {
+      if (input.starts_with_insensitive(ph))
+        return (Twine(value) + input.drop_front(ph.size())).str();
+    }
+    return input.str();
+  }
+
+private:
+  StringMap<std::string> Placeholders;
+};
+
+/// Validates and normalizes dynamic library paths.
+///
+/// Uses a `PathResolver` to resolve paths to their canonical form and
+/// checks whether they point to valid shared libraries.
+class DylibPathValidator {
+public:
+  DylibPathValidator(PathResolver &PR) : LibPathResolver(PR) {}
+
+  static bool isSharedLibrary(StringRef Path);
+
+  std::optional<std::string> normalize(StringRef Path) const {
+    std::error_code ec;
+    auto real = LibPathResolver.resolve(Path, ec);
+    if (!real || ec)
+      return std::nullopt;
+
+    return real;
+  }
+
+  /// Validate the given path as a shared library.
+  std::optional<std::string> validate(StringRef Path) const {
+    auto realOpt = normalize(Path);
+    if (!realOpt)
+      return std::nullopt;
+
+    if (!isSharedLibrary(*realOpt))
+      return std::nullopt;
+
+    return realOpt;
+  }
+
+private:
+  PathResolver &LibPathResolver;
+};
+
+enum class SearchPathType {
+  RPath,
+  UsrOrSys,
+  RunPath,
+};
+
+struct SearchPathConfig {
+  ArrayRef<StringRef> Paths;
+  SearchPathType type;
+};
+
+class SearchPathResolver {
+public:
+  SearchPathResolver(const SearchPathConfig &Cfg,
+                     StringRef PlaceholderPrefix = "")
+      : Kind(Cfg.type), PlaceholderPrefix(PlaceholderPrefix) {
+    for (auto &path : Cfg.Paths)
+      Paths.emplace_back(path.str());
+  }
+
+  std::optional<std::string> resolve(StringRef libStem,
+                                     const DylibSubstitutor &Subst,
+                                     DylibPathValidator &Validator) const;
+  SearchPathType searchPathType() const { return Kind; }
+
+private:
+  std::vector<std::string> Paths;
+  SearchPathType Kind;
+  std::string PlaceholderPrefix;
+};
+
+class DylibResolverImpl {
+public:
+  DylibResolverImpl(DylibSubstitutor Substitutor, DylibPathValidator &Validator,
+                    std::vector<SearchPathResolver> Resolvers)
+      : Substitutor(std::move(Substitutor)), Validator(Validator),
+        Resolvers(std::move(Resolvers)) {}
+
+  std::optional<std::string> resolve(StringRef Stem,
+                                     bool VariateLibStem = false) const;
+
+private:
+  std::optional<std::string> tryWithExtensions(StringRef libstem) const;
+
+  DylibSubstitutor Substitutor;
+  DylibPathValidator &Validator;
+  std::vector<SearchPathResolver> Resolvers;
+};
+
+class DylibResolver {
+public:
+  DylibResolver(DylibPathValidator &Validator) : Validator(Validator) {}
+
+  void configure(StringRef loaderPath,
+                 ArrayRef<SearchPathConfig> SearchPathCfg) {
+    DylibSubstitutor Substitutor;
+    Substitutor.configure(loaderPath);
+
+    std::vector<SearchPathResolver> Resolvers;
+    for (const auto &cfg : SearchPathCfg) {
+      Resolvers.emplace_back(cfg,
+                             cfg.type == SearchPathType::RPath ? "@rpath" : "");
+    }
+
+    impl_ = std::make_unique<DylibResolverImpl>(
+        std::move(Substitutor), Validator, std::move(Resolvers));
+  }
+
+  std::optional<std::string> resolve(StringRef libStem,
+                                     bool VariateLibStem = false) const {
+    if (!impl_)
+      return std::nullopt;
+    return impl_->resolve(libStem, VariateLibStem);
+  }
+
+  static std::string resolvelinkerFlag(StringRef libStem,
+                                       StringRef loaderPath) {
+    DylibSubstitutor Substitutor;
+    Substitutor.configure(loaderPath);
+    return Substitutor.substitute(libStem);
+  }
+
+private:
+  DylibPathValidator &Validator;
+  std::unique_ptr<DylibResolverImpl> impl_;
+};
+
+enum class PathType : uint8_t { User, System, Unknown };
+
+enum class ScanState : uint8_t { NotScanned, Scanning, Scanned };
+
+struct LibrarySearchPath {
+  std::string BasePath; // Canonical base directory path
+  PathType Kind;        // User or System
+  std::atomic<ScanState> State;
+
+  LibrarySearchPath(std::string Base, PathType K)
+      : BasePath(std::move(Base)), Kind(K), State(ScanState::NotScanned) {}
+};
+
+/// Scans and tracks libraries for symbol resolution.
+///
+/// Maintains a list of library paths to scan, caches scanned units,
+/// and resolves paths canonically for consistent tracking.
+class LibraryScanHelper {
+public:
+  explicit LibraryScanHelper(const std::vector<std::string> &SPaths,
+                             std::shared_ptr<LibraryPathCache> LibPathCache,
+                             std::shared_ptr<PathResolver> LibPathResolver)
+      : LibPathCache(std::move(LibPathCache)),
+        LibPathResolver(std::move(LibPathResolver)) {
+    DEBUG_WITH_TYPE(
+        "orc", dbgs() << "LibraryScanHelper::LibraryScanHelper: base paths : "
+                      << SPaths.size() << "\n";);
+    for (const auto &p : SPaths)
+      addBasePath(p);
+  }
+
+  void
+  addBasePath(const std::string &P,
+              PathType Kind =
+                  PathType::Unknown); // Add a canonical directory for scanning
+  std::vector<std::shared_ptr<LibrarySearchPath>>
+  getNextBatch(PathType Kind, size_t batchSize);
+
+  bool leftToScan(PathType K) const;
+  void resetToScan();
+
+  bool isTrackedBasePath(StringRef P) const;
+  std::vector<std::shared_ptr<LibrarySearchPath>> getAllUnits() const;
+
+  SmallVector<StringRef> getSearchPaths() const {
+    SmallVector<StringRef> SearchPaths;
+    for (const auto &[_, SP] : LibSearchPaths)
+      SearchPaths.push_back(SP->BasePath);
+    return SearchPaths;
+  }
+
+  PathResolver &getPathResolver() const { return *LibPathResolver; }
+
+  LibraryPathCache &getCache() const { return *LibPathCache; }
+
+  bool hasSeenOrMark(StringRef P) const {
+    return LibPathCache->hasSeenOrMark(P);
+  }
+
+  std::optional<std::string> resolve(StringRef P, std::error_code &ec) const {
+    return LibPathResolver->resolve(P.str(), ec);
+  }
+
+private:
+  std::string resolveCanonical(StringRef P, std::error_code &ec) const;
+  PathType classifyKind(StringRef P) const;
+
+  mutable std::shared_mutex Mtx;
+  std::shared_ptr<LibraryPathCache> LibPathCache;
+  std::shared_ptr<PathResolver> LibPathResolver;
+
+  StringMap<std::shared_ptr<LibrarySearchPath>>
+      LibSearchPaths; // key: canonical path
+  std::deque<StringRef> UnscannedUsr;
+  std::deque<StringRef> UnscannedSys;
+};
+
+/// Loads an object file and provides access to it.
+///
+/// Owns the underlying `ObjectFile` and ensures it is valid.
+/// Any errors encountered during construction are stored and
+/// returned when attempting to access the file.
+class ObjectFileLoader {
+public:
+  /// Construct an object file loader from the given path.
+  explicit ObjectFileLoader(StringRef Path) {
+    auto ObjOrErr = loadObjectFileWithOwnership(Path);
+    if (ObjOrErr)
+      Obj = std::move(*ObjOrErr);
+    else {
+      consumeError(std::move(Err));
+      Err = ObjOrErr.takeError();
+    }
+  }
+
+  ObjectFileLoader(const ObjectFileLoader &) = delete;
+  ObjectFileLoader &operator=(const ObjectFileLoader &) = delete;
+
+  ObjectFileLoader(ObjectFileLoader &&) = default;
+  ObjectFileLoader &operator=(ObjectFileLoader &&) = default;
+
+  /// Get the loaded object file, or return an error if loading failed.
+  Expected<object::ObjectFile &> getObjectFile() {
+    if (Err)
+      return std::move(Err);
+    return *Obj.getBinary();
+  }
+
+  static bool isArchitectureCompatible(const object::ObjectFile &Obj);
+
+private:
+  object::OwningBinary<object::ObjectFile> Obj;
+  Error Err = Error::success();
+
+  static Expected<object::OwningBinary<object::ObjectFile>>
+  loadObjectFileWithOwnership(StringRef FilePath);
+};
+
+/// Scans libraries, resolves dependencies, and registers them.
+class LibraryScanner {
+public:
+  using ShouldScanFn = std::function<bool(StringRef)>;
+
+  LibraryScanner(
+      LibraryScanHelper &H, LibraryManager &LibMgr,
+      ShouldScanFn ShouldScanCall = [](StringRef path) { return true; })
+      : ScanHelper(H), LibMgr(LibMgr),
+        ShouldScanCall(std::move(ShouldScanCall)) {}
+
+  void scanNext(PathType Kind, size_t batchSize = 1);
+
+  /// Dependency info for a library.
+  struct LibraryDepsInfo {
+    llvm::BumpPtrAllocator Alloc;
+    llvm::StringSaver Saver{Alloc};
+
+    SmallVector<StringRef, 2> rpath;
+    SmallVector<StringRef, 2> runPath;
+    SmallVector<StringRef, 4> deps;
+    bool isPIE = false;
+
+    void addRPath(StringRef s) { rpath.push_back(Saver.save(s)); }
+
+    void addRunPath(StringRef s) { runPath.push_back(Saver.save(s)); }
+
+    void addDep(StringRef s) { deps.push_back(Saver.save(s)); }
+  };
+
+private:
+  LibraryScanHelper &ScanHelper;
+  LibraryManager &LibMgr;
+  ShouldScanFn ShouldScanCall;
+
+  std::optional<std::string> shouldScan(StringRef FilePath);
+  Expected<LibraryDepsInfo> extractDeps(StringRef FilePath);
+
+  void handleLibrary(StringRef P, PathType K, int level = 1);
+
+  void scanBaseDir(std::shared_ptr<LibrarySearchPath> U);
+};
+
+using LibraryDepsInfo = LibraryScanner::LibraryDepsInfo;
+
+} // end namespace orc
+} // end namespace llvm
+
+#endif // LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_LIBRARYSCANNER_H
diff --git a/llvm/include/llvm/Frontend/OpenMP/ClauseT.h b/llvm/include/llvm/Frontend/OpenMP/ClauseT.h
index 87b95200b2459..d7f0e3a3d49da 100644
--- a/llvm/include/llvm/Frontend/OpenMP/ClauseT.h
+++ b/llvm/include/llvm/Frontend/OpenMP/ClauseT.h
@@ -1167,6 +1167,14 @@ struct ThreadsT {
   using EmptyTrait = std::true_type;
 };
 
+// V6.0: [14.8] `threadset` clause
+template <typename T, typename I, typename E> //
+struct ThreadsetT {
+  ENUM(ThreadsetPolicy, Omp_Pool, Omp_Team);
+  using WrapperTrait = std::true_type;
+  ThreadsetPolicy v;
+};
+
 // V5.2: [5.9.1] `to` clause
 template <typename T, typename I, typename E> //
 struct ToT {
@@ -1352,9 +1360,9 @@ using WrapperClausesT = std::variant<
     ProcBindT<T, I, E>, ReverseOffloadT<T, I, E>, SafelenT<T, I, E>,
     SelfMapsT<T, I, E>, SeverityT<T, I, E>, SharedT<T, I, E>, SimdlenT<T, I, E>,
     SizesT<T, I, E>, PermutationT<T, I, E>, ThreadLimitT<T, I, E>,
-    UnifiedAddressT<T, I, E>, UnifiedSharedMemoryT<T, I, E>, UniformT<T, I, E>,
-    UpdateT<T, I, E>, UseDeviceAddrT<T, I, E>, UseDevicePtrT<T, I, E>,
-    UsesAllocatorsT<T, I, E>>;
+    ThreadsetT<T, I, E>, UnifiedAddressT<T, I, E>,
+    UnifiedSharedMemoryT<T, I, E>, UniformT<T, I, E>, UpdateT<T, I, E>,
+    UseDeviceAddrT<T, I, E>, UseDevicePtrT<T, I, E>, UsesAllocatorsT<T, I, E>>;
 
 template <typename T, typename I, typename E>
 using UnionOfAllClausesT = typename type::Union< //
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMP.td b/llvm/include/llvm/Frontend/OpenMP/OMP.td
index 61a1a05f6e904..208609f64f418 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMP.td
+++ b/llvm/include/llvm/Frontend/OpenMP/OMP.td
@@ -539,6 +539,10 @@ def OMPC_GroupPrivate : Clause<[Spelling<"groupprivate">]> {
 def OMPC_Threads : Clause<[Spelling<"threads">]> {
   let clangClass = "OMPThreadsClause";
 }
+def OMPC_Threadset : Clause<[Spelling<"threadset">]> {
+  let clangClass = "OMPThreadsetClause";
+  let flangClass = "OmpThreadsetClause";
+}
 def OMPC_To : Clause<[Spelling<"to">]> {
   let clangClass = "OMPToClause";
   let flangClass = "OmpToClause";
@@ -1254,6 +1258,7 @@ def OMP_Task : Directive<[Spelling<"task">]> {
     VersionedClause<OMPC_Final>,
     VersionedClause<OMPC_If>,
     VersionedClause<OMPC_Priority>,
+    VersionedClause<OMPC_Threadset, 60>,
     VersionedClause<OMPC_Replayable, 60>,
     VersionedClause<OMPC_Transparent, 60>,
   ];
@@ -1297,6 +1302,7 @@ def OMP_TaskLoop : Directive<[Spelling<"taskloop">]> {
     VersionedClause<OMPC_Final>,
     VersionedClause<OMPC_If>,
     VersionedClause<OMPC_Priority>,
+    VersionedClause<OMPC_Threadset, 60>,
     VersionedClause<OMPC_Replayable, 60>,
     VersionedClause<OMPC_Transparent, 60>,
   ];
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 5331cb5abdc6f..b3d7ab4acf303 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -2383,7 +2383,7 @@ class OpenMPIRBuilder {
     /// runtime library for debugging
     Value *MapNamesArray = nullptr;
 
-    explicit TargetDataRTArgs() {}
+    explicit TargetDataRTArgs() = default;
     explicit TargetDataRTArgs(Value *BasePointersArray, Value *PointersArray,
                               Value *SizesArray, Value *MapTypesArray,
                               Value *MapTypesArrayEnd, Value *MappersArray,
@@ -2451,7 +2451,7 @@ class OpenMPIRBuilder {
     bool HasNoWait = false;
 
     // Constructors for TargetKernelArgs.
-    TargetKernelArgs() {}
+    TargetKernelArgs() = default;
     TargetKernelArgs(unsigned NumTargetItems, TargetDataRTArgs RTArgs,
                      Value *NumIterations, ArrayRef<Value *> NumTeams,
                      ArrayRef<Value *> NumThreads, Value *DynCGGroupMem,
@@ -2494,7 +2494,7 @@ class OpenMPIRBuilder {
     /// Whether the `target ... data` directive has a `nowait` clause.
     bool HasNoWait = false;
 
-    explicit TargetDataInfo() {}
+    explicit TargetDataInfo() = default;
     explicit TargetDataInfo(bool RequiresDevicePointerInfo,
                             bool SeparateBeginEndCalls)
         : RequiresDevicePointerInfo(RequiresDevicePointerInfo),
diff --git a/llvm/include/llvm/IR/AbstractCallSite.h b/llvm/include/llvm/IR/AbstractCallSite.h
index 9e24ae7d1b431..f431e1d8a38ef 100644
--- a/llvm/include/llvm/IR/AbstractCallSite.h
+++ b/llvm/include/llvm/IR/AbstractCallSite.h
@@ -137,7 +137,7 @@ class AbstractCallSite {
 
   /// Return true if @p U is the use that defines the callee of this ACS.
   bool isCallee(const Use *U) const {
-    if (isDirectCall())
+    if (!isCallbackCall())
       return CB->isCallee(U);
 
     assert(!CI.ParameterEncoding.empty() &&
@@ -154,7 +154,7 @@ class AbstractCallSite {
 
   /// Return the number of parameters of the callee.
   unsigned getNumArgOperands() const {
-    if (isDirectCall())
+    if (!isCallbackCall())
       return CB->arg_size();
     // Subtract 1 for the callee encoding.
     return CI.ParameterEncoding.size() - 1;
@@ -169,7 +169,7 @@ class AbstractCallSite {
   /// Return the operand index of the underlying instruction associated with
   /// the function parameter number @p ArgNo or -1 if there is none.
   int getCallArgOperandNo(unsigned ArgNo) const {
-    if (isDirectCall())
+    if (!isCallbackCall())
       return ArgNo;
     // Add 1 for the callee encoding.
     return CI.ParameterEncoding[ArgNo + 1];
@@ -183,7 +183,7 @@ class AbstractCallSite {
   /// Return the operand of the underlying instruction associated with the
   /// function parameter number @p ArgNo or nullptr if there is none.
   Value *getCallArgOperand(unsigned ArgNo) const {
-    if (isDirectCall())
+    if (!isCallbackCall())
       return CB->getArgOperand(ArgNo);
     // Add 1 for the callee encoding.
     return CI.ParameterEncoding[ArgNo + 1] >= 0
@@ -210,7 +210,7 @@ class AbstractCallSite {
 
   /// Return the pointer to function that is being called.
   Value *getCalledOperand() const {
-    if (isDirectCall())
+    if (!isCallbackCall())
       return CB->getCalledOperand();
     return CB->getArgOperand(getCallArgOperandNoForCallee());
   }
diff --git a/llvm/include/llvm/IR/ConstantFold.h b/llvm/include/llvm/IR/ConstantFold.h
index f9f2b3516a4ca..4056f1feb4dd3 100644
--- a/llvm/include/llvm/IR/ConstantFold.h
+++ b/llvm/include/llvm/IR/ConstantFold.h
@@ -26,42 +26,66 @@
 #include <optional>
 
 namespace llvm {
-  template <typename T> class ArrayRef;
-  class Value;
-  class Constant;
-  class Type;
+template <typename T> class ArrayRef;
+class Value;
+class Constant;
+class Type;
 
-  // Constant fold various types of instruction...
-  LLVM_ABI Constant *
-  ConstantFoldCastInstruction(unsigned opcode, ///< The opcode of the cast
-                              Constant *V,     ///< The source constant
-                              Type *DestTy     ///< The destination type
-  );
-  LLVM_ABI Constant *ConstantFoldSelectInstruction(Constant *Cond, Constant *V1,
-                                                   Constant *V2);
-  LLVM_ABI Constant *ConstantFoldExtractElementInstruction(Constant *Val,
-                                                           Constant *Idx);
-  LLVM_ABI Constant *ConstantFoldInsertElementInstruction(Constant *Val,
-                                                          Constant *Elt,
-                                                          Constant *Idx);
-  LLVM_ABI Constant *ConstantFoldShuffleVectorInstruction(Constant *V1,
-                                                          Constant *V2,
-                                                          ArrayRef<int> Mask);
-  LLVM_ABI Constant *
-  ConstantFoldExtractValueInstruction(Constant *Agg, ArrayRef<unsigned> Idxs);
-  LLVM_ABI Constant *
-  ConstantFoldInsertValueInstruction(Constant *Agg, Constant *Val,
-                                     ArrayRef<unsigned> Idxs);
-  LLVM_ABI Constant *ConstantFoldUnaryInstruction(unsigned Opcode, Constant *V);
-  LLVM_ABI Constant *ConstantFoldBinaryInstruction(unsigned Opcode,
-                                                   Constant *V1, Constant *V2);
-  LLVM_ABI Constant *
-  ConstantFoldCompareInstruction(CmpInst::Predicate Predicate, Constant *C1,
-                                 Constant *C2);
-  LLVM_ABI Constant *
-  ConstantFoldGetElementPtr(Type *Ty, Constant *C,
-                            std::optional<ConstantRange> InRange,
-                            ArrayRef<Value *> Idxs);
-} // End llvm namespace
+// Constant fold various types of instruction...
+LLVM_ABI Constant *
+ConstantFoldCastInstruction(unsigned opcode, ///< The opcode of the cast
+                            Constant *V,     ///< The source constant
+                            Type *DestTy     ///< The destination type
+);
+
+/// Attempt to constant fold a select instruction with the specified
+/// operands. The constant result is returned if successful; if not, null is
+/// returned.
+LLVM_ABI Constant *ConstantFoldSelectInstruction(Constant *Cond, Constant *V1,
+                                                 Constant *V2);
+
+/// Attempt to constant fold an extractelement instruction with the
+/// specified operands and indices.  The constant result is returned if
+/// successful; if not, null is returned.
+LLVM_ABI Constant *ConstantFoldExtractElementInstruction(Constant *Val,
+                                                         Constant *Idx);
+
+/// Attempt to constant fold an insertelement instruction with the
+/// specified operands and indices.  The constant result is returned if
+/// successful; if not, null is returned.
+LLVM_ABI Constant *ConstantFoldInsertElementInstruction(Constant *Val,
+                                                        Constant *Elt,
+                                                        Constant *Idx);
+
+/// Attempt to constant fold a shufflevector instruction with the
+/// specified operands and mask.  See class ShuffleVectorInst for a description
+/// of the mask representation. The constant result is returned if successful;
+/// if not, null is returned.
+LLVM_ABI Constant *ConstantFoldShuffleVectorInstruction(Constant *V1,
+                                                        Constant *V2,
+                                                        ArrayRef<int> Mask);
+
+/// Attempt to constant fold an extractvalue instruction with the
+/// specified operands and indices.  The constant result is returned if
+/// successful; if not, null is returned.
+LLVM_ABI Constant *ConstantFoldExtractValueInstruction(Constant *Agg,
+                                                       ArrayRef<unsigned> Idxs);
+
+/// Attempt to constant fold an insertvalue instruction with the specified
+/// operands and indices.  The constant result is returned if successful; if
+/// not, null is returned.
+LLVM_ABI Constant *ConstantFoldInsertValueInstruction(Constant *Agg,
+                                                      Constant *Val,
+                                                      ArrayRef<unsigned> Idxs);
+LLVM_ABI Constant *ConstantFoldUnaryInstruction(unsigned Opcode, Constant *V);
+LLVM_ABI Constant *ConstantFoldBinaryInstruction(unsigned Opcode, Constant *V1,
+                                                 Constant *V2);
+LLVM_ABI Constant *ConstantFoldCompareInstruction(CmpInst::Predicate Predicate,
+                                                  Constant *C1, Constant *C2);
+LLVM_ABI Constant *
+ConstantFoldGetElementPtr(Type *Ty, Constant *C,
+                          std::optional<ConstantRange> InRange,
+                          ArrayRef<Value *> Idxs);
+} // namespace llvm
 
 #endif
diff --git a/llvm/include/llvm/IR/DIBuilder.h b/llvm/include/llvm/IR/DIBuilder.h
index f3839c9694f34..4228ec9c3ef7a 100644
--- a/llvm/include/llvm/IR/DIBuilder.h
+++ b/llvm/include/llvm/IR/DIBuilder.h
@@ -209,10 +209,15 @@ namespace llvm {
     /// \param NumExtraInhabitants The number of extra inhabitants of the type.
     /// An extra inhabitant is a bit pattern that does not represent a valid
     /// value for instances of a given type. This is used by the Swift language.
+    /// \param DataSizeInBits Optionally describes the number of bits used by
+    /// the value of the object when this is less than the storage size of
+    /// SizeInBits. Default value of zero indicates the object value and storage
+    /// sizes are equal.
     LLVM_ABI DIBasicType *
     createBasicType(StringRef Name, uint64_t SizeInBits, unsigned Encoding,
                     DINode::DIFlags Flags = DINode::FlagZero,
-                    uint32_t NumExtraInhabitants = 0);
+                    uint32_t NumExtraInhabitants = 0,
+                    uint32_t DataSizeInBits = 0);
 
     /// Create debugging information entry for a binary fixed-point type.
     /// \param Name        Type name.
diff --git a/llvm/include/llvm/IR/DataLayout.h b/llvm/include/llvm/IR/DataLayout.h
index 56fc749838ef9..54458201af0b3 100644
--- a/llvm/include/llvm/IR/DataLayout.h
+++ b/llvm/include/llvm/IR/DataLayout.h
@@ -590,7 +590,7 @@ class DataLayout {
   ///
   /// This is the amount that alloca reserves for this type. For example,
   /// returns 12 or 16 for x86_fp80, depending on alignment.
-  TypeSize getTypeAllocSize(Type *Ty) const;
+  LLVM_ABI TypeSize getTypeAllocSize(Type *Ty) const;
 
   /// Returns the offset in bits between successive objects of the
   /// specified type, including alignment padding; always a multiple of 8.
diff --git a/llvm/include/llvm/IR/DebugInfoMetadata.h b/llvm/include/llvm/IR/DebugInfoMetadata.h
index c626efc9daaa4..7ade6b8e13308 100644
--- a/llvm/include/llvm/IR/DebugInfoMetadata.h
+++ b/llvm/include/llvm/IR/DebugInfoMetadata.h
@@ -891,96 +891,114 @@ class DIBasicType : public DIType {
   friend class MDNode;
 
   unsigned Encoding;
+  /// Describes the number of bits used by the value of the object. Non-zero
+  /// when the value of an object does not fully occupy the storage size
+  /// specified by SizeInBits.
+  uint32_t DataSizeInBits;
 
 protected:
   DIBasicType(LLVMContext &C, StorageType Storage, unsigned Tag,
               uint32_t AlignInBits, unsigned Encoding,
-              uint32_t NumExtraInhabitants, DIFlags Flags,
-              ArrayRef<Metadata *> Ops)
+              uint32_t NumExtraInhabitants, uint32_t DataSizeInBits,
+              DIFlags Flags, ArrayRef<Metadata *> Ops)
       : DIType(C, DIBasicTypeKind, Storage, Tag, 0, AlignInBits,
                NumExtraInhabitants, Flags, Ops),
-        Encoding(Encoding) {}
+        Encoding(Encoding), DataSizeInBits(DataSizeInBits) {}
   DIBasicType(LLVMContext &C, unsigned ID, StorageType Storage, unsigned Tag,
               uint32_t AlignInBits, unsigned Encoding,
-              uint32_t NumExtraInhabitants, DIFlags Flags,
-              ArrayRef<Metadata *> Ops)
+              uint32_t NumExtraInhabitants, uint32_t DataSizeInBits,
+              DIFlags Flags, ArrayRef<Metadata *> Ops)
       : DIType(C, ID, Storage, Tag, 0, AlignInBits, NumExtraInhabitants, Flags,
                Ops),
-        Encoding(Encoding) {}
+        Encoding(Encoding), DataSizeInBits(DataSizeInBits) {}
   ~DIBasicType() = default;
 
   static DIBasicType *getImpl(LLVMContext &Context, unsigned Tag,
                               StringRef Name, uint64_t SizeInBits,
                               uint32_t AlignInBits, unsigned Encoding,
-                              uint32_t NumExtraInhabitants, DIFlags Flags,
+                              uint32_t NumExtraInhabitants,
+                              uint32_t DataSizeInBits, DIFlags Flags,
                               StorageType Storage, bool ShouldCreate = true) {
     return getImpl(Context, Tag, getCanonicalMDString(Context, Name),
                    SizeInBits, AlignInBits, Encoding, NumExtraInhabitants,
-                   Flags, Storage, ShouldCreate);
+                   DataSizeInBits, Flags, Storage, ShouldCreate);
   }
   static DIBasicType *getImpl(LLVMContext &Context, unsigned Tag,
                               MDString *Name, uint64_t SizeInBits,
                               uint32_t AlignInBits, unsigned Encoding,
-                              uint32_t NumExtraInhabitants, DIFlags Flags,
+                              uint32_t NumExtraInhabitants,
+                              uint32_t DataSizeInBits, DIFlags Flags,
                               StorageType Storage, bool ShouldCreate = true) {
     auto *SizeInBitsNode = ConstantAsMetadata::get(
         ConstantInt::get(Type::getInt64Ty(Context), SizeInBits));
     return getImpl(Context, Tag, Name, SizeInBitsNode, AlignInBits, Encoding,
-                   NumExtraInhabitants, Flags, Storage, ShouldCreate);
+                   NumExtraInhabitants, DataSizeInBits, Flags, Storage,
+                   ShouldCreate);
   }
-  LLVM_ABI static DIBasicType *getImpl(LLVMContext &Context, unsigned Tag,
-                                       MDString *Name, Metadata *SizeInBits,
-                                       uint32_t AlignInBits, unsigned Encoding,
-                                       uint32_t NumExtraInhabitants,
-                                       DIFlags Flags, StorageType Storage,
-                                       bool ShouldCreate = true);
+  LLVM_ABI static DIBasicType *
+  getImpl(LLVMContext &Context, unsigned Tag, MDString *Name,
+          Metadata *SizeInBits, uint32_t AlignInBits, unsigned Encoding,
+          uint32_t NumExtraInhabitants, uint32_t DataSizeInBits, DIFlags Flags,
+          StorageType Storage, bool ShouldCreate = true);
 
   TempDIBasicType cloneImpl() const {
     return getTemporary(getContext(), getTag(), getRawName(),
                         getRawSizeInBits(), getAlignInBits(), getEncoding(),
-                        getNumExtraInhabitants(), getFlags());
+                        getNumExtraInhabitants(), getDataSizeInBits(),
+                        getFlags());
   }
 
 public:
   DEFINE_MDNODE_GET(DIBasicType, (unsigned Tag, StringRef Name),
-                    (Tag, Name, 0, 0, 0, 0, FlagZero))
+                    (Tag, Name, 0, 0, 0, 0, 0, FlagZero))
   DEFINE_MDNODE_GET(DIBasicType,
                     (unsigned Tag, StringRef Name, uint64_t SizeInBits),
-                    (Tag, Name, SizeInBits, 0, 0, 0, FlagZero))
+                    (Tag, Name, SizeInBits, 0, 0, 0, 0, FlagZero))
   DEFINE_MDNODE_GET(DIBasicType,
                     (unsigned Tag, MDString *Name, uint64_t SizeInBits),
-                    (Tag, Name, SizeInBits, 0, 0, 0, FlagZero))
+                    (Tag, Name, SizeInBits, 0, 0, 0, 0, FlagZero))
   DEFINE_MDNODE_GET(DIBasicType,
                     (unsigned Tag, StringRef Name, uint64_t SizeInBits,
                      uint32_t AlignInBits, unsigned Encoding, DIFlags Flags),
-                    (Tag, Name, SizeInBits, AlignInBits, Encoding, 0, Flags))
+                    (Tag, Name, SizeInBits, AlignInBits, Encoding, 0, 0, Flags))
   DEFINE_MDNODE_GET(DIBasicType,
                     (unsigned Tag, MDString *Name, uint64_t SizeInBits,
                      uint32_t AlignInBits, unsigned Encoding, DIFlags Flags),
-                    (Tag, Name, SizeInBits, AlignInBits, Encoding, 0, Flags))
+                    (Tag, Name, SizeInBits, AlignInBits, Encoding, 0, 0, Flags))
   DEFINE_MDNODE_GET(DIBasicType,
                     (unsigned Tag, StringRef Name, uint64_t SizeInBits,
                      uint32_t AlignInBits, unsigned Encoding,
                      uint32_t NumExtraInhabitants, DIFlags Flags),
                     (Tag, Name, SizeInBits, AlignInBits, Encoding,
-                     NumExtraInhabitants, Flags))
+                     NumExtraInhabitants, 0, Flags))
+  DEFINE_MDNODE_GET(DIBasicType,
+                    (unsigned Tag, StringRef Name, uint64_t SizeInBits,
+                     uint32_t AlignInBits, unsigned Encoding,
+                     uint32_t NumExtraInhabitants, uint32_t DataSizeInBits,
+                     DIFlags Flags),
+                    (Tag, Name, SizeInBits, AlignInBits, Encoding,
+                     NumExtraInhabitants, DataSizeInBits, Flags))
   DEFINE_MDNODE_GET(DIBasicType,
                     (unsigned Tag, MDString *Name, uint64_t SizeInBits,
                      uint32_t AlignInBits, unsigned Encoding,
-                     uint32_t NumExtraInhabitants, DIFlags Flags),
+                     uint32_t NumExtraInhabitants, uint32_t DataSizeInBits,
+                     DIFlags Flags),
                     (Tag, Name, SizeInBits, AlignInBits, Encoding,
-                     NumExtraInhabitants, Flags))
+                     NumExtraInhabitants, DataSizeInBits, Flags))
   DEFINE_MDNODE_GET(DIBasicType,
                     (unsigned Tag, MDString *Name, Metadata *SizeInBits,
                      uint32_t AlignInBits, unsigned Encoding,
-                     uint32_t NumExtraInhabitants, DIFlags Flags),
+                     uint32_t NumExtraInhabitants, uint32_t DataSizeInBits,
+                     DIFlags Flags),
                     (Tag, Name, SizeInBits, AlignInBits, Encoding,
-                     NumExtraInhabitants, Flags))
+                     NumExtraInhabitants, DataSizeInBits, Flags))
 
   TempDIBasicType clone() const { return cloneImpl(); }
 
   unsigned getEncoding() const { return Encoding; }
 
+  uint32_t getDataSizeInBits() const { return DataSizeInBits; }
+
   enum class Signedness { Signed, Unsigned };
 
   /// Return the signedness of this type, or std::nullopt if this type is
@@ -1010,7 +1028,7 @@ class DIFixedPointType : public DIBasicType {
                    uint32_t AlignInBits, unsigned Encoding, DIFlags Flags,
                    unsigned Kind, int Factor, ArrayRef<Metadata *> Ops)
       : DIBasicType(C, DIFixedPointTypeKind, Storage, Tag, AlignInBits,
-                    Encoding, 0, Flags, Ops),
+                    Encoding, 0, 0, Flags, Ops),
         Kind(Kind), Factor(Factor) {
     assert(Kind == FixedPointBinary || Kind == FixedPointDecimal);
   }
@@ -1019,7 +1037,7 @@ class DIFixedPointType : public DIBasicType {
                    unsigned Kind, APInt Numerator, APInt Denominator,
                    ArrayRef<Metadata *> Ops)
       : DIBasicType(C, DIFixedPointTypeKind, Storage, Tag, AlignInBits,
-                    Encoding, 0, Flags, Ops),
+                    Encoding, 0, 0, Flags, Ops),
         Kind(Kind), Factor(0), Numerator(Numerator), Denominator(Denominator) {
     assert(Kind == FixedPointRational);
   }
@@ -1028,7 +1046,7 @@ class DIFixedPointType : public DIBasicType {
                    unsigned Kind, int Factor, APInt Numerator,
                    APInt Denominator, ArrayRef<Metadata *> Ops)
       : DIBasicType(C, DIFixedPointTypeKind, Storage, Tag, AlignInBits,
-                    Encoding, 0, Flags, Ops),
+                    Encoding, 0, 0, Flags, Ops),
         Kind(Kind), Factor(Factor), Numerator(Numerator),
         Denominator(Denominator) {}
   ~DIFixedPointType() = default;
diff --git a/llvm/include/llvm/IR/DebugProgramInstruction.h b/llvm/include/llvm/IR/DebugProgramInstruction.h
index 457c60e3bc929..66f44fe34d3f6 100644
--- a/llvm/include/llvm/IR/DebugProgramInstruction.h
+++ b/llvm/include/llvm/IR/DebugProgramInstruction.h
@@ -589,7 +589,7 @@ filterDbgVars(iterator_range<simple_ilist<DbgRecord>::iterator> R) {
 /// date.
 class DbgMarker {
 public:
-  DbgMarker() {}
+  DbgMarker() = default;
   /// Link back to the Instruction that owns this marker. Can be null during
   /// operations that move a marker from one instruction to another.
   Instruction *MarkedInstr = nullptr;
diff --git a/llvm/include/llvm/IR/DroppedVariableStats.h b/llvm/include/llvm/IR/DroppedVariableStats.h
index 42e86dd966751..8a1dbd6aeb60a 100644
--- a/llvm/include/llvm/IR/DroppedVariableStats.h
+++ b/llvm/include/llvm/IR/DroppedVariableStats.h
@@ -42,7 +42,7 @@ class DroppedVariableStats {
 public:
   LLVM_ABI DroppedVariableStats(bool DroppedVarStatsEnabled);
 
-  virtual ~DroppedVariableStats() {}
+  virtual ~DroppedVariableStats() = default;
 
   // We intend this to be unique per-compilation, thus no copies.
   DroppedVariableStats(const DroppedVariableStats &) = delete;
diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h
index dacda0afc7f03..972a253344ddf 100644
--- a/llvm/include/llvm/IR/IRBuilder.h
+++ b/llvm/include/llvm/IR/IRBuilder.h
@@ -2191,7 +2191,7 @@ class IRBuilderBase {
                       FMFSource);
   }
   Value *CreatePtrToAddr(Value *V, const Twine &Name = "") {
-    return CreateCast(Instruction::PtrToInt, V,
+    return CreateCast(Instruction::PtrToAddr, V,
                       BB->getDataLayout().getAddressType(V->getType()), Name);
   }
   Value *CreatePtrToInt(Value *V, Type *DestTy,
diff --git a/llvm/include/llvm/IR/IntrinsicsDirectX.td b/llvm/include/llvm/IR/IntrinsicsDirectX.td
index 3b7077c52db21..d6b85630eb979 100644
--- a/llvm/include/llvm/IR/IntrinsicsDirectX.td
+++ b/llvm/include/llvm/IR/IntrinsicsDirectX.td
@@ -153,6 +153,8 @@ def int_dx_wave_any : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_i1_ty], [IntrCon
 def int_dx_wave_getlaneindex : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrConvergent, IntrNoMem]>;
 def int_dx_wave_reduce_max : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>], [IntrConvergent, IntrNoMem]>;
 def int_dx_wave_reduce_umax : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrConvergent, IntrNoMem]>;
+def int_dx_wave_reduce_min : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>], [IntrConvergent, IntrNoMem]>;
+def int_dx_wave_reduce_umin : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrConvergent, IntrNoMem]>;
 def int_dx_wave_reduce_sum : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>], [IntrConvergent, IntrNoMem]>;
 def int_dx_wave_reduce_usum : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrConvergent, IntrNoMem]>;
 def int_dx_wave_is_first_lane : DefaultAttrsIntrinsic<[llvm_i1_ty], [], [IntrConvergent]>;
diff --git a/llvm/include/llvm/IR/IntrinsicsHexagonDep.td b/llvm/include/llvm/IR/IntrinsicsHexagonDep.td
index fe95377f8e1a5..dde4132791f06 100644
--- a/llvm/include/llvm/IR/IntrinsicsHexagonDep.td
+++ b/llvm/include/llvm/IR/IntrinsicsHexagonDep.td
@@ -6835,6 +6835,180 @@ Hexagon_v64i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vsub_hf_f8_128B">;
 
 // V81 HVX Instructions.
 
+def int_hexagon_V6_vabs_qf16_hf :
+Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vabs_qf16_hf">;
+
+def int_hexagon_V6_vabs_qf16_hf_128B :
+Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vabs_qf16_hf_128B">;
+
+def int_hexagon_V6_vabs_qf16_qf16 :
+Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vabs_qf16_qf16">;
+
+def int_hexagon_V6_vabs_qf16_qf16_128B :
+Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vabs_qf16_qf16_128B">;
+
+def int_hexagon_V6_vabs_qf32_qf32 :
+Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vabs_qf32_qf32">;
+
+def int_hexagon_V6_vabs_qf32_qf32_128B :
+Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vabs_qf32_qf32_128B">;
+
+def int_hexagon_V6_vabs_qf32_sf :
+Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vabs_qf32_sf">;
+
+def int_hexagon_V6_vabs_qf32_sf_128B :
+Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vabs_qf32_sf_128B">;
+
+def int_hexagon_V6_valign4 :
+Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_valign4">;
+
+def int_hexagon_V6_valign4_128B :
+Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_valign4_128B">;
+
+def int_hexagon_V6_vconv_bf_qf32 :
+Hexagon_v16i32_v32i32_Intrinsic<"HEXAGON_V6_vconv_bf_qf32">;
+
+def int_hexagon_V6_vconv_bf_qf32_128B :
+Hexagon_v32i32_v64i32_Intrinsic<"HEXAGON_V6_vconv_bf_qf32_128B">;
+
+def int_hexagon_V6_vconv_f8_qf16 :
+Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vconv_f8_qf16">;
+
+def int_hexagon_V6_vconv_f8_qf16_128B :
+Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vconv_f8_qf16_128B">;
+
+def int_hexagon_V6_vconv_h_hf_rnd :
+Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vconv_h_hf_rnd">;
+
+def int_hexagon_V6_vconv_h_hf_rnd_128B :
+Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vconv_h_hf_rnd_128B">;
+
+def int_hexagon_V6_vconv_qf16_f8 :
+Hexagon_v32i32_v16i32_Intrinsic<"HEXAGON_V6_vconv_qf16_f8">;
+
+def int_hexagon_V6_vconv_qf16_f8_128B :
+Hexagon_v64i32_v32i32_Intrinsic<"HEXAGON_V6_vconv_qf16_f8_128B">;
+
+def int_hexagon_V6_vconv_qf16_hf :
+Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vconv_qf16_hf">;
+
+def int_hexagon_V6_vconv_qf16_hf_128B :
+Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vconv_qf16_hf_128B">;
+
+def int_hexagon_V6_vconv_qf16_qf16 :
+Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vconv_qf16_qf16">;
+
+def int_hexagon_V6_vconv_qf16_qf16_128B :
+Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vconv_qf16_qf16_128B">;
+
+def int_hexagon_V6_vconv_qf32_qf32 :
+Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vconv_qf32_qf32">;
+
+def int_hexagon_V6_vconv_qf32_qf32_128B :
+Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vconv_qf32_qf32_128B">;
+
+def int_hexagon_V6_vconv_qf32_sf :
+Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vconv_qf32_sf">;
+
+def int_hexagon_V6_vconv_qf32_sf_128B :
+Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vconv_qf32_sf_128B">;
+
+def int_hexagon_V6_veqhf :
+Hexagon_v64i1_v16i32v16i32_Intrinsic<"HEXAGON_V6_veqhf">;
+
+def int_hexagon_V6_veqhf_128B :
+Hexagon_v128i1_v32i32v32i32_Intrinsic<"HEXAGON_V6_veqhf_128B">;
+
+def int_hexagon_V6_veqhf_and :
+Hexagon_v64i1_v64i1v16i32v16i32_Intrinsic<"HEXAGON_V6_veqhf_and">;
+
+def int_hexagon_V6_veqhf_and_128B :
+Hexagon_v128i1_v128i1v32i32v32i32_Intrinsic<"HEXAGON_V6_veqhf_and_128B">;
+
+def int_hexagon_V6_veqhf_or :
+Hexagon_v64i1_v64i1v16i32v16i32_Intrinsic<"HEXAGON_V6_veqhf_or">;
+
+def int_hexagon_V6_veqhf_or_128B :
+Hexagon_v128i1_v128i1v32i32v32i32_Intrinsic<"HEXAGON_V6_veqhf_or_128B">;
+
+def int_hexagon_V6_veqhf_xor :
+Hexagon_v64i1_v64i1v16i32v16i32_Intrinsic<"HEXAGON_V6_veqhf_xor">;
+
+def int_hexagon_V6_veqhf_xor_128B :
+Hexagon_v128i1_v128i1v32i32v32i32_Intrinsic<"HEXAGON_V6_veqhf_xor_128B">;
+
+def int_hexagon_V6_veqsf :
+Hexagon_v64i1_v16i32v16i32_Intrinsic<"HEXAGON_V6_veqsf">;
+
+def int_hexagon_V6_veqsf_128B :
+Hexagon_v128i1_v32i32v32i32_Intrinsic<"HEXAGON_V6_veqsf_128B">;
+
+def int_hexagon_V6_veqsf_and :
+Hexagon_v64i1_v64i1v16i32v16i32_Intrinsic<"HEXAGON_V6_veqsf_and">;
+
+def int_hexagon_V6_veqsf_and_128B :
+Hexagon_v128i1_v128i1v32i32v32i32_Intrinsic<"HEXAGON_V6_veqsf_and_128B">;
+
+def int_hexagon_V6_veqsf_or :
+Hexagon_v64i1_v64i1v16i32v16i32_Intrinsic<"HEXAGON_V6_veqsf_or">;
+
+def int_hexagon_V6_veqsf_or_128B :
+Hexagon_v128i1_v128i1v32i32v32i32_Intrinsic<"HEXAGON_V6_veqsf_or_128B">;
+
+def int_hexagon_V6_veqsf_xor :
+Hexagon_v64i1_v64i1v16i32v16i32_Intrinsic<"HEXAGON_V6_veqsf_xor">;
+
+def int_hexagon_V6_veqsf_xor_128B :
+Hexagon_v128i1_v128i1v32i32v32i32_Intrinsic<"HEXAGON_V6_veqsf_xor_128B">;
+
+def int_hexagon_V6_vilog2_hf :
+Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vilog2_hf">;
+
+def int_hexagon_V6_vilog2_hf_128B :
+Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vilog2_hf_128B">;
+
+def int_hexagon_V6_vilog2_qf16 :
+Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vilog2_qf16">;
+
+def int_hexagon_V6_vilog2_qf16_128B :
+Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vilog2_qf16_128B">;
+
+def int_hexagon_V6_vilog2_qf32 :
+Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vilog2_qf32">;
+
+def int_hexagon_V6_vilog2_qf32_128B :
+Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vilog2_qf32_128B">;
+
+def int_hexagon_V6_vilog2_sf :
+Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vilog2_sf">;
+
+def int_hexagon_V6_vilog2_sf_128B :
+Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vilog2_sf_128B">;
+
+def int_hexagon_V6_vneg_qf16_hf :
+Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vneg_qf16_hf">;
+
+def int_hexagon_V6_vneg_qf16_hf_128B :
+Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vneg_qf16_hf_128B">;
+
+def int_hexagon_V6_vneg_qf16_qf16 :
+Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vneg_qf16_qf16">;
+
+def int_hexagon_V6_vneg_qf16_qf16_128B :
+Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vneg_qf16_qf16_128B">;
+
+def int_hexagon_V6_vneg_qf32_qf32 :
+Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vneg_qf32_qf32">;
+
+def int_hexagon_V6_vneg_qf32_qf32_128B :
+Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vneg_qf32_qf32_128B">;
+
+def int_hexagon_V6_vneg_qf32_sf :
+Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vneg_qf32_sf">;
+
+def int_hexagon_V6_vneg_qf32_sf_128B :
+Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vneg_qf32_sf_128B">;
+
 def int_hexagon_V6_vsub_hf_mix :
 Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vsub_hf_mix">;
 
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 719181a09f475..2710853e17688 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -1334,15 +1334,8 @@ let TargetPrefix = "nvvm" in {
   //
   let IntrProperties = [IntrNoMem] in {
     foreach ftz = ["", "_ftz"] in
-      def int_nvvm_ex2_approx # ftz # _f : NVVMBuiltin,
-          DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty]>;
-
-    def int_nvvm_ex2_approx_d : NVVMBuiltin,
-        DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty]>;
-    def int_nvvm_ex2_approx_f16 :
-        DefaultAttrsIntrinsic<[llvm_half_ty], [llvm_half_ty]>;
-    def int_nvvm_ex2_approx_f16x2 :
-        DefaultAttrsIntrinsic<[llvm_v2f16_ty], [llvm_v2f16_ty]>;
+      def int_nvvm_ex2_approx # ftz :
+          DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
 
     foreach ftz = ["", "_ftz"] in
       def int_nvvm_lg2_approx # ftz # _f : NVVMBuiltin,
diff --git a/llvm/include/llvm/IR/IntrinsicsSPIRV.td b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
index 49a182be98acd..bc51fb639fd75 100644
--- a/llvm/include/llvm/IR/IntrinsicsSPIRV.td
+++ b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
@@ -122,6 +122,8 @@ def int_spv_rsqrt : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty]
   def int_spv_wave_any : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_i1_ty], [IntrConvergent, IntrNoMem]>;
   def int_spv_wave_reduce_umax : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>], [IntrConvergent, IntrNoMem]>;
   def int_spv_wave_reduce_max : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>], [IntrConvergent, IntrNoMem]>;
+  def int_spv_wave_reduce_min : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>], [IntrConvergent, IntrNoMem]>;
+  def int_spv_wave_reduce_umin : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>], [IntrConvergent, IntrNoMem]>;
   def int_spv_wave_reduce_sum : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>], [IntrConvergent, IntrNoMem]>;
   def int_spv_wave_is_first_lane : DefaultAttrsIntrinsic<[llvm_i1_ty], [], [IntrConvergent]>;
   def int_spv_wave_readlane : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>, llvm_i32_ty], [IntrConvergent, IntrNoMem]>;
@@ -136,7 +138,7 @@ def int_spv_rsqrt : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty]
   def int_spv_sclamp : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
   def int_spv_nclamp : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
 
-  // Create resource handle given the binding information. Returns a 
+  // Create resource handle given the binding information. Returns a
   // type appropriate for the kind of resource given the set id, binding id,
   // array size of the binding, as well as an index and an indicator
   // whether that index may be non-uniform.
diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td
index 81fbfbf0bb1b4..1dd23f60c7e1e 100644
--- a/llvm/include/llvm/IR/IntrinsicsX86.td
+++ b/llvm/include/llvm/IR/IntrinsicsX86.td
@@ -5505,46 +5505,6 @@ let TargetPrefix = "x86" in {
                         [ImmArg<ArgIndex<0>>,
                         ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
 
-  // AMX-TRANSPOSE
-  def int_x86_t2rpntlvwz0 : ClangBuiltin<"__builtin_ia32_t2rpntlvwz0">,
-              Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty],
-                        [ImmArg<ArgIndex<0>>]>;
-  def int_x86_t2rpntlvwz0t1 : ClangBuiltin<"__builtin_ia32_t2rpntlvwz0t1">,
-              Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty],
-                        [ImmArg<ArgIndex<0>>]>;
-  def int_x86_t2rpntlvwz1 : ClangBuiltin<"__builtin_ia32_t2rpntlvwz1">,
-              Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty],
-                        [ImmArg<ArgIndex<0>>]>;
-  def int_x86_t2rpntlvwz1t1 : ClangBuiltin<"__builtin_ia32_t2rpntlvwz1t1">,
-              Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty],
-                        [ImmArg<ArgIndex<0>>]>;
-  def int_x86_ttransposed : ClangBuiltin<"__builtin_ia32_ttransposed">,
-              Intrinsic<[], [llvm_i8_ty, llvm_i8_ty],
-                        [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>]>;
-  def int_x86_ttdpbf16ps : ClangBuiltin<"__builtin_ia32_ttdpbf16ps">,
-              Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty],
-                        [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>,
-                         ImmArg<ArgIndex<2>>]>;
-  def int_x86_ttdpfp16ps : ClangBuiltin<"__builtin_ia32_ttdpfp16ps">,
-              Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty],
-                        [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>,
-                         ImmArg<ArgIndex<2>>]>;
-  def int_x86_ttcmmimfp16ps : ClangBuiltin<"__builtin_ia32_ttcmmimfp16ps">,
-              Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty],
-                        [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>,
-                         ImmArg<ArgIndex<2>>]>;
-  def int_x86_ttcmmrlfp16ps : ClangBuiltin<"__builtin_ia32_ttcmmrlfp16ps">,
-              Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty],
-                        [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>,
-                         ImmArg<ArgIndex<2>>]>;
-  def int_x86_tconjtcmmimfp16ps : ClangBuiltin<"__builtin_ia32_tconjtcmmimfp16ps">,
-              Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty],
-                        [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>,
-                         ImmArg<ArgIndex<2>>]>;
-  def int_x86_tconjtfp16 : ClangBuiltin<"__builtin_ia32_tconjtfp16">,
-              Intrinsic<[], [llvm_i8_ty, llvm_i8_ty],
-                        [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>]>;
-
   // AMX-MORVS, AMX-TRANSPOSE
   def int_x86_t2rpntlvwz0rs : ClangBuiltin<"__builtin_ia32_t2rpntlvwz0rs">,
               Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty],
@@ -5685,61 +5645,6 @@ let TargetPrefix = "x86" in {
                         [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty],
                         [IntrArgMemOnly]>;
 
-  def int_x86_t2rpntlvwz0_internal :
-              Intrinsic<[llvm_x86amx_ty, llvm_x86amx_ty],
-                        [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty],
-                        []>;
-  def int_x86_t2rpntlvwz0t1_internal :
-              Intrinsic<[llvm_x86amx_ty, llvm_x86amx_ty],
-                        [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty],
-                        []>;
-  def int_x86_t2rpntlvwz1_internal :
-              Intrinsic<[llvm_x86amx_ty, llvm_x86amx_ty],
-                        [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty],
-                        []>;
-  def int_x86_t2rpntlvwz1t1_internal :
-              Intrinsic<[llvm_x86amx_ty, llvm_x86amx_ty],
-                        [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty],
-                        []>;
-  def int_x86_ttransposed_internal :
-              ClangBuiltin<"__builtin_ia32_ttransposed_internal">,
-              Intrinsic<[llvm_x86amx_ty],
-                        [llvm_i16_ty, llvm_i16_ty, llvm_x86amx_ty], []>;
-  def int_x86_ttdpbf16ps_internal :
-              ClangBuiltin<"__builtin_ia32_ttdpbf16ps_internal">,
-              Intrinsic<[llvm_x86amx_ty],
-                        [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty,
-                         llvm_x86amx_ty, llvm_x86amx_ty,
-                         llvm_x86amx_ty], []>;
-  def int_x86_ttdpfp16ps_internal :
-              ClangBuiltin<"__builtin_ia32_ttdpfp16ps_internal">,
-              Intrinsic<[llvm_x86amx_ty],
-                        [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty,
-                         llvm_x86amx_ty, llvm_x86amx_ty,
-                         llvm_x86amx_ty], []>;
-  def int_x86_ttcmmimfp16ps_internal :
-              ClangBuiltin<"__builtin_ia32_ttcmmimfp16ps_internal">,
-              Intrinsic<[llvm_x86amx_ty],
-                        [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty,
-                         llvm_x86amx_ty, llvm_x86amx_ty,
-                         llvm_x86amx_ty], []>;
-  def int_x86_ttcmmrlfp16ps_internal :
-              ClangBuiltin<"__builtin_ia32_ttcmmrlfp16ps_internal">,
-              Intrinsic<[llvm_x86amx_ty],
-                        [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty,
-                         llvm_x86amx_ty, llvm_x86amx_ty,
-                         llvm_x86amx_ty], []>;
-  def int_x86_tconjtcmmimfp16ps_internal :
-              ClangBuiltin<"__builtin_ia32_tconjtcmmimfp16ps_internal">,
-              Intrinsic<[llvm_x86amx_ty],
-                        [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty,
-                         llvm_x86amx_ty, llvm_x86amx_ty,
-                         llvm_x86amx_ty], []>;
-  def int_x86_tconjtfp16_internal :
-              ClangBuiltin<"__builtin_ia32_tconjtfp16_internal">,
-              Intrinsic<[llvm_x86amx_ty],
-                        [llvm_i16_ty, llvm_i16_ty, llvm_x86amx_ty], []>;
-
   def int_x86_tcvtrowd2ps_internal :
               ClangBuiltin<"__builtin_ia32_tcvtrowd2ps_internal">,
               Intrinsic<[llvm_v16f32_ty],
@@ -5775,20 +5680,11 @@ let TargetPrefix = "x86" in {
               Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty],
                         [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>,
                          ImmArg<ArgIndex<2>>]>;
-  def int_x86_ttmmultf32ps : ClangBuiltin<"__builtin_ia32_ttmmultf32ps">,
-              Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty],
-                        [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>,
-                         ImmArg<ArgIndex<2>>]>;
   def int_x86_tmmultf32ps_internal :
               ClangBuiltin<"__builtin_ia32_tmmultf32ps_internal">,
               Intrinsic<[llvm_x86amx_ty],
                         [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_x86amx_ty,
                          llvm_x86amx_ty, llvm_x86amx_ty], []>;
-  def int_x86_ttmmultf32ps_internal :
-              ClangBuiltin<"__builtin_ia32_ttmmultf32ps_internal">,
-              Intrinsic<[llvm_x86amx_ty],
-                        [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_x86amx_ty,
-                         llvm_x86amx_ty, llvm_x86amx_ty], []>;
 
   def int_x86_tdpbf8ps_internal :
                 ClangBuiltin<"__builtin_ia32_tdpbf8ps_internal">,
diff --git a/llvm/include/llvm/IR/Mangler.h b/llvm/include/llvm/IR/Mangler.h
index 232101a8926b7..4d387ba73127d 100644
--- a/llvm/include/llvm/IR/Mangler.h
+++ b/llvm/include/llvm/IR/Mangler.h
@@ -80,8 +80,7 @@ getArm64ECDemangledFunctionName(StringRef Name);
 
 /// Check if an ARM64EC function name is mangled.
 bool inline isArm64ECMangledFunctionName(StringRef Name) {
-  return Name[0] == '#' ||
-         (Name[0] == '?' && Name.find("@$$h") != StringRef::npos);
+  return Name[0] == '#' || (Name[0] == '?' && Name.contains("@$$h"));
 }
 
 } // End llvm namespace
diff --git a/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h b/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h
index 3381e1777217a..ccb77e75492af 100644
--- a/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h
+++ b/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h
@@ -79,7 +79,7 @@ struct CustomMappingTraits<
       }
       Args.push_back(Arg);
     }
-    io.mapRequired(Key.str().c_str(), V[Args]);
+    io.mapRequired(Key, V[Args]);
   }
   static void output(
       IO &io,
@@ -91,7 +91,7 @@ struct CustomMappingTraits<
           Key += ',';
         Key += llvm::utostr(Arg);
       }
-      io.mapRequired(Key.c_str(), P.second);
+      io.mapRequired(Key, P.second);
     }
   }
 };
@@ -122,11 +122,11 @@ struct CustomMappingTraits<std::map<uint64_t, WholeProgramDevirtResolution>> {
       io.setError("key not an integer");
       return;
     }
-    io.mapRequired(Key.str().c_str(), V[KeyInt]);
+    io.mapRequired(Key, V[KeyInt]);
   }
   static void output(IO &io, std::map<uint64_t, WholeProgramDevirtResolution> &V) {
     for (auto &P : V)
-      io.mapRequired(llvm::utostr(P.first).c_str(), P.second);
+      io.mapRequired(llvm::utostr(P.first), P.second);
   }
 };
 
@@ -215,7 +215,7 @@ namespace yaml {
 template <> struct CustomMappingTraits<GlobalValueSummaryMapTy> {
   static void inputOne(IO &io, StringRef Key, GlobalValueSummaryMapTy &V) {
     std::vector<GlobalValueSummaryYaml> GVSums;
-    io.mapRequired(Key.str().c_str(), GVSums);
+    io.mapRequired(Key, GVSums);
     uint64_t KeyInt;
     if (Key.getAsInteger(0, KeyInt)) {
       io.setError("key not an integer");
@@ -290,7 +290,7 @@ template <> struct CustomMappingTraits<GlobalValueSummaryMapTy> {
         }
       }
       if (!GVSums.empty())
-        io.mapRequired(llvm::utostr(P.first).c_str(), GVSums);
+        io.mapRequired(llvm::utostr(P.first), GVSums);
     }
   }
   static void fixAliaseeLinks(GlobalValueSummaryMapTy &V) {
@@ -313,12 +313,12 @@ template <> struct CustomMappingTraits<GlobalValueSummaryMapTy> {
 template <> struct CustomMappingTraits<TypeIdSummaryMapTy> {
   static void inputOne(IO &io, StringRef Key, TypeIdSummaryMapTy &V) {
     TypeIdSummary TId;
-    io.mapRequired(Key.str().c_str(), TId);
+    io.mapRequired(Key, TId);
     V.insert({GlobalValue::getGUIDAssumingExternalLinkage(Key), {Key, TId}});
   }
   static void output(IO &io, TypeIdSummaryMapTy &V) {
     for (auto &TidIter : V)
-      io.mapRequired(TidIter.second.first.str().c_str(), TidIter.second.second);
+      io.mapRequired(TidIter.second.first, TidIter.second.second);
   }
 };
 
diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.td b/llvm/include/llvm/IR/RuntimeLibcalls.td
index 7be1b654ca727..24c1b035d0dda 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.td
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.td
@@ -1585,7 +1585,7 @@ def __aeabi_f2ulz : RuntimeLibcallImpl<FPTOUINT_F32_I64>; // CallingConv::ARM_AA
 // RTABI chapter 4.1.2, Table 7
 def __aeabi_d2f : RuntimeLibcallImpl<FPROUND_F64_F32>; // CallingConv::ARM_AAPCS
 def __aeabi_d2h : RuntimeLibcallImpl<FPROUND_F64_F16>; // CallingConv::ARM_AAPCS
-def  __aeabi_f2d : RuntimeLibcallImpl<FPEXT_F32_F64>; // CallingConv::ARM_AAPCS
+def __aeabi_f2d : RuntimeLibcallImpl<FPEXT_F32_F64>; // CallingConv::ARM_AAPCS
 
 // Integer to floating-point conversions.
 // RTABI chapter 4.1.2, Table 8
diff --git a/llvm/include/llvm/IR/TrackingMDRef.h b/llvm/include/llvm/IR/TrackingMDRef.h
index d7377398b91b3..7ad7225d076fc 100644
--- a/llvm/include/llvm/IR/TrackingMDRef.h
+++ b/llvm/include/llvm/IR/TrackingMDRef.h
@@ -111,17 +111,14 @@ template <class T> class TypedTrackingMDRef {
   explicit TypedTrackingMDRef(T *MD) : Ref(static_cast<Metadata *>(MD)) {}
 
   TypedTrackingMDRef(TypedTrackingMDRef &&X) : Ref(std::move(X.Ref)) {}
-  TypedTrackingMDRef(const TypedTrackingMDRef &X) : Ref(X.Ref) {}
+  TypedTrackingMDRef(const TypedTrackingMDRef &X) = default;
 
   TypedTrackingMDRef &operator=(TypedTrackingMDRef &&X) {
     Ref = std::move(X.Ref);
     return *this;
   }
 
-  TypedTrackingMDRef &operator=(const TypedTrackingMDRef &X) {
-    Ref = X.Ref;
-    return *this;
-  }
+  TypedTrackingMDRef &operator=(const TypedTrackingMDRef &X) = default;
 
   T *get() const { return (T *)Ref.get(); }
   operator T *() const { return get(); }
diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
index 581b4ad161daa..c8196d8a7ef48 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -90,7 +90,6 @@ LLVM_ABI void initializeDSELegacyPassPass(PassRegistry &);
 LLVM_ABI void initializeDXILMetadataAnalysisWrapperPassPass(PassRegistry &);
 LLVM_ABI void initializeDXILMetadataAnalysisWrapperPrinterPass(PassRegistry &);
 LLVM_ABI void initializeDXILResourceBindingWrapperPassPass(PassRegistry &);
-LLVM_ABI void initializeDXILResourceImplicitBindingLegacyPass(PassRegistry &);
 LLVM_ABI void initializeDXILResourceTypeWrapperPassPass(PassRegistry &);
 LLVM_ABI void initializeDXILResourceWrapperPassPass(PassRegistry &);
 LLVM_ABI void initializeDeadMachineInstructionElimPass(PassRegistry &);
diff --git a/llvm/include/llvm/MC/MCAssembler.h b/llvm/include/llvm/MC/MCAssembler.h
index 6e1d6421b8d33..bbb8bee515258 100644
--- a/llvm/include/llvm/MC/MCAssembler.h
+++ b/llvm/include/llvm/MC/MCAssembler.h
@@ -198,8 +198,8 @@ class MCAssembler {
   const_iterator end() const { return Sections.end(); }
 
   SmallVectorImpl<const MCSymbol *> &getSymbols() { return Symbols; }
-  iterator_range<pointee_iterator<
-      typename SmallVector<const MCSymbol *, 0>::const_iterator>>
+  iterator_range<
+      pointee_iterator<SmallVector<const MCSymbol *, 0>::const_iterator>>
   symbols() const {
     return make_pointee_range(Symbols);
   }
diff --git a/llvm/include/llvm/MC/MCParser/MCAsmParser.h b/llvm/include/llvm/MC/MCParser/MCAsmParser.h
index e3f44a08db641..5d74b76592df9 100644
--- a/llvm/include/llvm/MC/MCParser/MCAsmParser.h
+++ b/llvm/include/llvm/MC/MCParser/MCAsmParser.h
@@ -209,28 +209,25 @@ class LLVM_ABI MCAsmParser {
       MCInstPrinter *IP, MCAsmParserSemaCallback &SI) = 0;
 
   /// Emit a note at the location \p L, with the message \p Msg.
-  virtual void Note(SMLoc L, const Twine &Msg,
-                    SMRange Range = std::nullopt) = 0;
+  virtual void Note(SMLoc L, const Twine &Msg, SMRange Range = {}) = 0;
 
   /// Emit a warning at the location \p L, with the message \p Msg.
   ///
   /// \return The return value is true, if warnings are fatal.
-  virtual bool Warning(SMLoc L, const Twine &Msg,
-                       SMRange Range = std::nullopt) = 0;
+  virtual bool Warning(SMLoc L, const Twine &Msg, SMRange Range = {}) = 0;
 
   /// Return an error at the location \p L, with the message \p Msg. This
   /// may be modified before being emitted.
   ///
   /// \return The return value is always true, as an idiomatic convenience to
   /// clients.
-  bool Error(SMLoc L, const Twine &Msg, SMRange Range = std::nullopt);
+  bool Error(SMLoc L, const Twine &Msg, SMRange Range = {});
 
   /// Emit an error at the location \p L, with the message \p Msg.
   ///
   /// \return The return value is always true, as an idiomatic convenience to
   /// clients.
-  virtual bool printError(SMLoc L, const Twine &Msg,
-                          SMRange Range = std::nullopt) = 0;
+  virtual bool printError(SMLoc L, const Twine &Msg, SMRange Range = {}) = 0;
 
   bool hasPendingError() { return !PendingErrors.empty(); }
 
@@ -255,7 +252,7 @@ class LLVM_ABI MCAsmParser {
   const AsmToken &getTok() const;
 
   /// Report an error at the current lexer location.
-  bool TokError(const Twine &Msg, SMRange Range = std::nullopt);
+  bool TokError(const Twine &Msg, SMRange Range = {});
 
   bool parseTokenLoc(SMLoc &Loc);
   bool parseToken(AsmToken::TokenKind T, const Twine &Msg = "unexpected token");
diff --git a/llvm/include/llvm/MC/MCRegisterInfo.h b/llvm/include/llvm/MC/MCRegisterInfo.h
index e6fc7077a2dc3..f611edd715398 100644
--- a/llvm/include/llvm/MC/MCRegisterInfo.h
+++ b/llvm/include/llvm/MC/MCRegisterInfo.h
@@ -272,7 +272,7 @@ class LLVM_ABI MCRegisterInfo {
   friend class MCRegUnitRootIterator;
   friend class MCRegAliasIterator;
 
-  virtual ~MCRegisterInfo() {}
+  virtual ~MCRegisterInfo() = default;
 
   /// Initialize MCRegisterInfo, called by TableGen
   /// auto-generated routines. *DO NOT USE*.
diff --git a/llvm/include/llvm/MCA/SourceMgr.h b/llvm/include/llvm/MCA/SourceMgr.h
index 16a60d1116ad6..300961cbfcd69 100644
--- a/llvm/include/llvm/MCA/SourceMgr.h
+++ b/llvm/include/llvm/MCA/SourceMgr.h
@@ -50,7 +50,7 @@ struct SourceMgr {
   /// Advance to the next \a SourceRef.
   virtual void updateNext() = 0;
 
-  virtual ~SourceMgr() {}
+  virtual ~SourceMgr() = default;
 };
 
 /// The default implementation of \a SourceMgr. It always takes a fixed number
diff --git a/llvm/include/llvm/ObjCopy/ConfigManager.h b/llvm/include/llvm/ObjCopy/ConfigManager.h
index 15687998820c5..45f847ff7c434 100644
--- a/llvm/include/llvm/ObjCopy/ConfigManager.h
+++ b/llvm/include/llvm/ObjCopy/ConfigManager.h
@@ -23,7 +23,7 @@ namespace llvm {
 namespace objcopy {
 
 struct LLVM_ABI ConfigManager : public MultiFormatConfig {
-  ~ConfigManager() override {}
+  ~ConfigManager() override = default;
 
   const CommonConfig &getCommonConfig() const override { return Common; }
 
diff --git a/llvm/include/llvm/ObjCopy/MultiFormatConfig.h b/llvm/include/llvm/ObjCopy/MultiFormatConfig.h
index bb93f64aa2788..91baf9b286c58 100644
--- a/llvm/include/llvm/ObjCopy/MultiFormatConfig.h
+++ b/llvm/include/llvm/ObjCopy/MultiFormatConfig.h
@@ -24,7 +24,7 @@ struct DXContainerConfig;
 
 class MultiFormatConfig {
 public:
-  virtual ~MultiFormatConfig() {}
+  virtual ~MultiFormatConfig() = default;
 
   virtual const CommonConfig &getCommonConfig() const = 0;
   virtual Expected<const ELFConfig &> getELFConfig() const = 0;
diff --git a/llvm/include/llvm/Object/ELF.h b/llvm/include/llvm/Object/ELF.h
index 59f63eb6b5bb6..cc1e5f9dcb9da 100644
--- a/llvm/include/llvm/Object/ELF.h
+++ b/llvm/include/llvm/Object/ELF.h
@@ -261,6 +261,8 @@ class ELFFile {
   ELFFile(const ELFFile &) = default;
   ELFFile &operator=(const ELFFile &) = default;
 
+  ELFFile(ELFFile &&) = default;
+
   // This is a callback that can be passed to a number of functions.
   // It can be used to ignore non-critical errors (warnings), which is
   // useful for dumpers, like llvm-readobj.
@@ -278,9 +280,46 @@ class ELFFile {
   std::vector<Elf_Shdr> FakeSections;
   SmallString<0> FakeSectionStrings;
 
+  // When the number of program headers is >= PN_XNUM, the actual number is
+  // contained in the sh_info field of the section header at index 0.
+  std::optional<uint32_t> RealPhNum;
+  // When the number of section headers is >= SHN_LORESERVE, the actual number
+  // is contained in the sh_size field of the section header at index 0.
+  std::optional<uint64_t> RealShNum;
+  // When the section index of the section name table is >= SHN_LORESERVE, the
+  // actual number is contained in the sh_link field of the section header at
+  // index 0.
+  std::optional<uint32_t> RealShStrNdx;
+
   ELFFile(StringRef Object);
 
+  Error readShdrZero();
+
 public:
+  Expected<uint32_t> getPhNum() const {
+    if (!RealPhNum) {
+      if (Error E = const_cast<ELFFile<ELFT> *>(this)->readShdrZero())
+        return std::move(E);
+    }
+    return *RealPhNum;
+  }
+
+  Expected<uint64_t> getShNum() const {
+    if (!RealShNum) {
+      if (Error E = const_cast<ELFFile<ELFT> *>(this)->readShdrZero())
+        return std::move(E);
+    }
+    return *RealShNum;
+  }
+
+  Expected<uint32_t> getShStrNdx() const {
+    if (!RealShStrNdx) {
+      if (Error E = const_cast<ELFFile<ELFT> *>(this)->readShdrZero())
+        return std::move(E);
+    }
+    return *RealShStrNdx;
+  }
+
   const Elf_Ehdr &getHeader() const {
     return *reinterpret_cast<const Elf_Ehdr *>(base());
   }
@@ -379,22 +418,26 @@ class ELFFile {
 
   /// Iterate over program header table.
   Expected<Elf_Phdr_Range> program_headers() const {
-    if (getHeader().e_phnum && getHeader().e_phentsize != sizeof(Elf_Phdr))
+    uint32_t NumPh;
+    if (Expected<uint32_t> PhNumOrErr = getPhNum())
+      NumPh = *PhNumOrErr;
+    else
+      return PhNumOrErr.takeError();
+    if (NumPh && getHeader().e_phentsize != sizeof(Elf_Phdr))
       return createError("invalid e_phentsize: " +
                          Twine(getHeader().e_phentsize));
 
-    uint64_t HeadersSize =
-        (uint64_t)getHeader().e_phnum * getHeader().e_phentsize;
+    uint64_t HeadersSize = (uint64_t)NumPh * getHeader().e_phentsize;
     uint64_t PhOff = getHeader().e_phoff;
     if (PhOff + HeadersSize < PhOff || PhOff + HeadersSize > getBufSize())
       return createError("program headers are longer than binary of size " +
                          Twine(getBufSize()) + ": e_phoff = 0x" +
                          Twine::utohexstr(getHeader().e_phoff) +
-                         ", e_phnum = " + Twine(getHeader().e_phnum) +
+                         ", e_phnum = " + Twine(NumPh) +
                          ", e_phentsize = " + Twine(getHeader().e_phentsize));
 
     auto *Begin = reinterpret_cast<const Elf_Phdr *>(base() + PhOff);
-    return ArrayRef(Begin, Begin + getHeader().e_phnum);
+    return ArrayRef(Begin, Begin + NumPh);
   }
 
   /// Get an iterator over notes in a program header.
@@ -772,19 +815,15 @@ template <class ELFT>
 Expected<StringRef>
 ELFFile<ELFT>::getSectionStringTable(Elf_Shdr_Range Sections,
                                      WarningHandler WarnHandler) const {
-  uint32_t Index = getHeader().e_shstrndx;
-  if (Index == ELF::SHN_XINDEX) {
-    // If the section name string table section index is greater than
-    // or equal to SHN_LORESERVE, then the actual index of the section name
-    // string table section is contained in the sh_link field of the section
-    // header at index 0.
-    if (Sections.empty())
-      return createError(
-          "e_shstrndx == SHN_XINDEX, but the section header table is empty");
+  Expected<uint32_t> ShStrNdxOrErr = getShStrNdx();
+  if (!ShStrNdxOrErr)
+    return ShStrNdxOrErr.takeError();
 
-    Index = Sections[0].sh_link;
-  }
+  if (*ShStrNdxOrErr == ELF::SHN_XINDEX && Sections.empty())
+    return createError(
+        "e_shstrndx == SHN_XINDEX, but the section header table is empty");
 
+  uint32_t Index = *ShStrNdxOrErr;
   // There is no section name string table. Return FakeSectionStrings which
   // is non-empty if we have created fake sections.
   if (!Index)
@@ -891,6 +930,35 @@ Expected<uint64_t> ELFFile<ELFT>::getDynSymtabSize() const {
 
 template <class ELFT> ELFFile<ELFT>::ELFFile(StringRef Object) : Buf(Object) {}
 
+template <class ELFT> Error ELFFile<ELFT>::readShdrZero() {
+  const Elf_Ehdr &Header = getHeader();
+
+  if ((Header.e_phnum == ELF::PN_XNUM || Header.e_shnum == 0 ||
+       Header.e_shstrndx == ELF::SHN_XINDEX) &&
+      Header.e_shoff != 0) {
+    // Pretend we have section 0 or sections() would call getShNum and thus
+    // become an infinite recursion.
+    RealShNum = 1;
+    auto SecOrErr = getSection(0);
+    if (!SecOrErr) {
+      RealShNum = std::nullopt;
+      return SecOrErr.takeError();
+    }
+
+    RealPhNum =
+        Header.e_phnum == ELF::PN_XNUM ? (*SecOrErr)->sh_info : Header.e_phnum;
+    RealShNum = Header.e_shnum == 0 ? (*SecOrErr)->sh_size : Header.e_shnum;
+    RealShStrNdx = Header.e_shstrndx == ELF::SHN_XINDEX ? (*SecOrErr)->sh_link
+                                                        : Header.e_shstrndx;
+  } else {
+    RealPhNum = Header.e_phnum;
+    RealShNum = Header.e_shnum;
+    RealShStrNdx = Header.e_shstrndx;
+  }
+
+  return Error::success();
+}
+
 template <class ELFT>
 Expected<ELFFile<ELFT>> ELFFile<ELFT>::create(StringRef Object) {
   if (sizeof(Elf_Ehdr) > Object.size())
@@ -956,9 +1024,11 @@ Expected<typename ELFT::ShdrRange> ELFFile<ELFT>::sections() const {
   const Elf_Shdr *First =
       reinterpret_cast<const Elf_Shdr *>(base() + SectionTableOffset);
 
-  uintX_t NumSections = getHeader().e_shnum;
-  if (NumSections == 0)
-    NumSections = First->sh_size;
+  uintX_t NumSections = 0;
+  if (Expected<uint64_t> ShNumOrErr = getShNum())
+    NumSections = *ShNumOrErr;
+  else
+    return ShNumOrErr.takeError();
 
   if (NumSections > UINT64_MAX / sizeof(Elf_Shdr))
     return createError("invalid number of sections specified in the NULL "
diff --git a/llvm/include/llvm/Object/ELFObjectFile.h b/llvm/include/llvm/Object/ELFObjectFile.h
index ced1afdd4cc6a..ca4135742bf6b 100644
--- a/llvm/include/llvm/Object/ELFObjectFile.h
+++ b/llvm/include/llvm/Object/ELFObjectFile.h
@@ -1218,12 +1218,12 @@ ELFObjectFile<ELFT>::ELFObjectFile(MemoryBufferRef Object, ELFFile<ELFT> EF,
     : ELFObjectFileBase(getELFType(ELFT::Endianness == llvm::endianness::little,
                                    ELFT::Is64Bits),
                         Object),
-      EF(EF), DotDynSymSec(DotDynSymSec), DotSymtabSec(DotSymtabSec),
+      EF(std::move(EF)), DotDynSymSec(DotDynSymSec), DotSymtabSec(DotSymtabSec),
       DotSymtabShndxSec(DotSymtabShndx) {}
 
 template <class ELFT>
 ELFObjectFile<ELFT>::ELFObjectFile(ELFObjectFile<ELFT> &&Other)
-    : ELFObjectFile(Other.Data, Other.EF, Other.DotDynSymSec,
+    : ELFObjectFile(Other.Data, std::move(Other.EF), Other.DotDynSymSec,
                     Other.DotSymtabSec, Other.DotSymtabShndxSec) {}
 
 template <class ELFT>
diff --git a/llvm/include/llvm/Object/ELFTypes.h b/llvm/include/llvm/Object/ELFTypes.h
index e9a417d3d4fb3..467ab6fd3c1e9 100644
--- a/llvm/include/llvm/Object/ELFTypes.h
+++ b/llvm/include/llvm/Object/ELFTypes.h
@@ -834,30 +834,32 @@ struct BBAddrMap {
     bool OmitBBEntries : 1;
     bool CallsiteEndOffsets : 1;
     bool BBHash : 1;
+    bool PostLinkCfg : 1;
 
     bool hasPGOAnalysis() const { return FuncEntryCount || BBFreq || BrProb; }
 
     bool hasPGOAnalysisBBData() const { return BBFreq || BrProb; }
 
     // Encodes to minimum bit width representation.
-    uint8_t encode() const {
-      return (static_cast<uint8_t>(FuncEntryCount) << 0) |
-             (static_cast<uint8_t>(BBFreq) << 1) |
-             (static_cast<uint8_t>(BrProb) << 2) |
-             (static_cast<uint8_t>(MultiBBRange) << 3) |
-             (static_cast<uint8_t>(OmitBBEntries) << 4) |
-             (static_cast<uint8_t>(CallsiteEndOffsets) << 5) |
-             (static_cast<uint8_t>(BBHash) << 6);
+    uint16_t encode() const {
+      return (static_cast<uint16_t>(FuncEntryCount) << 0) |
+             (static_cast<uint16_t>(BBFreq) << 1) |
+             (static_cast<uint16_t>(BrProb) << 2) |
+             (static_cast<uint16_t>(MultiBBRange) << 3) |
+             (static_cast<uint16_t>(OmitBBEntries) << 4) |
+             (static_cast<uint16_t>(CallsiteEndOffsets) << 5) |
+             (static_cast<uint16_t>(BBHash) << 6) |
+             (static_cast<uint16_t>(PostLinkCfg) << 7);
     }
 
     // Decodes from minimum bit width representation and validates no
     // unnecessary bits are used.
-    static Expected<Features> decode(uint8_t Val) {
+    static Expected<Features> decode(uint16_t Val) {
       Features Feat{
           static_cast<bool>(Val & (1 << 0)), static_cast<bool>(Val & (1 << 1)),
           static_cast<bool>(Val & (1 << 2)), static_cast<bool>(Val & (1 << 3)),
           static_cast<bool>(Val & (1 << 4)), static_cast<bool>(Val & (1 << 5)),
-          static_cast<bool>(Val & (1 << 6))};
+          static_cast<bool>(Val & (1 << 6)), static_cast<bool>(Val & (1 << 7))};
       if (Feat.encode() != Val)
         return createStringError(
             std::error_code(), "invalid encoding for BBAddrMap::Features: 0x%x",
@@ -867,10 +869,11 @@ struct BBAddrMap {
 
     bool operator==(const Features &Other) const {
       return std::tie(FuncEntryCount, BBFreq, BrProb, MultiBBRange,
-                      OmitBBEntries, CallsiteEndOffsets, BBHash) ==
+                      OmitBBEntries, CallsiteEndOffsets, BBHash, PostLinkCfg) ==
              std::tie(Other.FuncEntryCount, Other.BBFreq, Other.BrProb,
                       Other.MultiBBRange, Other.OmitBBEntries,
-                      Other.CallsiteEndOffsets, Other.BBHash);
+                      Other.CallsiteEndOffsets, Other.BBHash,
+                      Other.PostLinkCfg);
     }
   };
 
@@ -1010,23 +1013,30 @@ struct PGOAnalysisMap {
     /// probability associated with it.
     struct SuccessorEntry {
       /// Unique ID of this successor basic block.
-      uint32_t ID;
+      uint32_t ID = 0;
       /// Branch Probability of the edge to this successor taken from MBPI.
       BranchProbability Prob;
+      /// Raw edge count from the post link profile (e.g., from bolt or
+      /// propeller).
+      uint64_t PostLinkFreq = 0;
 
       bool operator==(const SuccessorEntry &Other) const {
-        return std::tie(ID, Prob) == std::tie(Other.ID, Other.Prob);
+        return std::tie(ID, Prob, PostLinkFreq) ==
+               std::tie(Other.ID, Other.Prob, Other.PostLinkFreq);
       }
     };
 
     /// Block frequency taken from MBFI
     BlockFrequency BlockFreq;
+    /// Raw block count taken from the post link profile (e.g., from bolt or
+    /// propeller).
+    uint64_t PostLinkBlockFreq = 0;
     /// List of successors of the current block
     llvm::SmallVector<SuccessorEntry, 2> Successors;
 
     bool operator==(const PGOBBEntry &Other) const {
-      return std::tie(BlockFreq, Successors) ==
-             std::tie(Other.BlockFreq, Other.Successors);
+      return std::tie(BlockFreq, PostLinkBlockFreq, Successors) ==
+             std::tie(Other.BlockFreq, PostLinkBlockFreq, Other.Successors);
     }
   };
 
diff --git a/llvm/include/llvm/Object/SFrameParser.h b/llvm/include/llvm/Object/SFrameParser.h
index 3ce5d70142a9f..23298357191b3 100644
--- a/llvm/include/llvm/Object/SFrameParser.h
+++ b/llvm/include/llvm/Object/SFrameParser.h
@@ -90,7 +90,7 @@ template <endianness E> class SFrameParser<E>::FallibleFREIterator {
                       uint32_t Idx, uint32_t Size, uint64_t Offset)
       : Data(Data), FREType(FREType), Idx(Idx), Size(Size), Offset(Offset) {}
 
-  Error inc();
+  LLVM_ABI Error inc();
   const FrameRowEntry &operator*() const { return FRE; }
 
   friend bool operator==(const FallibleFREIterator &LHS,
diff --git a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h
index b5b110d0f59a1..fbfe3069566d3 100644
--- a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h
+++ b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h
@@ -115,7 +115,7 @@ struct RootParameterHeaderYaml {
   dxbc::ShaderVisibility Visibility;
   uint32_t Offset;
 
-  RootParameterHeaderYaml(){};
+  RootParameterHeaderYaml() = default;
   RootParameterHeaderYaml(dxbc::RootParameterType T) : Type(T) {}
 };
 
@@ -123,7 +123,7 @@ struct RootParameterLocationYaml {
   RootParameterHeaderYaml Header;
   std::optional<size_t> IndexInSignature;
 
-  RootParameterLocationYaml(){};
+  RootParameterLocationYaml() = default;
   explicit RootParameterLocationYaml(RootParameterHeaderYaml Header)
       : Header(Header) {}
 };
diff --git a/llvm/include/llvm/ObjectYAML/ELFYAML.h b/llvm/include/llvm/ObjectYAML/ELFYAML.h
index a7c7c7c436dc2..a8236ca37b5ed 100644
--- a/llvm/include/llvm/ObjectYAML/ELFYAML.h
+++ b/llvm/include/llvm/ObjectYAML/ELFYAML.h
@@ -166,7 +166,7 @@ struct BBAddrMapEntry {
     std::optional<llvm::yaml::Hex64> Hash;
   };
   uint8_t Version;
-  llvm::yaml::Hex8 Feature;
+  llvm::yaml::Hex16 Feature;
 
   struct BBRangeEntry {
     llvm::yaml::Hex64 BaseAddress;
@@ -203,8 +203,10 @@ struct PGOAnalysisMapEntry {
     struct SuccessorEntry {
       uint32_t ID;
       llvm::yaml::Hex32 BrProb;
+      std::optional<uint32_t> PostLinkBrFreq;
     };
     std::optional<uint64_t> BBFreq;
+    std::optional<uint32_t> PostLinkBBFreq;
     std::optional<std::vector<SuccessorEntry>> Successors;
   };
   std::optional<uint64_t> FuncEntryCount;
diff --git a/llvm/include/llvm/ProfileData/DataAccessProf.h b/llvm/include/llvm/ProfileData/DataAccessProf.h
index 608306f02be66..ea256ef7b170b 100644
--- a/llvm/include/llvm/ProfileData/DataAccessProf.h
+++ b/llvm/include/llvm/ProfileData/DataAccessProf.h
@@ -42,7 +42,7 @@ struct SourceLocation {
       : FileName(FileNameRef.str()), Line(Line) {}
 
   // Empty constructor is used in yaml conversion.
-  SourceLocation() {}
+  SourceLocation() = default;
   /// The filename where the data is located.
   std::string FileName;
   /// The line number in the source code.
diff --git a/llvm/include/llvm/ProfileData/MemProfYAML.h b/llvm/include/llvm/ProfileData/MemProfYAML.h
index d66e16dda51d6..c55f7806d73a6 100644
--- a/llvm/include/llvm/ProfileData/MemProfYAML.h
+++ b/llvm/include/llvm/ProfileData/MemProfYAML.h
@@ -141,7 +141,7 @@ template <> struct CustomMappingTraits<memprof::PortableMemInfoBlock> {
 #define MIBEntryDef(NameTag, Name, Type)                                       \
   if (KeyStr == #Name) {                                                       \
     uint64_t Value;                                                            \
-    Io.mapRequired(KeyStr.str().c_str(), Value);                               \
+    Io.mapRequired(KeyStr, Value);                                             \
     MIB.Name = static_cast<Type>(Value);                                       \
     MIB.Schema.set(llvm::to_underlying(memprof::Meta::Name));                  \
     return;                                                                    \
diff --git a/llvm/include/llvm/SandboxIR/Context.h b/llvm/include/llvm/SandboxIR/Context.h
index 7d8b2c86e94a7..a8966db29ab26 100644
--- a/llvm/include/llvm/SandboxIR/Context.h
+++ b/llvm/include/llvm/SandboxIR/Context.h
@@ -51,7 +51,7 @@ class Context {
     // Uses a 64-bit integer so we don't have to worry about the unlikely case
     // of overflowing a 32-bit counter.
     using ValTy = uint64_t;
-    static constexpr const ValTy InvalidVal = 0;
+    static constexpr ValTy InvalidVal = 0;
 
   private:
     // Default initialization results in an invalid ID.
diff --git a/llvm/include/llvm/SandboxIR/Instruction.h b/llvm/include/llvm/SandboxIR/Instruction.h
index e1c1ca039a8a0..5e369a482be57 100644
--- a/llvm/include/llvm/SandboxIR/Instruction.h
+++ b/llvm/include/llvm/SandboxIR/Instruction.h
@@ -1866,7 +1866,7 @@ class SwitchInst : public SingleLLVMInstructionImpl<llvm::SwitchInst> {
   friend class Context; // For accessing the constructor in create*()
 
 public:
-  static constexpr const unsigned DefaultPseudoIndex =
+  static constexpr unsigned DefaultPseudoIndex =
       llvm::SwitchInst::DefaultPseudoIndex;
 
   LLVM_ABI static SwitchInst *create(Value *V, BasicBlock *Dest,
diff --git a/llvm/include/llvm/SandboxIR/Pass.h b/llvm/include/llvm/SandboxIR/Pass.h
index 267389a8a87a2..eb84f21483f8e 100644
--- a/llvm/include/llvm/SandboxIR/Pass.h
+++ b/llvm/include/llvm/SandboxIR/Pass.h
@@ -56,7 +56,7 @@ class Pass {
            "A pass name should not contain whitespaces!");
     assert(!Name.starts_with('-') && "A pass name should not start with '-'!");
   }
-  virtual ~Pass() {}
+  virtual ~Pass() = default;
   /// \Returns the name of the pass.
   StringRef getName() const { return Name; }
 #ifndef NDEBUG
diff --git a/llvm/include/llvm/SandboxIR/PassManager.h b/llvm/include/llvm/SandboxIR/PassManager.h
index 93ca710805dd4..a8117aa3b9fa8 100644
--- a/llvm/include/llvm/SandboxIR/PassManager.h
+++ b/llvm/include/llvm/SandboxIR/PassManager.h
@@ -59,10 +59,10 @@ class PassManager : public ParentPass {
     Passes.push_back(std::move(Pass));
   }
 
-  static constexpr const char EndToken = '\0';
-  static constexpr const char BeginArgsToken = '<';
-  static constexpr const char EndArgsToken = '>';
-  static constexpr const char PassDelimToken = ',';
+  static constexpr char EndToken = '\0';
+  static constexpr char BeginArgsToken = '<';
+  static constexpr char EndArgsToken = '>';
+  static constexpr char PassDelimToken = ',';
 
   /// Parses \p Pipeline as a comma-separated sequence of pass names and sets
   /// the pass pipeline, using \p CreatePass to instantiate passes by name.
diff --git a/llvm/include/llvm/Support/AutoConvert.h b/llvm/include/llvm/Support/AutoConvert.h
index 1e6792636e169..15f1ec8af6c57 100644
--- a/llvm/include/llvm/Support/AutoConvert.h
+++ b/llvm/include/llvm/Support/AutoConvert.h
@@ -18,6 +18,7 @@
 #include <_Ccsid.h>
 #endif
 #ifdef __cplusplus
+#include "llvm/ADT/Twine.h"
 #include "llvm/Support/Error.h"
 #include <system_error>
 #endif /* __cplusplus */
@@ -47,12 +48,12 @@ namespace llvm {
 std::error_code setzOSFileTag(int FD, int CCSID, bool Text);
 
 /** \brief Get the the tag ccsid for a file name or a file descriptor. */
-ErrorOr<__ccsid_t> getzOSFileTag(const char *FileName, const int FD = -1);
+ErrorOr<__ccsid_t> getzOSFileTag(const Twine &FileName, const int FD = -1);
 
 /** \brief Query the file tag to determine if it needs conversion to UTF-8
  *  codepage.
  */
-ErrorOr<bool> needzOSConversion(const char *FileName, const int FD = -1);
+ErrorOr<bool> needzOSConversion(const Twine &FileName, const int FD = -1);
 
 #endif /* __MVS__*/
 
@@ -87,7 +88,7 @@ inline std::error_code setFileTag(int FD, int CCSID, bool Text) {
   return std::error_code();
 }
 
-inline ErrorOr<bool> needConversion(const char *FileName, const int FD = -1) {
+inline ErrorOr<bool> needConversion(const Twine &FileName, const int FD = -1) {
 #ifdef __MVS__
   return needzOSConversion(FileName, FD);
 #endif
diff --git a/llvm/include/llvm/Support/BranchProbability.h b/llvm/include/llvm/Support/BranchProbability.h
index 42fe225709ef8..b15d6e1707afa 100644
--- a/llvm/include/llvm/Support/BranchProbability.h
+++ b/llvm/include/llvm/Support/BranchProbability.h
@@ -97,6 +97,9 @@ class BranchProbability {
   /// \return \c Num divided by \c this.
   LLVM_ABI uint64_t scaleByInverse(uint64_t Num) const;
 
+  /// Compute pow(Probability, N).
+  BranchProbability pow(unsigned N) const;
+
   BranchProbability &operator+=(BranchProbability RHS) {
     assert(N != UnknownN && RHS.N != UnknownN &&
            "Unknown probability cannot participate in arithmetics.");
diff --git a/llvm/include/llvm/Support/CommandLine.h b/llvm/include/llvm/Support/CommandLine.h
index 5a5f00e844705..d737fbcf891b3 100644
--- a/llvm/include/llvm/Support/CommandLine.h
+++ b/llvm/include/llvm/Support/CommandLine.h
@@ -2099,7 +2099,7 @@ getRegisteredOptions(SubCommand &Sub = SubCommand::getTopLevel());
 ///
 /// This interface is useful for defining subcommands in libraries and
 /// the dispatch from a single point (like in the main function).
-LLVM_ABI iterator_range<typename SmallPtrSet<SubCommand *, 4>::iterator>
+LLVM_ABI iterator_range<SmallPtrSet<SubCommand *, 4>::iterator>
 getRegisteredSubcommands();
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/include/llvm/Support/ELFAttributeParser.h b/llvm/include/llvm/Support/ELFAttributeParser.h
index 97350edb793c9..c2ad812b5d632 100644
--- a/llvm/include/llvm/Support/ELFAttributeParser.h
+++ b/llvm/include/llvm/Support/ELFAttributeParser.h
@@ -17,7 +17,7 @@ namespace llvm {
 
 class ELFAttributeParser {
 public:
-  virtual ~ELFAttributeParser() {}
+  virtual ~ELFAttributeParser() = default;
 
   virtual Error parse(ArrayRef<uint8_t> Section, llvm::endianness Endian) {
     return llvm::Error::success();
diff --git a/llvm/include/llvm/Support/FormatProviders.h b/llvm/include/llvm/Support/FormatProviders.h
index 8eaa5e382c73e..3377781873b8c 100644
--- a/llvm/include/llvm/Support/FormatProviders.h
+++ b/llvm/include/llvm/Support/FormatProviders.h
@@ -261,7 +261,7 @@ template <> struct format_provider<bool> {
                   .Case("y", B ? "yes" : "no")
                   .CaseLower("D", B ? "1" : "0")
                   .Case("T", B ? "TRUE" : "FALSE")
-                  .Cases("t", "", B ? "true" : "false")
+                  .Cases({"t", ""}, B ? "true" : "false")
                   .Default(B ? "1" : "0");
   }
 };
diff --git a/llvm/include/llvm/Support/FormattedStream.h b/llvm/include/llvm/Support/FormattedStream.h
index 011a6aea238e3..402cd3e3235dc 100644
--- a/llvm/include/llvm/Support/FormattedStream.h
+++ b/llvm/include/llvm/Support/FormattedStream.h
@@ -180,7 +180,8 @@ class LLVM_ABI formatted_raw_ostream : public raw_ostream {
     return *this;
   }
 
-  raw_ostream &changeColor(enum Colors Color, bool Bold, bool BG) override {
+  raw_ostream &changeColor(enum Colors Color, bool Bold = false,
+                           bool BG = false) override {
     if (colors_enabled()) {
       DisableScanScope S(this);
       raw_ostream::changeColor(Color, Bold, BG);
diff --git a/llvm/include/llvm/Support/GenericLoopInfo.h b/llvm/include/llvm/Support/GenericLoopInfo.h
index 2775a8734dd47..b6bb360d9868f 100644
--- a/llvm/include/llvm/Support/GenericLoopInfo.h
+++ b/llvm/include/llvm/Support/GenericLoopInfo.h
@@ -615,6 +615,17 @@ template <class BlockT, class LoopT> class LoopInfoBase {
     return L ? L->getLoopDepth() : 0;
   }
 
+  /// \brief Find the innermost loop containing both given loops.
+  ///
+  /// \returns the innermost loop containing both \p A and \p B
+  ///          or nullptr if there is no such loop.
+  LoopT *getSmallestCommonLoop(LoopT *A, LoopT *B) const;
+  /// \brief Find the innermost loop containing both given blocks.
+  ///
+  /// \returns the innermost loop containing both \p A and \p B
+  ///          or nullptr if there is no such loop.
+  LoopT *getSmallestCommonLoop(BlockT *A, BlockT *B) const;
+
   // True if the block is a loop header node
   bool isLoopHeader(const BlockT *BB) const {
     const LoopT *L = getLoopFor(BB);
diff --git a/llvm/include/llvm/Support/GenericLoopInfoImpl.h b/llvm/include/llvm/Support/GenericLoopInfoImpl.h
index 6fc508b0e0cca..541678001a8ff 100644
--- a/llvm/include/llvm/Support/GenericLoopInfoImpl.h
+++ b/llvm/include/llvm/Support/GenericLoopInfoImpl.h
@@ -355,7 +355,7 @@ void LoopBase<BlockT, LoopT>::verifyLoop() const {
     if (BB == getHeader()) {
       assert(!OutsideLoopPreds.empty() && "Loop is unreachable!");
     } else if (!OutsideLoopPreds.empty()) {
-      // A non-header loop shouldn't be reachable from outside the loop,
+      // A non-header loop block shouldn't be reachable from outside the loop,
       // though it is permitted if the predecessor is not itself actually
       // reachable.
       BlockT *EntryBB = &BB->getParent()->front();
@@ -645,6 +645,36 @@ LoopInfoBase<BlockT, LoopT>::getLoopsInReverseSiblingPreorder() const {
   return PreOrderLoops;
 }
 
+template <class BlockT, class LoopT>
+LoopT *LoopInfoBase<BlockT, LoopT>::getSmallestCommonLoop(LoopT *A,
+                                                          LoopT *B) const {
+  if (!A || !B)
+    return nullptr;
+
+  // If lops A and B have different depth replace them with parent loop
+  // until they have the same depth.
+  while (A->getLoopDepth() > B->getLoopDepth())
+    A = A->getParentLoop();
+  while (B->getLoopDepth() > A->getLoopDepth())
+    B = B->getParentLoop();
+
+  // Loops A and B are at same depth but may be disjoint, replace them with
+  // parent loops until we find loop that contains both or we run out of
+  // parent loops.
+  while (A != B) {
+    A = A->getParentLoop();
+    B = B->getParentLoop();
+  }
+
+  return A;
+}
+
+template <class BlockT, class LoopT>
+LoopT *LoopInfoBase<BlockT, LoopT>::getSmallestCommonLoop(BlockT *A,
+                                                          BlockT *B) const {
+  return getSmallestCommonLoop(getLoopFor(A), getLoopFor(B));
+}
+
 // Debugging
 template <class BlockT, class LoopT>
 void LoopInfoBase<BlockT, LoopT>::print(raw_ostream &OS) const {
diff --git a/llvm/include/llvm/Support/GraphWriter.h b/llvm/include/llvm/Support/GraphWriter.h
index 3bef75cc7e508..43d9b0cfddef7 100644
--- a/llvm/include/llvm/Support/GraphWriter.h
+++ b/llvm/include/llvm/Support/GraphWriter.h
@@ -128,7 +128,7 @@ template <typename GraphType, typename Derived> class GraphWriterBase {
     DTraits = DOTTraits(SN);
     RenderUsingHTML = DTraits.renderNodesUsingHTML();
   }
-  virtual ~GraphWriterBase() {}
+  virtual ~GraphWriterBase() = default;
 
   void writeGraph(const std::string &Title = "") {
     // Output the header for the graph...
@@ -369,7 +369,7 @@ class GraphWriter : public GraphWriterBase<GraphType, GraphWriter<GraphType>> {
 public:
   GraphWriter(raw_ostream &o, const GraphType &g, bool SN)
       : GraphWriterBase<GraphType, GraphWriter<GraphType>>(o, g, SN) {}
-  ~GraphWriter() override {}
+  ~GraphWriter() override = default;
 };
 
 template <typename GraphType>
diff --git a/llvm/include/llvm/Support/JSON.h b/llvm/include/llvm/Support/JSON.h
index d8c6de49b4bc6..37baa7b45e4eb 100644
--- a/llvm/include/llvm/Support/JSON.h
+++ b/llvm/include/llvm/Support/JSON.h
@@ -154,7 +154,7 @@ class Object {
   LLVM_ABI const json::Array *getArray(StringRef K) const;
   LLVM_ABI json::Array *getArray(StringRef K);
 
-  friend bool operator==(const Object &LHS, const Object &RHS);
+  friend LLVM_ABI bool operator==(const Object &LHS, const Object &RHS);
 };
 LLVM_ABI bool operator==(const Object &LHS, const Object &RHS);
 inline bool operator!=(const Object &LHS, const Object &RHS) {
@@ -318,7 +318,7 @@ class Value {
   Value(std::string V) : Type(T_String) {
     if (LLVM_UNLIKELY(!isUTF8(V))) {
       assert(false && "Invalid UTF-8 in value used as JSON");
-      V = fixUTF8(std::move(V));
+      V = fixUTF8(V);
     }
     create<std::string>(std::move(V));
   }
@@ -549,10 +549,10 @@ inline const Value &Array::back() const { return V.back(); }
 inline Value *Array::data() { return V.data(); }
 inline const Value *Array::data() const { return V.data(); }
 
-inline typename Array::iterator Array::begin() { return V.begin(); }
-inline typename Array::const_iterator Array::begin() const { return V.begin(); }
-inline typename Array::iterator Array::end() { return V.end(); }
-inline typename Array::const_iterator Array::end() const { return V.end(); }
+inline Array::iterator Array::begin() { return V.begin(); }
+inline Array::const_iterator Array::begin() const { return V.begin(); }
+inline Array::iterator Array::end() { return V.end(); }
+inline Array::const_iterator Array::end() const { return V.end(); }
 
 inline bool Array::empty() const { return V.empty(); }
 inline size_t Array::size() const { return V.size(); }
@@ -565,18 +565,18 @@ template <typename... Args> inline void Array::emplace_back(Args &&...A) {
   V.emplace_back(std::forward<Args>(A)...);
 }
 inline void Array::pop_back() { V.pop_back(); }
-inline typename Array::iterator Array::insert(const_iterator P, const Value &E) {
+inline Array::iterator Array::insert(const_iterator P, const Value &E) {
   return V.insert(P, E);
 }
-inline typename Array::iterator Array::insert(const_iterator P, Value &&E) {
+inline Array::iterator Array::insert(const_iterator P, Value &&E) {
   return V.insert(P, std::move(E));
 }
 template <typename It>
-inline typename Array::iterator Array::insert(const_iterator P, It A, It Z) {
+inline Array::iterator Array::insert(const_iterator P, It A, It Z) {
   return V.insert(P, A, Z);
 }
 template <typename... Args>
-inline typename Array::iterator Array::emplace(const_iterator P, Args &&...A) {
+inline Array::iterator Array::emplace(const_iterator P, Args &&...A) {
   return V.emplace(P, std::forward<Args>(A)...);
 }
 inline bool operator==(const Array &L, const Array &R) { return L.V == R.V; }
@@ -591,7 +591,7 @@ class ObjectKey {
   ObjectKey(std::string S) : Owned(new std::string(std::move(S))) {
     if (LLVM_UNLIKELY(!isUTF8(*Owned))) {
       assert(false && "Invalid UTF-8 in value used as JSON");
-      *Owned = fixUTF8(std::move(*Owned));
+      *Owned = fixUTF8(*Owned);
     }
     Data = *Owned;
   }
diff --git a/llvm/include/llvm/Support/LEB128.h b/llvm/include/llvm/Support/LEB128.h
index 898b4ea1f19ab..4e2262fb15c56 100644
--- a/llvm/include/llvm/Support/LEB128.h
+++ b/llvm/include/llvm/Support/LEB128.h
@@ -29,8 +29,7 @@ inline unsigned encodeSLEB128(int64_t Value, raw_ostream &OS,
     uint8_t Byte = Value & 0x7f;
     // NOTE: this assumes that this signed shift is an arithmetic right shift.
     Value >>= 7;
-    More = !((((Value == 0 ) && ((Byte & 0x40) == 0)) ||
-              ((Value == -1) && ((Byte & 0x40) != 0))));
+    More = Value != ((Byte & 0x40) ? -1 : 0);
     Count++;
     if (More || Count < PadTo)
       Byte |= 0x80; // Mark this byte to show that more bytes will follow.
@@ -58,8 +57,7 @@ inline unsigned encodeSLEB128(int64_t Value, uint8_t *p, unsigned PadTo = 0) {
     uint8_t Byte = Value & 0x7f;
     // NOTE: this assumes that this signed shift is an arithmetic right shift.
     Value >>= 7;
-    More = !((((Value == 0 ) && ((Byte & 0x40) == 0)) ||
-              ((Value == -1) && ((Byte & 0x40) != 0))));
+    More = Value != ((Byte & 0x40) ? -1 : 0);
     Count++;
     if (More || Count < PadTo)
       Byte |= 0x80; // Mark this byte to show that more bytes will follow.
diff --git a/llvm/include/llvm/Support/SMLoc.h b/llvm/include/llvm/Support/SMLoc.h
index c80969b1d83dc..b7ae6e488cde9 100644
--- a/llvm/include/llvm/Support/SMLoc.h
+++ b/llvm/include/llvm/Support/SMLoc.h
@@ -15,7 +15,6 @@
 #define LLVM_SUPPORT_SMLOC_H
 
 #include <cassert>
-#include <optional>
 
 namespace llvm {
 
@@ -50,7 +49,6 @@ class SMRange {
   SMLoc Start, End;
 
   SMRange() = default;
-  SMRange(std::nullopt_t) {}
   SMRange(SMLoc St, SMLoc En) : Start(St), End(En) {
     assert(Start.isValid() == End.isValid() &&
            "Start and End should either both be valid or both be invalid!");
diff --git a/llvm/include/llvm/Support/SourceMgr.h b/llvm/include/llvm/Support/SourceMgr.h
index 8320006ff5f6e..43f7e27c26ba1 100644
--- a/llvm/include/llvm/Support/SourceMgr.h
+++ b/llvm/include/llvm/Support/SourceMgr.h
@@ -103,7 +103,7 @@ class SourceMgr {
 
 public:
   /// Create new source manager without support for include files.
-  SourceMgr();
+  LLVM_ABI SourceMgr();
   /// Create new source manager with the capability of finding include files
   /// via the provided file system.
   explicit SourceMgr(IntrusiveRefCntPtr<vfs::FileSystem> FS);
@@ -111,10 +111,10 @@ class SourceMgr {
   SourceMgr &operator=(const SourceMgr &) = delete;
   SourceMgr(SourceMgr &&);
   SourceMgr &operator=(SourceMgr &&);
-  ~SourceMgr();
+  LLVM_ABI ~SourceMgr();
 
   IntrusiveRefCntPtr<vfs::FileSystem> getVirtualFileSystem() const;
-  void setVirtualFileSystem(IntrusiveRefCntPtr<vfs::FileSystem> FS);
+  LLVM_ABI void setVirtualFileSystem(IntrusiveRefCntPtr<vfs::FileSystem> FS);
 
   /// Return the include directories of this source manager.
   ArrayRef<std::string> getIncludeDirs() const { return IncludeDirectories; }
diff --git a/llvm/include/llvm/Support/VirtualFileSystem.h b/llvm/include/llvm/Support/VirtualFileSystem.h
index c8911a0225f86..dbd5a5c137fd1 100644
--- a/llvm/include/llvm/Support/VirtualFileSystem.h
+++ b/llvm/include/llvm/Support/VirtualFileSystem.h
@@ -1116,8 +1116,9 @@ class LLVM_ABI RedirectingFileSystem
 /// Collect all pairs of <virtual path, real path> entries from the
 /// \p VFS. This is used by the module dependency collector to forward
 /// the entries into the reproducer output VFS YAML file.
-void collectVFSEntries(RedirectingFileSystem &VFS,
-                       SmallVectorImpl<YAMLVFSEntry> &CollectedEntries);
+LLVM_ABI void
+collectVFSEntries(RedirectingFileSystem &VFS,
+                  SmallVectorImpl<YAMLVFSEntry> &CollectedEntries);
 
 class YAMLVFSWriter {
   std::vector<YAMLVFSEntry> Mappings;
diff --git a/llvm/include/llvm/Support/VirtualOutputBackend.h b/llvm/include/llvm/Support/VirtualOutputBackend.h
index 85caa021c2aae..78ed4b9b66607 100644
--- a/llvm/include/llvm/Support/VirtualOutputBackend.h
+++ b/llvm/include/llvm/Support/VirtualOutputBackend.h
@@ -32,7 +32,7 @@ namespace llvm::vfs {
 /// If virtual functions are added here, also add them to \a
 /// ProxyOutputBackend.
 class OutputBackend : public RefCountedBase<OutputBackend> {
-  virtual void anchor();
+  LLVM_ABI virtual void anchor();
 
 public:
   /// Get a backend that points to the same destination as this one but that
@@ -47,7 +47,7 @@ class OutputBackend : public RefCountedBase<OutputBackend> {
   /// have been customized).
   ///
   /// Thread-safe.
-  Expected<OutputFile>
+  LLVM_ABI Expected<OutputFile>
   createFile(const Twine &Path,
              std::optional<OutputConfig> Config = std::nullopt);
 
diff --git a/llvm/include/llvm/Support/VirtualOutputBackends.h b/llvm/include/llvm/Support/VirtualOutputBackends.h
index 219bc30cfa6db..13a9611f7613a 100644
--- a/llvm/include/llvm/Support/VirtualOutputBackends.h
+++ b/llvm/include/llvm/Support/VirtualOutputBackends.h
@@ -77,14 +77,14 @@ class ProxyOutputBackend : public OutputBackend {
 
 /// An output backend that creates files on disk, wrapping APIs in sys::fs.
 class OnDiskOutputBackend : public OutputBackend {
-  void anchor() override;
+  LLVM_ABI void anchor() override;
 
 protected:
   IntrusiveRefCntPtr<OutputBackend> cloneImpl() const override {
     return clone();
   }
 
-  Expected<std::unique_ptr<OutputFileImpl>>
+  LLVM_ABI Expected<std::unique_ptr<OutputFileImpl>>
   createFileImpl(StringRef Path, std::optional<OutputConfig> Config) override;
 
 public:
diff --git a/llvm/include/llvm/Support/VirtualOutputError.h b/llvm/include/llvm/Support/VirtualOutputError.h
index 2293ff982a6b4..44590a1fb5ed0 100644
--- a/llvm/include/llvm/Support/VirtualOutputError.h
+++ b/llvm/include/llvm/Support/VirtualOutputError.h
@@ -43,7 +43,7 @@ class OutputError : public ErrorInfo<OutputError, ECError> {
   void log(raw_ostream &OS) const override;
 
   // Used by ErrorInfo::classID.
-  static char ID;
+  LLVM_ABI static char ID;
 
   OutputError(const Twine &OutputPath, std::error_code EC)
       : ErrorInfo<OutputError, ECError>(EC), OutputPath(OutputPath.str()) {
@@ -99,7 +99,7 @@ class TempFileOutputError : public ErrorInfo<TempFileOutputError, OutputError> {
   void log(raw_ostream &OS) const override;
 
   // Used by ErrorInfo::classID.
-  static char ID;
+  LLVM_ABI static char ID;
 
   TempFileOutputError(const Twine &TempPath, const Twine &OutputPath,
                       std::error_code EC)
diff --git a/llvm/include/llvm/Support/VirtualOutputFile.h b/llvm/include/llvm/Support/VirtualOutputFile.h
index dd50437605deb..d53701c130479 100644
--- a/llvm/include/llvm/Support/VirtualOutputFile.h
+++ b/llvm/include/llvm/Support/VirtualOutputFile.h
@@ -80,13 +80,13 @@ class OutputFile {
   ///
   /// If there's an open proxy from \a createProxy(), calls \a discard() to
   /// clean up temporaries followed by \a report_fatal_error().
-  Error keep();
+  LLVM_ABI Error keep();
 
   /// Discard an output, cleaning up any temporary state. Errors if clean-up
   /// fails.
   ///
   /// If it has already been closed, calls \a report_fatal_error().
-  Error discard();
+  LLVM_ABI Error discard();
 
   /// Discard the output when destroying it if it's still open, sending the
   /// result to \a Handler.
@@ -98,7 +98,7 @@ class OutputFile {
   /// producer. Errors if there's already a proxy. The proxy must be deleted
   /// before calling \a keep(). The proxy will crash if it's written to after
   /// calling \a discard().
-  Expected<std::unique_ptr<raw_pwrite_stream>> createProxy();
+  LLVM_ABI Expected<std::unique_ptr<raw_pwrite_stream>> createProxy();
 
   bool hasOpenProxy() const { return OpenProxy; }
 
@@ -132,7 +132,7 @@ class OutputFile {
 private:
   /// Destroy \a Impl. Reports fatal error if the file is open and there's no
   /// handler from \a discardOnDestroy().
-  void destroy();
+  LLVM_ABI void destroy();
   OutputFile &moveFrom(OutputFile &O) {
     Path = std::move(O.Path);
     Impl = std::move(O.Impl);
diff --git a/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h b/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h
index 4aa6c01d29cc2..6f6f65dc075f3 100644
--- a/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h
+++ b/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h
@@ -511,7 +511,6 @@ enum OperandEncoding { ENCODINGS ENCODING_max };
   ENUM_ENTRY(TYPE_VK, "mask register")                                         \
   ENUM_ENTRY(TYPE_VK_PAIR, "mask register pair")                               \
   ENUM_ENTRY(TYPE_TMM, "tile")                                                 \
-  ENUM_ENTRY(TYPE_TMM_PAIR, "tile pair")                                       \
   ENUM_ENTRY(TYPE_SEGMENTREG, "Segment register operand")                      \
   ENUM_ENTRY(TYPE_DEBUGREG, "Debug register operand")                          \
   ENUM_ENTRY(TYPE_CONTROLREG, "Control register operand")                      \
diff --git a/llvm/include/llvm/Support/YAMLTraits.h b/llvm/include/llvm/Support/YAMLTraits.h
index 3d36f41ca1a04..b53b28dd00fd1 100644
--- a/llvm/include/llvm/Support/YAMLTraits.h
+++ b/llvm/include/llvm/Support/YAMLTraits.h
@@ -1921,12 +1921,12 @@ template <typename T> struct StdMapStringCustomMappingTraitsImpl {
   using map_type = std::map<std::string, T>;
 
   static void inputOne(IO &io, StringRef key, map_type &v) {
-    io.mapRequired(key.str().c_str(), v[std::string(key)]);
+    io.mapRequired(key, v[std::string(key)]);
   }
 
   static void output(IO &io, map_type &v) {
     for (auto &p : v)
-      io.mapRequired(p.first.c_str(), p.second);
+      io.mapRequired(p.first, p.second);
   }
 };
 
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 47d5d68174b38..119695e53c3cb 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -1013,10 +1013,18 @@ def extract_vec_elt_combines : GICombineGroup<[
 def funnel_shift_from_or_shift : GICombineRule<
   (defs root:$root, build_fn_matchinfo:$info),
   (match (wip_match_opcode G_OR):$root,
-    [{ return Helper.matchOrShiftToFunnelShift(*${root}, ${info}); }]),
+    [{ return Helper.matchOrShiftToFunnelShift(*${root}, false, ${info}); }]),
   (apply [{ Helper.applyBuildFn(*${root}, ${info}); }])
 >;
 
+def funnel_shift_from_or_shift_constants_are_legal : GICombineRule<
+  (defs root:$root, build_fn_matchinfo:$info),
+  (match (wip_match_opcode G_OR):$root,
+    [{ return Helper.matchOrShiftToFunnelShift(*${root}, true, ${info}); }]),
+  (apply [{ Helper.applyBuildFn(*${root}, ${info}); }])
+>;
+
+
 def funnel_shift_to_rotate : GICombineRule<
   (defs root:$root),
   (match (wip_match_opcode G_FSHL, G_FSHR):$root,
diff --git a/llvm/include/llvm/TargetParser/X86TargetParser.def b/llvm/include/llvm/TargetParser/X86TargetParser.def
index a94eab1d7ae34..78cf46406192e 100644
--- a/llvm/include/llvm/TargetParser/X86TargetParser.def
+++ b/llvm/include/llvm/TargetParser/X86TargetParser.def
@@ -268,7 +268,6 @@ X86_FEATURE_COMPAT(AVX10_2_512,     "avx10.2-512",            0)
 X86_FEATURE       (MOVRS,           "movrs")
 X86_FEATURE       (ZU,              "zu")
 X86_FEATURE       (AMX_FP8,         "amx-fp8")
-X86_FEATURE       (AMX_TRANSPOSE,   "amx-transpose")
 X86_FEATURE       (AMX_MOVRS,       "amx-movrs")
 X86_FEATURE       (AMX_AVX512,      "amx-avx512")
 X86_FEATURE       (AMX_TF32,        "amx-tf32")
diff --git a/llvm/include/llvm/Transforms/Coroutines/CoroAnnotationElide.h b/llvm/include/llvm/Transforms/Coroutines/CoroAnnotationElide.h
index 352c9e1452669..2061098b6ea6a 100644
--- a/llvm/include/llvm/Transforms/Coroutines/CoroAnnotationElide.h
+++ b/llvm/include/llvm/Transforms/Coroutines/CoroAnnotationElide.h
@@ -24,7 +24,7 @@
 namespace llvm {
 
 struct CoroAnnotationElidePass : PassInfoMixin<CoroAnnotationElidePass> {
-  CoroAnnotationElidePass() {}
+  CoroAnnotationElidePass() = default;
 
   PreservedAnalyses run(LazyCallGraph::SCC &C, CGSCCAnalysisManager &AM,
                         LazyCallGraph &CG, CGSCCUpdateResult &UR);
diff --git a/llvm/include/llvm/Transforms/IPO/FatLTOCleanup.h b/llvm/include/llvm/Transforms/IPO/FatLTOCleanup.h
index 17eab8568f4dc..6fc1b2623e163 100644
--- a/llvm/include/llvm/Transforms/IPO/FatLTOCleanup.h
+++ b/llvm/include/llvm/Transforms/IPO/FatLTOCleanup.h
@@ -26,7 +26,7 @@ class ModuleSummaryIndex;
 
 class FatLtoCleanup : public PassInfoMixin<FatLtoCleanup> {
 public:
-  FatLtoCleanup() {}
+  FatLtoCleanup() = default;
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
   static bool isRequired() { return true; }
 };
diff --git a/llvm/include/llvm/Transforms/IPO/InferFunctionAttrs.h b/llvm/include/llvm/Transforms/IPO/InferFunctionAttrs.h
index 8addf49fc0d81..272b96037c753 100644
--- a/llvm/include/llvm/Transforms/IPO/InferFunctionAttrs.h
+++ b/llvm/include/llvm/Transforms/IPO/InferFunctionAttrs.h
@@ -23,7 +23,7 @@ class Module;
 /// A pass which infers function attributes from the names and signatures of
 /// function declarations in a module.
 struct InferFunctionAttrsPass : PassInfoMixin<InferFunctionAttrsPass> {
-  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+  LLVM_ABI PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
 };
 
 }
diff --git a/llvm/include/llvm/Transforms/Instrumentation/PGOInstrumentation.h b/llvm/include/llvm/Transforms/Instrumentation/PGOInstrumentation.h
index ced446dacb6cc..9dcd4b53a0dbe 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/PGOInstrumentation.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/PGOInstrumentation.h
@@ -26,8 +26,6 @@
 
 namespace llvm {
 
-LLVM_ABI extern cl::opt<bool> DebugInfoCorrelate;
-
 class Function;
 class Instruction;
 class Module;
diff --git a/llvm/include/llvm/Transforms/Instrumentation/SanitizerCoverage.h b/llvm/include/llvm/Transforms/Instrumentation/SanitizerCoverage.h
index a8a09fb95c4bd..346e7f06eaa43 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/SanitizerCoverage.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/SanitizerCoverage.h
@@ -33,7 +33,7 @@ class FileSystem;
 /// appends globals to llvm.compiler.used.
 class SanitizerCoveragePass : public PassInfoMixin<SanitizerCoveragePass> {
 public:
-  explicit SanitizerCoveragePass(
+  LLVM_ABI explicit SanitizerCoveragePass(
       SanitizerCoverageOptions Options = SanitizerCoverageOptions(),
       IntrusiveRefCntPtr<vfs::FileSystem> VFS = nullptr,
       const std::vector<std::string> &AllowlistFiles = {},
diff --git a/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h b/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h
index e677cbf2d8968..49885b7f06a15 100644
--- a/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h
@@ -19,6 +19,7 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CycleInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Printable.h"
@@ -262,6 +263,34 @@ LLVM_ABI BasicBlock *SplitEdge(BasicBlock *From, BasicBlock *To,
                                MemorySSAUpdater *MSSAU = nullptr,
                                const Twine &BBName = "");
 
+/// \brief Create a new intermediate target block for a callbr edge.
+///
+/// Create a new basic block between a callbr instruction and one of its
+/// successors. The new block replaces the original successor in the callbr
+/// instruction and unconditionally branches to the original successor. This
+/// is useful for normalizing control flow, e.g., when transforming
+/// irreducible loops.
+///
+/// \param CallBrBlock    block containing the callbr instruction
+/// \param Succ           original successor block
+/// \param SuccIdx        index of the original successor in the callbr
+///                       instruction
+/// \param DTU            optional \p DomTreeUpdater for updating the
+///                       dominator tree
+/// \param CI             optional \p CycleInfo for updating cycle membership
+/// \param LI             optional \p LoopInfo for updating loop membership
+/// \param UpdatedLI      optional output flag indicating if \p LoopInfo has
+///                       been updated
+///
+/// \returns newly created intermediate target block
+///
+/// \note This function updates PHI nodes, dominator tree, loop info, and
+/// cycle info as needed.
+LLVM_ABI BasicBlock *
+SplitCallBrEdge(BasicBlock *CallBrBlock, BasicBlock *Succ, unsigned SuccIdx,
+                DomTreeUpdater *DTU = nullptr, CycleInfo *CI = nullptr,
+                LoopInfo *LI = nullptr, bool *UpdatedLI = nullptr);
+
 /// Sets the unwind edge of an instruction to a particular successor.
 LLVM_ABI void setUnwindEdgeTo(Instruction *TI, BasicBlock *Succ);
 
diff --git a/llvm/include/llvm/Transforms/Utils/ControlFlowUtils.h b/llvm/include/llvm/Transforms/Utils/ControlFlowUtils.h
index 810fef29f4010..17cde82b084d8 100644
--- a/llvm/include/llvm/Transforms/Utils/ControlFlowUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/ControlFlowUtils.h
@@ -15,10 +15,13 @@
 
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/IR/CycleInfo.h"
 
 namespace llvm {
 
 class BasicBlock;
+class CallBrInst;
+class LoopInfo;
 class DomTreeUpdater;
 
 /// Given a set of branch descriptors [BB, Succ0, Succ1], create a "hub" such
@@ -104,7 +107,8 @@ struct ControlFlowHub {
         : BB(BB), Succ0(Succ0), Succ1(Succ1) {}
   };
 
-  void addBranch(BasicBlock *BB, BasicBlock *Succ0, BasicBlock *Succ1) {
+  void addBranch(BasicBlock *BB, BasicBlock *Succ0,
+                 BasicBlock *Succ1 = nullptr) {
     assert(BB);
     assert(Succ0 || Succ1);
     Branches.emplace_back(BB, Succ0, Succ1);
diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
index 2d2355d6be68a..86eb21389756c 100644
--- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
@@ -365,6 +365,40 @@ LLVM_ABI bool setLoopEstimatedTripCount(
     Loop *L, unsigned EstimatedTripCount,
     std::optional<unsigned> EstimatedLoopInvocationWeight = std::nullopt);
 
+/// Based on branch weight metadata, return either:
+/// - An unknown probability if the implementation is unable to handle the loop
+///   form of \p L (e.g., \p L must have a latch block that controls the loop
+///   exit).
+/// - The probability \c P that, at the end of any iteration, the latch of \p L
+///   will start another iteration such that `1 - P` is the probability of
+///   exiting the loop.
+BranchProbability getLoopProbability(Loop *L);
+
+/// Set branch weight metadata for the latch of \p L to indicate that, at the
+/// end of any iteration, \p P and `1 - P` are the probabilities of starting
+/// another iteration and exiting the loop, respectively.  Return false if the
+/// implementation is unable to handle the loop form of \p L (e.g., \p L must
+/// have a latch block that controls the loop exit).  Otherwise, return true.
+bool setLoopProbability(Loop *L, BranchProbability P);
+
+/// Based on branch weight metadata, return either:
+/// - An unknown probability if the implementation cannot extract the
+///   probability (e.g., \p B must have exactly two target labels, so it must be
+///   a conditional branch).
+/// - The probability \c P that control flows from \p B to its first target
+///   label such that `1 - P` is the probability of control flowing to its
+///   second target label, or vice-versa if \p ForFirstTarget is false.
+BranchProbability getBranchProbability(BranchInst *B, bool ForFirstTarget);
+
+/// Set branch weight metadata for \p B to indicate that \p P and `1 - P` are
+/// the probabilities of control flowing to its first and second target labels,
+/// respectively, or vice-versa if \p ForFirstTarget is false.  Return false if
+/// the implementation cannot set the probability (e.g., \p B must have exactly
+/// two target labels, so it must be a conditional branch).  Otherwise, return
+/// true.
+bool setBranchProbability(BranchInst *B, BranchProbability P,
+                          bool ForFirstTarget);
+
 /// Check inner loop (L) backedge count is known to be invariant on all
 /// iterations of its outer loop. If the loop has no parent, this is trivially
 /// true.
diff --git a/llvm/include/llvm/Transforms/Utils/UnrollLoop.h b/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
index 871c13d972470..a3efc43c62dc3 100644
--- a/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
+++ b/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
@@ -97,7 +97,9 @@ LLVM_ABI bool UnrollRuntimeLoopRemainder(
     LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC,
     const TargetTransformInfo *TTI, bool PreserveLCSSA,
     unsigned SCEVExpansionBudget, bool RuntimeUnrollMultiExit,
-    Loop **ResultLoop = nullptr);
+    Loop **ResultLoop = nullptr,
+    std::optional<unsigned> OriginalTripCount = std::nullopt,
+    BranchProbability OriginalLoopProb = BranchProbability::getUnknown());
 
 LLVM_ABI LoopUnrollResult UnrollAndJamLoop(
     Loop *L, unsigned Count, unsigned TripCount, unsigned TripMultiple,
diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h
index 96a2348403932..3d76cdaad6240 100644
--- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h
@@ -167,7 +167,7 @@ class LegalityResult {
   LegalityResult &operator=(const LegalityResult &) = delete;
 
 public:
-  virtual ~LegalityResult() {}
+  virtual ~LegalityResult() = default;
   LegalityResultID getSubclassID() const { return ID; }
 #ifndef NDEBUG
   virtual void print(raw_ostream &OS) const {
diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SeedCollector.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SeedCollector.h
index b289520fa83af..821382b0b12d0 100644
--- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SeedCollector.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SeedCollector.h
@@ -36,7 +36,7 @@ class SeedBundle {
   /// No need to allow copies.
   SeedBundle(const SeedBundle &) = delete;
   SeedBundle &operator=(const SeedBundle &) = delete;
-  virtual ~SeedBundle() {}
+  virtual ~SeedBundle() = default;
 
   using iterator = SmallVector<Instruction *>::iterator;
   using const_iterator = SmallVector<Instruction *>::const_iterator;
diff --git a/llvm/lib/Analysis/AliasAnalysis.cpp b/llvm/lib/Analysis/AliasAnalysis.cpp
index f2dc25fa5dbf5..26a560252d9aa 100644
--- a/llvm/lib/Analysis/AliasAnalysis.cpp
+++ b/llvm/lib/Analysis/AliasAnalysis.cpp
@@ -75,7 +75,7 @@ AAResults::AAResults(const TargetLibraryInfo &TLI) : TLI(TLI) {}
 AAResults::AAResults(AAResults &&Arg)
     : TLI(Arg.TLI), AAs(std::move(Arg.AAs)), AADeps(std::move(Arg.AADeps)) {}
 
-AAResults::~AAResults() {}
+AAResults::~AAResults() = default;
 
 bool AAResults::invalidate(Function &F, const PreservedAnalyses &PA,
                            FunctionAnalysisManager::Invalidator &Inv) {
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index e9e2e7d0316c7..da32542cf7870 100755
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -2163,18 +2163,42 @@ Constant *ConstantFoldBinaryFP(double (*NativeFP)(double, double),
 }
 
 Constant *constantFoldVectorReduce(Intrinsic::ID IID, Constant *Op) {
-  FixedVectorType *VT = dyn_cast<FixedVectorType>(Op->getType());
-  if (!VT)
-    return nullptr;
-
-  // This isn't strictly necessary, but handle the special/common case of zero:
-  // all integer reductions of a zero input produce zero.
-  if (isa<ConstantAggregateZero>(Op))
-    return ConstantInt::get(VT->getElementType(), 0);
+  auto *OpVT = cast<VectorType>(Op->getType());
 
   // This is the same as the underlying binops - poison propagates.
-  if (isa<PoisonValue>(Op) || Op->containsPoisonElement())
-    return PoisonValue::get(VT->getElementType());
+  if (Op->containsPoisonElement())
+    return PoisonValue::get(OpVT->getElementType());
+
+  // Shortcut non-accumulating reductions.
+  if (Constant *SplatVal = Op->getSplatValue()) {
+    switch (IID) {
+    case Intrinsic::vector_reduce_and:
+    case Intrinsic::vector_reduce_or:
+    case Intrinsic::vector_reduce_smin:
+    case Intrinsic::vector_reduce_smax:
+    case Intrinsic::vector_reduce_umin:
+    case Intrinsic::vector_reduce_umax:
+      return SplatVal;
+    case Intrinsic::vector_reduce_add:
+      if (SplatVal->isNullValue())
+        return SplatVal;
+      break;
+    case Intrinsic::vector_reduce_mul:
+      if (SplatVal->isNullValue() || SplatVal->isOneValue())
+        return SplatVal;
+      break;
+    case Intrinsic::vector_reduce_xor:
+      if (SplatVal->isNullValue())
+        return SplatVal;
+      if (OpVT->getElementCount().isKnownMultipleOf(2))
+        return Constant::getNullValue(OpVT->getElementType());
+      break;
+    }
+  }
+
+  FixedVectorType *VT = dyn_cast<FixedVectorType>(OpVT);
+  if (!VT)
+    return nullptr;
 
   // TODO: Handle undef.
   auto *EltC = dyn_cast_or_null<ConstantInt>(Op->getAggregateElement(0U));
diff --git a/llvm/lib/Analysis/DependenceAnalysis.cpp b/llvm/lib/Analysis/DependenceAnalysis.cpp
index 84ee8c0bf3e18..11d829492a10e 100644
--- a/llvm/lib/Analysis/DependenceAnalysis.cpp
+++ b/llvm/lib/Analysis/DependenceAnalysis.cpp
@@ -2854,14 +2854,18 @@ bool DependenceInfo::testMIV(const SCEV *Src, const SCEV *Dst,
          banerjeeMIVtest(Src, Dst, Loops, Result);
 }
 
-// Given a product, e.g., 10*X*Y, returns the first constant operand,
-// in this case 10. If there is no constant part, returns std::nullopt.
-static std::optional<APInt> getConstantPart(const SCEV *Expr) {
+/// Given a SCEVMulExpr, returns its first operand if its first operand is a
+/// constant and the product doesn't overflow in a signed sense. Otherwise,
+/// returns std::nullopt. For example, given (10 * X * Y)<nsw>, it returns 10.
+/// Notably, if it doesn't have nsw, the multiplication may overflow, and if
+/// so, it may not a multiple of 10.
+static std::optional<APInt> getConstanCoefficient(const SCEV *Expr) {
   if (const auto *Constant = dyn_cast<SCEVConstant>(Expr))
     return Constant->getAPInt();
   if (const auto *Product = dyn_cast<SCEVMulExpr>(Expr))
     if (const auto *Constant = dyn_cast<SCEVConstant>(Product->getOperand(0)))
-      return Constant->getAPInt();
+      if (Product->hasNoSignedWrap())
+        return Constant->getAPInt();
   return std::nullopt;
 }
 
@@ -2887,7 +2891,7 @@ bool DependenceInfo::accumulateCoefficientsGCD(const SCEV *Expr,
   if (AddRec->getLoop() == CurLoop) {
     CurLoopCoeff = Step;
   } else {
-    std::optional<APInt> ConstCoeff = getConstantPart(Step);
+    std::optional<APInt> ConstCoeff = getConstanCoefficient(Step);
 
     // If the coefficient is the product of a constant and other stuff, we can
     // use the constant in the GCD computation.
@@ -2940,7 +2944,7 @@ bool DependenceInfo::gcdMIVtest(const SCEV *Src, const SCEV *Dst,
     const SCEV *Coeff = AddRec->getStepRecurrence(*SE);
     // If the coefficient is the product of a constant and other stuff,
     // we can use the constant in the GCD computation.
-    std::optional<APInt> ConstCoeff = getConstantPart(Coeff);
+    std::optional<APInt> ConstCoeff = getConstanCoefficient(Coeff);
     if (!ConstCoeff)
       return false;
     RunningGCD = APIntOps::GreatestCommonDivisor(RunningGCD, ConstCoeff->abs());
@@ -2958,7 +2962,7 @@ bool DependenceInfo::gcdMIVtest(const SCEV *Src, const SCEV *Dst,
     const SCEV *Coeff = AddRec->getStepRecurrence(*SE);
     // If the coefficient is the product of a constant and other stuff,
     // we can use the constant in the GCD computation.
-    std::optional<APInt> ConstCoeff = getConstantPart(Coeff);
+    std::optional<APInt> ConstCoeff = getConstanCoefficient(Coeff);
     if (!ConstCoeff)
       return false;
     RunningGCD = APIntOps::GreatestCommonDivisor(RunningGCD, ConstCoeff->abs());
@@ -2979,7 +2983,7 @@ bool DependenceInfo::gcdMIVtest(const SCEV *Src, const SCEV *Dst,
       } else if (const SCEVMulExpr *Product = dyn_cast<SCEVMulExpr>(Operand)) {
         // Search for constant operand to participate in GCD;
         // If none found; return false.
-        std::optional<APInt> ConstOp = getConstantPart(Product);
+        std::optional<APInt> ConstOp = getConstanCoefficient(Product);
         if (!ConstOp)
           return false;
         ExtraGCD = APIntOps::GreatestCommonDivisor(ExtraGCD, ConstOp->abs());
@@ -3032,7 +3036,7 @@ bool DependenceInfo::gcdMIVtest(const SCEV *Src, const SCEV *Dst,
     Delta = SE->getMinusSCEV(SrcCoeff, DstCoeff);
     // If the coefficient is the product of a constant and other stuff,
     // we can use the constant in the GCD computation.
-    std::optional<APInt> ConstCoeff = getConstantPart(Delta);
+    std::optional<APInt> ConstCoeff = getConstanCoefficient(Delta);
     if (!ConstCoeff)
       // The difference of the two coefficients might not be a product
       // or constant, in which case we give up on this direction.
diff --git a/llvm/lib/Analysis/HashRecognize.cpp b/llvm/lib/Analysis/HashRecognize.cpp
index 4529123508a7c..8974ce5734b13 100644
--- a/llvm/lib/Analysis/HashRecognize.cpp
+++ b/llvm/lib/Analysis/HashRecognize.cpp
@@ -468,8 +468,11 @@ std::variant<PolynomialInfo, StringRef> HashRecognize::recognizeCRC() const {
 
     // Ensure that the PHIs have exactly two uses:
     // the bit-shift, and the XOR (or a cast feeding into the XOR).
+    // Also ensure that the SimpleRecurrence's evolution doesn't have stray
+    // users.
     if (!ConditionalRecurrence.Phi->hasNUses(2) ||
-        !SimpleRecurrence.Phi->hasNUses(2))
+        !SimpleRecurrence.Phi->hasNUses(2) ||
+        SimpleRecurrence.BO->getUniqueUndroppableUser() != SimpleRecurrence.Phi)
       return "Recurrences have stray uses";
 
     // Check that the SelectInst ConditionalRecurrence.Step is conditional on
diff --git a/llvm/lib/Analysis/HeatUtils.cpp b/llvm/lib/Analysis/HeatUtils.cpp
index a1cc7071f0e22..08e9428059e7e 100644
--- a/llvm/lib/Analysis/HeatUtils.cpp
+++ b/llvm/lib/Analysis/HeatUtils.cpp
@@ -64,10 +64,7 @@ std::string llvm::getHeatColor(uint64_t Freq, uint64_t MaxFreq) {
 }
 
 std::string llvm::getHeatColor(double Percent) {
-  if (Percent > 1.0)
-    Percent = 1.0;
-  if (Percent < 0.0)
-    Percent = 0.0;
+  Percent = std::clamp(Percent, 0.0, 1.0);
   unsigned ColorID = unsigned(round(Percent * (HeatSize - 1.0)));
   return HeatPalette[ColorID];
 }
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 7597f3ad685a0..a31f17b1936d6 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -2424,10 +2424,10 @@ ScalarEvolution::getStrengthenedNoWrapFlagsFromBinOp(
 // We're trying to construct a SCEV of type `Type' with `Ops' as operands and
 // `OldFlags' as can't-wrap behavior.  Infer a more aggressive set of
 // can't-overflow flags for the operation if possible.
-static SCEV::NoWrapFlags
-StrengthenNoWrapFlags(ScalarEvolution *SE, SCEVTypes Type,
-                      const ArrayRef<const SCEV *> Ops,
-                      SCEV::NoWrapFlags Flags) {
+static SCEV::NoWrapFlags StrengthenNoWrapFlags(ScalarEvolution *SE,
+                                               SCEVTypes Type,
+                                               ArrayRef<const SCEV *> Ops,
+                                               SCEV::NoWrapFlags Flags) {
   using namespace std::placeholders;
 
   using OBO = OverflowingBinaryOperator;
@@ -2540,7 +2540,7 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
   unsigned Idx = isa<SCEVConstant>(Ops[0]) ? 1 : 0;
 
   // Delay expensive flag strengthening until necessary.
-  auto ComputeFlags = [this, OrigFlags](const ArrayRef<const SCEV *> Ops) {
+  auto ComputeFlags = [this, OrigFlags](ArrayRef<const SCEV *> Ops) {
     return StrengthenNoWrapFlags(this, scAddExpr, Ops, OrigFlags);
   };
 
@@ -3125,7 +3125,7 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops,
     return Folded;
 
   // Delay expensive flag strengthening until necessary.
-  auto ComputeFlags = [this, OrigFlags](const ArrayRef<const SCEV *> Ops) {
+  auto ComputeFlags = [this, OrigFlags](ArrayRef<const SCEV *> Ops) {
     return StrengthenNoWrapFlags(this, scMulExpr, Ops, OrigFlags);
   };
 
@@ -15510,6 +15510,78 @@ static const SCEV *getNextSCEVDivisibleByDivisor(const SCEV *Expr,
   return SE.getConstant(*ExprVal + DivisorVal - Rem);
 }
 
+static bool collectDivisibilityInformation(
+    ICmpInst::Predicate Predicate, const SCEV *LHS, const SCEV *RHS,
+    DenseMap<const SCEV *, const SCEV *> &DivInfo,
+    DenseMap<const SCEV *, APInt> &Multiples, ScalarEvolution &SE) {
+  // If we have LHS == 0, check if LHS is computing a property of some unknown
+  // SCEV %v which we can rewrite %v to express explicitly.
+  if (Predicate != CmpInst::ICMP_EQ || !match(RHS, m_scev_Zero()))
+    return false;
+  // If LHS is A % B, i.e. A % B == 0, rewrite A to (A /u B) * B to
+  // explicitly express that.
+  const SCEVUnknown *URemLHS = nullptr;
+  const SCEV *URemRHS = nullptr;
+  if (!match(LHS, m_scev_URem(m_SCEVUnknown(URemLHS), m_SCEV(URemRHS), SE)))
+    return false;
+
+  const SCEV *Multiple =
+      SE.getMulExpr(SE.getUDivExpr(URemLHS, URemRHS), URemRHS);
+  DivInfo[URemLHS] = Multiple;
+  if (auto *C = dyn_cast<SCEVConstant>(URemRHS))
+    Multiples[URemLHS] = C->getAPInt();
+  return true;
+}
+
+// Check if the condition is a divisibility guard (A % B == 0).
+static bool isDivisibilityGuard(const SCEV *LHS, const SCEV *RHS,
+                                ScalarEvolution &SE) {
+  const SCEV *X, *Y;
+  return match(LHS, m_scev_URem(m_SCEV(X), m_SCEV(Y), SE)) && RHS->isZero();
+}
+
+// Apply divisibility by \p Divisor on MinMaxExpr with constant values,
+// recursively. This is done by aligning up/down the constant value to the
+// Divisor.
+static const SCEV *applyDivisibilityOnMinMaxExpr(const SCEV *MinMaxExpr,
+                                                 APInt Divisor,
+                                                 ScalarEvolution &SE) {
+  // Return true if \p Expr is a MinMax SCEV expression with a non-negative
+  // constant operand. If so, return in \p SCTy the SCEV type and in \p RHS
+  // the non-constant operand and in \p LHS the constant operand.
+  auto IsMinMaxSCEVWithNonNegativeConstant =
+      [&](const SCEV *Expr, SCEVTypes &SCTy, const SCEV *&LHS,
+          const SCEV *&RHS) {
+        if (auto *MinMax = dyn_cast<SCEVMinMaxExpr>(Expr)) {
+          if (MinMax->getNumOperands() != 2)
+            return false;
+          if (auto *C = dyn_cast<SCEVConstant>(MinMax->getOperand(0))) {
+            if (C->getAPInt().isNegative())
+              return false;
+            SCTy = MinMax->getSCEVType();
+            LHS = MinMax->getOperand(0);
+            RHS = MinMax->getOperand(1);
+            return true;
+          }
+        }
+        return false;
+      };
+
+  const SCEV *MinMaxLHS = nullptr, *MinMaxRHS = nullptr;
+  SCEVTypes SCTy;
+  if (!IsMinMaxSCEVWithNonNegativeConstant(MinMaxExpr, SCTy, MinMaxLHS,
+                                           MinMaxRHS))
+    return MinMaxExpr;
+  auto IsMin = isa<SCEVSMinExpr>(MinMaxExpr) || isa<SCEVUMinExpr>(MinMaxExpr);
+  assert(SE.isKnownNonNegative(MinMaxLHS) && "Expected non-negative operand!");
+  auto *DivisibleExpr =
+      IsMin ? getPreviousSCEVDivisibleByDivisor(MinMaxLHS, Divisor, SE)
+            : getNextSCEVDivisibleByDivisor(MinMaxLHS, Divisor, SE);
+  SmallVector<const SCEV *> Ops = {
+      applyDivisibilityOnMinMaxExpr(MinMaxRHS, Divisor, SE), DivisibleExpr};
+  return SE.getMinMaxExpr(SCTy, Ops);
+}
+
 void ScalarEvolution::LoopGuards::collectFromBlock(
     ScalarEvolution &SE, ScalarEvolution::LoopGuards &Guards,
     const BasicBlock *Block, const BasicBlock *Pred,
@@ -15520,19 +15592,13 @@ void ScalarEvolution::LoopGuards::collectFromBlock(
   SmallVector<const SCEV *> ExprsToRewrite;
   auto CollectCondition = [&](ICmpInst::Predicate Predicate, const SCEV *LHS,
                               const SCEV *RHS,
-                              DenseMap<const SCEV *, const SCEV *>
-                                  &RewriteMap) {
+                              DenseMap<const SCEV *, const SCEV *> &RewriteMap,
+                              const LoopGuards &DivGuards) {
     // WARNING: It is generally unsound to apply any wrap flags to the proposed
     // replacement SCEV which isn't directly implied by the structure of that
     // SCEV.  In particular, using contextual facts to imply flags is *NOT*
     // legal.  See the scoping rules for flags in the header to understand why.
 
-    // If LHS is a constant, apply information to the other expression.
-    if (isa<SCEVConstant>(LHS)) {
-      std::swap(LHS, RHS);
-      Predicate = CmpInst::getSwappedPredicate(Predicate);
-    }
-
     // Check for a condition of the form (-C1 + X < C2).  InstCombine will
     // create this form when combining two checks of the form (X u< C2 + C1) and
     // (X >=u C1).
@@ -15565,67 +15631,6 @@ void ScalarEvolution::LoopGuards::collectFromBlock(
     if (MatchRangeCheckIdiom())
       return;
 
-    // Return true if \p Expr is a MinMax SCEV expression with a non-negative
-    // constant operand. If so, return in \p SCTy the SCEV type and in \p RHS
-    // the non-constant operand and in \p LHS the constant operand.
-    auto IsMinMaxSCEVWithNonNegativeConstant =
-        [&](const SCEV *Expr, SCEVTypes &SCTy, const SCEV *&LHS,
-            const SCEV *&RHS) {
-          const APInt *C;
-          SCTy = Expr->getSCEVType();
-          return match(Expr, m_scev_MinMax(m_SCEV(LHS), m_SCEV(RHS))) &&
-                 match(LHS, m_scev_APInt(C)) && C->isNonNegative();
-        };
-
-    // Apply divisibilty by \p Divisor on MinMaxExpr with constant values,
-    // recursively. This is done by aligning up/down the constant value to the
-    // Divisor.
-    std::function<const SCEV *(const SCEV *, const SCEV *)>
-        ApplyDivisibiltyOnMinMaxExpr = [&](const SCEV *MinMaxExpr,
-                                           const SCEV *Divisor) {
-          auto *ConstDivisor = dyn_cast<SCEVConstant>(Divisor);
-          if (!ConstDivisor)
-            return MinMaxExpr;
-          const APInt &DivisorVal = ConstDivisor->getAPInt();
-
-          const SCEV *MinMaxLHS = nullptr, *MinMaxRHS = nullptr;
-          SCEVTypes SCTy;
-          if (!IsMinMaxSCEVWithNonNegativeConstant(MinMaxExpr, SCTy, MinMaxLHS,
-                                                   MinMaxRHS))
-            return MinMaxExpr;
-          auto IsMin =
-              isa<SCEVSMinExpr>(MinMaxExpr) || isa<SCEVUMinExpr>(MinMaxExpr);
-          assert(SE.isKnownNonNegative(MinMaxLHS) &&
-                 "Expected non-negative operand!");
-          auto *DivisibleExpr =
-              IsMin
-                  ? getPreviousSCEVDivisibleByDivisor(MinMaxLHS, DivisorVal, SE)
-                  : getNextSCEVDivisibleByDivisor(MinMaxLHS, DivisorVal, SE);
-          SmallVector<const SCEV *> Ops = {
-              ApplyDivisibiltyOnMinMaxExpr(MinMaxRHS, Divisor), DivisibleExpr};
-          return SE.getMinMaxExpr(SCTy, Ops);
-        };
-
-    // If we have LHS == 0, check if LHS is computing a property of some unknown
-    // SCEV %v which we can rewrite %v to express explicitly.
-    if (Predicate == CmpInst::ICMP_EQ && match(RHS, m_scev_Zero())) {
-      // If LHS is A % B, i.e. A % B == 0, rewrite A to (A /u B) * B to
-      // explicitly express that.
-      const SCEVUnknown *URemLHS = nullptr;
-      const SCEV *URemRHS = nullptr;
-      if (match(LHS,
-                m_scev_URem(m_SCEVUnknown(URemLHS), m_SCEV(URemRHS), SE))) {
-        auto I = RewriteMap.find(URemLHS);
-        const SCEV *RewrittenLHS = I != RewriteMap.end() ? I->second : URemLHS;
-        RewrittenLHS = ApplyDivisibiltyOnMinMaxExpr(RewrittenLHS, URemRHS);
-        const auto *Multiple =
-            SE.getMulExpr(SE.getUDivExpr(RewrittenLHS, URemRHS), URemRHS);
-        RewriteMap[URemLHS] = Multiple;
-        ExprsToRewrite.push_back(URemLHS);
-        return;
-      }
-    }
-
     // Do not apply information for constants or if RHS contains an AddRec.
     if (isa<SCEVConstant>(LHS) || SE.containsAddRecurrence(RHS))
       return;
@@ -15655,7 +15660,9 @@ void ScalarEvolution::LoopGuards::collectFromBlock(
     };
 
     const SCEV *RewrittenLHS = GetMaybeRewritten(LHS);
-    const APInt &DividesBy = SE.getConstantMultiple(RewrittenLHS);
+    // Apply divisibility information when computing the constant multiple.
+    const APInt &DividesBy =
+        SE.getConstantMultiple(DivGuards.rewrite(RewrittenLHS));
 
     // Collect rewrites for LHS and its transitive operands based on the
     // condition.
@@ -15670,31 +15677,31 @@ void ScalarEvolution::LoopGuards::collectFromBlock(
     // predicate.
     const SCEV *One = SE.getOne(RHS->getType());
     switch (Predicate) {
-      case CmpInst::ICMP_ULT:
-        if (RHS->getType()->isPointerTy())
-          return;
-        RHS = SE.getUMaxExpr(RHS, One);
-        [[fallthrough]];
-      case CmpInst::ICMP_SLT: {
-        RHS = SE.getMinusSCEV(RHS, One);
-        RHS = getPreviousSCEVDivisibleByDivisor(RHS, DividesBy, SE);
-        break;
-      }
-      case CmpInst::ICMP_UGT:
-      case CmpInst::ICMP_SGT:
-        RHS = SE.getAddExpr(RHS, One);
-        RHS = getNextSCEVDivisibleByDivisor(RHS, DividesBy, SE);
-        break;
-      case CmpInst::ICMP_ULE:
-      case CmpInst::ICMP_SLE:
-        RHS = getPreviousSCEVDivisibleByDivisor(RHS, DividesBy, SE);
-        break;
-      case CmpInst::ICMP_UGE:
-      case CmpInst::ICMP_SGE:
-        RHS = getNextSCEVDivisibleByDivisor(RHS, DividesBy, SE);
-        break;
-      default:
-        break;
+    case CmpInst::ICMP_ULT:
+      if (RHS->getType()->isPointerTy())
+        return;
+      RHS = SE.getUMaxExpr(RHS, One);
+      [[fallthrough]];
+    case CmpInst::ICMP_SLT: {
+      RHS = SE.getMinusSCEV(RHS, One);
+      RHS = getPreviousSCEVDivisibleByDivisor(RHS, DividesBy, SE);
+      break;
+    }
+    case CmpInst::ICMP_UGT:
+    case CmpInst::ICMP_SGT:
+      RHS = SE.getAddExpr(RHS, One);
+      RHS = getNextSCEVDivisibleByDivisor(RHS, DividesBy, SE);
+      break;
+    case CmpInst::ICMP_ULE:
+    case CmpInst::ICMP_SLE:
+      RHS = getPreviousSCEVDivisibleByDivisor(RHS, DividesBy, SE);
+      break;
+    case CmpInst::ICMP_UGE:
+    case CmpInst::ICMP_SGE:
+      RHS = getNextSCEVDivisibleByDivisor(RHS, DividesBy, SE);
+      break;
+    default:
+      break;
     }
 
     SmallVector<const SCEV *, 16> Worklist(1, LHS);
@@ -15840,8 +15847,11 @@ void ScalarEvolution::LoopGuards::collectFromBlock(
 
   // Now apply the information from the collected conditions to
   // Guards.RewriteMap. Conditions are processed in reverse order, so the
-  // earliest conditions is processed first. This ensures the SCEVs with the
+  // earliest conditions is processed first, except guards with divisibility
+  // information, which are moved to the back. This ensures the SCEVs with the
   // shortest dependency chains are constructed first.
+  SmallVector<std::tuple<CmpInst::Predicate, const SCEV *, const SCEV *>>
+      GuardsToProcess;
   for (auto [Term, EnterIfTrue] : reverse(Terms)) {
     SmallVector<Value *, 8> Worklist;
     SmallPtrSet<Value *, 8> Visited;
@@ -15856,7 +15866,14 @@ void ScalarEvolution::LoopGuards::collectFromBlock(
             EnterIfTrue ? Cmp->getPredicate() : Cmp->getInversePredicate();
         const auto *LHS = SE.getSCEV(Cmp->getOperand(0));
         const auto *RHS = SE.getSCEV(Cmp->getOperand(1));
-        CollectCondition(Predicate, LHS, RHS, Guards.RewriteMap);
+        // If LHS is a constant, apply information to the other expression.
+        // TODO: If LHS is not a constant, check if using CompareSCEVComplexity
+        // can improve results.
+        if (isa<SCEVConstant>(LHS)) {
+          std::swap(LHS, RHS);
+          Predicate = CmpInst::getSwappedPredicate(Predicate);
+        }
+        GuardsToProcess.emplace_back(Predicate, LHS, RHS);
         continue;
       }
 
@@ -15869,6 +15886,31 @@ void ScalarEvolution::LoopGuards::collectFromBlock(
     }
   }
 
+  // Process divisibility guards in reverse order to populate DivGuards early.
+  DenseMap<const SCEV *, APInt> Multiples;
+  LoopGuards DivGuards(SE);
+  for (const auto &[Predicate, LHS, RHS] : GuardsToProcess) {
+    if (!isDivisibilityGuard(LHS, RHS, SE))
+      continue;
+    collectDivisibilityInformation(Predicate, LHS, RHS, DivGuards.RewriteMap,
+                                   Multiples, SE);
+  }
+
+  for (const auto &[Predicate, LHS, RHS] : GuardsToProcess)
+    CollectCondition(Predicate, LHS, RHS, Guards.RewriteMap, DivGuards);
+
+  // Apply divisibility information last. This ensures it is applied to the
+  // outermost expression after other rewrites for the given value.
+  for (const auto &[K, Divisor] : Multiples) {
+    const SCEV *DivisorSCEV = SE.getConstant(Divisor);
+    Guards.RewriteMap[K] =
+        SE.getMulExpr(SE.getUDivExpr(applyDivisibilityOnMinMaxExpr(
+                                         Guards.rewrite(K), Divisor, SE),
+                                     DivisorSCEV),
+                      DivisorSCEV);
+    ExprsToRewrite.push_back(K);
+  }
+
   // Let the rewriter preserve NUW/NSW flags if the unsigned/signed ranges of
   // the replacement expressions are contained in the ranges of the replaced
   // expressions.
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index c47a1c1b23a37..0426ac7e62fab 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1353,9 +1353,9 @@ TargetTransformInfo::getInlineCallPenalty(const Function *F,
   return TTIImpl->getInlineCallPenalty(F, Call, DefaultCallPenalty);
 }
 
-bool TargetTransformInfo::areTypesABICompatible(
-    const Function *Caller, const Function *Callee,
-    const ArrayRef<Type *> &Types) const {
+bool TargetTransformInfo::areTypesABICompatible(const Function *Caller,
+                                                const Function *Callee,
+                                                ArrayRef<Type *> Types) const {
   return TTIImpl->areTypesABICompatible(Caller, Callee, Types);
 }
 
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index 5164cec33e6f5..8e3ce4990f437 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -4538,6 +4538,9 @@ bool LLParser::parseValID(ValID &ID, PerFunctionState *PFS, Type *ExpectedTy) {
       if (!Indices.empty() && !Ty->isSized(&Visited))
         return error(ID.Loc, "base element of getelementptr must be sized");
 
+      if (!ConstantExpr::isSupportedGetElementPtr(Ty))
+        return error(ID.Loc, "invalid base element for constant getelementptr");
+
       if (!GetElementPtrInst::getIndexedType(Ty, Indices))
         return error(ID.Loc, "invalid getelementptr indices");
 
@@ -5639,16 +5642,17 @@ bool LLParser::parseDIBasicType(MDNode *&Result, bool IsDistinct) {
   OPTIONAL(name, MDStringField, );                                             \
   OPTIONAL(size, MDUnsignedOrMDField, (0, UINT64_MAX));                        \
   OPTIONAL(align, MDUnsignedField, (0, UINT32_MAX));                           \
+  OPTIONAL(dataSize, MDUnsignedField, (0, UINT32_MAX));                        \
   OPTIONAL(encoding, DwarfAttEncodingField, );                                 \
   OPTIONAL(num_extra_inhabitants, MDUnsignedField, (0, UINT32_MAX));           \
   OPTIONAL(flags, DIFlagField, );
   PARSE_MD_FIELDS();
 #undef VISIT_MD_FIELDS
 
-  Result = GET_OR_DISTINCT(DIBasicType, (Context, tag.Val, name.Val,
-                                         size.getValueAsMetadata(Context),
-                                         align.Val, encoding.Val,
-                                         num_extra_inhabitants.Val, flags.Val));
+  Result = GET_OR_DISTINCT(
+      DIBasicType,
+      (Context, tag.Val, name.Val, size.getValueAsMetadata(Context), align.Val,
+       encoding.Val, num_extra_inhabitants.Val, dataSize.Val, flags.Val));
   return false;
 }
 
@@ -6341,8 +6345,8 @@ bool LLParser::parseDIObjCProperty(MDNode *&Result, bool IsDistinct) {
 #undef VISIT_MD_FIELDS
 
   Result = GET_OR_DISTINCT(DIObjCProperty,
-                           (Context, name.Val, file.Val, line.Val, setter.Val,
-                            getter.Val, attributes.Val, type.Val));
+                           (Context, name.Val, file.Val, line.Val, getter.Val,
+                            setter.Val, attributes.Val, type.Val));
   return false;
 }
 
diff --git a/llvm/lib/BinaryFormat/Dwarf.cpp b/llvm/lib/BinaryFormat/Dwarf.cpp
index 55fa2df632bfa..a6c7e6afdbe7a 100644
--- a/llvm/lib/BinaryFormat/Dwarf.cpp
+++ b/llvm/lib/BinaryFormat/Dwarf.cpp
@@ -1076,10 +1076,3 @@ StringRef (*const llvm::dwarf::EnumTraits<LineNumberOps>::StringFn)(unsigned) =
     LNStandardString;
 StringRef (*const llvm::dwarf::EnumTraits<Index>::StringFn)(unsigned) =
     IndexString;
-
-constexpr char llvm::dwarf::EnumTraits<Attribute>::Type[];
-constexpr char llvm::dwarf::EnumTraits<Form>::Type[];
-constexpr char llvm::dwarf::EnumTraits<Index>::Type[];
-constexpr char llvm::dwarf::EnumTraits<Tag>::Type[];
-constexpr char llvm::dwarf::EnumTraits<LineNumberOps>::Type[];
-constexpr char llvm::dwarf::EnumTraits<LocationAtom>::Type[];
diff --git a/llvm/lib/BinaryFormat/MsgPackDocumentYAML.cpp b/llvm/lib/BinaryFormat/MsgPackDocumentYAML.cpp
index 3de3dccce0c6c..80b421d5f752e 100644
--- a/llvm/lib/BinaryFormat/MsgPackDocumentYAML.cpp
+++ b/llvm/lib/BinaryFormat/MsgPackDocumentYAML.cpp
@@ -209,12 +209,12 @@ template <> struct CustomMappingTraits<MapDocNode> {
   static void inputOne(IO &IO, StringRef Key, MapDocNode &M) {
     ScalarDocNode KeyObj = M.getDocument()->getNode();
     KeyObj.fromString(Key, "");
-    IO.mapRequired(Key.str().c_str(), M.getMap()[KeyObj]);
+    IO.mapRequired(Key, M.getMap()[KeyObj]);
   }
 
   static void output(IO &IO, MapDocNode &M) {
     for (auto I : M.getMap()) {
-      IO.mapRequired(I.first.toString().c_str(), I.second);
+      IO.mapRequired(I.first.toString(), I.second);
     }
   }
 };
diff --git a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
index ed0443f599a44..c63dc8f00785e 100644
--- a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
+++ b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
@@ -1531,7 +1531,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     break;
   }
   case bitc::METADATA_BASIC_TYPE: {
-    if (Record.size() < 6 || Record.size() > 8)
+    if (Record.size() < 6 || Record.size() > 9)
       return error("Invalid record");
 
     IsDistinct = Record[0] & 1;
@@ -1540,13 +1540,13 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
                                 ? static_cast<DINode::DIFlags>(Record[6])
                                 : DINode::FlagZero;
     uint32_t NumExtraInhabitants = (Record.size() > 7) ? Record[7] : 0;
-
+    uint32_t DataSizeInBits = (Record.size() > 8) ? Record[8] : 0;
     Metadata *SizeInBits = getMetadataOrConstant(SizeIsMetadata, Record[3]);
-
     MetadataList.assignValue(
         GET_OR_DISTINCT(DIBasicType,
                         (Context, Record[1], getMDString(Record[2]), SizeInBits,
-                         Record[4], Record[5], NumExtraInhabitants, Flags)),
+                         Record[4], Record[5], NumExtraInhabitants,
+                         DataSizeInBits, Flags)),
         NextMetadataNo);
     NextMetadataNo++;
     break;
@@ -2323,8 +2323,9 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
         GET_OR_DISTINCT(DIObjCProperty,
                         (Context, getMDString(Record[1]),
                          getMDOrNull(Record[2]), Record[3],
-                         getMDString(Record[4]), getMDString(Record[5]),
-                         Record[6], getDITypeRefOrNull(Record[7]))),
+                         /*GetterName=*/getMDString(Record[5]),
+                         /*SetterName=*/getMDString(Record[4]), Record[6],
+                         getDITypeRefOrNull(Record[7]))),
         NextMetadataNo);
     NextMetadataNo++;
     break;
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index 61aa7c2f5af53..f17656c7c3b03 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -1925,6 +1925,7 @@ void ModuleBitcodeWriter::writeDIBasicType(const DIBasicType *N,
   Record.push_back(N->getEncoding());
   Record.push_back(N->getFlags());
   Record.push_back(N->getNumExtraInhabitants());
+  Record.push_back(N->getDataSizeInBits());
 
   Stream.EmitRecord(bitc::METADATA_BASIC_TYPE, Record, Abbrev);
   Record.clear();
diff --git a/llvm/lib/CGData/OutlinedHashTreeRecord.cpp b/llvm/lib/CGData/OutlinedHashTreeRecord.cpp
index cc760634d7fae..2b6e2f0537524 100644
--- a/llvm/lib/CGData/OutlinedHashTreeRecord.cpp
+++ b/llvm/lib/CGData/OutlinedHashTreeRecord.cpp
@@ -37,7 +37,7 @@ template <> struct MappingTraits<HashNodeStable> {
 template <> struct CustomMappingTraits<IdHashNodeStableMapTy> {
   static void inputOne(IO &io, StringRef Key, IdHashNodeStableMapTy &V) {
     HashNodeStable NodeStable;
-    io.mapRequired(Key.str().c_str(), NodeStable);
+    io.mapRequired(Key, NodeStable);
     unsigned Id;
     if (Key.getAsInteger(0, Id)) {
       io.setError("Id not an integer");
@@ -48,7 +48,7 @@ template <> struct CustomMappingTraits<IdHashNodeStableMapTy> {
 
   static void output(IO &io, IdHashNodeStableMapTy &V) {
     for (auto Iter = V.begin(); Iter != V.end(); ++Iter)
-      io.mapRequired(utostr(Iter->first).c_str(), Iter->second);
+      io.mapRequired(utostr(Iter->first), Iter->second);
   }
 };
 
diff --git a/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp b/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp
index e5c85d588b45e..1ea30d8ab3c2b 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp
@@ -745,11 +745,6 @@ void AppleAccelTableStaticTypeData::emit(AsmPrinter *Asm) const {
   Asm->emitInt32(QualifiedNameHash);
 }
 
-constexpr AppleAccelTableData::Atom AppleAccelTableTypeData::Atoms[];
-constexpr AppleAccelTableData::Atom AppleAccelTableOffsetData::Atoms[];
-constexpr AppleAccelTableData::Atom AppleAccelTableStaticOffsetData::Atoms[];
-constexpr AppleAccelTableData::Atom AppleAccelTableStaticTypeData::Atoms[];
-
 #ifndef NDEBUG
 void AppleAccelTableWriter::Header::print(raw_ostream &OS) const {
   OS << "Magic: " << format("0x%x", Magic) << "\n"
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 8aa488f0efd8f..f65d88a669f13 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -1443,7 +1443,7 @@ getBBAddrMapFeature(const MachineFunction &MF, int NumMBBSectionRanges,
           MF.hasBBSections() && NumMBBSectionRanges > 1,
           // Use static_cast to avoid breakage of tests on windows.
           static_cast<bool>(BBAddrMapSkipEmitBBEntries), HasCalls,
-          static_cast<bool>(EmitBBHash)};
+          static_cast<bool>(EmitBBHash), false};
 }
 
 void AsmPrinter::emitBBAddrMapSection(const MachineFunction &MF) {
diff --git a/llvm/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp b/llvm/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp
index 171fb8394990d..98cdada3d8add 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp
@@ -112,8 +112,7 @@ void DbgValueHistoryMap::Entry::endEntry(EntryIndex Index) {
 /// to the first intersecting scope range if one exists.
 static std::optional<ArrayRef<InsnRange>::iterator>
 intersects(const MachineInstr *StartMI, const MachineInstr *EndMI,
-           const ArrayRef<InsnRange> &Ranges,
-           const InstructionOrdering &Ordering) {
+           ArrayRef<InsnRange> Ranges, const InstructionOrdering &Ordering) {
   for (auto RangesI = Ranges.begin(), RangesE = Ranges.end();
        RangesI != RangesE; ++RangesI) {
     if (EndMI && Ordering.isBefore(EndMI, RangesI->first))
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
index 518121e200190..751d3735d3b2b 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
@@ -1793,9 +1793,13 @@ void DwarfCompileUnit::createBaseTypeDIEs() {
                     "_" + Twine(Btr.BitSize)).toStringRef(Str));
     addUInt(Die, dwarf::DW_AT_encoding, dwarf::DW_FORM_data1, Btr.Encoding);
     // Round up to smallest number of bytes that contains this number of bits.
+    // ExprRefedBaseTypes is populated with types referenced by
+    // DW_OP_LLVM_convert operations in location expressions. These are often
+    // byte-sized, but one common counter-example is 1-bit sized conversions
+    // from `i1` types. TODO: Should these use DW_AT_bit_size? See
+    // DwarfUnit::constructTypeDIE.
     addUInt(Die, dwarf::DW_AT_byte_size, std::nullopt,
             divideCeil(Btr.BitSize, 8));
-
     Btr.Die = &Die;
   }
 }
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
index e40fb768027b8..b16e131529ac3 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
@@ -766,8 +766,19 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DIBasicType *BTy) {
     addUInt(Buffer, dwarf::DW_AT_encoding, dwarf::DW_FORM_data1,
             BTy->getEncoding());
 
-  uint64_t Size = BTy->getSizeInBits() >> 3;
-  addUInt(Buffer, dwarf::DW_AT_byte_size, std::nullopt, Size);
+  uint64_t SizeInBytes = divideCeil(BTy->getSizeInBits(), 8);
+  addUInt(Buffer, dwarf::DW_AT_byte_size, std::nullopt, SizeInBytes);
+  if (BTy->getTag() == dwarf::Tag::DW_TAG_base_type) {
+    // DW_TAG_base_type:
+    // If the value of an object of the given type does not fully occupy the
+    // storage described by a byte size attribute, the base type entry may also
+    // have a DW_AT_bit_size [...] attribute.
+    // TODO: Do big endian targets need DW_AT_data_bit_offset? See discussion in
+    // pull request #164372.
+    if (uint64_t DataSizeInBits = BTy->getDataSizeInBits();
+        DataSizeInBits && DataSizeInBits != SizeInBytes * 8)
+      addUInt(Buffer, dwarf::DW_AT_bit_size, std::nullopt, DataSizeInBits);
+  }
 
   if (BTy->isBigEndian())
     addUInt(Buffer, dwarf::DW_AT_endianity, std::nullopt, dwarf::DW_END_big);
@@ -1109,7 +1120,7 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DICompositeType *CTy) {
           constructMemberDIE(Buffer, DDTy);
         }
       } else if (auto *Property = dyn_cast<DIObjCProperty>(Element)) {
-        DIE &ElemDie = createAndAddDIE(Property->getTag(), Buffer);
+        DIE &ElemDie = createAndAddDIE(Property->getTag(), Buffer, Property);
         StringRef PropertyName = Property->getName();
         addString(ElemDie, dwarf::DW_AT_APPLE_property_name, PropertyName);
         if (Property->getType())
diff --git a/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp b/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp
index fbcd614b85d18..485b44ae4c4aa 100644
--- a/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp
+++ b/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp
@@ -287,6 +287,25 @@ Error BasicBlockSectionsProfileReader::ReadV1Profile() {
       }
       continue;
     }
+    case 'h': { // Basic block hash secifier.
+      // Skip the profile when the profile iterator (FI) refers to the
+      // past-the-end element.
+      if (FI == ProgramPathAndClusterInfo.end())
+        continue;
+      for (auto BBIDHashStr : Values) {
+        auto [BBIDStr, HashStr] = BBIDHashStr.split(':');
+        unsigned long long BBID = 0, Hash = 0;
+        if (getAsUnsignedInteger(BBIDStr, 10, BBID))
+          return createProfileParseError(Twine("unsigned integer expected: '") +
+                                         BBIDStr + "'");
+        if (getAsUnsignedInteger(HashStr, 16, Hash))
+          return createProfileParseError(
+              Twine("unsigned integer expected in hex format: '") + HashStr +
+              "'");
+        FI->second.BBHashes[BBID] = Hash;
+      }
+      continue;
+    }
     default:
       return createProfileParseError(Twine("invalid specifier: '") +
                                      Twine(Specifier) + "'");
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 8ea132626a5af..0309e225d9df4 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -368,7 +368,7 @@ class CodeGenPrepare {
   std::unique_ptr<DominatorTree> DT;
 
 public:
-  CodeGenPrepare(){};
+  CodeGenPrepare() = default;
   CodeGenPrepare(const TargetMachine *TM) : TM(TM){};
   /// If encounter huge function, we need to limit the build time.
   bool IsHugeFunc = false;
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 1f104784a97ec..ec4d13f1cd1b3 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -589,8 +589,8 @@ bool CombinerHelper::matchCombineShuffleVector(
   return true;
 }
 
-void CombinerHelper::applyCombineShuffleVector(
-    MachineInstr &MI, const ArrayRef<Register> Ops) const {
+void CombinerHelper::applyCombineShuffleVector(MachineInstr &MI,
+                                               ArrayRef<Register> Ops) const {
   Register DstReg = MI.getOperand(0).getReg();
   Builder.setInsertPt(*MI.getParent(), MI);
   Register NewDstReg = MRI.cloneVirtualRegister(DstReg);
@@ -4425,6 +4425,7 @@ void CombinerHelper::applyBuildFnNoErase(
 }
 
 bool CombinerHelper::matchOrShiftToFunnelShift(MachineInstr &MI,
+                                               bool AllowScalarConstants,
                                                BuildFnTy &MatchInfo) const {
   assert(MI.getOpcode() == TargetOpcode::G_OR);
 
@@ -4444,31 +4445,29 @@ bool CombinerHelper::matchOrShiftToFunnelShift(MachineInstr &MI,
 
   // Given constants C0 and C1 such that C0 + C1 is bit-width:
   // (or (shl x, C0), (lshr y, C1)) -> (fshl x, y, C0) or (fshr x, y, C1)
-  int64_t CstShlAmt, CstLShrAmt;
+  int64_t CstShlAmt = 0, CstLShrAmt;
   if (mi_match(ShlAmt, MRI, m_ICstOrSplat(CstShlAmt)) &&
       mi_match(LShrAmt, MRI, m_ICstOrSplat(CstLShrAmt)) &&
       CstShlAmt + CstLShrAmt == BitWidth) {
     FshOpc = TargetOpcode::G_FSHR;
     Amt = LShrAmt;
-
   } else if (mi_match(LShrAmt, MRI,
                       m_GSub(m_SpecificICstOrSplat(BitWidth), m_Reg(Amt))) &&
              ShlAmt == Amt) {
     // (or (shl x, amt), (lshr y, (sub bw, amt))) -> (fshl x, y, amt)
     FshOpc = TargetOpcode::G_FSHL;
-
   } else if (mi_match(ShlAmt, MRI,
                       m_GSub(m_SpecificICstOrSplat(BitWidth), m_Reg(Amt))) &&
              LShrAmt == Amt) {
     // (or (shl x, (sub bw, amt)), (lshr y, amt)) -> (fshr x, y, amt)
     FshOpc = TargetOpcode::G_FSHR;
-
   } else {
     return false;
   }
 
   LLT AmtTy = MRI.getType(Amt);
-  if (!isLegalOrBeforeLegalizer({FshOpc, {Ty, AmtTy}}))
+  if (!isLegalOrBeforeLegalizer({FshOpc, {Ty, AmtTy}}) &&
+      (!AllowScalarConstants || CstShlAmt == 0 || !Ty.isScalar()))
     return false;
 
   MatchInfo = [=](MachineIRBuilder &B) {
diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
index ca82857319abc..5fab6ec506e94 100644
--- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
@@ -1893,6 +1893,8 @@ static bool canCreateUndefOrPoison(Register Reg, const MachineRegisterInfo &MRI,
   case TargetOpcode::G_UADDSAT:
   case TargetOpcode::G_SSUBSAT:
   case TargetOpcode::G_USUBSAT:
+  case TargetOpcode::G_SBFX:
+  case TargetOpcode::G_UBFX:
     return false;
   case TargetOpcode::G_SSHLSAT:
   case TargetOpcode::G_USHLSAT:
diff --git a/llvm/lib/CodeGen/MachineOperand.cpp b/llvm/lib/CodeGen/MachineOperand.cpp
index bb9c76ff0c729..8c6d2194433d0 100644
--- a/llvm/lib/CodeGen/MachineOperand.cpp
+++ b/llvm/lib/CodeGen/MachineOperand.cpp
@@ -363,8 +363,9 @@ bool MachineOperand::isIdenticalTo(const MachineOperand &Other) const {
   case MachineOperand::MO_RegisterMask:
   case MachineOperand::MO_RegisterLiveOut: {
     // Shallow compare of the two RegMasks
-    const uint32_t *RegMask = getRegMask();
-    const uint32_t *OtherRegMask = Other.getRegMask();
+    const uint32_t *RegMask = isRegMask() ? getRegMask() : getRegLiveOut();
+    const uint32_t *OtherRegMask =
+        isRegMask() ? Other.getRegMask() : Other.getRegLiveOut();
     if (RegMask == OtherRegMask)
       return true;
 
@@ -434,7 +435,8 @@ hash_code llvm::hash_value(const MachineOperand &MO) {
     if (const MachineFunction *MF = getMFIfAvailable(MO)) {
       const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
       unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs());
-      const uint32_t *RegMask = MO.getRegMask();
+      const uint32_t *RegMask =
+          MO.isRegMask() ? MO.getRegMask() : MO.getRegLiveOut();
       std::vector<stable_hash> RegMaskHashes(RegMask, RegMask + RegMaskSize);
       return hash_combine(MO.getType(), MO.getTargetFlags(),
                           stable_hash_combine(RegMaskHashes));
diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index 3ed10454f76c5..f18c051142960 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -334,7 +334,7 @@ class MachineSchedulerImpl : public MachineSchedulerBase {
     LiveIntervals &LIS;
   };
 
-  MachineSchedulerImpl() {}
+  MachineSchedulerImpl() = default;
   // Migration only
   void setLegacyPass(MachineFunctionPass *P) { this->P = P; }
   void setMFAM(MachineFunctionAnalysisManager *MFAM) { this->MFAM = MFAM; }
@@ -358,7 +358,7 @@ class PostMachineSchedulerImpl : public MachineSchedulerBase {
     MachineLoopInfo &MLI;
     AAResults &AA;
   };
-  PostMachineSchedulerImpl() {}
+  PostMachineSchedulerImpl() = default;
   // Migration only
   void setLegacyPass(MachineFunctionPass *P) { this->P = P; }
   void setMFAM(MachineFunctionAnalysisManager *MFAM) { this->MFAM = MFAM; }
diff --git a/llvm/lib/CodeGen/MachineStableHash.cpp b/llvm/lib/CodeGen/MachineStableHash.cpp
index 9d56696079478..6da708d51b95f 100644
--- a/llvm/lib/CodeGen/MachineStableHash.cpp
+++ b/llvm/lib/CodeGen/MachineStableHash.cpp
@@ -136,7 +136,8 @@ stable_hash llvm::stableHashValue(const MachineOperand &MO) {
           const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
           unsigned RegMaskSize =
               MachineOperand::getRegMaskSize(TRI->getNumRegs());
-          const uint32_t *RegMask = MO.getRegMask();
+          const uint32_t *RegMask =
+              MO.isRegMask() ? MO.getRegMask() : MO.getRegLiveOut();
           std::vector<llvm::stable_hash> RegMaskHashes(RegMask,
                                                        RegMask + RegMaskSize);
           return stable_hash_combine(MO.getType(), MO.getTargetFlags(),
diff --git a/llvm/lib/CodeGen/RegAllocFast.cpp b/llvm/lib/CodeGen/RegAllocFast.cpp
index 697b779e10106..ec6ffd4809246 100644
--- a/llvm/lib/CodeGen/RegAllocFast.cpp
+++ b/llvm/lib/CodeGen/RegAllocFast.cpp
@@ -206,7 +206,7 @@ class RegAllocFastImpl {
     bool Error = false;              ///< Could not allocate.
 
     explicit LiveReg(Register VirtReg) : VirtReg(VirtReg) {}
-    explicit LiveReg() {}
+    explicit LiveReg() = default;
 
     unsigned getSparseSetIndex() const { return VirtReg.virtRegIndex(); }
   };
diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp
index e17a214b9a27d..38f6deb39ddf3 100644
--- a/llvm/lib/CodeGen/RegisterCoalescer.cpp
+++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp
@@ -378,7 +378,7 @@ class RegisterCoalescer : private LiveRangeEdit::Delegate {
 
 public:
   // For legacy pass only.
-  RegisterCoalescer() {}
+  RegisterCoalescer() = default;
   RegisterCoalescer &operator=(RegisterCoalescer &&Other) = default;
 
   RegisterCoalescer(LiveIntervals *LIS, SlotIndexes *SI,
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index cf221bba1e3a3..46c4bb85a7420 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -2715,6 +2715,12 @@ SDValue DAGCombiner::visitPTRADD(SDNode *N) {
           (N->getFlags() & N0->getFlags()) & SDNodeFlags::NoUnsignedWrap;
       SDValue Add = DAG.getNode(ISD::ADD, DL, IntVT, {Y, Z}, Flags);
       AddToWorklist(Add.getNode());
+      // We can't set InBounds even if both original ptradds were InBounds and
+      // NUW: SDAG usually represents pointers as integers, therefore, the
+      // matched pattern behaves as if it had implicit casts:
+      //   (ptradd inbounds (inttoptr (ptrtoint (ptradd inbounds x, y))), z)
+      // The outer inbounds ptradd might therefore rely on a provenance that x
+      // does not have.
       return DAG.getMemBasePlusOffset(X, Add, DL, Flags);
     }
   }
@@ -2740,6 +2746,12 @@ SDValue DAGCombiner::visitPTRADD(SDNode *N) {
         // that.
         SDNodeFlags Flags =
             (N->getFlags() & N0->getFlags()) & SDNodeFlags::NoUnsignedWrap;
+        // We can't set InBounds even if both original ptradds were InBounds and
+        // NUW: SDAG usually represents pointers as integers, therefore, the
+        // matched pattern behaves as if it had implicit casts:
+        //   (ptradd inbounds (inttoptr (ptrtoint (ptradd inbounds GA, v))), c)
+        // The outer inbounds ptradd might therefore rely on a provenance that
+        // GA does not have.
         SDValue Inner = DAG.getMemBasePlusOffset(GAValue, N1, DL, Flags);
         AddToWorklist(Inner.getNode());
         return DAG.getMemBasePlusOffset(Inner, N0.getOperand(1), DL, Flags);
@@ -2763,8 +2775,13 @@ SDValue DAGCombiner::visitPTRADD(SDNode *N) {
     bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
 
     // If both additions in the original were NUW, reassociation preserves that.
-    SDNodeFlags ReassocFlags =
-        (N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap;
+    SDNodeFlags CommonFlags = N->getFlags() & N1->getFlags();
+    SDNodeFlags ReassocFlags = CommonFlags & SDNodeFlags::NoUnsignedWrap;
+    if (CommonFlags.hasNoUnsignedWrap()) {
+      // If both operations are NUW and the PTRADD is inbounds, the offests are
+      // both non-negative, so the reassociated PTRADDs are also inbounds.
+      ReassocFlags |= N->getFlags() & SDNodeFlags::InBounds;
+    }
 
     if (ZIsConstant != YIsConstant) {
       if (YIsConstant)
@@ -9357,7 +9374,7 @@ static unsigned bigEndianByteAt(unsigned BW, unsigned i) {
 // Check if the bytes offsets we are looking at match with either big or
 // little endian value loaded. Return true for big endian, false for little
 // endian, and std::nullopt if match failed.
-static std::optional<bool> isBigEndian(const ArrayRef<int64_t> ByteOffsets,
+static std::optional<bool> isBigEndian(ArrayRef<int64_t> ByteOffsets,
                                        int64_t FirstOffset) {
   // The endian can be decided only when it is 2 bytes at least.
   unsigned Width = ByteOffsets.size();
@@ -22743,7 +22760,10 @@ SDValue DAGCombiner::replaceStoreOfInsertLoad(StoreSDNode *ST) {
     NewPtr = DAG.getMemBasePlusOffset(Ptr, TypeSize::getFixed(COffset), DL);
     PointerInfo = ST->getPointerInfo().getWithOffset(COffset);
   } else {
-    NewPtr = TLI.getVectorElementPointer(DAG, Ptr, Value.getValueType(), Idx);
+    // The original DAG loaded the entire vector from memory, so arithmetic
+    // within it must be inbounds.
+    NewPtr = TLI.getInboundsVectorElementPointer(DAG, Ptr, Value.getValueType(),
+                                                 Idx);
   }
 
   return DAG.getStore(Chain, DL, Elt, NewPtr, PointerInfo, ST->getAlign(),
@@ -23506,6 +23526,93 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
     // inselt undef, InVal, EltNo --> build_vector < InVal, InVal, ... >
     if (InVec.isUndef() && TLI.shouldSplatInsEltVarIndex(VT))
       return DAG.getSplat(VT, DL, InVal);
+
+    // Extend this type to be byte-addressable
+    EVT OldVT = VT;
+    EVT EltVT = VT.getVectorElementType();
+    bool IsByteSized = EltVT.isByteSized();
+    if (!IsByteSized) {
+      EltVT =
+          EltVT.changeTypeToInteger().getRoundIntegerType(*DAG.getContext());
+      VT = VT.changeElementType(EltVT);
+    }
+
+    // Check if this operation will be handled the default way for its type.
+    auto IsTypeDefaultHandled = [this](EVT VT) {
+      return TLI.getTypeAction(*DAG.getContext(), VT) ==
+                 TargetLowering::TypeSplitVector ||
+             TLI.isOperationExpand(ISD::INSERT_VECTOR_ELT, VT);
+    };
+
+    // Check if this operation is illegal and will be handled the default way,
+    // even after extending the type to be byte-addressable.
+    if (IsTypeDefaultHandled(OldVT) && IsTypeDefaultHandled(VT)) {
+      // For each dynamic insertelt, the default way will save the vector to
+      // the stack, store at an offset, and load the modified vector. This can
+      // dramatically increase code size if we have a chain of insertelts on a
+      // large vector: requiring O(V*C) stores/loads where V = length of
+      // vector and C is length of chain. If each insertelt is only fed into the
+      // next, the vector is write-only across this chain, and we can just
+      // save once before the chain and load after in O(V + C) operations.
+      SmallVector<SDNode *> Seq{N};
+      unsigned NumDynamic = 1;
+      while (true) {
+        SDValue InVec = Seq.back()->getOperand(0);
+        if (InVec.getOpcode() != ISD::INSERT_VECTOR_ELT)
+          break;
+        Seq.push_back(InVec.getNode());
+        NumDynamic += !isa<ConstantSDNode>(InVec.getOperand(2));
+      }
+
+      // It always and only makes sense to lower this sequence when we have more
+      // than one dynamic insertelt, since we will not have more than V constant
+      // insertelts, so we will be reducing the total number of stores+loads.
+      if (NumDynamic > 1) {
+        // In cases where the vector is illegal it will be broken down into
+        // parts and stored in parts - we should use the alignment for the
+        // smallest part.
+        Align SmallestAlign = DAG.getReducedAlign(VT, /*UseABI=*/false);
+        SDValue StackPtr =
+            DAG.CreateStackTemporary(VT.getStoreSize(), SmallestAlign);
+        auto &MF = DAG.getMachineFunction();
+        int FrameIndex = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+        auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex);
+
+        // Save the vector to the stack
+        SDValue InVec = Seq.back()->getOperand(0);
+        if (!IsByteSized)
+          InVec = DAG.getNode(ISD::ANY_EXTEND, DL, VT, InVec);
+        SDValue Store = DAG.getStore(DAG.getEntryNode(), DL, InVec, StackPtr,
+                                     PtrInfo, SmallestAlign);
+
+        // Lower each dynamic insertelt to a store
+        for (SDNode *N : reverse(Seq)) {
+          SDValue Elmnt = N->getOperand(1);
+          SDValue Index = N->getOperand(2);
+
+          // Check if we have to extend the element type
+          if (!IsByteSized && Elmnt.getValueType().bitsLT(EltVT))
+            Elmnt = DAG.getNode(ISD::ANY_EXTEND, DL, EltVT, Elmnt);
+
+          // Store the new element. This may be larger than the vector element
+          // type, so use a truncating store.
+          SDValue EltPtr =
+              TLI.getVectorElementPointer(DAG, StackPtr, VT, Index);
+          EVT EltVT = Elmnt.getValueType();
+          Store = DAG.getTruncStore(
+              Store, DL, Elmnt, EltPtr, MachinePointerInfo::getUnknownStack(MF),
+              EltVT,
+              commonAlignment(SmallestAlign, EltVT.getFixedSizeInBits() / 8));
+        }
+
+        // Load the saved vector from the stack
+        SDValue Load =
+            DAG.getLoad(VT, DL, Store, StackPtr, PtrInfo, SmallestAlign);
+        SDValue LoadV = Load.getValue(0);
+        return IsByteSized ? LoadV : DAG.getAnyExtOrTrunc(LoadV, DL, OldVT);
+      }
+    }
+
     return SDValue();
   }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 5fb7e63cfb605..431a81002074f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -2400,10 +2400,11 @@ SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node,
   Results.push_back(Rem);
 }
 
-/// Return true if sincos libcall is available.
+/// Return true if sincos or __sincos_stret libcall is available.
 static bool isSinCosLibcallAvailable(SDNode *Node, const TargetLowering &TLI) {
-  RTLIB::Libcall LC = RTLIB::getSINCOS(Node->getSimpleValueType(0).SimpleTy);
-  return TLI.getLibcallName(LC) != nullptr;
+  MVT::SimpleValueType VT = Node->getSimpleValueType(0).SimpleTy;
+  return TLI.getLibcallImpl(RTLIB::getSINCOS(VT)) != RTLIB::Unsupported ||
+         TLI.getLibcallImpl(RTLIB::getSINCOS_STRET(VT)) != RTLIB::Unsupported;
 }
 
 /// Only issue sincos libcall if both sin and cos are needed.
@@ -3752,9 +3753,9 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     EVT VT = Node->getValueType(0);
     // Turn fsin / fcos into ISD::FSINCOS node if there are a pair of fsin /
     // fcos which share the same operand and both are used.
-    if ((TLI.isOperationLegalOrCustom(ISD::FSINCOS, VT) ||
-         isSinCosLibcallAvailable(Node, TLI))
-        && useSinCos(Node)) {
+    if ((TLI.isOperationLegal(ISD::FSINCOS, VT) ||
+         isSinCosLibcallAvailable(Node, TLI)) &&
+        useSinCos(Node)) {
       SDVTList VTs = DAG.getVTList(VT, VT);
       Tmp1 = DAG.getNode(ISD::FSINCOS, dl, VTs, Node->getOperand(0));
       if (Node->getOpcode() == ISD::FCOS)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index bf1abfe50327e..58983cb57d7f6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -1172,6 +1172,12 @@ bool DAGTypeLegalizer::SoftenFloatOperand(SDNode *N, unsigned OpNo) {
   case ISD::FAKE_USE:
     Res = SoftenFloatOp_FAKE_USE(N);
     break;
+  case ISD::STACKMAP:
+    Res = SoftenFloatOp_STACKMAP(N, OpNo);
+    break;
+  case ISD::PATCHPOINT:
+    Res = SoftenFloatOp_PATCHPOINT(N, OpNo);
+    break;
   }
 
   // If the result is null, the sub-method took care of registering results etc.
@@ -1512,6 +1518,20 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_FAKE_USE(SDNode *N) {
                      N->getOperand(0), Op1);
 }
 
+SDValue DAGTypeLegalizer::SoftenFloatOp_STACKMAP(SDNode *N, unsigned OpNo) {
+  assert(OpNo > 1); // Because the first two arguments are guaranteed legal.
+  SmallVector<SDValue> NewOps(N->ops());
+  NewOps[OpNo] = GetSoftenedFloat(NewOps[OpNo]);
+  return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatOp_PATCHPOINT(SDNode *N, unsigned OpNo) {
+  assert(OpNo >= 7);
+  SmallVector<SDValue> NewOps(N->ops());
+  NewOps[OpNo] = GetSoftenedFloat(NewOps[OpNo]);
+  return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
+}
+
 //===----------------------------------------------------------------------===//
 //  Float Result Expansion
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index b1776eaae6e86..44e5a187c4281 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -2871,18 +2871,14 @@ SDValue DAGTypeLegalizer::PromoteIntOp_SET_ROUNDING(SDNode *N) {
 SDValue DAGTypeLegalizer::PromoteIntOp_STACKMAP(SDNode *N, unsigned OpNo) {
   assert(OpNo > 1); // Because the first two arguments are guaranteed legal.
   SmallVector<SDValue> NewOps(N->ops());
-  SDValue Operand = N->getOperand(OpNo);
-  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), Operand.getValueType());
-  NewOps[OpNo] = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), NVT, Operand);
+  NewOps[OpNo] = GetPromotedInteger(NewOps[OpNo]);
   return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_PATCHPOINT(SDNode *N, unsigned OpNo) {
   assert(OpNo >= 7);
   SmallVector<SDValue> NewOps(N->ops());
-  SDValue Operand = N->getOperand(OpNo);
-  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), Operand.getValueType());
-  NewOps[OpNo] = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), NVT, Operand);
+  NewOps[OpNo] = GetPromotedInteger(NewOps[OpNo]);
   return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
 }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 9656a30321efa..ede522eff6df3 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -658,6 +658,8 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   SDValue SoftenFloatOp_ATOMIC_STORE(SDNode *N, unsigned OpNo);
   SDValue SoftenFloatOp_FCOPYSIGN(SDNode *N);
   SDValue SoftenFloatOp_FAKE_USE(SDNode *N);
+  SDValue SoftenFloatOp_STACKMAP(SDNode *N, unsigned OpNo);
+  SDValue SoftenFloatOp_PATCHPOINT(SDNode *N, unsigned OpNo);
 
   //===--------------------------------------------------------------------===//
   // Float Expansion Support: LegalizeFloatTypes.cpp
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index da4e40953b39a..9bdf82210fed1 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -10668,19 +10668,20 @@ static SDValue clampDynamicVectorIndex(SelectionDAG &DAG, SDValue Idx,
                      DAG.getConstant(MaxIndex, dl, IdxVT));
 }
 
-SDValue TargetLowering::getVectorElementPointer(SelectionDAG &DAG,
-                                                SDValue VecPtr, EVT VecVT,
-                                                SDValue Index) const {
+SDValue
+TargetLowering::getVectorElementPointer(SelectionDAG &DAG, SDValue VecPtr,
+                                        EVT VecVT, SDValue Index,
+                                        const SDNodeFlags PtrArithFlags) const {
   return getVectorSubVecPointer(
       DAG, VecPtr, VecVT,
       EVT::getVectorVT(*DAG.getContext(), VecVT.getVectorElementType(), 1),
-      Index);
+      Index, PtrArithFlags);
 }
 
-SDValue TargetLowering::getVectorSubVecPointer(SelectionDAG &DAG,
-                                               SDValue VecPtr, EVT VecVT,
-                                               EVT SubVecVT,
-                                               SDValue Index) const {
+SDValue
+TargetLowering::getVectorSubVecPointer(SelectionDAG &DAG, SDValue VecPtr,
+                                       EVT VecVT, EVT SubVecVT, SDValue Index,
+                                       const SDNodeFlags PtrArithFlags) const {
   SDLoc dl(Index);
   // Make sure the index type is big enough to compute in.
   Index = DAG.getZExtOrTrunc(Index, dl, VecPtr.getValueType());
@@ -10704,7 +10705,7 @@ SDValue TargetLowering::getVectorSubVecPointer(SelectionDAG &DAG,
 
   Index = DAG.getNode(ISD::MUL, dl, IdxVT, Index,
                       DAG.getConstant(EltSize, dl, IdxVT));
-  return DAG.getMemBasePlusOffset(VecPtr, Index, dl);
+  return DAG.getMemBasePlusOffset(VecPtr, Index, dl, PtrArithFlags);
 }
 
 //===----------------------------------------------------------------------===//
@@ -12382,8 +12383,10 @@ SDValue TargetLowering::scalarizeExtractedVectorLoad(EVT ResultVT,
       !IsFast)
     return SDValue();
 
-  SDValue NewPtr =
-      getVectorElementPointer(DAG, OriginalLoad->getBasePtr(), InVecVT, EltNo);
+  // The original DAG loaded the entire vector from memory, so arithmetic
+  // within it must be inbounds.
+  SDValue NewPtr = getInboundsVectorElementPointer(
+      DAG, OriginalLoad->getBasePtr(), InVecVT, EltNo);
 
   // We are replacing a vector load with a scalar load. The new load must have
   // identical memory op ordering to the original.
diff --git a/llvm/lib/CodeGen/SwitchLoweringUtils.cpp b/llvm/lib/CodeGen/SwitchLoweringUtils.cpp
index 038c499fe236e..3fa8243c03423 100644
--- a/llvm/lib/CodeGen/SwitchLoweringUtils.cpp
+++ b/llvm/lib/CodeGen/SwitchLoweringUtils.cpp
@@ -198,7 +198,6 @@ bool SwitchCG::SwitchLowering::buildJumpTable(const CaseClusterVector &Clusters,
   assert(First <= Last);
 
   auto Prob = BranchProbability::getZero();
-  unsigned NumCmps = 0;
   std::vector<MachineBasicBlock*> Table;
   DenseMap<MachineBasicBlock*, BranchProbability> JTProbs;
 
@@ -206,12 +205,16 @@ bool SwitchCG::SwitchLowering::buildJumpTable(const CaseClusterVector &Clusters,
   for (unsigned I = First; I <= Last; ++I)
     JTProbs[Clusters[I].MBB] = BranchProbability::getZero();
 
+  DenseMap<const BasicBlock *, unsigned int> DestMap;
   for (unsigned I = First; I <= Last; ++I) {
     assert(Clusters[I].Kind == CC_Range);
     Prob += Clusters[I].Prob;
     const APInt &Low = Clusters[I].Low->getValue();
     const APInt &High = Clusters[I].High->getValue();
-    NumCmps += (Low == High) ? 1 : 2;
+    unsigned int NumCmp = (Low == High) ? 1 : 2;
+    const BasicBlock *BB = Clusters[I].MBB->getBasicBlock();
+    DestMap[BB] += NumCmp;
+
     if (I != First) {
       // Fill the gap between this and the previous cluster.
       const APInt &PreviousHigh = Clusters[I - 1].High->getValue();
@@ -226,9 +229,7 @@ bool SwitchCG::SwitchLowering::buildJumpTable(const CaseClusterVector &Clusters,
     JTProbs[Clusters[I].MBB] += Clusters[I].Prob;
   }
 
-  unsigned NumDests = JTProbs.size();
-  if (TLI->isSuitableForBitTests(NumDests, NumCmps,
-                                 Clusters[First].Low->getValue(),
+  if (TLI->isSuitableForBitTests(DestMap, Clusters[First].Low->getValue(),
                                  Clusters[Last].High->getValue(), *DL)) {
     // Clusters[First..Last] should be lowered as bit tests instead.
     return false;
@@ -372,20 +373,19 @@ bool SwitchCG::SwitchLowering::buildBitTests(CaseClusterVector &Clusters,
   if (First == Last)
     return false;
 
-  BitVector Dests(FuncInfo.MF->getNumBlockIDs());
-  unsigned NumCmps = 0;
+  DenseMap<const BasicBlock *, unsigned int> DestMap;
   for (int64_t I = First; I <= Last; ++I) {
     assert(Clusters[I].Kind == CC_Range);
-    Dests.set(Clusters[I].MBB->getNumber());
-    NumCmps += (Clusters[I].Low == Clusters[I].High) ? 1 : 2;
+    unsigned NumCmp = (Clusters[I].Low == Clusters[I].High) ? 1 : 2;
+    const BasicBlock *BB = Clusters[I].MBB->getBasicBlock();
+    DestMap[BB] += NumCmp;
   }
-  unsigned NumDests = Dests.count();
 
   APInt Low = Clusters[First].Low->getValue();
   APInt High = Clusters[Last].High->getValue();
   assert(Low.slt(High));
 
-  if (!TLI->isSuitableForBitTests(NumDests, NumCmps, Low, High, *DL))
+  if (!TLI->isSuitableForBitTests(DestMap, Low, High, *DL))
     return false;
 
   APInt LowBound;
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 59798b3cf201a..b3535eaca5e9d 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
@@ -90,6 +91,11 @@ static cl::opt<unsigned> OptsizeJumpTableDensity(
     cl::desc("Minimum density for building a jump table in "
              "an optsize function"));
 
+static cl::opt<unsigned> MinimumBitTestCmpsOverride(
+    "min-bit-test-cmps", cl::init(2), cl::Hidden,
+    cl::desc("Set minimum of largest number of comparisons "
+             "to use bit test for switch."));
+
 // FIXME: This option is only to test if the strict fp operation processed
 // correctly by preventing mutating strict fp operation to normal fp operation
 // during development. When the backend supports strict float operation, this
@@ -428,6 +434,11 @@ RTLIB::Libcall RTLIB::getSINCOSPI(EVT RetVT) {
                       SINCOSPI_F128, SINCOSPI_PPCF128);
 }
 
+RTLIB::Libcall RTLIB::getSINCOS_STRET(EVT RetVT) {
+  return getFPLibCall(RetVT, SINCOS_STRET_F32, SINCOS_STRET_F64,
+                      UNKNOWN_LIBCALL, UNKNOWN_LIBCALL, UNKNOWN_LIBCALL);
+}
+
 RTLIB::Libcall RTLIB::getMODF(EVT RetVT) {
   return getFPLibCall(RetVT, MODF_F32, MODF_F64, MODF_F80, MODF_F128,
                       MODF_PPCF128);
@@ -719,6 +730,8 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm)
 
   MinCmpXchgSizeInBits = 0;
   SupportsUnalignedAtomics = false;
+
+  MinimumBitTestCmps = MinimumBitTestCmpsOverride;
 }
 
 // Define the virtual destructor out-of-line to act as a key method to anchor
@@ -2129,6 +2142,14 @@ bool TargetLoweringBase::isJumpTableRelative() const {
   return getTargetMachine().isPositionIndependent();
 }
 
+unsigned TargetLoweringBase::getMinimumBitTestCmps() const {
+  return MinimumBitTestCmps;
+}
+
+void TargetLoweringBase::setMinimumBitTestCmps(unsigned Val) {
+  MinimumBitTestCmps = Val;
+}
+
 Align TargetLoweringBase::getPrefLoopAlignment(MachineLoop *ML) const {
   if (TM.Options.LoopAlignment)
     return Align(TM.Options.LoopAlignment);
diff --git a/llvm/lib/CodeGenTypes/LowLevelType.cpp b/llvm/lib/CodeGenTypes/LowLevelType.cpp
index 4785f2652b00e..92b7fad3a0e24 100644
--- a/llvm/lib/CodeGenTypes/LowLevelType.cpp
+++ b/llvm/lib/CodeGenTypes/LowLevelType.cpp
@@ -54,9 +54,3 @@ LLVM_DUMP_METHOD void LLT::dump() const {
   dbgs() << '\n';
 }
 #endif
-
-const constexpr LLT::BitFieldInfo LLT::ScalarSizeFieldInfo;
-const constexpr LLT::BitFieldInfo LLT::PointerSizeFieldInfo;
-const constexpr LLT::BitFieldInfo LLT::PointerAddressSpaceFieldInfo;
-const constexpr LLT::BitFieldInfo LLT::VectorElementsFieldInfo;
-const constexpr LLT::BitFieldInfo LLT::VectorScalableFieldInfo;
diff --git a/llvm/lib/DWARFLinker/Parallel/DWARFLinkerUnit.h b/llvm/lib/DWARFLinker/Parallel/DWARFLinkerUnit.h
index 84757aea7045d..970abdc38f417 100644
--- a/llvm/lib/DWARFLinker/Parallel/DWARFLinkerUnit.h
+++ b/llvm/lib/DWARFLinker/Parallel/DWARFLinkerUnit.h
@@ -28,7 +28,7 @@ using MacroOffset2UnitMapTy = DenseMap<uint64_t, DwarfUnit *>;
 /// Base class for all Dwarf units(Compile unit/Type table unit).
 class DwarfUnit : public OutputSections {
 public:
-  virtual ~DwarfUnit() {}
+  virtual ~DwarfUnit() = default;
   DwarfUnit(LinkingGlobalData &GlobalData, unsigned ID,
             StringRef ClangModuleName)
       : OutputSections(GlobalData), ID(ID), ClangModuleName(ClangModuleName),
diff --git a/llvm/lib/DWARFLinker/Parallel/StringEntryToDwarfStringPoolEntryMap.h b/llvm/lib/DWARFLinker/Parallel/StringEntryToDwarfStringPoolEntryMap.h
index f67536ef7a1a8..8ccb4a502aaba 100644
--- a/llvm/lib/DWARFLinker/Parallel/StringEntryToDwarfStringPoolEntryMap.h
+++ b/llvm/lib/DWARFLinker/Parallel/StringEntryToDwarfStringPoolEntryMap.h
@@ -22,7 +22,7 @@ class StringEntryToDwarfStringPoolEntryMap {
 public:
   StringEntryToDwarfStringPoolEntryMap(LinkingGlobalData &GlobalData)
       : GlobalData(GlobalData) {}
-  ~StringEntryToDwarfStringPoolEntryMap() {}
+  ~StringEntryToDwarfStringPoolEntryMap() = default;
 
   /// Create DwarfStringPoolEntry for specified StringEntry if necessary.
   /// Initialize DwarfStringPoolEntry with initial values.
diff --git a/llvm/lib/DebugInfo/CodeView/LazyRandomTypeCollection.cpp b/llvm/lib/DebugInfo/CodeView/LazyRandomTypeCollection.cpp
index 6c23ba8f3c466..23ab5344df1ed 100644
--- a/llvm/lib/DebugInfo/CodeView/LazyRandomTypeCollection.cpp
+++ b/llvm/lib/DebugInfo/CodeView/LazyRandomTypeCollection.cpp
@@ -102,7 +102,8 @@ std::optional<CVType> LazyRandomTypeCollection::tryGetType(TypeIndex Index) {
     return std::nullopt;
   }
 
-  assert(contains(Index));
+  if (!contains(Index))
+    return std::nullopt;
   return Records[Index.toArrayIndex()].Type;
 }
 
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
index db5cc37c93f90..6c78ef05e1b61 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
@@ -129,6 +129,25 @@ prettyLanguageVersionString(const DWARFAttribute &AttrValue,
       static_cast<SourceLanguageName>(*LName), *LVersion);
 }
 
+static llvm::Expected<llvm::StringRef>
+getApplePropertyName(const DWARFDie &PropDIE) {
+  if (!PropDIE)
+    return llvm::createStringError("invalid DIE");
+
+  if (PropDIE.getTag() != DW_TAG_APPLE_property)
+    return llvm::createStringError("not referencing a DW_TAG_APPLE_property");
+
+  auto PropNameForm = PropDIE.find(DW_AT_APPLE_property_name);
+  if (!PropNameForm)
+    return "";
+
+  auto NameOrErr = PropNameForm->getAsCString();
+  if (!NameOrErr)
+    return NameOrErr.takeError();
+
+  return *NameOrErr;
+}
+
 static void dumpAttribute(raw_ostream &OS, const DWARFDie &Die,
                           const DWARFAttribute &AttrValue, unsigned Indent,
                           DIDumpOptions DumpOpts) {
@@ -233,6 +252,15 @@ static void dumpAttribute(raw_ostream &OS, const DWARFDie &Die,
             Die.getAttributeValueAsReferencedDie(FormValue).getName(
                 DINameKind::LinkageName))
       OS << Space << "\"" << Name << '\"';
+  } else if (Attr == DW_AT_APPLE_property) {
+    auto PropDIE = Die.getAttributeValueAsReferencedDie(FormValue);
+    if (auto PropNameOrErr = getApplePropertyName(PropDIE))
+      OS << Space << "\"" << *PropNameOrErr << '\"';
+    else
+      DumpOpts.RecoverableErrorHandler(createStringError(
+          errc::invalid_argument,
+          llvm::formatv("decoding DW_AT_APPLE_property_name: {}",
+                        toString(PropNameOrErr.takeError()))));
   } else if (Attr == DW_AT_type || Attr == DW_AT_containing_type) {
     DWARFDie D = resolveReferencedType(Die, FormValue);
     if (D && !D.isNULL()) {
diff --git a/llvm/lib/DebugInfo/GSYM/GsymCreator.cpp b/llvm/lib/DebugInfo/GSYM/GsymCreator.cpp
index 93ff3b924db32..d87cb4d2210a6 100644
--- a/llvm/lib/DebugInfo/GSYM/GsymCreator.cpp
+++ b/llvm/lib/DebugInfo/GSYM/GsymCreator.cpp
@@ -552,7 +552,7 @@ llvm::Error GsymCreator::saveSegments(StringRef Path,
         createSegment(SegmentSize, FuncIdx);
     if (ExpectedGC) {
       GsymCreator *GC = ExpectedGC->get();
-      if (GC == NULL)
+      if (!GC)
         break; // We had not more functions to encode.
       // Don't collect any messages at all
       OutputAggregator Out(nullptr);
diff --git a/llvm/lib/ExecutionEngine/Orc/MemoryMapper.cpp b/llvm/lib/ExecutionEngine/Orc/MemoryMapper.cpp
index 7e606c6a473b6..4e7db822776cc 100644
--- a/llvm/lib/ExecutionEngine/Orc/MemoryMapper.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/MemoryMapper.cpp
@@ -27,7 +27,7 @@
 namespace llvm {
 namespace orc {
 
-MemoryMapper::~MemoryMapper() {}
+MemoryMapper::~MemoryMapper() = default;
 
 InProcessMemoryMapper::InProcessMemoryMapper(size_t PageSize)
     : PageSize(PageSize) {}
diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/CMakeLists.txt b/llvm/lib/ExecutionEngine/Orc/TargetProcess/CMakeLists.txt
index 927558649eb4d..ca8192bb99492 100644
--- a/llvm/lib/ExecutionEngine/Orc/TargetProcess/CMakeLists.txt
+++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/CMakeLists.txt
@@ -16,9 +16,11 @@ add_llvm_component_library(LLVMOrcTargetProcess
   ExecutorSharedMemoryMapperService.cpp
   DefaultHostBootstrapValues.cpp
   ExecutorResolver.cpp
+  LibraryResolver.cpp
   JITLoaderGDB.cpp
   JITLoaderPerf.cpp
   JITLoaderVTune.cpp
+  LibraryScanner.cpp
   OrcRTBootstrap.cpp
   RegisterEHFrames.cpp
   SimpleExecutorDylibManager.cpp
@@ -36,6 +38,8 @@ add_llvm_component_library(LLVMOrcTargetProcess
 
   LINK_COMPONENTS
   ${intel_jit_profiling}
+  BinaryFormat
+  Object
   OrcShared
   Support
   TargetParser
diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderPerf.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderPerf.cpp
index 1a61d3188a820..e609a7d3dc08e 100644
--- a/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderPerf.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderPerf.cpp
@@ -55,7 +55,7 @@ struct PerfState {
   std::unique_ptr<raw_fd_ostream> Dumpstream;
 
   // perf mmap marker
-  void *MarkerAddr = NULL;
+  void *MarkerAddr = nullptr;
 };
 
 // prevent concurrent dumps from messing up the output file
diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryResolver.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryResolver.cpp
new file mode 100644
index 0000000000000..35da82a10306a
--- /dev/null
+++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryResolver.cpp
@@ -0,0 +1,370 @@
+//===- LibraryResolver.cpp - Library Resolution of Unresolved Symbols ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Library resolution impl for unresolved symbols
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/TargetProcess/LibraryResolver.h"
+#include "llvm/ExecutionEngine/Orc/TargetProcess/LibraryScanner.h"
+
+#include "llvm/ADT/StringSet.h"
+
+#include "llvm/BinaryFormat/MachO.h"
+#include "llvm/Object/COFF.h"
+#include "llvm/Object/ELF.h"
+#include "llvm/Object/ELFObjectFile.h"
+#include "llvm/Object/MachO.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/Error.h"
+
+#include <mutex>
+#include <thread>
+
+#define DEBUG_TYPE "orc-resolver"
+
+namespace llvm::orc {
+
+LibraryResolver::LibraryResolver(const LibraryResolver::Setup &S)
+    : LibPathCache(S.Cache ? S.Cache : std::make_shared<LibraryPathCache>()),
+      LibPathResolver(S.PResolver
+                          ? S.PResolver
+                          : std::make_shared<PathResolver>(LibPathCache)),
+      ScanHelper(S.BasePaths, LibPathCache, LibPathResolver),
+      FB(S.FilterBuilder), LibMgr(),
+      ShouldScanCall(S.ShouldScanCall ? S.ShouldScanCall
+                                      : [](StringRef) -> bool { return true; }),
+      scanBatchSize(S.ScanBatchSize) {
+
+  if (ScanHelper.getAllUnits().empty()) {
+    LLVM_DEBUG(dbgs() << "Warning: No base paths provided for scanning.\n");
+  }
+}
+
+std::unique_ptr<LibraryResolutionDriver>
+LibraryResolutionDriver::create(const LibraryResolver::Setup &S) {
+  auto LR = std::make_unique<LibraryResolver>(S);
+  return std::unique_ptr<LibraryResolutionDriver>(
+      new LibraryResolutionDriver(std::move(LR)));
+}
+
+void LibraryResolutionDriver::addScanPath(const std::string &Path, PathType K) {
+  LR->ScanHelper.addBasePath(Path, K);
+}
+
+bool LibraryResolutionDriver::markLibraryLoaded(StringRef Path) {
+  auto Lib = LR->LibMgr.getLibrary(Path);
+  if (!Lib)
+    return false;
+
+  Lib->setState(LibraryManager::LibState::Loaded);
+
+  return true;
+}
+
+bool LibraryResolutionDriver::markLibraryUnLoaded(StringRef Path) {
+  auto Lib = LR->LibMgr.getLibrary(Path);
+  if (!Lib)
+    return false;
+
+  Lib->setState(LibraryManager::LibState::Unloaded);
+
+  return true;
+}
+
+void LibraryResolutionDriver::resolveSymbols(
+    std::vector<std::string> Syms,
+    LibraryResolver::OnSearchComplete OnCompletion,
+    const SearchConfig &Config) {
+  LR->searchSymbolsInLibraries(Syms, std::move(OnCompletion), Config);
+}
+
+static bool shouldIgnoreSymbol(const object::SymbolRef &Sym,
+                               uint32_t IgnoreFlags) {
+  Expected<uint32_t> FlagsOrErr = Sym.getFlags();
+  if (!FlagsOrErr) {
+    consumeError(FlagsOrErr.takeError());
+    return true;
+  }
+
+  uint32_t Flags = *FlagsOrErr;
+
+  using Filter = SymbolEnumeratorOptions;
+  if ((IgnoreFlags & Filter::IgnoreUndefined) &&
+      (Flags & object::SymbolRef::SF_Undefined))
+    return true;
+  if ((IgnoreFlags & Filter::IgnoreIndirect) &&
+      (Flags & object::SymbolRef::SF_Indirect))
+    return true;
+  if ((IgnoreFlags & Filter::IgnoreWeak) &&
+      (Flags & object::SymbolRef::SF_Weak))
+    return true;
+
+  return false;
+}
+
+bool SymbolEnumerator::enumerateSymbols(StringRef Path, OnEachSymbolFn OnEach,
+                                        const SymbolEnumeratorOptions &Opts) {
+  if (Path.empty())
+    return false;
+
+  ObjectFileLoader ObjLoader(Path);
+
+  auto ObjOrErr = ObjLoader.getObjectFile();
+  if (!ObjOrErr) {
+    std::string ErrMsg;
+    handleAllErrors(ObjOrErr.takeError(),
+                    [&](const ErrorInfoBase &EIB) { ErrMsg = EIB.message(); });
+    LLVM_DEBUG(dbgs() << "Failed loading object file: " << Path
+                      << "\nError: " << ErrMsg << "\n");
+    return false;
+  }
+
+  object::ObjectFile *Obj = &ObjOrErr.get();
+
+  auto processSymbolRange =
+      [&](object::ObjectFile::symbol_iterator_range Range) -> EnumerateResult {
+    for (const auto &Sym : Range) {
+      if (shouldIgnoreSymbol(Sym, Opts.FilterFlags))
+        continue;
+
+      auto NameOrErr = Sym.getName();
+      if (!NameOrErr) {
+        consumeError(NameOrErr.takeError());
+        continue;
+      }
+
+      StringRef Name = *NameOrErr;
+      if (Name.empty())
+        continue;
+
+      EnumerateResult Res = OnEach(Name);
+      if (Res != EnumerateResult::Continue)
+        return Res;
+    }
+    return EnumerateResult::Continue;
+  };
+
+  EnumerateResult Res = processSymbolRange(Obj->symbols());
+  if (Res != EnumerateResult::Continue)
+    return Res == EnumerateResult::Stop;
+
+  if (Obj->isELF()) {
+    const auto *ElfObj = cast<object::ELFObjectFileBase>(Obj);
+    Res = processSymbolRange(ElfObj->getDynamicSymbolIterators());
+    if (Res != EnumerateResult::Continue)
+      return Res == EnumerateResult::Stop;
+  } else if (Obj->isCOFF()) {
+    const auto *CoffObj = cast<object::COFFObjectFile>(Obj);
+    for (auto I = CoffObj->export_directory_begin(),
+              E = CoffObj->export_directory_end();
+         I != E; ++I) {
+      StringRef Name;
+      if (I->getSymbolName(Name))
+        continue;
+      if (Name.empty())
+        continue;
+
+      EnumerateResult Res = OnEach(Name);
+      if (Res != EnumerateResult::Continue)
+        return Res == EnumerateResult::Stop;
+    }
+  } else if (Obj->isMachO()) {
+  }
+
+  return true;
+}
+
+class SymbolSearchContext {
+public:
+  SymbolSearchContext(SymbolQuery &Q) : Q(Q) {}
+
+  bool hasSearched(LibraryInfo *Lib) const { return Searched.count(Lib); }
+
+  void markSearched(LibraryInfo *Lib) { Searched.insert(Lib); }
+
+  inline bool allResolved() const { return Q.allResolved(); }
+
+  SymbolQuery &query() { return Q; }
+
+private:
+  SymbolQuery &Q;
+  DenseSet<LibraryInfo *> Searched;
+};
+
+void LibraryResolver::resolveSymbolsInLibrary(
+    LibraryInfo &Lib, SymbolQuery &UnresolvedSymbols,
+    const SymbolEnumeratorOptions &Opts) {
+  LLVM_DEBUG(dbgs() << "Checking unresolved symbols "
+                    << " in library : " << Lib.getFileName() << "\n";);
+  StringSet<> DiscoveredSymbols;
+
+  if (!UnresolvedSymbols.hasUnresolved()) {
+    LLVM_DEBUG(dbgs() << "Skipping library: " << Lib.getFullPath()
+                      << " — unresolved symbols exist.\n";);
+    return;
+  }
+
+  bool HasEnumerated = false;
+  auto enumerateSymbolsIfNeeded = [&]() {
+    if (HasEnumerated)
+      return;
+
+    HasEnumerated = true;
+
+    LLVM_DEBUG(dbgs() << "Enumerating symbols in library: " << Lib.getFullPath()
+                      << "\n";);
+    SymbolEnumerator::enumerateSymbols(
+        Lib.getFullPath(),
+        [&](StringRef sym) {
+          DiscoveredSymbols.insert(sym);
+          return EnumerateResult::Continue;
+        },
+        Opts);
+
+    if (DiscoveredSymbols.empty()) {
+      LLVM_DEBUG(dbgs() << "  No symbols and remove library : "
+                        << Lib.getFullPath() << "\n";);
+      LibMgr.removeLibrary(Lib.getFullPath());
+      return;
+    }
+  };
+
+  if (!Lib.hasFilter()) {
+    LLVM_DEBUG(dbgs() << "Building filter for library: " << Lib.getFullPath()
+                      << "\n";);
+    enumerateSymbolsIfNeeded();
+    SmallVector<StringRef> SymbolVec;
+    SymbolVec.reserve(DiscoveredSymbols.size());
+    for (const auto &KV : DiscoveredSymbols)
+      SymbolVec.push_back(KV.first());
+
+    Lib.ensureFilterBuilt(FB, SymbolVec);
+    LLVM_DEBUG({
+      dbgs() << "DiscoveredSymbols : " << DiscoveredSymbols.size() << "\n";
+      for (const auto &KV : DiscoveredSymbols)
+        dbgs() << "DiscoveredSymbols : " << KV.first() << "\n";
+    });
+  }
+
+  const auto &Unresolved = UnresolvedSymbols.getUnresolvedSymbols();
+  bool HadAnySym = false;
+  LLVM_DEBUG(dbgs() << "Total unresolved symbols : " << Unresolved.size()
+                    << "\n";);
+  for (const auto &Sym : Unresolved) {
+    if (Lib.mayContain(Sym)) {
+      LLVM_DEBUG(dbgs() << "Checking symbol '" << Sym
+                        << "' in library: " << Lib.getFullPath() << "\n";);
+      enumerateSymbolsIfNeeded();
+      if (DiscoveredSymbols.count(Sym) > 0) {
+        LLVM_DEBUG(dbgs() << "  Resolved symbol: " << Sym
+                          << " in library: " << Lib.getFullPath() << "\n";);
+        UnresolvedSymbols.resolve(Sym, Lib.getFullPath());
+        HadAnySym = true;
+      }
+    }
+  }
+
+  using LibraryState = LibraryManager::LibState;
+  if (HadAnySym && Lib.getState() != LibraryState::Loaded)
+    Lib.setState(LibraryState::Queried);
+}
+
+void LibraryResolver::searchSymbolsInLibraries(
+    std::vector<std::string> &SymbolList, OnSearchComplete OnComplete,
+    const SearchConfig &Config) {
+  SymbolQuery Q(SymbolList);
+
+  using LibraryState = LibraryManager::LibState;
+  using LibraryType = PathType;
+  auto tryResolveFrom = [&](LibraryState S, LibraryType K) {
+    LLVM_DEBUG(dbgs() << "Trying resolve from state=" << static_cast<int>(S)
+                      << " type=" << static_cast<int>(K) << "\n";);
+
+    SymbolSearchContext Ctx(Q);
+    while (!Ctx.allResolved()) {
+
+      for (auto &Lib : LibMgr.getView(S, K)) {
+        if (Ctx.hasSearched(Lib.get()))
+          continue;
+
+        // can use Async here?
+        resolveSymbolsInLibrary(*Lib, Ctx.query(), Config.Options);
+        Ctx.markSearched(Lib.get());
+
+        if (Ctx.allResolved())
+          return;
+      }
+
+      if (Ctx.allResolved())
+        return;
+
+      if (!scanLibrariesIfNeeded(K, scanBatchSize))
+        break; // no more new libs to scan
+    }
+  };
+
+  for (const auto &[St, Ty] : Config.Policy.Plan) {
+    tryResolveFrom(St, Ty);
+    if (Q.allResolved())
+      break;
+  }
+
+  // done:
+  LLVM_DEBUG({
+    dbgs() << "Search complete.\n";
+    for (const auto &r : Q.getAllResults())
+      dbgs() << "Resolved Symbol:" << r->Name << " -> " << r->ResolvedLibPath
+             << "\n";
+  });
+
+  OnComplete(Q);
+}
+
+bool LibraryResolver::scanLibrariesIfNeeded(PathType PK, size_t BatchSize) {
+  LLVM_DEBUG(dbgs() << "LibraryResolver::scanLibrariesIfNeeded: Scanning for "
+                    << (PK == PathType::User ? "User" : "System")
+                    << " libraries\n";);
+  if (!ScanHelper.leftToScan(PK))
+    return false;
+
+  LibraryScanner Scanner(ScanHelper, LibMgr, ShouldScanCall);
+  Scanner.scanNext(PK, BatchSize);
+  return true;
+}
+
+bool LibraryResolver::symbolExistsInLibrary(const LibraryInfo &Lib,
+                                            StringRef SymName,
+                                            std::vector<std::string> *AllSyms) {
+  SymbolEnumeratorOptions Opts;
+  return symbolExistsInLibrary(Lib, SymName, AllSyms, Opts);
+}
+
+bool LibraryResolver::symbolExistsInLibrary(
+    const LibraryInfo &Lib, StringRef SymName,
+    std::vector<std::string> *AllSyms, const SymbolEnumeratorOptions &Opts) {
+  bool Found = false;
+
+  SymbolEnumerator::enumerateSymbols(
+      Lib.getFullPath(),
+      [&](StringRef Sym) {
+        if (AllSyms)
+          AllSyms->emplace_back(Sym.str());
+
+        if (Sym == SymName) {
+          Found = true;
+        }
+
+        return EnumerateResult::Continue;
+      },
+      Opts);
+
+  return Found;
+}
+
+} // end namespace llvm::orc
diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryScanner.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryScanner.cpp
new file mode 100644
index 0000000000000..d93f68622fcc2
--- /dev/null
+++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryScanner.cpp
@@ -0,0 +1,1161 @@
+//===- LibraryScanner.cpp - Provide Library Scanning Implementation ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/TargetProcess/LibraryScanner.h"
+#include "llvm/ExecutionEngine/Orc/TargetProcess/LibraryResolver.h"
+
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Object/COFF.h"
+#include "llvm/Object/ELF.h"
+#include "llvm/Object/ELFObjectFile.h"
+#include "llvm/Object/ELFTypes.h"
+#include "llvm/Object/MachO.h"
+#include "llvm/Object/MachOUniversal.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/Program.h"
+#include "llvm/TargetParser/Host.h"
+#include "llvm/TargetParser/Triple.h"
+
+#ifdef LLVM_ON_UNIX
+#include <sys/stat.h>
+#include <unistd.h>
+#endif // LLVM_ON_UNIX
+
+#ifdef __APPLE__
+#include <sys/stat.h>
+#undef LC_LOAD_DYLIB
+#undef LC_RPATH
+#endif // __APPLE__
+
+#define DEBUG_TYPE "orc-scanner"
+
+namespace llvm::orc {
+
+void handleError(Error Err, StringRef context = "") {
+  consumeError(handleErrors(std::move(Err), [&](const ErrorInfoBase &EIB) {
+    dbgs() << "LLVM Error";
+    if (!context.empty())
+      dbgs() << " [" << context << "]";
+    dbgs() << ": " << EIB.message() << "\n";
+  }));
+}
+
+bool ObjectFileLoader::isArchitectureCompatible(const object::ObjectFile &Obj) {
+  Triple HostTriple(sys::getDefaultTargetTriple());
+  Triple ObjTriple = Obj.makeTriple();
+
+  LLVM_DEBUG({
+    dbgs() << "Host triple: " << HostTriple.str()
+           << ", Object triple: " << ObjTriple.str() << "\n";
+  });
+
+  if (ObjTriple.getArch() != Triple::UnknownArch &&
+      HostTriple.getArch() != ObjTriple.getArch())
+    return false;
+
+  if (ObjTriple.getOS() != Triple::UnknownOS &&
+      HostTriple.getOS() != ObjTriple.getOS())
+    return false;
+
+  if (ObjTriple.getEnvironment() != Triple::UnknownEnvironment &&
+      HostTriple.getEnvironment() != Triple::UnknownEnvironment &&
+      HostTriple.getEnvironment() != ObjTriple.getEnvironment())
+    return false;
+
+  return true;
+}
+
+Expected<object::OwningBinary<object::ObjectFile>>
+ObjectFileLoader::loadObjectFileWithOwnership(StringRef FilePath) {
+  LLVM_DEBUG(dbgs() << "ObjectFileLoader: Attempting to open file " << FilePath
+                    << "\n";);
+  auto BinOrErr = object::createBinary(FilePath);
+  if (!BinOrErr) {
+    LLVM_DEBUG(dbgs() << "ObjectFileLoader: Failed to open file " << FilePath
+                      << "\n";);
+    return BinOrErr.takeError();
+  }
+
+  LLVM_DEBUG(dbgs() << "ObjectFileLoader: Successfully opened file " << FilePath
+                    << "\n";);
+
+  auto OwningBin = BinOrErr->takeBinary();
+  object::Binary *Bin = OwningBin.first.get();
+
+  if (Bin->isArchive()) {
+    LLVM_DEBUG(dbgs() << "ObjectFileLoader: File is an archive, not supported: "
+                      << FilePath << "\n";);
+    return createStringError(std::errc::invalid_argument,
+                             "Archive files are not supported: %s",
+                             FilePath.str().c_str());
+  }
+
+#if defined(__APPLE__)
+  if (auto *UB = dyn_cast<object::MachOUniversalBinary>(Bin)) {
+    LLVM_DEBUG(dbgs() << "ObjectFileLoader: Detected Mach-O universal binary: "
+                      << FilePath << "\n";);
+    for (auto ObjForArch : UB->objects()) {
+      auto ObjOrErr = ObjForArch.getAsObjectFile();
+      if (!ObjOrErr) {
+        LLVM_DEBUG(
+            dbgs()
+                << "ObjectFileLoader: Skipping invalid architecture slice\n";);
+
+        consumeError(ObjOrErr.takeError());
+        continue;
+      }
+
+      std::unique_ptr<object::ObjectFile> Obj = std::move(ObjOrErr.get());
+      if (isArchitectureCompatible(*Obj)) {
+        LLVM_DEBUG(
+            dbgs() << "ObjectFileLoader: Found compatible object slice\n";);
+
+        return object::OwningBinary<object::ObjectFile>(
+            std::move(Obj), std::move(OwningBin.second));
+
+      } else {
+        LLVM_DEBUG(dbgs() << "ObjectFileLoader: Incompatible architecture "
+                             "slice skipped\n";);
+      }
+    }
+    LLVM_DEBUG(dbgs() << "ObjectFileLoader: No compatible slices found in "
+                         "universal binary\n";);
+    return createStringError(inconvertibleErrorCode(),
+                             "No compatible object found in fat binary: %s",
+                             FilePath.str().c_str());
+  }
+#endif
+
+  auto ObjOrErr =
+      object::ObjectFile::createObjectFile(Bin->getMemoryBufferRef());
+  if (!ObjOrErr) {
+    LLVM_DEBUG(dbgs() << "ObjectFileLoader: Failed to create object file\n";);
+    return ObjOrErr.takeError();
+  }
+  LLVM_DEBUG(dbgs() << "ObjectFileLoader: Detected object file\n";);
+
+  std::unique_ptr<object::ObjectFile> Obj = std::move(*ObjOrErr);
+  if (!isArchitectureCompatible(*Obj)) {
+    LLVM_DEBUG(dbgs() << "ObjectFileLoader: Incompatible architecture: "
+                      << FilePath << "\n";);
+    return createStringError(inconvertibleErrorCode(),
+                             "Incompatible object file: %s",
+                             FilePath.str().c_str());
+  }
+
+  LLVM_DEBUG(dbgs() << "ObjectFileLoader: Object file is compatible\n";);
+
+  return object::OwningBinary<object::ObjectFile>(std::move(Obj),
+                                                  std::move(OwningBin.second));
+}
+
+template <class ELFT>
+bool isELFSharedLibrary(const object::ELFFile<ELFT> &ELFObj) {
+  if (ELFObj.getHeader().e_type != ELF::ET_DYN)
+    return false;
+
+  auto PHOrErr = ELFObj.program_headers();
+  if (!PHOrErr) {
+    consumeError(PHOrErr.takeError());
+    return true;
+  }
+
+  for (auto Phdr : *PHOrErr) {
+    if (Phdr.p_type == ELF::PT_INTERP)
+      return false;
+  }
+
+  return true;
+}
+
+bool isSharedLibraryObject(object::ObjectFile &Obj) {
+  if (Obj.isELF()) {
+    if (auto *ELF32LE = dyn_cast<object::ELF32LEObjectFile>(&Obj))
+      return isELFSharedLibrary(ELF32LE->getELFFile());
+    if (auto *ELF64LE = dyn_cast<object::ELF64LEObjectFile>(&Obj))
+      return isELFSharedLibrary(ELF64LE->getELFFile());
+    if (auto *ELF32BE = dyn_cast<object::ELF32BEObjectFile>(&Obj))
+      return isELFSharedLibrary(ELF32BE->getELFFile());
+    if (auto *ELF64BE = dyn_cast<object::ELF64BEObjectFile>(&Obj))
+      return isELFSharedLibrary(ELF64BE->getELFFile());
+  } else if (Obj.isMachO()) {
+    const object::MachOObjectFile *MachO =
+        dyn_cast<object::MachOObjectFile>(&Obj);
+    if (!MachO) {
+      LLVM_DEBUG(dbgs() << "Failed to cast to MachOObjectFile.\n";);
+      return false;
+    }
+    LLVM_DEBUG({
+      bool Result =
+          MachO->getHeader().filetype == MachO::HeaderFileType::MH_DYLIB;
+      dbgs() << "Mach-O filetype: " << MachO->getHeader().filetype
+             << " (MH_DYLIB == " << MachO::HeaderFileType::MH_DYLIB
+             << "), shared: " << Result << "\n";
+    });
+
+    return MachO->getHeader().filetype == MachO::HeaderFileType::MH_DYLIB;
+  } else if (Obj.isCOFF()) {
+    const object::COFFObjectFile *coff = dyn_cast<object::COFFObjectFile>(&Obj);
+    if (!coff)
+      return false;
+    return coff->getCharacteristics() & COFF::IMAGE_FILE_DLL;
+  } else {
+    LLVM_DEBUG(dbgs() << "Binary is not an ObjectFile.\n";);
+  }
+
+  return false;
+}
+
+bool DylibPathValidator::isSharedLibrary(StringRef Path) {
+  LLVM_DEBUG(dbgs() << "Checking if path is a shared library: " << Path
+                    << "\n";);
+
+  auto FileType = sys::fs::get_file_type(Path, /*Follow*/ true);
+  if (FileType != sys::fs::file_type::regular_file) {
+    LLVM_DEBUG(dbgs() << "File type is not a regular file for path: " << Path
+                      << "\n";);
+    return false;
+  }
+
+  file_magic MagicCode;
+  identify_magic(Path, MagicCode);
+
+  // Skip archives.
+  if (MagicCode == file_magic::archive)
+    return false;
+
+  // Universal binary handling.
+#if defined(__APPLE__)
+  if (MagicCode == file_magic::macho_universal_binary) {
+    ObjectFileLoader ObjLoader(Path);
+    auto ObjOrErr = ObjLoader.getObjectFile();
+    if (!ObjOrErr) {
+      consumeError(ObjOrErr.takeError());
+      return false;
+    }
+    return isSharedLibraryObject(ObjOrErr.get());
+  }
+#endif
+
+  // Object file inspection for PE/COFF, ELF, and Mach-O
+  bool NeedsObjectInspection =
+#if defined(_WIN32)
+      (MagicCode == file_magic::pecoff_executable);
+#elif defined(__APPLE__)
+      (MagicCode == file_magic::macho_fixed_virtual_memory_shared_lib ||
+       MagicCode == file_magic::macho_dynamically_linked_shared_lib ||
+       MagicCode == file_magic::macho_dynamically_linked_shared_lib_stub);
+#elif defined(LLVM_ON_UNIX)
+#ifdef __CYGWIN__
+      (MagicCode == file_magic::pecoff_executable);
+#else
+      (MagicCode == file_magic::elf_shared_object);
+#endif
+#else
+#error "Unsupported platform."
+#endif
+
+  if (NeedsObjectInspection) {
+    ObjectFileLoader ObjLoader(Path);
+    auto ObjOrErr = ObjLoader.getObjectFile();
+    if (!ObjOrErr) {
+      consumeError(ObjOrErr.takeError());
+      return false;
+    }
+    return isSharedLibraryObject(ObjOrErr.get());
+  }
+
+  LLVM_DEBUG(dbgs() << "Path is not identified as a shared library: " << Path
+                    << "\n";);
+  return false;
+}
+
+void DylibSubstitutor::configure(StringRef LoaderPath) {
+  SmallString<512> ExecPath(sys::fs::getMainExecutable(nullptr, nullptr));
+  sys::path::remove_filename(ExecPath);
+
+  SmallString<512> LoaderDir;
+  if (LoaderPath.empty()) {
+    LoaderDir = ExecPath;
+  } else {
+    LoaderDir = LoaderPath.str();
+    if (!sys::fs::is_directory(LoaderPath))
+      sys::path::remove_filename(LoaderDir);
+  }
+
+#ifdef __APPLE__
+  Placeholders["@loader_path"] = std::string(LoaderDir);
+  Placeholders["@executable_path"] = std::string(ExecPath);
+#else
+  Placeholders["$origin"] = std::string(LoaderDir);
+#endif
+}
+
+std::optional<std::string>
+SearchPathResolver::resolve(StringRef Stem, const DylibSubstitutor &Subst,
+                            DylibPathValidator &Validator) const {
+  for (const auto &SP : Paths) {
+    std::string Base = Subst.substitute(SP);
+
+    SmallString<512> FullPath(Base);
+    if (!PlaceholderPrefix.empty() &&
+        Stem.starts_with_insensitive(PlaceholderPrefix))
+      FullPath.append(Stem.drop_front(PlaceholderPrefix.size()));
+    else
+      sys::path::append(FullPath, Stem);
+
+    LLVM_DEBUG(dbgs() << "SearchPathResolver::resolve FullPath = " << FullPath
+                      << "\n";);
+
+    if (auto Valid = Validator.validate(FullPath.str()))
+      return Valid;
+  }
+
+  return std::nullopt;
+}
+
+std::optional<std::string>
+DylibResolverImpl::tryWithExtensions(StringRef LibStem) const {
+  LLVM_DEBUG(dbgs() << "tryWithExtensions: baseName = " << LibStem << "\n";);
+  SmallVector<SmallString<256>, 8> Candidates;
+
+  // Add extensions by platform
+#if defined(__APPLE__)
+  Candidates.emplace_back(LibStem);
+  Candidates.back() += ".dylib";
+#elif defined(_WIN32)
+  Candidates.emplace_back(LibStem);
+  Candidates.back() += ".dll";
+#else
+  Candidates.emplace_back(LibStem);
+  Candidates.back() += ".so";
+#endif
+
+  // Optionally try "lib" prefix if not already there
+  StringRef FileName = sys::path::filename(LibStem);
+  StringRef Base = sys::path::parent_path(LibStem);
+  if (!FileName.starts_with("lib")) {
+    SmallString<256> WithPrefix(Base);
+    if (!WithPrefix.empty())
+      sys::path::append(WithPrefix, ""); // ensure separator if needed
+    WithPrefix += "lib";
+    WithPrefix += FileName;
+
+#if defined(__APPLE__)
+    WithPrefix += ".dylib";
+#elif defined(_WIN32)
+    WithPrefix += ".dll";
+#else
+    WithPrefix += ".so";
+#endif
+
+    Candidates.push_back(std::move(WithPrefix));
+  }
+
+  LLVM_DEBUG({
+    dbgs() << "  Candidates to try:\n";
+    for (const auto &C : Candidates)
+      dbgs() << "    " << C << "\n";
+  });
+
+  // Try all variants using tryAllPaths
+  for (const auto &Name : Candidates) {
+
+    LLVM_DEBUG(dbgs() << "  Trying candidate: " << Name << "\n";);
+
+    for (const auto &R : Resolvers) {
+      if (auto Res = R.resolve(Name, Substitutor, Validator))
+        return Res;
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "  -> No candidate Resolved.\n";);
+
+  return std::nullopt;
+}
+
+std::optional<std::string>
+DylibResolverImpl::resolve(StringRef LibStem, bool VariateLibStem) const {
+  LLVM_DEBUG(dbgs() << "Resolving library stem: " << LibStem << "\n";);
+
+  // If it is an absolute path, don't try iterate over the paths.
+  if (sys::path::is_absolute(LibStem)) {
+    LLVM_DEBUG(dbgs() << "  -> Absolute path detected.\n";);
+    return Validator.validate(LibStem);
+  }
+
+  if (!LibStem.starts_with_insensitive("@rpath")) {
+    if (auto norm = Validator.validate(Substitutor.substitute(LibStem))) {
+      LLVM_DEBUG(dbgs() << "  -> Resolved after substitution: " << *norm
+                        << "\n";);
+
+      return norm;
+    }
+  }
+
+  for (const auto &R : Resolvers) {
+    LLVM_DEBUG(dbgs() << "  -> Resolving via search path ... \n";);
+    if (auto Result = R.resolve(LibStem, Substitutor, Validator)) {
+      LLVM_DEBUG(dbgs() << "  -> Resolved via search path: " << *Result
+                        << "\n";);
+
+      return Result;
+    }
+  }
+
+  // Expand libStem with paths, extensions, etc.
+  // std::string foundName;
+  if (VariateLibStem) {
+    LLVM_DEBUG(dbgs() << "  -> Trying with extensions...\n";);
+
+    if (auto Norm = tryWithExtensions(LibStem)) {
+      LLVM_DEBUG(dbgs() << "  -> Resolved via tryWithExtensions: " << *Norm
+                        << "\n";);
+
+      return Norm;
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "  -> Could not resolve: " << LibStem << "\n";);
+
+  return std::nullopt;
+}
+
+#ifndef _WIN32
+mode_t PathResolver::lstatCached(StringRef Path) {
+  // If already cached - retun cached result
+  if (auto Cache = LibPathCache->read_lstat(Path))
+    return *Cache;
+
+  // Not cached: perform lstat and store
+  struct stat buf{};
+  mode_t st_mode = (lstat(Path.str().c_str(), &buf) == -1) ? 0 : buf.st_mode;
+
+  LibPathCache->insert_lstat(Path, st_mode);
+
+  return st_mode;
+}
+
+std::optional<std::string> PathResolver::readlinkCached(StringRef Path) {
+  // If already cached - retun cached result
+  if (auto Cache = LibPathCache->read_link(Path))
+    return Cache;
+
+  // If result not in cache - call system function and cache result
+  char buf[PATH_MAX];
+  ssize_t len;
+  if ((len = readlink(Path.str().c_str(), buf, sizeof(buf))) != -1) {
+    buf[len] = '\0';
+    std::string s(buf);
+    LibPathCache->insert_link(Path, s);
+    return s;
+  }
+  return std::nullopt;
+}
+
+void createComponent(StringRef Path, StringRef BasePath, bool BaseIsResolved,
+                     SmallVector<StringRef, 16> &Component) {
+  StringRef Separator = sys::path::get_separator();
+  if (!BaseIsResolved) {
+    if (Path[0] == '~' &&
+        (Path.size() == 1 || sys::path::is_separator(Path[1]))) {
+      static SmallString<128> HomeP;
+      if (HomeP.str().empty())
+        sys::path::home_directory(HomeP);
+      StringRef(HomeP).split(Component, Separator, /*MaxSplit*/ -1,
+                             /*KeepEmpty*/ false);
+    } else if (BasePath.empty()) {
+      static SmallString<256> CurrentPath;
+      if (CurrentPath.str().empty())
+        sys::fs::current_path(CurrentPath);
+      StringRef(CurrentPath)
+          .split(Component, Separator, /*MaxSplit*/ -1, /*KeepEmpty*/ false);
+    } else {
+      BasePath.split(Component, Separator, /*MaxSplit*/ -1,
+                     /*KeepEmpty*/ false);
+    }
+  }
+
+  Path.split(Component, Separator, /*MaxSplit*/ -1, /*KeepEmpty*/ false);
+}
+
+void normalizePathSegments(SmallVector<StringRef, 16> &PathParts) {
+  SmallVector<StringRef, 16> NormalizedPath;
+  for (auto &Part : PathParts) {
+    if (Part == ".") {
+      continue;
+    } else if (Part == "..") {
+      if (!NormalizedPath.empty() && NormalizedPath.back() != "..") {
+        NormalizedPath.pop_back();
+      } else {
+        NormalizedPath.push_back("..");
+      }
+    } else {
+      NormalizedPath.push_back(Part);
+    }
+  }
+  PathParts.swap(NormalizedPath);
+}
+#endif
+
+std::optional<std::string> PathResolver::realpathCached(StringRef Path,
+                                                        std::error_code &EC,
+                                                        StringRef Base,
+                                                        bool BaseIsResolved,
+                                                        long SymLoopLevel) {
+  EC.clear();
+
+  if (Path.empty()) {
+    EC = std::make_error_code(std::errc::no_such_file_or_directory);
+    LLVM_DEBUG(dbgs() << "PathResolver::realpathCached: Empty path\n";);
+
+    return std::nullopt;
+  }
+
+  if (SymLoopLevel <= 0) {
+    EC = std::make_error_code(std::errc::too_many_symbolic_link_levels);
+    LLVM_DEBUG(
+        dbgs() << "PathResolver::realpathCached: Too many Symlink levels: "
+               << Path << "\n";);
+
+    return std::nullopt;
+  }
+
+  // If already cached - retun cached result
+  bool isRelative = sys::path::is_relative(Path);
+  if (!isRelative) {
+    if (auto Cached = LibPathCache->read_realpath(Path)) {
+      EC = Cached->ErrnoCode;
+      if (EC) {
+        LLVM_DEBUG(dbgs() << "PathResolver::realpathCached: Cached (error) for "
+                          << Path << "\n";);
+      } else {
+        LLVM_DEBUG(
+            dbgs() << "PathResolver::realpathCached: Cached (success) for "
+                   << Path << " => " << Cached->canonicalPath << "\n";);
+      }
+      return Cached->canonicalPath.empty()
+                 ? std::nullopt
+                 : std::make_optional(Cached->canonicalPath);
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "PathResolver::realpathCached: Resolving path: " << Path
+                    << "\n";);
+
+  // If result not in cache - call system function and cache result
+
+  StringRef Separator(sys::path::get_separator());
+  SmallString<256> Resolved(Separator);
+#ifndef _WIN32
+  SmallVector<StringRef, 16> Components;
+
+  if (isRelative) {
+    if (BaseIsResolved) {
+      Resolved.assign(Base);
+      LLVM_DEBUG(dbgs() << "  Using Resolved base: " << Base << "\n";);
+    }
+    createComponent(Path, Base, BaseIsResolved, Components);
+  } else {
+    Path.split(Components, Separator, /*MaxSplit*/ -1, /*KeepEmpty*/ false);
+  }
+
+  normalizePathSegments(Components);
+  LLVM_DEBUG({
+    for (auto &C : Components)
+      dbgs() << " " << C << " ";
+
+    dbgs() << "\n";
+  });
+
+  // Handle path list items
+  for (const auto &Component : Components) {
+    if (Component == ".")
+      continue;
+    if (Component == "..") {
+      // collapse "a/b/../c" to "a/c"
+      size_t S = Resolved.rfind(Separator);
+      if (S != llvm::StringRef::npos)
+        Resolved.resize(S);
+      if (Resolved.empty())
+        Resolved = Separator;
+      continue;
+    }
+
+    size_t oldSize = Resolved.size();
+    sys::path::append(Resolved, Component);
+    const char *ResolvedPath = Resolved.c_str();
+    LLVM_DEBUG(dbgs() << "  Processing Component: " << Component << " => "
+                      << ResolvedPath << "\n";);
+    mode_t st_mode = lstatCached(ResolvedPath);
+
+    if (S_ISLNK(st_mode)) {
+      LLVM_DEBUG(dbgs() << "    Found symlink: " << ResolvedPath << "\n";);
+
+      auto SymlinkOpt = readlinkCached(ResolvedPath);
+      if (!SymlinkOpt) {
+        EC = std::make_error_code(std::errc::no_such_file_or_directory);
+        LibPathCache->insert_realpath(Path, LibraryPathCache::PathInfo{"", EC});
+        LLVM_DEBUG(dbgs() << "    Failed to read symlink: " << ResolvedPath
+                          << "\n";);
+
+        return std::nullopt;
+      }
+
+      StringRef Symlink = *SymlinkOpt;
+      LLVM_DEBUG(dbgs() << "    Symlink points to: " << Symlink << "\n";);
+
+      std::string resolvedBase = "";
+      if (sys::path::is_relative(Symlink)) {
+        Resolved.resize(oldSize);
+        resolvedBase = Resolved.str().str();
+      }
+
+      auto RealSymlink =
+          realpathCached(Symlink, EC, resolvedBase,
+                         /*BaseIsResolved=*/true, SymLoopLevel - 1);
+      if (!RealSymlink) {
+        LibPathCache->insert_realpath(Path, LibraryPathCache::PathInfo{"", EC});
+        LLVM_DEBUG(dbgs() << "    Failed to resolve symlink target: " << Symlink
+                          << "\n";);
+
+        return std::nullopt;
+      }
+
+      Resolved.assign(*RealSymlink);
+      LLVM_DEBUG(dbgs() << "    Symlink Resolved to: " << Resolved << "\n";);
+
+    } else if (st_mode == 0) {
+      EC = std::make_error_code(std::errc::no_such_file_or_directory);
+      LibPathCache->insert_realpath(Path, LibraryPathCache::PathInfo{"", EC});
+      LLVM_DEBUG(dbgs() << "    Component does not exist: " << ResolvedPath
+                        << "\n";);
+
+      return std::nullopt;
+    }
+  }
+#else
+  EC = sys::fs::real_path(Path, Resolved); // Windows fallback
+#endif
+
+  std::string Canonical = Resolved.str().str();
+  {
+    LibPathCache->insert_realpath(Path, LibraryPathCache::PathInfo{
+                                            Canonical,
+                                            std::error_code() // success
+                                        });
+  }
+  LLVM_DEBUG(dbgs() << "PathResolver::realpathCached: Final Resolved: " << Path
+                    << " => " << Canonical << "\n";);
+  return Canonical;
+}
+
+void LibraryScanHelper::addBasePath(const std::string &Path, PathType K) {
+  std::error_code EC;
+  std::string Canon = resolveCanonical(Path, EC);
+  if (EC) {
+    LLVM_DEBUG(
+        dbgs()
+            << "LibraryScanHelper::addBasePath: Failed to canonicalize path: "
+            << Path << "\n";);
+    return;
+  }
+  std::unique_lock<std::shared_mutex> Lock(Mtx);
+  if (LibSearchPaths.count(Canon)) {
+    LLVM_DEBUG(dbgs() << "LibraryScanHelper::addBasePath: Already added: "
+                      << Canon << "\n";);
+    return;
+  }
+  K = K == PathType::Unknown ? classifyKind(Canon) : K;
+  auto SP = std::make_shared<LibrarySearchPath>(Canon, K);
+  LibSearchPaths[Canon] = SP;
+
+  if (K == PathType::User) {
+    LLVM_DEBUG(dbgs() << "LibraryScanHelper::addBasePath: Added User path: "
+                      << Canon << "\n";);
+    UnscannedUsr.push_back(StringRef(SP->BasePath));
+  } else {
+    LLVM_DEBUG(dbgs() << "LibraryScanHelper::addBasePath: Added System path: "
+                      << Canon << "\n";);
+    UnscannedSys.push_back(StringRef(SP->BasePath));
+  }
+}
+
+std::vector<std::shared_ptr<LibrarySearchPath>>
+LibraryScanHelper::getNextBatch(PathType K, size_t BatchSize) {
+  std::vector<std::shared_ptr<LibrarySearchPath>> Result;
+  auto &Queue = (K == PathType::User) ? UnscannedUsr : UnscannedSys;
+
+  std::unique_lock<std::shared_mutex> Lock(Mtx);
+
+  while (!Queue.empty() && (BatchSize == 0 || Result.size() < BatchSize)) {
+    StringRef Base = Queue.front();
+    auto It = LibSearchPaths.find(Base);
+    if (It != LibSearchPaths.end()) {
+      auto &SP = It->second;
+      ScanState Expected = ScanState::NotScanned;
+      if (SP->State.compare_exchange_strong(Expected, ScanState::Scanning)) {
+        Result.push_back(SP);
+      }
+    }
+    Queue.pop_front();
+  }
+
+  return Result;
+}
+
+bool LibraryScanHelper::isTrackedBasePath(StringRef Path) const {
+  std::error_code EC;
+  std::string Canon = resolveCanonical(Path, EC);
+  if (EC)
+    return false;
+
+  std::shared_lock<std::shared_mutex> Lock(Mtx);
+  return LibSearchPaths.count(Canon) > 0;
+}
+
+bool LibraryScanHelper::leftToScan(PathType K) const {
+  std::shared_lock<std::shared_mutex> Lock(Mtx);
+  for (const auto &KV : LibSearchPaths) {
+    const auto &SP = KV.second;
+    if (SP->Kind == K && SP->State == ScanState::NotScanned)
+      return true;
+  }
+  return false;
+}
+
+void LibraryScanHelper::resetToScan() {
+  std::shared_lock<std::shared_mutex> Lock(Mtx);
+
+  for (auto &[_, SP] : LibSearchPaths) {
+    ScanState Expected = ScanState::Scanned;
+
+    if (!SP->State.compare_exchange_strong(Expected, ScanState::NotScanned))
+      continue;
+
+    auto &TargetList =
+        (SP->Kind == PathType::User) ? UnscannedUsr : UnscannedSys;
+    TargetList.emplace_back(SP->BasePath);
+  }
+}
+
+std::vector<std::shared_ptr<LibrarySearchPath>>
+LibraryScanHelper::getAllUnits() const {
+  std::shared_lock<std::shared_mutex> Lock(Mtx);
+  std::vector<std::shared_ptr<LibrarySearchPath>> Result;
+  Result.reserve(LibSearchPaths.size());
+  for (const auto &[_, SP] : LibSearchPaths) {
+    Result.push_back(SP);
+  }
+  return Result;
+}
+
+std::string LibraryScanHelper::resolveCanonical(StringRef Path,
+                                                std::error_code &EC) const {
+  auto Canon = LibPathResolver->resolve(Path, EC);
+  return EC ? Path.str() : *Canon;
+}
+
+PathType LibraryScanHelper::classifyKind(StringRef Path) const {
+  // Detect home directory
+  const char *Home = getenv("HOME");
+  if (Home && Path.find(Home) == 0)
+    return PathType::User;
+
+  static const std::array<std::string, 5> UserPrefixes = {
+      "/usr/local",    // often used by users for manual installs
+      "/opt/homebrew", // common on macOS
+      "/opt/local",    // MacPorts
+      "/home",         // Linux home dirs
+      "/Users",        // macOS user dirs
+  };
+
+  for (const auto &Prefix : UserPrefixes) {
+    if (Path.find(Prefix) == 0)
+      return PathType::User;
+  }
+
+  return PathType::System;
+}
+
+Expected<LibraryDepsInfo> parseMachODeps(const object::MachOObjectFile &Obj) {
+  LibraryDepsInfo Libdeps;
+  LLVM_DEBUG(dbgs() << "Parsing Mach-O dependencies...\n";);
+  for (const auto &Command : Obj.load_commands()) {
+    switch (Command.C.cmd) {
+    case MachO::LC_LOAD_DYLIB: {
+      MachO::dylib_command dylibCmd = Obj.getDylibIDLoadCommand(Command);
+      const char *name = Command.Ptr + dylibCmd.dylib.name;
+      Libdeps.addDep(name);
+      LLVM_DEBUG(dbgs() << "  Found LC_LOAD_DYLIB: " << name << "\n";);
+    } break;
+    case MachO::LC_LOAD_WEAK_DYLIB:
+    case MachO::LC_REEXPORT_DYLIB:
+    case MachO::LC_LOAD_UPWARD_DYLIB:
+    case MachO::LC_LAZY_LOAD_DYLIB:
+      break;
+    case MachO::LC_RPATH: {
+      // Extract RPATH
+      MachO::rpath_command rpathCmd = Obj.getRpathCommand(Command);
+      const char *rpath = Command.Ptr + rpathCmd.path;
+      LLVM_DEBUG(dbgs() << "  Found LC_RPATH: " << rpath << "\n";);
+
+      SmallVector<StringRef, 4> RawPaths;
+      SplitString(StringRef(rpath), RawPaths,
+                  sys::EnvPathSeparator == ':' ? ":" : ";");
+
+      for (const auto &raw : RawPaths) {
+        Libdeps.addRPath(raw.str()); // Convert to std::string
+        LLVM_DEBUG(dbgs() << "    Parsed RPATH entry: " << raw << "\n";);
+      }
+      break;
+    }
+    }
+  }
+
+  return Expected<LibraryDepsInfo>(std::move(Libdeps));
+}
+
+template <class ELFT>
+static Expected<StringRef> getDynamicStrTab(const object::ELFFile<ELFT> &Elf) {
+  auto DynamicEntriesOrError = Elf.dynamicEntries();
+  if (!DynamicEntriesOrError)
+    return DynamicEntriesOrError.takeError();
+
+  for (const typename ELFT::Dyn &Dyn : *DynamicEntriesOrError) {
+    if (Dyn.d_tag == ELF::DT_STRTAB) {
+      auto MappedAddrOrError = Elf.toMappedAddr(Dyn.getPtr());
+      if (!MappedAddrOrError)
+        return MappedAddrOrError.takeError();
+      return StringRef(reinterpret_cast<const char *>(*MappedAddrOrError));
+    }
+  }
+
+  // If the dynamic segment is not present, we fall back on the sections.
+  auto SectionsOrError = Elf.sections();
+  if (!SectionsOrError)
+    return SectionsOrError.takeError();
+
+  for (const typename ELFT::Shdr &Sec : *SectionsOrError) {
+    if (Sec.sh_type == ELF::SHT_DYNSYM)
+      return Elf.getStringTableForSymtab(Sec);
+  }
+
+  return make_error<StringError>("dynamic string table not found",
+                                 inconvertibleErrorCode());
+}
+
+template <typename ELFT>
+Expected<LibraryDepsInfo> parseELF(const object::ELFFile<ELFT> &Elf) {
+  LibraryDepsInfo Deps;
+  Expected<StringRef> StrTabOrErr = getDynamicStrTab(Elf);
+  if (!StrTabOrErr)
+    return StrTabOrErr.takeError();
+
+  const char *Data = StrTabOrErr->data();
+
+  auto DynamicEntriesOrError = Elf.dynamicEntries();
+  if (!DynamicEntriesOrError) {
+    return DynamicEntriesOrError.takeError();
+  }
+
+  for (const typename ELFT::Dyn &Dyn : *DynamicEntriesOrError) {
+    switch (Dyn.d_tag) {
+    case ELF::DT_NEEDED:
+      Deps.addDep(Data + Dyn.d_un.d_val);
+      break;
+    case ELF::DT_RPATH: {
+      SmallVector<StringRef, 4> RawPaths;
+      SplitString(Data + Dyn.d_un.d_val, RawPaths,
+                  sys::EnvPathSeparator == ':' ? ":" : ";");
+      for (const auto &raw : RawPaths)
+        Deps.addRPath(raw.str());
+      break;
+    }
+    case ELF::DT_RUNPATH: {
+      SmallVector<StringRef, 4> RawPaths;
+      SplitString(Data + Dyn.d_un.d_val, RawPaths,
+                  sys::EnvPathSeparator == ':' ? ":" : ";");
+      for (const auto &raw : RawPaths)
+        Deps.addRunPath(raw.str());
+      break;
+    }
+    case ELF::DT_FLAGS_1:
+      // Check if this is not a pie executable.
+      if (Dyn.d_un.d_val & ELF::DF_1_PIE)
+        Deps.isPIE = true;
+      break;
+      // (Dyn.d_tag == ELF::DT_NULL) continue;
+      // (Dyn.d_tag == ELF::DT_AUXILIARY || Dyn.d_tag == ELF::DT_FILTER)
+    default:
+      break;
+    }
+  }
+
+  return Expected<LibraryDepsInfo>(std::move(Deps));
+}
+
+Expected<LibraryDepsInfo> parseELFDeps(const object::ELFObjectFileBase &Obj) {
+  using namespace object;
+  LLVM_DEBUG(dbgs() << "parseELFDeps: Detected ELF object\n";);
+  if (const auto *ELF = dyn_cast<ELF32LEObjectFile>(&Obj))
+    return parseELF(ELF->getELFFile());
+  else if (const auto *ELF = dyn_cast<ELF32BEObjectFile>(&Obj))
+    return parseELF(ELF->getELFFile());
+  else if (const auto *ELF = dyn_cast<ELF64LEObjectFile>(&Obj))
+    return parseELF(ELF->getELFFile());
+  else if (const auto *ELF = dyn_cast<ELF64BEObjectFile>(&Obj))
+    return parseELF(ELF->getELFFile());
+
+  LLVM_DEBUG(dbgs() << "parseELFDeps: Unknown ELF format\n";);
+  return createStringError(std::errc::not_supported, "Unknown ELF format");
+}
+
+Expected<LibraryDepsInfo> LibraryScanner::extractDeps(StringRef FilePath) {
+  LLVM_DEBUG(dbgs() << "extractDeps: Attempting to open file " << FilePath
+                    << "\n";);
+
+  ObjectFileLoader ObjLoader(FilePath);
+  auto ObjOrErr = ObjLoader.getObjectFile();
+  if (!ObjOrErr) {
+    LLVM_DEBUG(dbgs() << "extractDeps: Failed to open " << FilePath << "\n";);
+    return ObjOrErr.takeError();
+  }
+
+  object::ObjectFile *Obj = &ObjOrErr.get();
+
+  if (auto *elfObj = dyn_cast<object::ELFObjectFileBase>(Obj)) {
+    LLVM_DEBUG(dbgs() << "extractDeps: File " << FilePath
+                      << " is an ELF object\n";);
+
+    return parseELFDeps(*elfObj);
+  }
+
+  if (auto *macho = dyn_cast<object::MachOObjectFile>(Obj)) {
+    LLVM_DEBUG(dbgs() << "extractDeps: File " << FilePath
+                      << " is a Mach-O object\n";);
+    return parseMachODeps(*macho);
+  }
+
+  if (Obj->isCOFF()) {
+    // TODO: COFF support
+    return LibraryDepsInfo();
+  }
+
+  LLVM_DEBUG(dbgs() << "extractDeps: Unsupported binary format for file "
+                    << FilePath << "\n";);
+  return createStringError(inconvertibleErrorCode(),
+                           "Unsupported binary format: %s",
+                           FilePath.str().c_str());
+}
+
+std::optional<std::string> LibraryScanner::shouldScan(StringRef FilePath) {
+  std::error_code EC;
+
+  LLVM_DEBUG(dbgs() << "[shouldScan] Checking: " << FilePath << "\n";);
+
+  // [1] Check file existence early
+  if (!sys::fs::exists(FilePath)) {
+    LLVM_DEBUG(dbgs() << "  -> Skipped: file does not exist.\n";);
+
+    return std::nullopt;
+  }
+
+  // [2] Resolve to canonical path
+  auto CanonicalPathOpt = ScanHelper.resolve(FilePath, EC);
+  if (EC || !CanonicalPathOpt) {
+    LLVM_DEBUG(dbgs() << "  -> Skipped: failed to resolve path (EC="
+                      << EC.message() << ").\n";);
+
+    return std::nullopt;
+  }
+
+  const std::string &CanonicalPath = *CanonicalPathOpt;
+  LLVM_DEBUG(dbgs() << "  -> Canonical path: " << CanonicalPath << "\n");
+
+  // [3] Check if it's a directory — skip directories
+  if (sys::fs::is_directory(CanonicalPath)) {
+    LLVM_DEBUG(dbgs() << "  -> Skipped: path is a directory.\n";);
+
+    return std::nullopt;
+  }
+
+  // [4] Skip if it's not a shared library.
+  if (!DylibPathValidator::isSharedLibrary(CanonicalPath)) {
+    LLVM_DEBUG(dbgs() << "  -> Skipped: not a shared library.\n";);
+    return std::nullopt;
+  }
+
+  // [5] Skip if we've already seen this path (via cache)
+  if (ScanHelper.hasSeenOrMark(CanonicalPath)) {
+    LLVM_DEBUG(dbgs() << "  -> Skipped: already seen.\n";);
+
+    return std::nullopt;
+  }
+
+  // [6] Already tracked in LibraryManager?
+  if (LibMgr.hasLibrary(CanonicalPath)) {
+    LLVM_DEBUG(dbgs() << "  -> Skipped: already tracked by LibraryManager.\n";);
+
+    return std::nullopt;
+  }
+
+  // [7] Run user-defined hook (default: always true)
+  if (!ShouldScanCall(CanonicalPath)) {
+    LLVM_DEBUG(dbgs() << "  -> Skipped: user-defined hook rejected.\n";);
+
+    return std::nullopt;
+  }
+
+  LLVM_DEBUG(dbgs() << "  -> Accepted: ready to scan " << CanonicalPath
+                    << "\n";);
+  return CanonicalPath;
+}
+
+void LibraryScanner::handleLibrary(StringRef FilePath, PathType K, int level) {
+  LLVM_DEBUG(dbgs() << "LibraryScanner::handleLibrary: Scanning: " << FilePath
+                    << ", level=" << level << "\n";);
+  auto CanonPathOpt = shouldScan(FilePath);
+  if (!CanonPathOpt) {
+    LLVM_DEBUG(dbgs() << "  Skipped (shouldScan returned false): " << FilePath
+                      << "\n";);
+
+    return;
+  }
+  const std::string CanonicalPath = *CanonPathOpt;
+
+  auto DepsOrErr = extractDeps(CanonicalPath);
+  if (!DepsOrErr) {
+    LLVM_DEBUG(dbgs() << "  Failed to extract deps for: " << CanonicalPath
+                      << "\n";);
+    handleError(DepsOrErr.takeError());
+    return;
+  }
+
+  LibraryDepsInfo &Deps = *DepsOrErr;
+
+  LLVM_DEBUG({
+    dbgs() << "    Found deps : \n";
+    for (const auto &dep : Deps.deps)
+      dbgs() << "        : " << dep << "\n";
+    dbgs() << "    Found @rpath : " << Deps.rpath.size() << "\n";
+    for (const auto &r : Deps.rpath)
+      dbgs() << "     : " << r << "\n";
+    dbgs() << "    Found @runpath : \n";
+    for (const auto &r : Deps.runPath)
+      dbgs() << "     : " << r << "\n";
+  });
+
+  if (Deps.isPIE && level == 0) {
+    LLVM_DEBUG(dbgs() << "  Skipped PIE executable at top level: "
+                      << CanonicalPath << "\n";);
+
+    return;
+  }
+
+  bool Added = LibMgr.addLibrary(CanonicalPath, K);
+  if (!Added) {
+    LLVM_DEBUG(dbgs() << "  Already added: " << CanonicalPath << "\n";);
+    return;
+  }
+
+  // Heuristic 1: No RPATH/RUNPATH, skip deps
+  if (Deps.rpath.empty() && Deps.runPath.empty()) {
+    LLVM_DEBUG(
+        dbgs() << "LibraryScanner::handleLibrary: Skipping deps (Heuristic1): "
+               << CanonicalPath << "\n";);
+    return;
+  }
+
+  // Heuristic 2: All RPATH and RUNPATH already tracked
+  auto allTracked = [&](const auto &Paths) {
+    LLVM_DEBUG(dbgs() << "   Checking : " << Paths.size() << "\n";);
+    return std::all_of(Paths.begin(), Paths.end(), [&](StringRef P) {
+      LLVM_DEBUG(dbgs() << "      Checking isTrackedBasePath : " << P << "\n";);
+      return ScanHelper.isTrackedBasePath(
+          DylibResolver::resolvelinkerFlag(P, CanonicalPath));
+    });
+  };
+
+  if (allTracked(Deps.rpath) && allTracked(Deps.runPath)) {
+    LLVM_DEBUG(
+        dbgs() << "LibraryScanner::handleLibrary: Skipping deps (Heuristic2): "
+               << CanonicalPath << "\n";);
+    return;
+  }
+
+  DylibPathValidator Validator(ScanHelper.getPathResolver());
+  DylibResolver Resolver(Validator);
+  Resolver.configure(CanonicalPath,
+                     {{Deps.rpath, SearchPathType::RPath},
+                      {ScanHelper.getSearchPaths(), SearchPathType::UsrOrSys},
+                      {Deps.runPath, SearchPathType::RunPath}});
+  for (StringRef Dep : Deps.deps) {
+    LLVM_DEBUG(dbgs() << "  Resolving dep: " << Dep << "\n";);
+    auto DepFullOpt = Resolver.resolve(Dep);
+    if (!DepFullOpt) {
+      LLVM_DEBUG(dbgs() << "    Failed to resolve dep: " << Dep << "\n";);
+
+      continue;
+    }
+    LLVM_DEBUG(dbgs() << "    Resolved dep to: " << *DepFullOpt << "\n";);
+
+    handleLibrary(*DepFullOpt, K, level + 1);
+  }
+}
+
+void LibraryScanner::scanBaseDir(std::shared_ptr<LibrarySearchPath> SP) {
+  if (!sys::fs::is_directory(SP->BasePath) || SP->BasePath.empty()) {
+    LLVM_DEBUG(
+        dbgs() << "LibraryScanner::scanBaseDir: Invalid or empty basePath: "
+               << SP->BasePath << "\n";);
+    return;
+  }
+
+  LLVM_DEBUG(dbgs() << "LibraryScanner::scanBaseDir: Scanning directory: "
+                    << SP->BasePath << "\n";);
+  std::error_code EC;
+
+  SP->State.store(ScanState::Scanning);
+
+  for (sys::fs::directory_iterator It(SP->BasePath, EC), end; It != end && !EC;
+       It.increment(EC)) {
+    auto Entry = *It;
+    if (!Entry.status())
+      continue;
+
+    auto Status = *Entry.status();
+    if (sys::fs::is_regular_file(Status) || sys::fs::is_symlink_file(Status)) {
+      LLVM_DEBUG(dbgs() << "  Found file: " << Entry.path() << "\n";);
+      // async support ?
+      handleLibrary(Entry.path(), SP->Kind);
+    }
+  }
+
+  SP->State.store(ScanState::Scanned);
+}
+
+void LibraryScanner::scanNext(PathType K, size_t BatchSize) {
+  LLVM_DEBUG(dbgs() << "LibraryScanner::scanNext: Scanning next batch of size "
+                    << BatchSize << " for kind "
+                    << (K == PathType::User ? "User" : "System") << "\n";);
+
+  auto SearchPaths = ScanHelper.getNextBatch(K, BatchSize);
+  for (auto &SP : SearchPaths) {
+    LLVM_DEBUG(dbgs() << "  Scanning unit with basePath: " << SP->BasePath
+                      << "\n";);
+
+    scanBaseDir(SP);
+  }
+}
+
+} // end namespace llvm::orc
diff --git a/llvm/lib/FileCheck/FileCheckImpl.h b/llvm/lib/FileCheck/FileCheckImpl.h
index a08502e4497e3..5851cfc4b5d5c 100644
--- a/llvm/lib/FileCheck/FileCheckImpl.h
+++ b/llvm/lib/FileCheck/FileCheckImpl.h
@@ -528,7 +528,7 @@ class ErrorDiagnostic : public ErrorInfo<ErrorDiagnostic> {
   SMRange getRange() const { return Range; }
 
   static Error get(const SourceMgr &SM, SMLoc Loc, const Twine &ErrMsg,
-                   SMRange Range = std::nullopt) {
+                   SMRange Range = {}) {
     return make_error<ErrorDiagnostic>(
         SM.GetMessage(Loc, SourceMgr::DK_Error, ErrMsg), Range);
   }
diff --git a/llvm/lib/Frontend/Driver/CodeGenOptions.cpp b/llvm/lib/Frontend/Driver/CodeGenOptions.cpp
index df884908845d2..b546e816419e3 100644
--- a/llvm/lib/Frontend/Driver/CodeGenOptions.cpp
+++ b/llvm/lib/Frontend/Driver/CodeGenOptions.cpp
@@ -12,7 +12,6 @@
 #include "llvm/TargetParser/Triple.h"
 
 namespace llvm {
-extern llvm::cl::opt<bool> DebugInfoCorrelate;
 extern llvm::cl::opt<llvm::InstrProfCorrelator::ProfCorrelatorKind>
     ProfileCorrelate;
 } // namespace llvm
@@ -64,8 +63,7 @@ TargetLibraryInfoImpl *createTLII(const llvm::Triple &TargetTriple,
 }
 
 std::string getDefaultProfileGenName() {
-  return llvm::DebugInfoCorrelate ||
-                 llvm::ProfileCorrelate != InstrProfCorrelator::NONE
+  return llvm::ProfileCorrelate != InstrProfCorrelator::NONE
              ? "default_%m.proflite"
              : "default_%m.profraw";
 }
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 286ed039b1214..fff9a815e5368 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -528,7 +528,7 @@ void OpenMPIRBuilder::getKernelArgsVector(TargetKernelArgs &KernelArgs,
   Value *Version = Builder.getInt32(OMP_KERNEL_ARG_VERSION);
   Value *PointerNum = Builder.getInt32(KernelArgs.NumTargetItems);
   auto Int32Ty = Type::getInt32Ty(Builder.getContext());
-  constexpr const size_t MaxDim = 3;
+  constexpr size_t MaxDim = 3;
   Value *ZeroArray = Constant::getNullValue(ArrayType::get(Int32Ty, MaxDim));
   Value *Flags = Builder.getInt64(KernelArgs.HasNoWait);
 
@@ -5473,7 +5473,8 @@ OpenMPIRBuilder::collapseLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops,
     }
 
     // TODO: Enable UndefinedSanitizer to diagnose an overflow here.
-    CollapsedTripCount = Builder.CreateNUWMul(CollapsedTripCount, OrigTripCount);
+    CollapsedTripCount =
+        Builder.CreateNUWMul(CollapsedTripCount, OrigTripCount);
   }
 
   // Create the collapsed loop control flow.
@@ -9338,9 +9339,8 @@ OpenMPIRBuilder::createAtomicRead(const LocationDescription &Loc,
     // target does not support `atomicrmw` of the size of the struct
     LoadInst *OldVal = Builder.CreateLoad(XElemTy, X.Var, "omp.atomic.read");
     OldVal->setAtomic(AO);
-    const DataLayout &LoadDL = OldVal->getModule()->getDataLayout();
-    unsigned LoadSize =
-        LoadDL.getTypeStoreSize(OldVal->getPointerOperand()->getType());
+    const DataLayout &DL = OldVal->getModule()->getDataLayout();
+    unsigned LoadSize = DL.getTypeStoreSize(XElemTy);
     OpenMPIRBuilder::AtomicInfo atomicInfo(
         &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
         OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X.Var);
@@ -9384,9 +9384,8 @@ OpenMPIRBuilder::createAtomicWrite(const LocationDescription &Loc,
     XSt->setAtomic(AO);
   } else if (XElemTy->isStructTy()) {
     LoadInst *OldVal = Builder.CreateLoad(XElemTy, X.Var, "omp.atomic.read");
-    const DataLayout &LoadDL = OldVal->getModule()->getDataLayout();
-    unsigned LoadSize =
-        LoadDL.getTypeStoreSize(OldVal->getPointerOperand()->getType());
+    const DataLayout &DL = OldVal->getModule()->getDataLayout();
+    unsigned LoadSize = DL.getTypeStoreSize(XElemTy);
     OpenMPIRBuilder::AtomicInfo atomicInfo(
         &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
         OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X.Var);
@@ -9581,7 +9580,7 @@ Expected<std::pair<Value *, Value *>> OpenMPIRBuilder::emitAtomicUpdate(
     OldVal->setAtomic(AO);
     // CurBB
     // |     /---\
-		// ContBB    |
+    // ContBB    |
     // |     \---/
     // ExitBB
     BasicBlock *CurBB = Builder.GetInsertBlock();
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index 3c222f54fd406..95d954f6b8174 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -2199,6 +2199,7 @@ static void writeDIBasicType(raw_ostream &Out, const DIBasicType *N,
   Printer.printString("name", N->getName());
   Printer.printMetadataOrInt("size", N->getRawSizeInBits(), true);
   Printer.printInt("align", N->getAlignInBits());
+  Printer.printInt("dataSize", N->getDataSizeInBits());
   Printer.printDwarfEnum("encoding", N->getEncoding(),
                          dwarf::AttributeEncodingString);
   Printer.printInt("num_extra_inhabitants", N->getNumExtraInhabitants());
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index b838e36c8824f..58b7ddd0381e5 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -730,7 +730,7 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F,
       // (arm|aarch64).neon.bfdot.*'.
       Intrinsic::ID ID =
           StringSwitch<Intrinsic::ID>(Name)
-              .Cases("v2f32.v8i8", "v4f32.v16i8",
+              .Cases({"v2f32.v8i8", "v4f32.v16i8"},
                      IsArm ? (Intrinsic::ID)Intrinsic::arm_neon_bfdot
                            : (Intrinsic::ID)Intrinsic::aarch64_neon_bfdot)
               .Default(Intrinsic::not_intrinsic);
@@ -1456,7 +1456,7 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
       if (F->arg_size() == 1) {
         Intrinsic::ID IID =
             StringSwitch<Intrinsic::ID>(Name)
-                .Cases("brev32", "brev64", Intrinsic::bitreverse)
+                .Cases({"brev32", "brev64"}, Intrinsic::bitreverse)
                 .Case("clz.i", Intrinsic::ctlz)
                 .Case("popc.i", Intrinsic::ctpop)
                 .Default(Intrinsic::not_intrinsic);
@@ -1504,6 +1504,10 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
       else if (Name.consume_front("fabs."))
         // nvvm.fabs.{f,ftz.f,d}
         Expand = Name == "f" || Name == "ftz.f" || Name == "d";
+      else if (Name.consume_front("ex2.approx."))
+        // nvvm.ex2.approx.{f,ftz.f,d,f16x2}
+        Expand =
+            Name == "f" || Name == "ftz.f" || Name == "d" || Name == "f16x2";
       else if (Name.consume_front("max.") || Name.consume_front("min."))
         // nvvm.{min,max}.{i,ii,ui,ull}
         Expand = Name == "s" || Name == "i" || Name == "ll" || Name == "us" ||
@@ -2550,6 +2554,11 @@ static Value *upgradeNVVMIntrinsicCall(StringRef Name, CallBase *CI,
     Intrinsic::ID IID = (Name == "fabs.ftz.f") ? Intrinsic::nvvm_fabs_ftz
                                                : Intrinsic::nvvm_fabs;
     Rep = Builder.CreateUnaryIntrinsic(IID, CI->getArgOperand(0));
+  } else if (Name.consume_front("ex2.approx.")) {
+    // nvvm.ex2.approx.{f,ftz.f,d,f16x2}
+    Intrinsic::ID IID = Name.starts_with("ftz") ? Intrinsic::nvvm_ex2_approx_ftz
+                                                : Intrinsic::nvvm_ex2_approx;
+    Rep = Builder.CreateUnaryIntrinsic(IID, CI->getArgOperand(0));
   } else if (Name.starts_with("atomic.load.add.f32.p") ||
              Name.starts_with("atomic.load.add.f64.p")) {
     Value *Ptr = CI->getArgOperand(0);
diff --git a/llvm/lib/IR/ConstantsContext.h b/llvm/lib/IR/ConstantsContext.h
index 51fb40bad201d..e3e8d895a63f4 100644
--- a/llvm/lib/IR/ConstantsContext.h
+++ b/llvm/lib/IR/ConstantsContext.h
@@ -535,7 +535,7 @@ struct ConstantPtrAuthKeyType {
 
   unsigned getHash() const { return hash_combine_range(Operands); }
 
-  using TypeClass = typename ConstantInfo<ConstantPtrAuth>::TypeClass;
+  using TypeClass = ConstantInfo<ConstantPtrAuth>::TypeClass;
 
   ConstantPtrAuth *create(TypeClass *Ty) const {
     return new ConstantPtrAuth(Operands[0], cast<ConstantInt>(Operands[1]),
diff --git a/llvm/lib/IR/DIBuilder.cpp b/llvm/lib/IR/DIBuilder.cpp
index 07a870f0630a5..ca11ecf2f473e 100644
--- a/llvm/lib/IR/DIBuilder.cpp
+++ b/llvm/lib/IR/DIBuilder.cpp
@@ -261,10 +261,12 @@ DIBasicType *DIBuilder::createNullPtrType() {
 DIBasicType *DIBuilder::createBasicType(StringRef Name, uint64_t SizeInBits,
                                         unsigned Encoding,
                                         DINode::DIFlags Flags,
-                                        uint32_t NumExtraInhabitants) {
+                                        uint32_t NumExtraInhabitants,
+                                        uint32_t DataSizeInBits) {
   assert(!Name.empty() && "Unable to create type without name");
   return DIBasicType::get(VMContext, dwarf::DW_TAG_base_type, Name, SizeInBits,
-                          0, Encoding, NumExtraInhabitants, Flags);
+                          0, Encoding, NumExtraInhabitants, DataSizeInBits,
+                          Flags);
 }
 
 DIFixedPointType *
diff --git a/llvm/lib/IR/DebugInfoMetadata.cpp b/llvm/lib/IR/DebugInfoMetadata.cpp
index e30df88e6b56b..fafc3254120de 100644
--- a/llvm/lib/IR/DebugInfoMetadata.cpp
+++ b/llvm/lib/IR/DebugInfoMetadata.cpp
@@ -872,15 +872,18 @@ DIEnumerator *DIEnumerator::getImpl(LLVMContext &Context, const APInt &Value,
 DIBasicType *DIBasicType::getImpl(LLVMContext &Context, unsigned Tag,
                                   MDString *Name, Metadata *SizeInBits,
                                   uint32_t AlignInBits, unsigned Encoding,
-                                  uint32_t NumExtraInhabitants, DIFlags Flags,
+                                  uint32_t NumExtraInhabitants,
+                                  uint32_t DataSizeInBits, DIFlags Flags,
                                   StorageType Storage, bool ShouldCreate) {
   assert(isCanonical(Name) && "Expected canonical MDString");
-  DEFINE_GETIMPL_LOOKUP(DIBasicType, (Tag, Name, SizeInBits, AlignInBits,
-                                      Encoding, NumExtraInhabitants, Flags));
+  DEFINE_GETIMPL_LOOKUP(DIBasicType,
+                        (Tag, Name, SizeInBits, AlignInBits, Encoding,
+                         NumExtraInhabitants, DataSizeInBits, Flags));
   Metadata *Ops[] = {nullptr, nullptr, Name, SizeInBits, nullptr};
-  DEFINE_GETIMPL_STORE(DIBasicType,
-                       (Tag, AlignInBits, Encoding, NumExtraInhabitants, Flags),
-                       Ops);
+  DEFINE_GETIMPL_STORE(
+      DIBasicType,
+      (Tag, AlignInBits, Encoding, NumExtraInhabitants, DataSizeInBits, Flags),
+      Ops);
 }
 
 std::optional<DIBasicType::Signedness> DIBasicType::getSignedness() const {
diff --git a/llvm/lib/IR/LLVMContextImpl.h b/llvm/lib/IR/LLVMContextImpl.h
index e03f993297e54..2c9921df0422e 100644
--- a/llvm/lib/IR/LLVMContextImpl.h
+++ b/llvm/lib/IR/LLVMContextImpl.h
@@ -480,20 +480,22 @@ template <> struct MDNodeKeyImpl<DIBasicType> {
   uint32_t AlignInBits;
   unsigned Encoding;
   uint32_t NumExtraInhabitants;
+  uint32_t DataSizeInBits;
   unsigned Flags;
 
   MDNodeKeyImpl(unsigned Tag, MDString *Name, Metadata *SizeInBits,
                 uint32_t AlignInBits, unsigned Encoding,
-                uint32_t NumExtraInhabitants, unsigned Flags)
+                uint32_t NumExtraInhabitants, uint32_t DataSizeInBits,
+                unsigned Flags)
       : Tag(Tag), Name(Name), SizeInBits(SizeInBits), AlignInBits(AlignInBits),
         Encoding(Encoding), NumExtraInhabitants(NumExtraInhabitants),
-        Flags(Flags) {}
+        DataSizeInBits(DataSizeInBits), Flags(Flags) {}
   MDNodeKeyImpl(const DIBasicType *N)
       : Tag(N->getTag()), Name(N->getRawName()),
         SizeInBits(N->getRawSizeInBits()), AlignInBits(N->getAlignInBits()),
         Encoding(N->getEncoding()),
-        NumExtraInhabitants(N->getNumExtraInhabitants()), Flags(N->getFlags()) {
-  }
+        NumExtraInhabitants(N->getNumExtraInhabitants()),
+        DataSizeInBits(N->getDataSizeInBits()), Flags(N->getFlags()) {}
 
   bool isKeyOf(const DIBasicType *RHS) const {
     return Tag == RHS->getTag() && Name == RHS->getRawName() &&
@@ -501,6 +503,7 @@ template <> struct MDNodeKeyImpl<DIBasicType> {
            AlignInBits == RHS->getAlignInBits() &&
            Encoding == RHS->getEncoding() &&
            NumExtraInhabitants == RHS->getNumExtraInhabitants() &&
+           DataSizeInBits == RHS->getDataSizeInBits() &&
            Flags == RHS->getFlags();
   }
 
diff --git a/llvm/lib/IR/ModuleSummaryIndex.cpp b/llvm/lib/IR/ModuleSummaryIndex.cpp
index 62fd62caad3d6..33947542bcf16 100644
--- a/llvm/lib/IR/ModuleSummaryIndex.cpp
+++ b/llvm/lib/IR/ModuleSummaryIndex.cpp
@@ -34,8 +34,6 @@ static cl::opt<bool> ImportConstantsWithRefs(
     "import-constants-with-refs", cl::init(true), cl::Hidden,
     cl::desc("Import constant global variables with references"));
 
-constexpr uint32_t FunctionSummary::ParamAccess::RangeWidth;
-
 FunctionSummary FunctionSummary::ExternalNode =
     FunctionSummary::makeDummyFunctionSummary(
         SmallVector<FunctionSummary::EdgeTy, 0>());
@@ -88,8 +86,6 @@ std::pair<unsigned, unsigned> FunctionSummary::specialRefCounts() const {
   return {RORefCnt, WORefCnt};
 }
 
-constexpr uint64_t ModuleSummaryIndex::BitcodeSummaryVersion;
-
 uint64_t ModuleSummaryIndex::getFlags() const {
   uint64_t Flags = 0;
   // Flags & 0x4 is reserved. DO NOT REUSE.
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index b6182222f6f80..23be42f9d60ce 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -1076,63 +1076,59 @@ Expected<ArrayRef<SymbolResolution>>
 LTO::addThinLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms,
                 ArrayRef<SymbolResolution> Res) {
   llvm::TimeTraceScope timeScope("LTO add thin LTO");
+  const auto BMID = BM.getModuleIdentifier();
   ArrayRef<SymbolResolution> ResTmp = Res;
   for (const InputFile::Symbol &Sym : Syms) {
     assert(!ResTmp.empty());
     const SymbolResolution &R = ResTmp.consume_front();
 
-    if (!Sym.getIRName().empty()) {
+    if (!Sym.getIRName().empty() && R.Prevailing) {
       auto GUID = GlobalValue::getGUIDAssumingExternalLinkage(
           GlobalValue::getGlobalIdentifier(Sym.getIRName(),
                                            GlobalValue::ExternalLinkage, ""));
-      if (R.Prevailing)
-        ThinLTO.setPrevailingModuleForGUID(GUID, BM.getModuleIdentifier());
+      ThinLTO.setPrevailingModuleForGUID(GUID, BMID);
     }
   }
 
-  if (Error Err =
-          BM.readSummary(ThinLTO.CombinedIndex, BM.getModuleIdentifier(),
-                         [&](GlobalValue::GUID GUID) {
-                           return ThinLTO.isPrevailingModuleForGUID(
-                               GUID, BM.getModuleIdentifier());
-                         }))
+  if (Error Err = BM.readSummary(
+          ThinLTO.CombinedIndex, BMID, [&](GlobalValue::GUID GUID) {
+            return ThinLTO.isPrevailingModuleForGUID(GUID, BMID);
+          }))
     return Err;
-  LLVM_DEBUG(dbgs() << "Module " << BM.getModuleIdentifier() << "\n");
+  LLVM_DEBUG(dbgs() << "Module " << BMID << "\n");
 
   for (const InputFile::Symbol &Sym : Syms) {
     assert(!Res.empty());
     const SymbolResolution &R = Res.consume_front();
 
-    if (!Sym.getIRName().empty()) {
+    if (!Sym.getIRName().empty() &&
+        (R.Prevailing || R.FinalDefinitionInLinkageUnit)) {
       auto GUID = GlobalValue::getGUIDAssumingExternalLinkage(
           GlobalValue::getGlobalIdentifier(Sym.getIRName(),
                                            GlobalValue::ExternalLinkage, ""));
       if (R.Prevailing) {
-        assert(
-            ThinLTO.isPrevailingModuleForGUID(GUID, BM.getModuleIdentifier()));
+        assert(ThinLTO.isPrevailingModuleForGUID(GUID, BMID));
 
         // For linker redefined symbols (via --wrap or --defsym) we want to
         // switch the linkage to `weak` to prevent IPOs from happening.
         // Find the summary in the module for this very GV and record the new
         // linkage so that we can switch it when we import the GV.
         if (R.LinkerRedefined)
-          if (auto S = ThinLTO.CombinedIndex.findSummaryInModule(
-                  GUID, BM.getModuleIdentifier()))
+          if (auto S = ThinLTO.CombinedIndex.findSummaryInModule(GUID, BMID))
             S->setLinkage(GlobalValue::WeakAnyLinkage);
       }
 
       // If the linker resolved the symbol to a local definition then mark it
       // as local in the summary for the module we are adding.
       if (R.FinalDefinitionInLinkageUnit) {
-        if (auto S = ThinLTO.CombinedIndex.findSummaryInModule(
-                GUID, BM.getModuleIdentifier())) {
+        if (auto S = ThinLTO.CombinedIndex.findSummaryInModule(GUID, BMID)) {
           S->setDSOLocal(true);
         }
       }
     }
   }
 
-  if (!ThinLTO.ModuleMap.insert({BM.getModuleIdentifier(), BM}).second)
+  if (!ThinLTO.ModuleMap.insert({BMID, BM}).second)
     return make_error<StringError>(
         "Expected at most one ThinLTO module per bitcode file",
         inconvertibleErrorCode());
@@ -1143,10 +1139,10 @@ LTO::addThinLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms,
     // This is a fuzzy name matching where only modules with name containing the
     // specified switch values are going to be compiled.
     for (const std::string &Name : Conf.ThinLTOModulesToCompile) {
-      if (BM.getModuleIdentifier().contains(Name)) {
-        ThinLTO.ModulesToCompile->insert({BM.getModuleIdentifier(), BM});
-        LLVM_DEBUG(dbgs() << "[ThinLTO] Selecting " << BM.getModuleIdentifier()
-                          << " to compile\n");
+      if (BMID.contains(Name)) {
+        ThinLTO.ModulesToCompile->insert({BMID, BM});
+        LLVM_DEBUG(dbgs() << "[ThinLTO] Selecting " << BMID << " to compile\n");
+        break;
       }
     }
   }
diff --git a/llvm/lib/MC/GOFFObjectWriter.cpp b/llvm/lib/MC/GOFFObjectWriter.cpp
index 71bd39763956e..a3eaaa743039d 100644
--- a/llvm/lib/MC/GOFFObjectWriter.cpp
+++ b/llvm/lib/MC/GOFFObjectWriter.cpp
@@ -520,7 +520,7 @@ GOFFObjectWriter::GOFFObjectWriter(
     std::unique_ptr<MCGOFFObjectTargetWriter> MOTW, raw_pwrite_stream &OS)
     : TargetObjectWriter(std::move(MOTW)), OS(OS) {}
 
-GOFFObjectWriter::~GOFFObjectWriter() {}
+GOFFObjectWriter::~GOFFObjectWriter() = default;
 
 uint64_t GOFFObjectWriter::writeObject() {
   uint64_t Size = GOFFWriter(OS, *Asm).writeObject();
diff --git a/llvm/lib/MC/MCDXContainerWriter.cpp b/llvm/lib/MC/MCDXContainerWriter.cpp
index 5eda039853ca8..ebed411454087 100644
--- a/llvm/lib/MC/MCDXContainerWriter.cpp
+++ b/llvm/lib/MC/MCDXContainerWriter.cpp
@@ -16,7 +16,7 @@
 
 using namespace llvm;
 
-MCDXContainerTargetWriter::~MCDXContainerTargetWriter() {}
+MCDXContainerTargetWriter::~MCDXContainerTargetWriter() = default;
 
 uint64_t DXContainerObjectWriter::writeObject() {
   auto &Asm = *this->Asm;
diff --git a/llvm/lib/MC/MCGOFFStreamer.cpp b/llvm/lib/MC/MCGOFFStreamer.cpp
index 8b228db0e8b30..ad6397bce70f0 100644
--- a/llvm/lib/MC/MCGOFFStreamer.cpp
+++ b/llvm/lib/MC/MCGOFFStreamer.cpp
@@ -20,7 +20,7 @@
 
 using namespace llvm;
 
-MCGOFFStreamer::~MCGOFFStreamer() {}
+MCGOFFStreamer::~MCGOFFStreamer() = default;
 
 GOFFObjectWriter &MCGOFFStreamer::getWriter() {
   return static_cast<GOFFObjectWriter &>(getAssembler().getWriter());
diff --git a/llvm/lib/MC/MCParser/AsmLexer.cpp b/llvm/lib/MC/MCParser/AsmLexer.cpp
index a6188f0676937..1af4a297babaa 100644
--- a/llvm/lib/MC/MCParser/AsmLexer.cpp
+++ b/llvm/lib/MC/MCParser/AsmLexer.cpp
@@ -16,7 +16,6 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCParser/AsmLexer.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/SMLoc.h"
 #include "llvm/Support/SaveAndRestore.h"
diff --git a/llvm/lib/MC/MCParser/AsmParser.cpp b/llvm/lib/MC/MCParser/AsmParser.cpp
index dd1bc2be5feb4..3c9ab8e108ddd 100644
--- a/llvm/lib/MC/MCParser/AsmParser.cpp
+++ b/llvm/lib/MC/MCParser/AsmParser.cpp
@@ -228,11 +228,9 @@ class AsmParser : public MCAsmParser {
     AssemblerDialect = i;
   }
 
-  void Note(SMLoc L, const Twine &Msg, SMRange Range = std::nullopt) override;
-  bool Warning(SMLoc L, const Twine &Msg,
-               SMRange Range = std::nullopt) override;
-  bool printError(SMLoc L, const Twine &Msg,
-                  SMRange Range = std::nullopt) override;
+  void Note(SMLoc L, const Twine &Msg, SMRange Range = {}) override;
+  bool Warning(SMLoc L, const Twine &Msg, SMRange Range = {}) override;
+  bool printError(SMLoc L, const Twine &Msg, SMRange Range = {}) override;
 
   const AsmToken &Lex() override;
 
@@ -312,7 +310,7 @@ class AsmParser : public MCAsmParser {
 
   void printMacroInstantiations();
   void printMessage(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Msg,
-                    SMRange Range = std::nullopt) const {
+                    SMRange Range = {}) const {
     ArrayRef<SMRange> Ranges(Range);
     SrcMgr.PrintMessage(Loc, Kind, Msg, Ranges);
   }
diff --git a/llvm/lib/MC/MCParser/ELFAsmParser.cpp b/llvm/lib/MC/MCParser/ELFAsmParser.cpp
index 1a3752f71f065..911d92c51b59b 100644
--- a/llvm/lib/MC/MCParser/ELFAsmParser.cpp
+++ b/llvm/lib/MC/MCParser/ELFAsmParser.cpp
@@ -695,15 +695,15 @@ bool ELFAsmParser::parseDirectivePrevious(StringRef DirName, SMLoc) {
 
 static MCSymbolAttr MCAttrForString(StringRef Type) {
   return StringSwitch<MCSymbolAttr>(Type)
-          .Cases("STT_FUNC", "function", MCSA_ELF_TypeFunction)
-          .Cases("STT_OBJECT", "object", MCSA_ELF_TypeObject)
-          .Cases("STT_TLS", "tls_object", MCSA_ELF_TypeTLS)
-          .Cases("STT_COMMON", "common", MCSA_ELF_TypeCommon)
-          .Cases("STT_NOTYPE", "notype", MCSA_ELF_TypeNoType)
-          .Cases("STT_GNU_IFUNC", "gnu_indirect_function",
-                 MCSA_ELF_TypeIndFunction)
-          .Case("gnu_unique_object", MCSA_ELF_TypeGnuUniqueObject)
-          .Default(MCSA_Invalid);
+      .Cases({"STT_FUNC", "function"}, MCSA_ELF_TypeFunction)
+      .Cases({"STT_OBJECT", "object"}, MCSA_ELF_TypeObject)
+      .Cases({"STT_TLS", "tls_object"}, MCSA_ELF_TypeTLS)
+      .Cases({"STT_COMMON", "common"}, MCSA_ELF_TypeCommon)
+      .Cases({"STT_NOTYPE", "notype"}, MCSA_ELF_TypeNoType)
+      .Cases({"STT_GNU_IFUNC", "gnu_indirect_function"},
+             MCSA_ELF_TypeIndFunction)
+      .Case("gnu_unique_object", MCSA_ELF_TypeGnuUniqueObject)
+      .Default(MCSA_Invalid);
 }
 
 /// parseDirectiveELFType
diff --git a/llvm/lib/MC/MCParser/MasmParser.cpp b/llvm/lib/MC/MCParser/MasmParser.cpp
index 8a8f11122673f..3a85770a2783d 100644
--- a/llvm/lib/MC/MCParser/MasmParser.cpp
+++ b/llvm/lib/MC/MCParser/MasmParser.cpp
@@ -483,11 +483,9 @@ class MasmParser : public MCAsmParser {
     AssemblerDialect = i;
   }
 
-  void Note(SMLoc L, const Twine &Msg, SMRange Range = std::nullopt) override;
-  bool Warning(SMLoc L, const Twine &Msg,
-               SMRange Range = std::nullopt) override;
-  bool printError(SMLoc L, const Twine &Msg,
-                  SMRange Range = std::nullopt) override;
+  void Note(SMLoc L, const Twine &Msg, SMRange Range = {}) override;
+  bool Warning(SMLoc L, const Twine &Msg, SMRange Range = {}) override;
+  bool printError(SMLoc L, const Twine &Msg, SMRange Range = {}) override;
 
   enum ExpandKind { ExpandMacros, DoNotExpandMacros };
   const AsmToken &Lex(ExpandKind ExpandNextToken);
@@ -592,7 +590,7 @@ class MasmParser : public MCAsmParser {
   bool expandStatement(SMLoc Loc);
 
   void printMessage(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Msg,
-                    SMRange Range = std::nullopt) const {
+                    SMRange Range = {}) const {
     ArrayRef<SMRange> Ranges(Range);
     SrcMgr.PrintMessage(Loc, Kind, Msg, Ranges);
   }
@@ -5325,10 +5323,10 @@ void MasmParser::initializeDirectiveKindMap() {
 bool MasmParser::isMacroLikeDirective() {
   if (getLexer().is(AsmToken::Identifier)) {
     bool IsMacroLike = StringSwitch<bool>(getTok().getIdentifier())
-                           .CasesLower("repeat", "rept", true)
+                           .CasesLower({"repeat", "rept"}, true)
                            .CaseLower("while", true)
-                           .CasesLower("for", "irp", true)
-                           .CasesLower("forc", "irpc", true)
+                           .CasesLower({"for", "irp"}, true)
+                           .CasesLower({"forc", "irpc"}, true)
                            .Default(false);
     if (IsMacroLike)
       return true;
diff --git a/llvm/lib/ObjCopy/COFF/COFFWriter.h b/llvm/lib/ObjCopy/COFF/COFFWriter.h
index 66d7f01c87f18..3ee0e06b92ae4 100644
--- a/llvm/lib/ObjCopy/COFF/COFFWriter.h
+++ b/llvm/lib/ObjCopy/COFF/COFFWriter.h
@@ -50,7 +50,7 @@ class COFFWriter {
   Expected<uint32_t> virtualAddressToFileAddress(uint32_t RVA);
 
 public:
-  virtual ~COFFWriter() {}
+  virtual ~COFFWriter() = default;
   Error write();
 
   COFFWriter(Object &Obj, raw_ostream &Out)
diff --git a/llvm/lib/ObjCopy/ELF/ELFObject.h b/llvm/lib/ObjCopy/ELF/ELFObject.h
index 4f6473f515ddd..2783ef27ac9de 100644
--- a/llvm/lib/ObjCopy/ELF/ELFObject.h
+++ b/llvm/lib/ObjCopy/ELF/ELFObject.h
@@ -134,7 +134,7 @@ template <class ELFT> class ELFSectionWriter : public SectionWriter {
   using Elf_Sym = typename ELFT::Sym;
 
 public:
-  ~ELFSectionWriter() override {}
+  ~ELFSectionWriter() override = default;
   Error visit(const SymbolTableSection &Sec) override;
   Error visit(const RelocationSection &Sec) override;
   Error visit(const GnuDebugLinkSection &Sec) override;
@@ -180,7 +180,7 @@ template <class ELFT> class ELFSectionSizer : public MutableSectionVisitor {
 
 class BinarySectionWriter : public SectionWriter {
 public:
-  ~BinarySectionWriter() override {}
+  ~BinarySectionWriter() override = default;
 
   Error visit(const SymbolTableSection &Sec) override;
   Error visit(const RelocationSection &Sec) override;
@@ -346,7 +346,7 @@ template <class ELFT> class ELFWriter : public Writer {
   size_t totalSize() const;
 
 public:
-  ~ELFWriter() override {}
+  ~ELFWriter() override = default;
   bool WriteSectionHeaders;
 
   // For --only-keep-debug, select an alternative section/segment layout
@@ -367,7 +367,7 @@ class BinaryWriter : public Writer {
   uint64_t TotalSize = 0;
 
 public:
-  ~BinaryWriter() override {}
+  ~BinaryWriter() override = default;
   Error finalize() override;
   Error write() override;
   BinaryWriter(Object &Obj, raw_ostream &Out, const CommonConfig &Config)
@@ -784,7 +784,7 @@ class SectionIndexSection : public SectionBase {
   SymbolTableSection *Symbols = nullptr;
 
 public:
-  ~SectionIndexSection() override {}
+  ~SectionIndexSection() override = default;
   void addIndex(uint32_t Index) {
     assert(Size > 0);
     Indexes.push_back(Index);
diff --git a/llvm/lib/ObjCopy/MachO/MachOReader.h b/llvm/lib/ObjCopy/MachO/MachOReader.h
index e315e6fd9b117..940ba4c2d879e 100644
--- a/llvm/lib/ObjCopy/MachO/MachOReader.h
+++ b/llvm/lib/ObjCopy/MachO/MachOReader.h
@@ -23,7 +23,7 @@ namespace macho {
 // raw binaries and regular MachO object files.
 class Reader {
 public:
-  virtual ~Reader(){};
+  virtual ~Reader() = default;
   virtual Expected<std::unique_ptr<Object>> create() const = 0;
 };
 
diff --git a/llvm/lib/ObjCopy/XCOFF/XCOFFWriter.h b/llvm/lib/ObjCopy/XCOFF/XCOFFWriter.h
index 8620548ed5991..47639ad82fa75 100644
--- a/llvm/lib/ObjCopy/XCOFF/XCOFFWriter.h
+++ b/llvm/lib/ObjCopy/XCOFF/XCOFFWriter.h
@@ -20,7 +20,7 @@ namespace xcoff {
 
 class XCOFFWriter {
 public:
-  virtual ~XCOFFWriter() {}
+  virtual ~XCOFFWriter() = default;
   XCOFFWriter(Object &Obj, raw_ostream &Out) : Obj(Obj), Out(Out) {}
   Error write();
 
diff --git a/llvm/lib/Object/ELF.cpp b/llvm/lib/Object/ELF.cpp
index 6da97f9b3755d..354c51d66419c 100644
--- a/llvm/lib/Object/ELF.cpp
+++ b/llvm/lib/Object/ELF.cpp
@@ -831,17 +831,17 @@ decodeBBAddrMapImpl(const ELFFile<ELFT> &EF,
   };
 
   uint8_t Version = 0;
-  uint8_t Feature = 0;
+  uint16_t Feature = 0;
   BBAddrMap::Features FeatEnable{};
   while (!ULEBSizeErr && !MetadataDecodeErr && Cur &&
          Cur.tell() < Content.size()) {
     Version = Data.getU8(Cur);
     if (!Cur)
       break;
-    if (Version < 2 || Version > 4)
+    if (Version < 2 || Version > 5)
       return createError("unsupported SHT_LLVM_BB_ADDR_MAP version: " +
                          Twine(static_cast<int>(Version)));
-    Feature = Data.getU8(Cur); // Feature byte
+    Feature = Version < 5 ? Data.getU8(Cur) : Data.getU16(Cur);
     if (!Cur)
       break;
     auto FeatEnableOrErr = BBAddrMap::Features::decode(Feature);
@@ -858,6 +858,11 @@ decodeBBAddrMapImpl(const ELFFile<ELFT> &EF,
                          "basic block hash feature is enabled: version = " +
                          Twine(static_cast<int>(Version)) +
                          " feature = " + Twine(static_cast<int>(Feature)));
+    if (FeatEnable.PostLinkCfg && Version < 5)
+      return createError("version should be >= 5 for SHT_LLVM_BB_ADDR_MAP when "
+                         "post link cfg feature is enabled: version = " +
+                         Twine(static_cast<int>(Version)) +
+                         " feature = " + Twine(static_cast<int>(Feature)));
     uint32_t NumBlocksInBBRange = 0;
     uint32_t NumBBRanges = 1;
     typename ELFFile<ELFT>::uintX_t RangeBaseAddress = 0;
@@ -946,6 +951,10 @@ decodeBBAddrMapImpl(const ELFFile<ELFT> &EF,
         uint64_t BBF = FeatEnable.BBFreq
                            ? readULEB128As<uint64_t>(Data, Cur, ULEBSizeErr)
                            : 0;
+        uint32_t PostLinkBBFreq =
+            FeatEnable.PostLinkCfg
+                ? readULEB128As<uint32_t>(Data, Cur, ULEBSizeErr)
+                : 0;
 
         // Branch probability
         llvm::SmallVector<PGOAnalysisMap::PGOBBEntry::SuccessorEntry, 2>
@@ -955,13 +964,20 @@ decodeBBAddrMapImpl(const ELFFile<ELFT> &EF,
           for (uint64_t I = 0; I < SuccCount; ++I) {
             uint32_t BBID = readULEB128As<uint32_t>(Data, Cur, ULEBSizeErr);
             uint32_t BrProb = readULEB128As<uint32_t>(Data, Cur, ULEBSizeErr);
+            uint32_t PostLinkFreq =
+                FeatEnable.PostLinkCfg
+                    ? readULEB128As<uint32_t>(Data, Cur, ULEBSizeErr)
+                    : 0;
+
             if (PGOAnalyses)
-              Successors.push_back({BBID, BranchProbability::getRaw(BrProb)});
+              Successors.push_back(
+                  {BBID, BranchProbability::getRaw(BrProb), PostLinkFreq});
           }
         }
 
         if (PGOAnalyses)
-          PGOBBEntries.push_back({BlockFrequency(BBF), std::move(Successors)});
+          PGOBBEntries.push_back(
+              {BlockFrequency(BBF), PostLinkBBFreq, std::move(Successors)});
       }
 
       if (PGOAnalyses)
diff --git a/llvm/lib/Object/WindowsMachineFlag.cpp b/llvm/lib/Object/WindowsMachineFlag.cpp
index caf357e8c136f..14c14f693ca96 100644
--- a/llvm/lib/Object/WindowsMachineFlag.cpp
+++ b/llvm/lib/Object/WindowsMachineFlag.cpp
@@ -23,8 +23,8 @@ using namespace llvm;
 COFF::MachineTypes llvm::getMachineType(StringRef S) {
   // Flags must be a superset of Microsoft lib.exe /machine flags.
   return StringSwitch<COFF::MachineTypes>(S.lower())
-      .Cases("x64", "amd64", COFF::IMAGE_FILE_MACHINE_AMD64)
-      .Cases("x86", "i386", COFF::IMAGE_FILE_MACHINE_I386)
+      .Cases({"x64", "amd64"}, COFF::IMAGE_FILE_MACHINE_AMD64)
+      .Cases({"x86", "i386"}, COFF::IMAGE_FILE_MACHINE_I386)
       .Case("arm", COFF::IMAGE_FILE_MACHINE_ARMNT)
       .Case("arm64", COFF::IMAGE_FILE_MACHINE_ARM64)
       .Case("arm64ec", COFF::IMAGE_FILE_MACHINE_ARM64EC)
diff --git a/llvm/lib/ObjectYAML/ELFEmitter.cpp b/llvm/lib/ObjectYAML/ELFEmitter.cpp
index 8b75fbe8291f0..8530785d07c93 100644
--- a/llvm/lib/ObjectYAML/ELFEmitter.cpp
+++ b/llvm/lib/ObjectYAML/ELFEmitter.cpp
@@ -1465,13 +1465,19 @@ void ELFState<ELFT>::writeSectionContent(
   for (const auto &[Idx, E] : llvm::enumerate(*Section.Entries)) {
     // Write version and feature values.
     if (Section.Type == llvm::ELF::SHT_LLVM_BB_ADDR_MAP) {
-      if (E.Version > 4)
+      if (E.Version > 5)
         WithColor::warning() << "unsupported SHT_LLVM_BB_ADDR_MAP version: "
                              << static_cast<int>(E.Version)
                              << "; encoding using the most recent version";
       CBA.write(E.Version);
-      CBA.write(E.Feature);
-      SHeader.sh_size += 2;
+      SHeader.sh_size += 1;
+      if (E.Version < 5) {
+        CBA.write(static_cast<uint8_t>(E.Feature));
+        SHeader.sh_size += 1;
+      } else {
+        CBA.write<uint16_t>(E.Feature, ELFT::Endianness);
+        SHeader.sh_size += 2;
+      }
     }
     auto FeatureOrErr = llvm::object::BBAddrMap::Features::decode(E.Feature);
     bool MultiBBRangeFeatureEnabled = false;
@@ -1556,11 +1562,15 @@ void ELFState<ELFT>::writeSectionContent(
     for (const auto &PGOBBE : PGOBBEntries) {
       if (PGOBBE.BBFreq)
         SHeader.sh_size += CBA.writeULEB128(*PGOBBE.BBFreq);
+      if (FeatureOrErr->PostLinkCfg || PGOBBE.PostLinkBBFreq.has_value())
+        SHeader.sh_size += CBA.writeULEB128(PGOBBE.PostLinkBBFreq.value_or(0));
       if (PGOBBE.Successors) {
         SHeader.sh_size += CBA.writeULEB128(PGOBBE.Successors->size());
-        for (const auto &[ID, BrProb] : *PGOBBE.Successors) {
+        for (const auto &[ID, BrProb, PostLinkBrFreq] : *PGOBBE.Successors) {
           SHeader.sh_size += CBA.writeULEB128(ID);
           SHeader.sh_size += CBA.writeULEB128(BrProb);
+          if (FeatureOrErr->PostLinkCfg || PostLinkBrFreq.has_value())
+            SHeader.sh_size += CBA.writeULEB128(PostLinkBrFreq.value_or(0));
         }
       }
     }
diff --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp
index f8a84b075b779..e5e5fc20728e8 100644
--- a/llvm/lib/ObjectYAML/ELFYAML.cpp
+++ b/llvm/lib/ObjectYAML/ELFYAML.cpp
@@ -1886,7 +1886,7 @@ void MappingTraits<ELFYAML::BBAddrMapEntry>::mapping(
     IO &IO, ELFYAML::BBAddrMapEntry &E) {
   assert(IO.getContext() && "The IO context is not initialized");
   IO.mapRequired("Version", E.Version);
-  IO.mapOptional("Feature", E.Feature, Hex8(0));
+  IO.mapOptional("Feature", E.Feature, Hex16(0));
   IO.mapOptional("NumBBRanges", E.NumBBRanges);
   IO.mapOptional("BBRanges", E.BBRanges);
 }
@@ -1920,6 +1920,7 @@ void MappingTraits<ELFYAML::PGOAnalysisMapEntry::PGOBBEntry>::mapping(
     IO &IO, ELFYAML::PGOAnalysisMapEntry::PGOBBEntry &E) {
   assert(IO.getContext() && "The IO context is not initialized");
   IO.mapOptional("BBFreq", E.BBFreq);
+  IO.mapOptional("PostLinkBBFreq", E.PostLinkBBFreq);
   IO.mapOptional("Successors", E.Successors);
 }
 
@@ -1929,6 +1930,7 @@ void MappingTraits<ELFYAML::PGOAnalysisMapEntry::PGOBBEntry::SuccessorEntry>::
   assert(IO.getContext() && "The IO context is not initialized");
   IO.mapRequired("ID", E.ID);
   IO.mapRequired("BrProb", E.BrProb);
+  IO.mapOptional("PostLinkBrFreq", E.PostLinkBrFreq);
 }
 
 void MappingTraits<ELFYAML::GnuHashHeader>::mapping(IO &IO,
diff --git a/llvm/lib/ObjectYAML/GOFFYAML.cpp b/llvm/lib/ObjectYAML/GOFFYAML.cpp
index 60bc1f70274b2..ecd7fb646ea36 100644
--- a/llvm/lib/ObjectYAML/GOFFYAML.cpp
+++ b/llvm/lib/ObjectYAML/GOFFYAML.cpp
@@ -15,7 +15,7 @@
 namespace llvm {
 namespace GOFFYAML {
 
-Object::Object() {}
+Object::Object() = default;
 
 } // namespace GOFFYAML
 
diff --git a/llvm/lib/Passes/StandardInstrumentations.cpp b/llvm/lib/Passes/StandardInstrumentations.cpp
index 7290a86503120..6b7e980d048a4 100644
--- a/llvm/lib/Passes/StandardInstrumentations.cpp
+++ b/llvm/lib/Passes/StandardInstrumentations.cpp
@@ -537,7 +537,7 @@ void IRChangedPrinter::handleAfter(StringRef PassID, std::string &Name,
   Out << "*** IR Dump After " << PassID << " on " << Name << " ***\n" << After;
 }
 
-IRChangedTester::~IRChangedTester() {}
+IRChangedTester::~IRChangedTester() = default;
 
 void IRChangedTester::registerCallbacks(PassInstrumentationCallbacks &PIC) {
   if (TestChanged != "")
@@ -1566,7 +1566,7 @@ void InLineChangePrinter::registerCallbacks(PassInstrumentationCallbacks &PIC) {
     TextChangeReporter<IRDataT<EmptyData>>::registerRequiredCallbacks(PIC);
 }
 
-TimeProfilingPassesHandler::TimeProfilingPassesHandler() {}
+TimeProfilingPassesHandler::TimeProfilingPassesHandler() = default;
 
 void TimeProfilingPassesHandler::registerCallbacks(
     PassInstrumentationCallbacks &PIC) {
diff --git a/llvm/lib/Remarks/RemarkFormat.cpp b/llvm/lib/Remarks/RemarkFormat.cpp
index 1c52e352f9392..f9fd4af20e047 100644
--- a/llvm/lib/Remarks/RemarkFormat.cpp
+++ b/llvm/lib/Remarks/RemarkFormat.cpp
@@ -19,7 +19,7 @@ using namespace llvm::remarks;
 
 Expected<Format> llvm::remarks::parseFormat(StringRef FormatStr) {
   auto Result = StringSwitch<Format>(FormatStr)
-                    .Cases("", "yaml", Format::YAML)
+                    .Cases({"", "yaml"}, Format::YAML)
                     .Case("bitstream", Format::Bitstream)
                     .Default(Format::Unknown);
 
diff --git a/llvm/lib/SandboxIR/Context.cpp b/llvm/lib/SandboxIR/Context.cpp
index 70ac68abbcb0d..6f5d072fb6913 100644
--- a/llvm/lib/SandboxIR/Context.cpp
+++ b/llvm/lib/SandboxIR/Context.cpp
@@ -443,7 +443,7 @@ Argument *Context::getOrCreateArgument(llvm::Argument *LLVMArg) {
 }
 
 Constant *Context::getOrCreateConstant(llvm::Constant *LLVMC) {
-  return cast<Constant>(getOrCreateValueInternal(LLVMC, 0));
+  return cast<Constant>(getOrCreateValueInternal(LLVMC, nullptr));
 }
 
 BasicBlock *Context::createBasicBlock(llvm::BasicBlock *LLVMBB) {
@@ -637,7 +637,7 @@ Context::Context(LLVMContext &LLVMCtx)
     : LLVMCtx(LLVMCtx), IRTracker(*this),
       LLVMIRBuilder(LLVMCtx, ConstantFolder()) {}
 
-Context::~Context() {}
+Context::~Context() = default;
 
 void Context::clear() {
   // TODO: Ideally we should clear only function-scope objects, and keep global
diff --git a/llvm/lib/Support/AArch64BuildAttributes.cpp b/llvm/lib/Support/AArch64BuildAttributes.cpp
index 4a6b2fd538803..be4d1f1a8914e 100644
--- a/llvm/lib/Support/AArch64BuildAttributes.cpp
+++ b/llvm/lib/Support/AArch64BuildAttributes.cpp
@@ -67,8 +67,8 @@ StringRef AArch64BuildAttributes::getTypeStr(unsigned Type) {
 }
 SubsectionType AArch64BuildAttributes::getTypeID(StringRef Type) {
   return StringSwitch<SubsectionType>(Type)
-      .Cases("uleb128", "ULEB128", ULEB128)
-      .Cases("ntbs", "NTBS", NTBS)
+      .Cases({"uleb128", "ULEB128"}, ULEB128)
+      .Cases({"ntbs", "NTBS"}, NTBS)
       .Default(TYPE_NOT_FOUND);
 }
 StringRef AArch64BuildAttributes::getSubsectionTypeUnknownError() {
diff --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp
index e21cf8e13d4dc..e2645fa46bbcd 100644
--- a/llvm/lib/Support/APFloat.cpp
+++ b/llvm/lib/Support/APFloat.cpp
@@ -269,12 +269,6 @@ bool APFloatBase::isRepresentableBy(const fltSemantics &A,
          A.precision <= B.precision;
 }
 
-constexpr RoundingMode APFloatBase::rmNearestTiesToEven;
-constexpr RoundingMode APFloatBase::rmTowardPositive;
-constexpr RoundingMode APFloatBase::rmTowardNegative;
-constexpr RoundingMode APFloatBase::rmTowardZero;
-constexpr RoundingMode APFloatBase::rmNearestTiesToAway;
-
 /* A tight upper bound on number of parts required to hold the value
    pow(5, power) is
 
diff --git a/llvm/lib/Support/AutoConvert.cpp b/llvm/lib/Support/AutoConvert.cpp
index 0b6928e10ef5a..741bb7bd2c5b0 100644
--- a/llvm/lib/Support/AutoConvert.cpp
+++ b/llvm/lib/Support/AutoConvert.cpp
@@ -96,7 +96,7 @@ std::error_code llvm::setzOSFileTag(int FD, int CCSID, bool Text) {
   return std::error_code();
 }
 
-ErrorOr<__ccsid_t> llvm::getzOSFileTag(const char *FileName, const int FD) {
+ErrorOr<__ccsid_t> llvm::getzOSFileTag(const Twine &FileName, const int FD) {
   // If we have a file descriptor, use it to find out file tagging. Otherwise we
   // need to use stat() with the file path.
   if (FD != -1) {
@@ -110,12 +110,12 @@ ErrorOr<__ccsid_t> llvm::getzOSFileTag(const char *FileName, const int FD) {
     return Query.fccsid;
   }
   struct stat Attr;
-  if (stat(FileName, &Attr) == -1)
+  if (stat(FileName.str().c_str(), &Attr) == -1)
     return std::error_code(errno, std::generic_category());
   return Attr.st_tag.ft_ccsid;
 }
 
-ErrorOr<bool> llvm::needzOSConversion(const char *FileName, const int FD) {
+ErrorOr<bool> llvm::needzOSConversion(const Twine &FileName, const int FD) {
   ErrorOr<__ccsid_t> Ccsid = getzOSFileTag(FileName, FD);
   if (std::error_code EC = Ccsid.getError())
     return EC;
diff --git a/llvm/lib/Support/BranchProbability.cpp b/llvm/lib/Support/BranchProbability.cpp
index e3763449d16cb..143e58a05d3b7 100644
--- a/llvm/lib/Support/BranchProbability.cpp
+++ b/llvm/lib/Support/BranchProbability.cpp
@@ -20,8 +20,6 @@
 
 using namespace llvm;
 
-constexpr uint32_t BranchProbability::D;
-
 raw_ostream &BranchProbability::print(raw_ostream &OS) const {
   if (isUnknown())
     return OS << "?%";
@@ -111,3 +109,10 @@ uint64_t BranchProbability::scale(uint64_t Num) const {
 uint64_t BranchProbability::scaleByInverse(uint64_t Num) const {
   return ::scale<0>(Num, D, N);
 }
+
+BranchProbability BranchProbability::pow(unsigned N) const {
+  BranchProbability Res = BranchProbability::getOne();
+  for (unsigned I = 0; I < N; ++I)
+    Res *= *this;
+  return Res;
+}
diff --git a/llvm/lib/Support/CommandLine.cpp b/llvm/lib/Support/CommandLine.cpp
index 9491ec049f79d..de5bd795403dc 100644
--- a/llvm/lib/Support/CommandLine.cpp
+++ b/llvm/lib/Support/CommandLine.cpp
@@ -382,7 +382,7 @@ class CommandLineParser {
     RegisteredSubCommands.erase(sub);
   }
 
-  iterator_range<typename SmallPtrSet<SubCommand *, 4>::iterator>
+  iterator_range<SmallPtrSet<SubCommand *, 4>::iterator>
   getRegisteredSubcommands() {
     return make_range(RegisteredSubCommands.begin(),
                       RegisteredSubCommands.end());
@@ -2830,7 +2830,7 @@ StringMap<Option *> &cl::getRegisteredOptions(SubCommand &Sub) {
   return Sub.OptionsMap;
 }
 
-iterator_range<typename SmallPtrSet<SubCommand *, 4>::iterator>
+iterator_range<SmallPtrSet<SubCommand *, 4>::iterator>
 cl::getRegisteredSubcommands() {
   return GlobalParser->getRegisteredSubcommands();
 }
diff --git a/llvm/lib/Support/MemoryBuffer.cpp b/llvm/lib/Support/MemoryBuffer.cpp
index 1c4645ad83641..23b9f8c5790d2 100644
--- a/llvm/lib/Support/MemoryBuffer.cpp
+++ b/llvm/lib/Support/MemoryBuffer.cpp
@@ -512,7 +512,7 @@ getOpenFileImpl(sys::fs::file_t FD, const Twine &Filename, uint64_t FileSize,
   }
 
 #ifdef __MVS__
-  ErrorOr<bool> NeedsConversion = needConversion(Filename.str().c_str(), FD);
+  ErrorOr<bool> NeedsConversion = needConversion(Filename, FD);
   if (std::error_code EC = NeedsConversion.getError())
     return EC;
   // File size may increase due to EBCDIC -> UTF-8 conversion, therefore we
diff --git a/llvm/lib/Support/StringRef.cpp b/llvm/lib/Support/StringRef.cpp
index b6a2f8aeadccf..2e8fba8cbfa37 100644
--- a/llvm/lib/Support/StringRef.cpp
+++ b/llvm/lib/Support/StringRef.cpp
@@ -17,11 +17,6 @@
 
 using namespace llvm;
 
-// MSVC emits references to this into the translation units which reference it.
-#ifndef _MSC_VER
-constexpr size_t StringRef::npos;
-#endif
-
 // strncasecmp() is not available on non-POSIX systems, so define an
 // alternative function here.
 static int ascii_strncasecmp(StringRef LHS, StringRef RHS) {
diff --git a/llvm/lib/Support/UnicodeNameToCodepoint.cpp b/llvm/lib/Support/UnicodeNameToCodepoint.cpp
index 6f8e0915ab632..8f0d24ea1c1c6 100644
--- a/llvm/lib/Support/UnicodeNameToCodepoint.cpp
+++ b/llvm/lib/Support/UnicodeNameToCodepoint.cpp
@@ -251,10 +251,10 @@ constexpr const char *const HangulSyllables[][3] = {
 
 // Unicode 15.0
 // 3.12 Conjoining Jamo Behavior Common constants
-constexpr const char32_t SBase = 0xAC00;
-constexpr const uint32_t LCount = 19;
-constexpr const uint32_t VCount = 21;
-constexpr const uint32_t TCount = 28;
+constexpr char32_t SBase = 0xAC00;
+constexpr uint32_t LCount = 19;
+constexpr uint32_t VCount = 21;
+constexpr uint32_t TCount = 28;
 
 static std::size_t findSyllable(StringRef Name, bool Strict,
                                 char &PreviousInName, int &Pos, int Column) {
diff --git a/llvm/lib/Support/Windows/Signals.inc b/llvm/lib/Support/Windows/Signals.inc
index 648d6a50287ec..da68994970ebb 100644
--- a/llvm/lib/Support/Windows/Signals.inc
+++ b/llvm/lib/Support/Windows/Signals.inc
@@ -421,8 +421,13 @@ bool sys::RemoveFileOnSignal(StringRef Filename, std::string *ErrMsg) {
     return true;
   }
 
-  if (FilesToRemove == NULL)
+  if (FilesToRemove == NULL) {
     FilesToRemove = new std::vector<std::string>;
+    std::atexit([]() {
+      delete FilesToRemove;
+      FilesToRemove = NULL;
+    });
+  }
 
   FilesToRemove->push_back(std::string(Filename));
 
diff --git a/llvm/lib/Support/raw_ostream.cpp b/llvm/lib/Support/raw_ostream.cpp
index 07b99896543bd..d6f27fb7e7b63 100644
--- a/llvm/lib/Support/raw_ostream.cpp
+++ b/llvm/lib/Support/raw_ostream.cpp
@@ -61,17 +61,6 @@
 
 using namespace llvm;
 
-constexpr raw_ostream::Colors raw_ostream::BLACK;
-constexpr raw_ostream::Colors raw_ostream::RED;
-constexpr raw_ostream::Colors raw_ostream::GREEN;
-constexpr raw_ostream::Colors raw_ostream::YELLOW;
-constexpr raw_ostream::Colors raw_ostream::BLUE;
-constexpr raw_ostream::Colors raw_ostream::MAGENTA;
-constexpr raw_ostream::Colors raw_ostream::CYAN;
-constexpr raw_ostream::Colors raw_ostream::WHITE;
-constexpr raw_ostream::Colors raw_ostream::SAVEDCOLOR;
-constexpr raw_ostream::Colors raw_ostream::RESET;
-
 raw_ostream::~raw_ostream() {
   // raw_ostream's subclasses should take care to flush the buffer
   // in their destructors.
diff --git a/llvm/lib/Support/raw_socket_stream.cpp b/llvm/lib/Support/raw_socket_stream.cpp
index 3b510d357fd5d..f71631730d072 100644
--- a/llvm/lib/Support/raw_socket_stream.cpp
+++ b/llvm/lib/Support/raw_socket_stream.cpp
@@ -332,7 +332,7 @@ ListeningSocket::~ListeningSocket() {
 raw_socket_stream::raw_socket_stream(int SocketFD)
     : raw_fd_stream(SocketFD, true) {}
 
-raw_socket_stream::~raw_socket_stream() {}
+raw_socket_stream::~raw_socket_stream() = default;
 
 Expected<std::unique_ptr<raw_socket_stream>>
 raw_socket_stream::createConnectedUnix(StringRef SocketPath) {
diff --git a/llvm/lib/TableGen/TGLexer.cpp b/llvm/lib/TableGen/TGLexer.cpp
index 30eae6e7837cb..e8e64695e1ac4 100644
--- a/llvm/lib/TableGen/TGLexer.cpp
+++ b/llvm/lib/TableGen/TGLexer.cpp
@@ -682,8 +682,10 @@ tgtok::TokKind TGLexer::LexExclaim() {
           .Case("instances", tgtok::XInstances)
           .Case("substr", tgtok::XSubstr)
           .Case("find", tgtok::XFind)
-          .Cases("setdagop", "setop", tgtok::XSetDagOp) // !setop is deprecated.
-          .Cases("getdagop", "getop", tgtok::XGetDagOp) // !getop is deprecated.
+          .Cases({"setdagop", "setop"},
+                 tgtok::XSetDagOp) // !setop is deprecated.
+          .Cases({"getdagop", "getop"},
+                 tgtok::XGetDagOp) // !getop is deprecated.
           .Case("setdagopname", tgtok::XSetDagOpName)
           .Case("getdagopname", tgtok::XGetDagOpName)
           .Case("getdagarg", tgtok::XGetDagArg)
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index b3ec65cab51fa..278314792bfb9 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -366,6 +366,7 @@ def AArch64PostLegalizerCombiner
                         select_to_minmax, or_to_bsp, combine_concat_vector,
                         commute_constant_to_rhs, extract_vec_elt_combines,
                         push_freeze_to_prevent_poison_from_propagating,
-                        combine_mul_cmlt, combine_use_vector_truncate, 
-                        extmultomull, truncsat_combines, lshr_of_trunc_of_lshr]> {
+                        combine_mul_cmlt, combine_use_vector_truncate,
+                        extmultomull, truncsat_combines, lshr_of_trunc_of_lshr,
+                        funnel_shift_from_or_shift_constants_are_legal]> {
 }
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index d16b11686e3c1..60aa61e993b26 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -9028,11 +9028,12 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
   CallingConv::ID CallerCC = CallerF.getCallingConv();
 
   // SME Streaming functions are not eligible for TCO as they may require
-  // the streaming mode or ZA to be restored after returning from the call.
+  // the streaming mode or ZA/ZT0 to be restored after returning from the call.
   SMECallAttrs CallAttrs =
       getSMECallAttrs(CallerF, getRuntimeLibcallsInfo(), CLI);
   if (CallAttrs.requiresSMChange() || CallAttrs.requiresLazySave() ||
       CallAttrs.requiresPreservingAllZAState() ||
+      CallAttrs.requiresPreservingZT0() ||
       CallAttrs.caller().hasStreamingBody())
     return false;
 
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 457e5402e0f46..ccc8eb8a9706d 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -122,7 +122,7 @@ unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
     NumBytes = Desc.getSize() ? Desc.getSize() : 4;
 
     const auto *MFI = MF->getInfo<AArch64FunctionInfo>();
-    if (!MFI->shouldSignReturnAddress(MF))
+    if (!MFI->shouldSignReturnAddress(*MF))
       return NumBytes;
 
     const auto &STI = MF->getSubtarget<AArch64Subtarget>();
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index b9e299ef37454..2871a20e28b65 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -1805,14 +1805,22 @@ def : SHA3_pattern<EOR3, int_aarch64_crypto_eor3u, v8i16>;
 def : SHA3_pattern<EOR3, int_aarch64_crypto_eor3u, v4i32>;
 def : SHA3_pattern<EOR3, int_aarch64_crypto_eor3u, v2i64>;
 
-class EOR3_pattern<ValueType VecTy>
-  : Pat<(xor (xor (VecTy V128:$Vn), (VecTy V128:$Vm)), (VecTy V128:$Va)),
-        (EOR3 (VecTy V128:$Vn), (VecTy V128:$Vm), (VecTy V128:$Va))>;
-
-def : EOR3_pattern<v16i8>;
-def : EOR3_pattern<v8i16>;
-def : EOR3_pattern<v4i32>;
-def : EOR3_pattern<v2i64>;
+multiclass EOR3_pattern<ValueType Vec128Ty, ValueType Vec64Ty>{
+  def : Pat<(xor (xor (Vec128Ty V128:$Vn), (Vec128Ty V128:$Vm)), (Vec128Ty V128:$Va)),
+        (EOR3 (Vec128Ty V128:$Vn), (Vec128Ty V128:$Vm), (Vec128Ty V128:$Va))>;
+  def : Pat<(xor (xor (Vec64Ty V64:$Vn), (Vec64Ty V64:$Vm)), (Vec64Ty V64:$Va)),
+            (EXTRACT_SUBREG
+              (EOR3
+                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vn, dsub),
+                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vm, dsub),
+                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Va, dsub)),
+              dsub)>;
+}
+
+defm : EOR3_pattern<v16i8, v8i8>;
+defm : EOR3_pattern<v8i16, v4i16>;
+defm : EOR3_pattern<v4i32, v2i32>;
+defm : EOR3_pattern<v2i64, v1i64>;
 
 class BCAX_pattern<ValueType VecTy>
   : Pat<(xor (VecTy V128:$Vn), (and (VecTy V128:$Vm), (vnot (VecTy V128:$Va)))),
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index fede586cf35bc..5b5565afd62b1 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -308,9 +308,9 @@ bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
   return (EffectiveCallerBits & EffectiveCalleeBits) == EffectiveCalleeBits;
 }
 
-bool AArch64TTIImpl::areTypesABICompatible(
-    const Function *Caller, const Function *Callee,
-    const ArrayRef<Type *> &Types) const {
+bool AArch64TTIImpl::areTypesABICompatible(const Function *Caller,
+                                           const Function *Callee,
+                                           ArrayRef<Type *> Types) const {
   if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
     return false;
 
@@ -1032,6 +1032,13 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     }
     break;
   }
+  case Intrinsic::experimental_vector_extract_last_active:
+    if (ST->isSVEorStreamingSVEAvailable()) {
+      auto [LegalCost, _] = getTypeLegalizationCost(ICA.getArgTypes()[0]);
+      // This should turn into chained clastb instructions.
+      return LegalCost;
+    }
+    break;
   default:
     break;
   }
@@ -2220,7 +2227,7 @@ static std::optional<Instruction *> instCombineSVEPTest(InstCombiner &IC,
   return std::nullopt;
 }
 
-template <Intrinsic::ID MulOpc, typename Intrinsic::ID FuseOpc>
+template <Intrinsic::ID MulOpc, Intrinsic::ID FuseOpc>
 static std::optional<Instruction *>
 instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II,
                                   bool MergeIntoAddendOp) {
@@ -6650,10 +6657,15 @@ bool AArch64TTIImpl::isProfitableToSinkOperands(
           Ops.push_back(&Ext->getOperandUse(0));
         Ops.push_back(&Op);
 
-        if (isa<SExtInst>(Ext))
+        if (isa<SExtInst>(Ext)) {
           NumSExts++;
-        else
+        } else {
           NumZExts++;
+          // A zext(a) is also a sext(zext(a)), if we take more than 2 steps.
+          if (Ext->getOperand(0)->getType()->getScalarSizeInBits() * 2 <
+              I->getType()->getScalarSizeInBits())
+            NumSExts++;
+        }
 
         continue;
       }
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index fe2e849258e3f..b39546a9a381d 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -84,7 +84,7 @@ class AArch64TTIImpl final : public BasicTTIImplBase<AArch64TTIImpl> {
                            const Function *Callee) const override;
 
   bool areTypesABICompatible(const Function *Caller, const Function *Callee,
-                             const ArrayRef<Type *> &Types) const override;
+                             ArrayRef<Type *> Types) const override;
 
   unsigned getInlineCallPenalty(const Function *F, const CallBase &Call,
                                 unsigned DefaultCallPenalty) const override;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index ce2b4a5f6f2e9..67042b700c047 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -69,7 +69,7 @@ FunctionPass *createAMDGPUPreloadKernArgPrologLegacyPass();
 ModulePass *createAMDGPUPreloadKernelArgumentsLegacyPass(const TargetMachine *);
 
 struct AMDGPUSimplifyLibCallsPass : PassInfoMixin<AMDGPUSimplifyLibCallsPass> {
-  AMDGPUSimplifyLibCallsPass() {}
+  AMDGPUSimplifyLibCallsPass() = default;
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 };
 
@@ -371,13 +371,13 @@ class AMDGPUPreloadKernelArgumentsPass
 class AMDGPUAnnotateUniformValuesPass
     : public PassInfoMixin<AMDGPUAnnotateUniformValuesPass> {
 public:
-  AMDGPUAnnotateUniformValuesPass() {}
+  AMDGPUAnnotateUniformValuesPass() = default;
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 };
 
 class SIModeRegisterPass : public PassInfoMixin<SIModeRegisterPass> {
 public:
-  SIModeRegisterPass() {}
+  SIModeRegisterPass() = default;
   PreservedAnalyses run(MachineFunction &F, MachineFunctionAnalysisManager &AM);
 };
 
@@ -562,9 +562,13 @@ class AMDGPURewriteAGPRCopyMFMAPass
 void initializeAMDGPURewriteAGPRCopyMFMALegacyPass(PassRegistry &);
 extern char &AMDGPURewriteAGPRCopyMFMALegacyID;
 
+void initializeAMDGPUUniformIntrinsicCombineLegacyPass(PassRegistry &);
+extern char &AMDGPUUniformIntrinsicCombineLegacyPassID;
+FunctionPass *createAMDGPUUniformIntrinsicCombineLegacyPass();
+
 struct AMDGPUUniformIntrinsicCombinePass
     : public PassInfoMixin<AMDGPUUniformIntrinsicCombinePass> {
-  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 };
 
 namespace AMDGPU {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
index 1064e57b9da9e..dad94b83aa84f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
@@ -96,7 +96,7 @@ inline raw_ostream &operator<<(raw_ostream &OS, const ArgDescriptor &Arg) {
 }
 
 struct KernArgPreloadDescriptor : public ArgDescriptor {
-  KernArgPreloadDescriptor() {}
+  KernArgPreloadDescriptor() = default;
   SmallVector<MCRegister> Regs;
 };
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 9907c88f4dfb8..8669978637f40 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -1555,7 +1555,7 @@ struct AAAMDGPUClusterDimsFunction : public AAAMDGPUClusterDims {
 
   AMDGPU::ClusterDimsAttr Attr;
 
-  static constexpr const char AttrName[] = "amdgpu-cluster-dims";
+  static constexpr char AttrName[] = "amdgpu-cluster-dims";
 };
 
 AAAMDGPUClusterDims &
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
index 0eb00cbc2f466..529da8d28a3c1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
@@ -50,6 +50,7 @@ const D16ImageDimIntrinsic *lookupD16ImageDimIntrinsic(unsigned Intr);
 struct ImageDimIntrinsicInfo {
   unsigned Intr;
   unsigned BaseOpcode;
+  unsigned AtomicNoRetBaseOpcode;
   MIMGDim Dim;
 
   uint8_t NumOffsetArgs;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 97c2c9c5316b3..0c977416f1793 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -221,12 +221,21 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
 bool AMDGPUInstructionSelector::selectCOPY_SCC_VCC(MachineInstr &I) const {
   const DebugLoc &DL = I.getDebugLoc();
   MachineBasicBlock *BB = I.getParent();
+  Register VCCReg = I.getOperand(1).getReg();
+  MachineInstr *Cmp;
+
+  // Set SCC as a side effect with S_CMP or S_OR.
+  if (STI.hasScalarCompareEq64()) {
+    unsigned CmpOpc =
+        STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32;
+    Cmp = BuildMI(*BB, &I, DL, TII.get(CmpOpc)).addReg(VCCReg).addImm(0);
+  } else {
+    Register DeadDst = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
+    Cmp = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_OR_B64), DeadDst)
+              .addReg(VCCReg)
+              .addReg(VCCReg);
+  }
 
-  unsigned CmpOpc =
-      STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32;
-  MachineInstr *Cmp = BuildMI(*BB, &I, DL, TII.get(CmpOpc))
-                          .addReg(I.getOperand(1).getReg())
-                          .addImm(0);
   if (!constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI))
     return false;
 
@@ -2006,19 +2015,27 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
   MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
   MachineBasicBlock *MBB = MI.getParent();
   const DebugLoc &DL = MI.getDebugLoc();
+  unsigned IntrOpcode = Intr->BaseOpcode;
+
+  // For image atomic: use no-return opcode if result is unused.
+  if (Intr->AtomicNoRetBaseOpcode != Intr->BaseOpcode) {
+    Register ResultDef = MI.getOperand(0).getReg();
+    if (MRI->use_nodbg_empty(ResultDef))
+      IntrOpcode = Intr->AtomicNoRetBaseOpcode;
+  }
 
   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
-    AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
+      AMDGPU::getMIMGBaseOpcodeInfo(IntrOpcode);
 
   const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
-  unsigned IntrOpcode = Intr->BaseOpcode;
   const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI);
   const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI);
   const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
 
   const unsigned ArgOffset = MI.getNumExplicitDefs() + 1;
 
-  Register VDataIn, VDataOut;
+  Register VDataIn = AMDGPU::NoRegister;
+  Register VDataOut = AMDGPU::NoRegister;
   LLT VDataTy;
   int NumVDataDwords = -1;
   bool IsD16 = MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 ||
@@ -2049,7 +2066,8 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
   unsigned DMaskLanes = 0;
 
   if (BaseOpcode->Atomic) {
-    VDataOut = MI.getOperand(0).getReg();
+    if (!BaseOpcode->NoReturn)
+      VDataOut = MI.getOperand(0).getReg();
     VDataIn = MI.getOperand(2).getReg();
     LLT Ty = MRI->getType(VDataIn);
 
@@ -2099,8 +2117,9 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
   assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
 
   unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm();
-  if (BaseOpcode->Atomic)
-    CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
+  // Keep GLC only when the atomic's result is actually used.
+  if (BaseOpcode->Atomic && !BaseOpcode->NoReturn)
+    CPol |= AMDGPU::CPol::GLC;
   if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
                AMDGPU::CPol::VOLATILE))
     return false;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp
index 1e6589eb42c15..d7d0292083e1c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp
@@ -58,6 +58,8 @@ class AMDGPULowerVGPREncoding {
   static constexpr unsigned BitsPerField = 2;
   static constexpr unsigned NumFields = 4;
   static constexpr unsigned FieldMask = (1 << BitsPerField) - 1;
+  static constexpr unsigned ModeWidth = NumFields * BitsPerField;
+  static constexpr unsigned ModeMask = (1 << ModeWidth) - 1;
   using ModeType = PackedVector<unsigned, BitsPerField,
                                 std::bitset<BitsPerField * NumFields>>;
 
@@ -82,12 +84,12 @@ class AMDGPULowerVGPREncoding {
   const SIInstrInfo *TII;
   const SIRegisterInfo *TRI;
 
+  // Current basic block.
+  MachineBasicBlock *MBB;
+
   /// Most recent s_set_* instruction.
   MachineInstr *MostRecentModeSet;
 
-  /// Whether the current mode is known.
-  bool CurrentModeKnown;
-
   /// Current mode bits.
   ModeTy CurrentMode;
 
@@ -108,10 +110,13 @@ class AMDGPULowerVGPREncoding {
   MachineInstr *Clause;
 
   /// Insert mode change before \p I. \returns true if mode was changed.
-  bool setMode(ModeTy NewMode, ModeTy Mask, MachineInstr *I);
+  bool setMode(ModeTy NewMode, ModeTy Mask,
+               MachineBasicBlock::instr_iterator I);
 
   /// Reset mode to default.
-  void resetMode(MachineInstr *I) { setMode(ModeTy(), ModeTy::fullMask(), I); }
+  void resetMode(MachineBasicBlock::instr_iterator I) {
+    setMode(ModeTy(), ModeTy::fullMask(), I);
+  }
 
   /// If \p MO references VGPRs, return the MSBs. Otherwise, return nullopt.
   std::optional<unsigned> getMSBs(const MachineOperand &MO) const;
@@ -130,38 +135,43 @@ class AMDGPULowerVGPREncoding {
   /// Check if an instruction \p I is within a clause and returns a suitable
   /// iterator to insert mode change. It may also modify the S_CLAUSE
   /// instruction to extend it or drop the clause if it cannot be adjusted.
-  MachineInstr *handleClause(MachineInstr *I);
+  MachineBasicBlock::instr_iterator
+  handleClause(MachineBasicBlock::instr_iterator I);
 };
 
 bool AMDGPULowerVGPREncoding::setMode(ModeTy NewMode, ModeTy Mask,
-                                      MachineInstr *I) {
+                                      MachineBasicBlock::instr_iterator I) {
   assert((NewMode.raw_bits() & ~Mask.raw_bits()).none());
 
-  if (CurrentModeKnown) {
-    auto Delta = NewMode.raw_bits() ^ CurrentMode.raw_bits();
+  auto Delta = NewMode.raw_bits() ^ CurrentMode.raw_bits();
 
-    if ((Delta & Mask.raw_bits()).none()) {
-      CurrentMask |= Mask;
-      return false;
-    }
+  if ((Delta & Mask.raw_bits()).none()) {
+    CurrentMask |= Mask;
+    return false;
+  }
 
-    if (MostRecentModeSet && (Delta & CurrentMask.raw_bits()).none()) {
-      CurrentMode |= NewMode;
-      CurrentMask |= Mask;
+  if (MostRecentModeSet && (Delta & CurrentMask.raw_bits()).none()) {
+    CurrentMode |= NewMode;
+    CurrentMask |= Mask;
 
-      MostRecentModeSet->getOperand(0).setImm(CurrentMode);
-      return true;
-    }
+    MachineOperand &Op = MostRecentModeSet->getOperand(0);
+
+    // Carry old mode bits from the existing instruction.
+    int64_t OldModeBits = Op.getImm() & (ModeMask << ModeWidth);
+
+    Op.setImm(CurrentMode | OldModeBits);
+    return true;
   }
 
+  // Record previous mode into high 8 bits of the immediate.
+  int64_t OldModeBits = CurrentMode << ModeWidth;
+
   I = handleClause(I);
-  MostRecentModeSet =
-      BuildMI(*I->getParent(), I, {}, TII->get(AMDGPU::S_SET_VGPR_MSB))
-          .addImm(NewMode);
+  MostRecentModeSet = BuildMI(*MBB, I, {}, TII->get(AMDGPU::S_SET_VGPR_MSB))
+                          .addImm(NewMode | OldModeBits);
 
   CurrentMode = NewMode;
   CurrentMask = Mask;
-  CurrentModeKnown = true;
   return true;
 }
 
@@ -233,21 +243,22 @@ bool AMDGPULowerVGPREncoding::runOnMachineInstr(MachineInstr &MI) {
   if (Ops.first) {
     ModeTy NewMode, Mask;
     computeMode(NewMode, Mask, MI, Ops.first, Ops.second);
-    return setMode(NewMode, Mask, &MI);
+    return setMode(NewMode, Mask, MI.getIterator());
   }
   assert(!TII->hasVGPRUses(MI) || MI.isMetaInstruction() || MI.isPseudo());
 
   return false;
 }
 
-MachineInstr *AMDGPULowerVGPREncoding::handleClause(MachineInstr *I) {
+MachineBasicBlock::instr_iterator
+AMDGPULowerVGPREncoding::handleClause(MachineBasicBlock::instr_iterator I) {
   if (!ClauseRemaining)
     return I;
 
   // A clause cannot start with a special instruction, place it right before
   // the clause.
   if (ClauseRemaining == ClauseLen) {
-    I = Clause->getPrevNode();
+    I = Clause->getPrevNode()->getIterator();
     assert(I->isBundle());
     return I;
   }
@@ -284,9 +295,9 @@ bool AMDGPULowerVGPREncoding::run(MachineFunction &MF) {
   ClauseLen = ClauseRemaining = 0;
   CurrentMode.reset();
   CurrentMask.reset();
-  CurrentModeKnown = true;
   for (auto &MBB : MF) {
     MostRecentModeSet = nullptr;
+    this->MBB = &MBB;
 
     for (auto &MI : llvm::make_early_inc_range(MBB.instrs())) {
       if (MI.isMetaInstruction())
@@ -294,17 +305,16 @@ bool AMDGPULowerVGPREncoding::run(MachineFunction &MF) {
 
       if (MI.isTerminator() || MI.isCall()) {
         if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
-            MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) {
+            MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED)
           CurrentMode.reset();
-          CurrentModeKnown = true;
-        } else
-          resetMode(&MI);
+        else
+          resetMode(MI.getIterator());
         continue;
       }
 
       if (MI.isInlineAsm()) {
         if (TII->hasVGPRUses(MI))
-          resetMode(&MI);
+          resetMode(MI.getIterator());
         continue;
       }
 
@@ -323,14 +333,8 @@ bool AMDGPULowerVGPREncoding::run(MachineFunction &MF) {
         --ClauseRemaining;
     }
 
-    // If we're falling through to a block that has at least one other
-    // predecessor, we no longer know the mode.
-    MachineBasicBlock *Next = MBB.getNextNode();
-    if (Next && Next->pred_size() >= 2 &&
-        llvm::is_contained(Next->predecessors(), &MBB)) {
-      if (CurrentMode.raw_bits().any())
-        CurrentModeKnown = false;
-    }
+    // Reset the mode if we are falling through.
+    resetMode(MBB.instr_end());
   }
 
   return Changed;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index 680e7eb3de6be..844649ebb9ae6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -412,7 +412,7 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
                              *OutStreamer);
 
     if (isVerbose() && MI->getOpcode() == AMDGPU::S_SET_VGPR_MSB) {
-      unsigned V = MI->getOperand(0).getImm();
+      unsigned V = MI->getOperand(0).getImm() & 0xff;
       OutStreamer->AddComment(
           " msbs: dst=" + Twine(V >> 6) + " src0=" + Twine(V & 3) +
           " src1=" + Twine((V >> 2) & 3) + " src2=" + Twine((V >> 4) & 3));
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index a6074eaf78fd0..bf6f1a9dbf576 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -30,7 +30,6 @@ MODULE_PASS("amdgpu-preload-kernel-arguments", AMDGPUPreloadKernelArgumentsPass(
 MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass())
 MODULE_PASS("amdgpu-remove-incompatible-functions", AMDGPURemoveIncompatibleFunctionsPass(*this))
 MODULE_PASS("amdgpu-sw-lower-lds", AMDGPUSwLowerLDSPass(*this))
-MODULE_PASS("amdgpu-uniform-intrinsic-combine", AMDGPUUniformIntrinsicCombinePass())
 #undef MODULE_PASS
 
 #ifndef MODULE_PASS_WITH_PARAMS
@@ -69,6 +68,7 @@ FUNCTION_PASS("amdgpu-unify-divergent-exit-nodes",
               AMDGPUUnifyDivergentExitNodesPass())
 FUNCTION_PASS("amdgpu-usenative", AMDGPUUseNativeCallsPass())
 FUNCTION_PASS("si-annotate-control-flow", SIAnnotateControlFlowPass(*static_cast<const GCNTargetMachine *>(this)))
+FUNCTION_PASS("amdgpu-uniform-intrinsic-combine", AMDGPUUniformIntrinsicCombinePass())
 #undef FUNCTION_PASS
 
 #ifndef FUNCTION_ANALYSIS
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h
index cf2ab82537800..a3be0f51c2c2f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h
@@ -48,7 +48,7 @@ class AMDGPUPerfHintAnalysis {
   FuncInfoMap FIM;
 
 public:
-  AMDGPUPerfHintAnalysis() {}
+  AMDGPUPerfHintAnalysis() = default;
 
   // OldPM
   bool runOnSCC(const GCNTargetMachine &TM, CallGraphSCC &SCC);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
index e1879598f098a..907f8300de6d2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
@@ -24,6 +24,7 @@
 #include "llvm/CodeGen/GlobalISel/CSEInfo.h"
 #include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h"
 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
+#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
 #include "llvm/CodeGen/GlobalISel/Utils.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineUniformityAnalysis.h"
@@ -34,9 +35,17 @@
 
 using namespace llvm;
 using namespace AMDGPU;
+using namespace llvm::MIPatternMatch;
 
 namespace {
 
+// AMDGPU-specific pattern matchers
+template <typename SrcTy>
+inline UnaryOp_match<SrcTy, AMDGPU::G_AMDGPU_READANYLANE>
+m_GAMDGPUReadAnyLane(const SrcTy &Src) {
+  return UnaryOp_match<SrcTy, AMDGPU::G_AMDGPU_READANYLANE>(Src);
+}
+
 class AMDGPURegBankLegalize : public MachineFunctionPass {
 public:
   static char ID;
@@ -160,10 +169,18 @@ AMDGPURegBankLegalizeCombiner::tryMatchRALFromUnmerge(Register Src) {
 
 Register AMDGPURegBankLegalizeCombiner::getReadAnyLaneSrc(Register Src) {
   // Src = G_AMDGPU_READANYLANE RALSrc
-  auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE);
-  if (RAL)
+  Register RALSrc;
+  if (mi_match(Src, MRI, m_GAMDGPUReadAnyLane(m_Reg(RALSrc))))
     return RALSrc;
 
+  // TruncSrc = G_AMDGPU_READANYLANE RALSrc
+  // AextSrc = G_TRUNC TruncSrc
+  // Src = G_ANYEXT AextSrc
+  if (mi_match(Src, MRI,
+               m_GAnyExt(m_GTrunc(m_GAMDGPUReadAnyLane(m_Reg(RALSrc)))))) {
+    return RALSrc;
+  }
+
   // LoVgpr, HiVgpr = G_UNMERGE_VALUES UnmergeSrc
   // LoSgpr = G_AMDGPU_READANYLANE LoVgpr
   // HiSgpr = G_AMDGPU_READANYLANE HiVgpr
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
index 540756653dd22..dc8fa7f0eef49 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
@@ -500,6 +500,16 @@ void RegBankLegalizeHelper::lowerUnpackMinMax(MachineInstr &MI) {
   MI.eraseFromParent();
 }
 
+void RegBankLegalizeHelper::lowerUnpackAExt(MachineInstr &MI) {
+  auto [Op1Lo, Op1Hi] = unpackAExt(MI.getOperand(1).getReg());
+  auto [Op2Lo, Op2Hi] = unpackAExt(MI.getOperand(2).getReg());
+  auto ResLo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Op1Lo, Op2Lo});
+  auto ResHi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Op1Hi, Op2Hi});
+  B.buildBuildVectorTrunc(MI.getOperand(0).getReg(),
+                          {ResLo.getReg(0), ResHi.getReg(0)});
+  MI.eraseFromParent();
+}
+
 static bool isSignedBFE(MachineInstr &MI) {
   if (GIntrinsic *GI = dyn_cast<GIntrinsic>(&MI))
     return (GI->is(Intrinsic::amdgcn_sbfe));
@@ -616,6 +626,23 @@ void RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &MI) {
   MI.eraseFromParent();
 }
 
+void RegBankLegalizeHelper::lowerSplitTo16(MachineInstr &MI) {
+  Register Dst = MI.getOperand(0).getReg();
+  assert(MRI.getType(Dst) == V2S16);
+  auto [Op1Lo32, Op1Hi32] = unpackAExt(MI.getOperand(1).getReg());
+  auto [Op2Lo32, Op2Hi32] = unpackAExt(MI.getOperand(2).getReg());
+  unsigned Opc = MI.getOpcode();
+  auto Flags = MI.getFlags();
+  auto Op1Lo = B.buildTrunc(SgprRB_S16, Op1Lo32);
+  auto Op1Hi = B.buildTrunc(SgprRB_S16, Op1Hi32);
+  auto Op2Lo = B.buildTrunc(SgprRB_S16, Op2Lo32);
+  auto Op2Hi = B.buildTrunc(SgprRB_S16, Op2Hi32);
+  auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo, Op2Lo}, Flags);
+  auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi, Op2Hi}, Flags);
+  B.buildMergeLikeInstr(Dst, {Lo, Hi});
+  MI.eraseFromParent();
+}
+
 void RegBankLegalizeHelper::lowerSplitTo32Select(MachineInstr &MI) {
   Register Dst = MI.getOperand(0).getReg();
   LLT DstTy = MRI.getType(Dst);
@@ -688,6 +715,8 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
     return lowerUnpackBitShift(MI);
   case UnpackMinMax:
     return lowerUnpackMinMax(MI);
+  case ScalarizeToS16:
+    return lowerSplitTo16(MI);
   case Ext32To64: {
     const RegisterBank *RB = MRI.getRegBank(MI.getOperand(0).getReg());
     MachineInstrBuilder Hi;
@@ -804,6 +833,8 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
     }
     break;
   }
+  case UnpackAExt:
+    return lowerUnpackAExt(MI);
   case WidenMMOToS32:
     return widenMMOToS32(cast<GAnyLoad>(MI));
   }
@@ -837,6 +868,7 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
     return LLT::scalar(32);
   case Sgpr64:
   case Vgpr64:
+  case UniInVgprS64:
     return LLT::scalar(64);
   case Sgpr128:
   case Vgpr128:
@@ -960,6 +992,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
   case UniInVcc:
   case UniInVgprS16:
   case UniInVgprS32:
+  case UniInVgprS64:
   case UniInVgprV2S16:
   case UniInVgprV4S32:
   case UniInVgprB32:
@@ -1092,6 +1125,7 @@ void RegBankLegalizeHelper::applyMappingDst(
       break;
     }
     case UniInVgprS32:
+    case UniInVgprS64:
     case UniInVgprV2S16:
     case UniInVgprV4S32: {
       assert(Ty == getTyFromID(MethodIDs[OpIdx]));
@@ -1120,7 +1154,8 @@ void RegBankLegalizeHelper::applyMappingDst(
       assert(RB == SgprRB);
       Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
       Op.setReg(NewDst);
-      B.buildTrunc(Reg, NewDst);
+      if (!MRI.use_empty(Reg))
+        B.buildTrunc(Reg, NewDst);
       break;
     }
     case InvalidMapping: {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
index d937815bf4714..e7598f888e4b5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
@@ -72,6 +72,7 @@ class RegBankLegalizeHelper {
   static constexpr LLT P6 = LLT::pointer(6, 32);
 
   MachineRegisterInfo::VRegAttrs SgprRB_S32 = {SgprRB, S32};
+  MachineRegisterInfo::VRegAttrs SgprRB_S16 = {SgprRB, S16};
   MachineRegisterInfo::VRegAttrs VgprRB_S32 = {VgprRB, S32};
   MachineRegisterInfo::VRegAttrs VccRB_S1 = {VccRB, S1};
 
@@ -121,9 +122,11 @@ class RegBankLegalizeHelper {
   void lowerV_BFE(MachineInstr &MI);
   void lowerS_BFE(MachineInstr &MI);
   void lowerSplitTo32(MachineInstr &MI);
+  void lowerSplitTo16(MachineInstr &MI);
   void lowerSplitTo32Select(MachineInstr &MI);
   void lowerSplitTo32SExtInReg(MachineInstr &MI);
   void lowerUnpackMinMax(MachineInstr &MI);
+  void lowerUnpackAExt(MachineInstr &MI);
 };
 
 } // end namespace AMDGPU
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
index a67b12a22589c..dd474ac52c3c8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
@@ -202,7 +202,7 @@ bool PredicateMapping::match(const MachineInstr &MI,
   return true;
 }
 
-SetOfRulesForOpcode::SetOfRulesForOpcode() {}
+SetOfRulesForOpcode::SetOfRulesForOpcode() = default;
 
 SetOfRulesForOpcode::SetOfRulesForOpcode(FastRulesTypes FastTypes)
     : FastTypes(FastTypes) {}
@@ -470,7 +470,19 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
       .Uni(S16, {{Sgpr32Trunc}, {Sgpr32AExt, Sgpr32AExt}})
       .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
       .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
-      .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}});
+      .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
+      .Uni(V2S16, {{SgprV2S16}, {SgprV2S16, SgprV2S16}, UnpackAExt})
+      .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
+      .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr64}})
+      .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}});
+
+  addRulesForGOpcs({G_UADDO, G_USUBO}, Standard)
+      .Uni(S32, {{Sgpr32, Sgpr32Trunc}, {Sgpr32, Sgpr32}})
+      .Div(S32, {{Vgpr32, Vcc}, {Vgpr32, Vgpr32}});
+
+  addRulesForGOpcs({G_UADDE, G_USUBE}, Standard)
+      .Uni(S32, {{Sgpr32, Sgpr32Trunc}, {Sgpr32, Sgpr32, Sgpr32AExtBoolInReg}})
+      .Div(S32, {{Vgpr32, Vcc}, {Vgpr32, Vgpr32, Vcc}});
 
   addRulesForGOpcs({G_MUL}, Standard).Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}});
 
@@ -901,14 +913,26 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
 
   addRulesForGOpcs({G_ABS}, Standard).Uni(S16, {{Sgpr32Trunc}, {Sgpr32SExt}});
 
-  addRulesForGOpcs({G_READSTEADYCOUNTER}, Standard).Uni(S64, {{Sgpr64}, {}});
+  addRulesForGOpcs({G_READSTEADYCOUNTER, G_READCYCLECOUNTER}, Standard)
+      .Uni(S64, {{Sgpr64}, {}});
 
   bool hasSALUFloat = ST->hasSALUFloatInsts();
 
   addRulesForGOpcs({G_FADD}, Standard)
+      .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}}, !hasSALUFloat)
+      .Uni(S16, {{Sgpr16}, {Sgpr16, Sgpr16}}, hasSALUFloat)
+      .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
       .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}, hasSALUFloat)
       .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}}, !hasSALUFloat)
-      .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}});
+      .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
+      .Uni(S64, {{UniInVgprS64}, {Vgpr64, Vgpr64}})
+      .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}})
+      .Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16, VgprV2S16}}, !hasSALUFloat)
+      .Uni(V2S16, {{SgprV2S16}, {SgprV2S16, SgprV2S16}, ScalarizeToS16},
+           hasSALUFloat)
+      .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
+      .Any({{UniV2S32}, {{UniInVgprV2S32}, {VgprV2S32, VgprV2S32}}})
+      .Any({{DivV2S32}, {{VgprV2S32}, {VgprV2S32, VgprV2S32}}});
 
   addRulesForGOpcs({G_FPTOUI})
       .Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}}, hasSALUFloat)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
index 93e0efda77fdd..e6df5d87a2edc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
@@ -92,8 +92,10 @@ enum UniformityLLTOpPredicateID {
   V4S32,
 
   UniV2S16,
+  UniV2S32,
 
   DivV2S16,
+  DivV2S32,
 
   // B types
   B32,
@@ -178,7 +180,9 @@ enum RegBankLLTMappingApplyID {
   UniInVcc,
   UniInVgprS16,
   UniInVgprS32,
+  UniInVgprS64,
   UniInVgprV2S16,
+  UniInVgprV2S32,
   UniInVgprV4S32,
   UniInVgprB32,
   UniInVgprB64,
@@ -217,13 +221,15 @@ enum LoweringMethodID {
   V_BFE,
   VgprToVccCopy,
   SplitTo32,
+  ScalarizeToS16,
   SplitTo32Select,
   SplitTo32SExtInReg,
   Ext32To64,
   UniCstExt,
   SplitLoad,
   WidenLoad,
-  WidenMMOToS32
+  WidenMMOToS32,
+  UnpackAExt
 };
 
 enum FastRulesTypes {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 6214f4db87e1e..b87b54ffc4f12 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -619,6 +619,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeAMDGPUPreloadKernArgPrologLegacyPass(*PR);
   initializeAMDGPUWaitSGPRHazardsLegacyPass(*PR);
   initializeAMDGPUPreloadKernelArgumentsLegacyPass(*PR);
+  initializeAMDGPUUniformIntrinsicCombineLegacyPass(*PR);
 }
 
 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -815,7 +816,7 @@ parseAMDGPUAtomicOptimizerStrategy(StringRef Params) {
   Params.consume_front("strategy=");
   auto Result = StringSwitch<std::optional<ScanOptions>>(Params)
                     .Case("dpp", ScanOptions::DPP)
-                    .Cases("iterative", "", ScanOptions::Iterative)
+                    .Cases({"iterative", ""}, ScanOptions::Iterative)
                     .Case("none", ScanOptions::None)
                     .Default(std::nullopt);
   if (Result)
@@ -887,9 +888,6 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
 
         if (EarlyInlineAll && !EnableFunctionCalls)
           PM.addPass(AMDGPUAlwaysInlinePass());
-
-        if (EnableUniformIntrinsicCombine)
-          PM.addPass(AMDGPUUniformIntrinsicCombinePass());
       });
 
   PB.registerPeepholeEPCallback(
@@ -900,6 +898,9 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
         FPM.addPass(AMDGPUUseNativeCallsPass());
         if (EnableLibCallSimplify)
           FPM.addPass(AMDGPUSimplifyLibCallsPass());
+
+        if (EnableUniformIntrinsicCombine)
+          FPM.addPass(AMDGPUUniformIntrinsicCombinePass());
       });
 
   PB.registerCGSCCOptimizerLateEPCallback(
@@ -1314,6 +1315,9 @@ void AMDGPUPassConfig::addIRPasses() {
       isPassEnabled(EnableImageIntrinsicOptimizer))
     addPass(createAMDGPUImageIntrinsicOptimizerPass(&TM));
 
+  if (EnableUniformIntrinsicCombine)
+    addPass(createAMDGPUUniformIntrinsicCombineLegacyPass());
+
   // This can be disabled by passing ::Disable here or on the command line
   // with --expand-variadics-override=disable.
   addPass(createExpandVariadicsPass(ExpandVariadicsMode::Lowering));
@@ -2065,6 +2069,8 @@ void AMDGPUCodeGenPassBuilder::addIRPasses(AddIRPass &addPass) const {
   if (isPassEnabled(EnableImageIntrinsicOptimizer))
     addPass(AMDGPUImageIntrinsicOptimizerPass(TM));
 
+  if (EnableUniformIntrinsicCombine)
+    addPass(AMDGPUUniformIntrinsicCombinePass());
   // This can be disabled by passing ::Disable here or on the command line
   // with --expand-variadics-override=disable.
   addPass(ExpandVariadicsPass(ExpandVariadicsMode::Lowering));
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
index 50c78d8c67251..65e6ed9d1d428 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
@@ -16,12 +16,6 @@
 /// uniformity. And every instruction that's downstream and cares about dynamic
 /// uniformity must be convergent (and isel will introduce v_readfirstlane for
 /// them if their operands can't be proven statically uniform).
-///
-/// This pass is implemented as a ModulePass because intrinsic declarations
-/// exist at the module scope, allowing us to skip processing entirely if no
-/// declarations are present and to traverse their user lists directly when
-/// they are. A FunctionPass would instead require scanning every instruction
-/// in every function to find relevant intrinsics, which is far less efficient.
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
@@ -97,14 +91,12 @@ static bool optimizeUniformIntrinsic(IntrinsicInst &II,
           Tracker[NotOp] = true; // NOT preserves uniformity
           LLVM_DEBUG(dbgs() << "Replacing ICMP_EQ: " << *NotOp << '\n');
           ICmp->replaceAllUsesWith(NotOp);
-          ICmp->eraseFromParent();
           Changed = true;
         } else if (Pred == ICmpInst::ICMP_NE && match(OtherOp, m_Zero())) {
           // Case: (icmp ne %ballot, 0) -> %ballot_arg
           LLVM_DEBUG(dbgs() << "Replacing ICMP_NE with ballot argument: "
                             << *Src << '\n');
           ICmp->replaceAllUsesWith(Src);
-          ICmp->eraseFromParent();
           Changed = true;
         }
       }
@@ -120,15 +112,17 @@ static bool optimizeUniformIntrinsic(IntrinsicInst &II,
   return false;
 }
 
-/// Iterates over intrinsic declarations in the module to optimize their uses.
-static bool runUniformIntrinsicCombine(Module &M, ModuleAnalysisManager &AM) {
+/// Iterates over intrinsic calls in the Function to optimize.
+static bool runUniformIntrinsicCombine(Function &F, const UniformityInfo &UI) {
   bool IsChanged = false;
   ValueMap<const Value *, bool> Tracker;
 
-  FunctionAnalysisManager &FAM =
-      AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
-  for (Function &F : M) {
-    switch (F.getIntrinsicID()) {
+  for (Instruction &I : make_early_inc_range(instructions(F))) {
+    auto *II = dyn_cast<IntrinsicInst>(&I);
+    if (!II)
+      continue;
+
+    switch (II->getIntrinsicID()) {
     case Intrinsic::amdgcn_permlane64:
     case Intrinsic::amdgcn_readfirstlane:
     case Intrinsic::amdgcn_readlane:
@@ -137,23 +131,61 @@ static bool runUniformIntrinsicCombine(Module &M, ModuleAnalysisManager &AM) {
     default:
       continue;
     }
-
-    for (User *U : make_early_inc_range(F.users())) {
-      auto *II = cast<IntrinsicInst>(U);
-      Function *ParentF = II->getFunction();
-      const auto &UI = FAM.getResult<UniformityInfoAnalysis>(*ParentF);
-      IsChanged |= optimizeUniformIntrinsic(*II, UI, Tracker);
-    }
+    IsChanged |= optimizeUniformIntrinsic(*II, UI, Tracker);
   }
   return IsChanged;
 }
 
 PreservedAnalyses
-AMDGPUUniformIntrinsicCombinePass::run(Module &M, ModuleAnalysisManager &AM) {
-  if (!runUniformIntrinsicCombine(M, AM))
+AMDGPUUniformIntrinsicCombinePass::run(Function &F,
+                                       FunctionAnalysisManager &AM) {
+  const auto &UI = AM.getResult<UniformityInfoAnalysis>(F);
+  if (!runUniformIntrinsicCombine(F, UI))
     return PreservedAnalyses::all();
 
   PreservedAnalyses PA;
   PA.preserve<UniformityInfoAnalysis>();
   return PA;
 }
+
+namespace {
+class AMDGPUUniformIntrinsicCombineLegacy : public FunctionPass {
+public:
+  static char ID;
+  AMDGPUUniformIntrinsicCombineLegacy() : FunctionPass(ID) {
+    initializeAMDGPUUniformIntrinsicCombineLegacyPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+private:
+  bool runOnFunction(Function &F) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<UniformityInfoWrapperPass>();
+    AU.addRequired<TargetPassConfig>();
+  }
+};
+} // namespace
+
+char AMDGPUUniformIntrinsicCombineLegacy::ID = 0;
+char &llvm::AMDGPUUniformIntrinsicCombineLegacyPassID =
+    AMDGPUUniformIntrinsicCombineLegacy::ID;
+
+bool AMDGPUUniformIntrinsicCombineLegacy::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+  const UniformityInfo &UI =
+      getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
+  return runUniformIntrinsicCombine(F, UI);
+}
+
+INITIALIZE_PASS_BEGIN(AMDGPUUniformIntrinsicCombineLegacy, DEBUG_TYPE,
+                      "AMDGPU Uniform Intrinsic Combine", false, false)
+INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_END(AMDGPUUniformIntrinsicCombineLegacy, DEBUG_TYPE,
+                    "AMDGPU Uniform Intrinsic Combine", false, false)
+
+FunctionPass *llvm::createAMDGPUUniformIntrinsicCombineLegacyPass() {
+  return new AMDGPUUniformIntrinsicCombineLegacy();
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp b/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp
index 61c5dcd5ebada..ded2f5ae1f8af 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp
@@ -54,7 +54,7 @@ class AMDGPUWaitSGPRHazards {
   bool CullSGPRHazardsAtMemWait;
   unsigned CullSGPRHazardsMemWaitThreshold;
 
-  AMDGPUWaitSGPRHazards() {}
+  AMDGPUWaitSGPRHazards() = default;
 
   // Return the numeric ID 0-127 for a given SGPR.
   static std::optional<unsigned> sgprNumber(Register Reg,
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 975781fea9452..f357981ac91de 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -183,7 +183,7 @@ class ScheduleMetrics {
   unsigned BubbleCycles;
 
 public:
-  ScheduleMetrics() {}
+  ScheduleMetrics() = default;
   ScheduleMetrics(unsigned L, unsigned BC)
       : ScheduleLength(L), BubbleCycles(BC) {}
   unsigned getLength() const { return ScheduleLength; }
@@ -217,7 +217,7 @@ class RegionPressureMap {
   bool IsLiveOut;
 
 public:
-  RegionPressureMap() {}
+  RegionPressureMap() = default;
   RegionPressureMap(GCNScheduleDAGMILive *GCNDAG, bool LiveOut)
       : DAG(GCNDAG), IsLiveOut(LiveOut) {}
   // Build the Instr->LiveReg and RegionIdx->Instr maps
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
index 013cfeb364048..28b4da8ab9ebb 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
@@ -168,7 +168,7 @@ bool AMDGPUMCInstrAnalysis::evaluateBranch(const MCInst &Inst, uint64_t Addr,
 
 void AMDGPUMCInstrAnalysis::updateState(const MCInst &Inst, uint64_t Addr) {
   if (Inst.getOpcode() == AMDGPU::S_SET_VGPR_MSB_gfx12)
-    VgprMSBs = Inst.getOperand(0).getImm();
+    VgprMSBs = Inst.getOperand(0).getImm() & 0xff;
   else if (isTerminator(Inst))
     VgprMSBs = 0;
 }
diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
index 5f6d742d245ec..65dce74a1e894 100644
--- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -877,69 +877,69 @@ multiclass MIMG_Store <mimgopc op, string asm, bit has_d16, bit mip = 0> {
 }
 
 class MIMG_Atomic_gfx6789_base <bits<8> op, string asm, RegisterOperand data_rc,
-                                RegisterClass addr_rc, string dns="">
-  : MIMG_gfx6789 <op, (outs data_rc:$vdst), dns> {
-  let Constraints = "$vdst = $vdata";
-
+                                RegisterClass addr_rc, bit noRtn, string dns="">
+  : MIMG_gfx6789 <op, !if(noRtn, (outs), (outs data_rc:$vdst)), dns> {
+  let Constraints = !if(noRtn, "", "$vdst = $vdata");
+  let isCodeGenOnly = noRtn;
   let InOperandList = (ins data_rc:$vdata, addr_rc:$vaddr, SReg_256_XNULL:$srsrc,
                            DMask:$dmask, UNorm:$unorm, CPol:$cpol,
                            R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da);
-  let AsmString = asm#" $vdst, $vaddr, $srsrc$dmask$unorm$cpol$r128$tfe$lwe$da";
+  let AsmString = asm#" $vdata, $vaddr, $srsrc$dmask$unorm$cpol$r128$tfe$lwe$da";
 }
 
 class MIMG_Atomic_gfx90a_base <bits<8> op, string asm, RegisterOperand data_rc,
-                               RegisterClass addr_rc, string dns="">
-  : MIMG_gfx90a <op, (outs getAlign2RegOp<data_rc>.ret:$vdst), dns> {
-  let Constraints = "$vdst = $vdata";
-
+                               RegisterClass addr_rc, bit noRtn, string dns="">
+  : MIMG_gfx90a <op, !if(noRtn, (outs), (outs getAlign2RegOp<data_rc>.ret:$vdst)), dns> {
+  let Constraints = !if(noRtn, "", "$vdst = $vdata");
+  let isCodeGenOnly = noRtn;
   let InOperandList = (ins getAlign2RegOp<data_rc>.ret:$vdata,
                            addr_rc:$vaddr, SReg_256_XNULL:$srsrc,
                            DMask:$dmask, UNorm:$unorm, CPol:$cpol,
                            R128A16:$r128, LWE:$lwe, DA:$da);
-  let AsmString = asm#" $vdst, $vaddr, $srsrc$dmask$unorm$cpol$r128$lwe$da";
+  let AsmString = asm#" $vdata, $vaddr, $srsrc$dmask$unorm$cpol$r128$lwe$da";
 }
 
 class MIMG_Atomic_si<mimgopc op, string asm, RegisterOperand data_rc,
-                     RegisterClass addr_rc, bit enableDasm = 0>
-  : MIMG_Atomic_gfx6789_base<op.SI, asm, data_rc, addr_rc,
+                     RegisterClass addr_rc, bit noRtn = 0, bit enableDasm = 0>
+  : MIMG_Atomic_gfx6789_base<op.SI, asm, data_rc, addr_rc, noRtn,
                              !if(enableDasm, "GFX6GFX7", "")> {
   let AssemblerPredicate = isGFX6GFX7;
 }
 
 class MIMG_Atomic_vi<mimgopc op, string asm, RegisterOperand data_rc,
-                     RegisterClass addr_rc, bit enableDasm = 0>
-  : MIMG_Atomic_gfx6789_base<op.VI, asm, data_rc, addr_rc, !if(enableDasm, "GFX8", "")> {
+                     RegisterClass addr_rc, bit noRtn = 0, bit enableDasm = 0>
+  : MIMG_Atomic_gfx6789_base<op.VI, asm, data_rc, addr_rc, noRtn, !if(enableDasm, "GFX8", "")> {
   let AssemblerPredicate = isGFX8GFX9NotGFX90A;
   let MIMGEncoding = MIMGEncGfx8;
 }
 
 class MIMG_Atomic_gfx90a<mimgopc op, string asm, RegisterOperand data_rc,
-                         RegisterClass addr_rc, bit enableDasm = 0>
-  : MIMG_Atomic_gfx90a_base<op.VI, asm, data_rc, addr_rc, !if(enableDasm, "GFX90A", "")> {
+                         RegisterClass addr_rc, bit noRtn = 0, bit enableDasm = 0>
+  : MIMG_Atomic_gfx90a_base<op.VI, asm, data_rc, addr_rc, noRtn, !if(enableDasm, "GFX90A", "")> {
   let AssemblerPredicate = isGFX90APlus;
   let MIMGEncoding = MIMGEncGfx90a;
 }
 
 class MIMG_Atomic_gfx10<mimgopc op, string opcode,
                         RegisterOperand DataRC, RegisterClass AddrRC,
-                        bit enableDisasm = 0>
-  : MIMG_gfx10<op.GFX10M, (outs DataRC:$vdst),
+                        bit noRtn = 0, bit enableDisasm = 0>
+  : MIMG_gfx10<op.GFX10M, !if(noRtn, (outs), (outs DataRC:$vdst)),
                !if(enableDisasm, "GFX10", "")> {
-  let Constraints = "$vdst = $vdata";
-
+  let Constraints = !if(noRtn, "", "$vdst = $vdata");
+  let isCodeGenOnly = noRtn;
   let InOperandList = (ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256_XNULL:$srsrc,
                            DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol,
                            R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe);
-  let AsmString = opcode#" $vdst, $vaddr0, $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe";
+  let AsmString = opcode#" $vdata, $vaddr0, $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe";
 }
 
 class MIMG_Atomic_nsa_gfx10<mimgopc op, string opcode,
                             RegisterOperand DataRC, int num_addrs,
-                            bit enableDisasm = 0>
-  : MIMG_nsa_gfx10<op.GFX10M, (outs DataRC:$vdst), num_addrs,
+                            bit noRtn = 0, bit enableDisasm = 0>
+  : MIMG_nsa_gfx10<op.GFX10M, !if(noRtn, (outs), (outs DataRC:$vdst)), num_addrs,
                    !if(enableDisasm, "GFX10", "")> {
-  let Constraints = "$vdst = $vdata";
-
+  let Constraints = !if(noRtn, "", "$vdst = $vdata");
+  let isCodeGenOnly = noRtn;
   let InOperandList = !con((ins DataRC:$vdata),
                            AddrIns,
                            (ins SReg_256_XNULL:$srsrc, DMask:$dmask,
@@ -950,24 +950,24 @@ class MIMG_Atomic_nsa_gfx10<mimgopc op, string opcode,
 
 class MIMG_Atomic_gfx11<mimgopc op, string opcode,
                         RegisterOperand DataRC, RegisterClass AddrRC,
-                        bit enableDisasm = 0>
-  : MIMG_gfx11<op.GFX11, (outs DataRC:$vdst),
+                        bit noRtn = 0, bit enableDisasm = 0>
+  : MIMG_gfx11<op.GFX11, !if(noRtn, (outs), (outs DataRC:$vdst)),
                !if(enableDisasm, "GFX11", "")> {
-  let Constraints = "$vdst = $vdata";
-
+  let Constraints = !if(noRtn, "", "$vdst = $vdata");
+  let isCodeGenOnly = noRtn;
   let InOperandList = (ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256_XNULL:$srsrc,
                            DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol,
                            R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe);
-  let AsmString = opcode#" $vdst, $vaddr0, $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe";
+  let AsmString = opcode#" $vdata, $vaddr0, $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe";
 }
 
 class MIMG_Atomic_nsa_gfx11<mimgopc op, string opcode,
                             RegisterOperand DataRC, int num_addrs,
-                            bit enableDisasm = 0>
-  : MIMG_nsa_gfx11<op.GFX11, (outs DataRC:$vdst), num_addrs,
+                            bit noRtn = 0, bit enableDisasm = 0>
+  : MIMG_nsa_gfx11<op.GFX11, !if(noRtn, (outs), (outs DataRC:$vdst)), num_addrs,
                    !if(enableDisasm, "GFX11", "")> {
-  let Constraints = "$vdst = $vdata";
-
+  let Constraints = !if(noRtn, "", "$vdst = $vdata");
+  let isCodeGenOnly = noRtn;
   let InOperandList = !con((ins DataRC:$vdata),
                            AddrIns,
                            (ins SReg_256_XNULL:$srsrc, DMask:$dmask,
@@ -977,11 +977,11 @@ class MIMG_Atomic_nsa_gfx11<mimgopc op, string opcode,
 }
 
 class VIMAGE_Atomic_gfx12<mimgopc op, string opcode, RegisterOperand DataRC,
-                          int num_addrs, string renamed, bit enableDisasm = 0>
-  : VIMAGE_gfx12<op.GFX12, (outs DataRC:$vdst), num_addrs,
+                          int num_addrs, string renamed, bit noRtn = 0, bit enableDisasm = 0>
+  : VIMAGE_gfx12<op.GFX12, !if(noRtn, (outs), (outs DataRC:$vdst)), num_addrs,
                   !if(enableDisasm, "GFX12", "")> {
-  let Constraints = "$vdst = $vdata";
-
+  let Constraints = !if(noRtn, "", "$vdst = $vdata");
+  let isCodeGenOnly = noRtn;
   let InOperandList = !con((ins DataRC:$vdata),
                            AddrIns,
                            (ins SReg_256_XNULL:$rsrc, DMask:$dmask, Dim:$dim,
@@ -994,95 +994,96 @@ multiclass MIMG_Atomic_Addr_Helper_m <mimgopc op, string asm,
                                       RegisterOperand data_rc,
                                       bit enableDasm = 0,
                                       bit isFP = 0,
+                                      bit noRtn = 0,
                                       string renamed = ""> {
   let hasSideEffects = 1, // FIXME: remove this
       mayLoad = 1, mayStore = 1, hasPostISelHook = 0, DisableWQM = 1,
-      FPAtomic = isFP in {
+      FPAtomic = isFP, IsAtomicNoRet = noRtn in {
     let VAddrDwords = 1 in {
       let ssamp = 0 in {
         if op.HAS_SI then {
-          def _V1_si : MIMG_Atomic_si <op, asm, data_rc, VGPR_32, enableDasm>;
+          def _V1_si : MIMG_Atomic_si <op, asm, data_rc, VGPR_32, noRtn, enableDasm>;
         }
         if op.HAS_VI then {
-          def _V1_vi : MIMG_Atomic_vi <op, asm, data_rc, VGPR_32, enableDasm>;
+          def _V1_vi : MIMG_Atomic_vi <op, asm, data_rc, VGPR_32, noRtn, enableDasm>;
           let hasPostISelHook = 1 in
-          def _V1_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VGPR_32, enableDasm>;
+          def _V1_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VGPR_32, noRtn, enableDasm>;
         }
         if op.HAS_GFX10M then {
-          def _V1_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VGPR_32, enableDasm>;
+          def _V1_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VGPR_32, noRtn, enableDasm>;
         }
         if op.HAS_GFX11 then {
-          def _V1_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VGPR_32, enableDasm>;
+          def _V1_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VGPR_32, noRtn, enableDasm>;
         }
       }
       if op.HAS_GFX12 then {
-        def _V1_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 1, renamed>;
+        def _V1_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 1, renamed, noRtn>;
       }
     }
     let VAddrDwords = 2 in {
       let ssamp = 0 in {
         if op.HAS_SI then {
-          def _V2_si : MIMG_Atomic_si <op, asm, data_rc, VReg_64, 0>;
+          def _V2_si : MIMG_Atomic_si <op, asm, data_rc, VReg_64, noRtn, 0>;
         }
         if op.HAS_VI then {
-          def _V2_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_64, 0>;
-          def _V2_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_64_Align2, 0>;
+          def _V2_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_64, noRtn, 0>;
+          def _V2_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_64_Align2, noRtn, 0>;
         }
         if op.HAS_GFX10M then {
-          def _V2_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_64, 0>;
-          def _V2_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 2, 0>;
+          def _V2_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_64, noRtn, 0>;
+          def _V2_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 2, noRtn, 0>;
         }
         if op.HAS_GFX11 then {
-          def _V2_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_64, 0>;
-          def _V2_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 2, 0>;
+          def _V2_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_64, noRtn, 0>;
+          def _V2_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 2, noRtn, 0>;
         }
       }
       if op.HAS_GFX12 then {
-        def _V2_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 2, renamed>;
+        def _V2_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 2, renamed, noRtn>;
       }
     }
     let VAddrDwords = 3 in {
       let ssamp = 0 in {
         if op.HAS_SI then {
-          def _V3_si : MIMG_Atomic_si <op, asm, data_rc, VReg_96, 0>;
+          def _V3_si : MIMG_Atomic_si <op, asm, data_rc, VReg_96, noRtn, 0>;
         }
         if op.HAS_VI then {
-          def _V3_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_96, 0>;
-          def _V3_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_96_Align2, 0>;
+          def _V3_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_96, noRtn, 0>;
+          def _V3_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_96_Align2, noRtn, 0>;
         }
         if op.HAS_GFX10M then {
-          def _V3_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_96, 0>;
-          def _V3_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 3, 0>;
+          def _V3_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_96, noRtn, 0>;
+          def _V3_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 3, noRtn, 0>;
         }
         if op.HAS_GFX11 then {
-          def _V3_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_96, 0>;
-          def _V3_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 3, 0>;
+          def _V3_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_96, noRtn, 0>;
+          def _V3_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 3, noRtn, 0>;
         }
       }
       if op.HAS_GFX12 then {
-        def _V3_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 3, renamed>;
+        def _V3_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 3, renamed, noRtn>;
       }
     }
     let VAddrDwords = 4 in {
       let ssamp = 0 in {
         if op.HAS_SI then {
-          def _V4_si : MIMG_Atomic_si <op, asm, data_rc, VReg_128, 0>;
+          def _V4_si : MIMG_Atomic_si <op, asm, data_rc, VReg_128, noRtn, 0>;
         }
         if op.HAS_VI then {
-          def _V4_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_128, 0>;
-          def _V4_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_128_Align2, 0>;
+          def _V4_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_128, noRtn, 0>;
+          def _V4_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_128_Align2, noRtn, 0>;
         }
         if op.HAS_GFX10M then {
-          def _V4_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_128, 0>;
-          def _V4_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 4, enableDasm>;
+          def _V4_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_128, noRtn, 0>;
+          def _V4_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 4, noRtn, enableDasm>;
         }
         if op.HAS_GFX11 then {
-          def _V4_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_128, 0>;
-          def _V4_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 4, enableDasm>;
+          def _V4_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_128, noRtn, 0>;
+          def _V4_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 4, noRtn, enableDasm>;
         }
       }
       if op.HAS_GFX12 then {
-        def _V4_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 4, renamed, enableDasm>;
+        def _V4_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 4, renamed, noRtn, enableDasm>;
       }
     }
   }
@@ -1095,12 +1096,13 @@ multiclass MIMG_Atomic_Addr_Helper_m <mimgopc op, string asm,
     }
 }
 
-multiclass MIMG_Atomic <mimgopc op, string asm, bit isCmpSwap = 0, bit isFP = 0,
-                        string renamed = ""> { // 64-bit atomics
-  let IsAtomicRet = 1 in {
+multiclass MIMG_Atomic_Base <mimgopc op, string asm, bit isCmpSwap = 0, bit isFP = 0,
+                        bit noRtn = 0, string renamed = ""> { // 64-bit atomics
+  let IsAtomicRet = !not(noRtn) in {
     def "" : MIMGBaseOpcode {
       let Atomic = 1;
       let AtomicX2 = isCmpSwap;
+      let NoReturn = noRtn;
     }
 
     let BaseOpcode = !cast<MIMGBaseOpcode>(NAME) in {
@@ -1109,22 +1111,28 @@ multiclass MIMG_Atomic <mimgopc op, string asm, bit isCmpSwap = 0, bit isFP = 0,
       // Other variants are reconstructed by disassembler using dmask and tfe.
       if !not(isCmpSwap) then {
         let VDataDwords = 1 in
-        defm _V1 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_32, 1, isFP, renamed>;
+        defm _V1 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_32, 1, isFP, noRtn, renamed>;
       }
 
       let VDataDwords = 2 in
-      defm _V2 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_64, isCmpSwap, isFP, renamed>;
+      defm _V2 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_64, isCmpSwap, isFP, noRtn, renamed>;
       let VDataDwords = 3 in
-      defm _V3 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_96, 0, isFP, renamed>;
+      defm _V3 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_96, 0, isFP, noRtn, renamed>;
 
       if isCmpSwap then {
         let VDataDwords = 4 in
-        defm _V4 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_128, 0, isFP, renamed>;
+        defm _V4 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_128, 0, isFP, noRtn, renamed>;
         let VDataDwords = 5 in
-        defm _V5 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_160, 0, isFP, renamed>;
+        defm _V5 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_160, 0, isFP, noRtn, renamed>;
       }
     }
-  } // End IsAtomicRet = 1
+  }
+}
+
+multiclass MIMG_Atomic <mimgopc op, string asm, bit isCmpSwap = 0, bit isFP = 0,
+                        string renamed = ""> {
+  defm "" : MIMG_Atomic_Base <op, asm, isCmpSwap, isFP, /*noRtn=*/0, renamed>;
+  defm "_NORTN" : MIMG_Atomic_Base <op, asm, isCmpSwap, isFP, /*noRtn=*/1, renamed>;
 }
 
 multiclass MIMG_Atomic_Renamed <mimgopc op, string asm, string renamed,
@@ -1820,6 +1828,7 @@ let SubtargetPredicate = isGFX12Plus in {
 class ImageDimIntrinsicInfo<AMDGPUImageDimIntrinsic I> {
   Intrinsic Intr = I;
   MIMGBaseOpcode BaseOpcode = !cast<MIMGBaseOpcode>(!strconcat("IMAGE_", I.P.OpMod));
+  MIMGBaseOpcode AtomicNoRetBaseOpcode = BaseOpcode;
   AMDGPUDimProps Dim = I.P.Dim;
   AMDGPUImageDimIntrinsicEval DimEval = AMDGPUImageDimIntrinsicEval<I.P>;
 
@@ -1855,13 +1864,20 @@ class ImageDimIntrinsicInfo<AMDGPUImageDimIntrinsic I> {
   bits<8> CoordTyArg = !add(GradientTyArg, !if(I.P.Gradients, 1, 0));
 }
 
+class ImageDimAtomicIntrinsicInfo<AMDGPUImageDimIntrinsic I>
+  : ImageDimIntrinsicInfo<I> {
+  MIMGBaseOpcode AtomicNoRetBaseOpcode =
+    !cast<MIMGBaseOpcode>(!strconcat("IMAGE_", I.P.OpMod, "_NORTN"));
+}
+
 def ImageDimIntrinsicTable : GenericTable {
   let FilterClass = "ImageDimIntrinsicInfo";
-  let Fields = ["Intr", "BaseOpcode", "Dim", "NumOffsetArgs", "NumBiasArgs", "NumZCompareArgs", "NumGradients", "NumDmask", "NumData", "NumVAddrs", "NumArgs",
-    "DMaskIndex", "VAddrStart", "OffsetIndex", "BiasIndex", "ZCompareIndex", "GradientStart", "CoordStart", "LodIndex", "MipIndex", "VAddrEnd",
-    "RsrcIndex", "SampIndex", "UnormIndex", "TexFailCtrlIndex", "CachePolicyIndex",
+  let Fields = ["Intr", "BaseOpcode", "AtomicNoRetBaseOpcode", "Dim", "NumOffsetArgs", "NumBiasArgs", "NumZCompareArgs", "NumGradients", "NumDmask", "NumData",
+    "NumVAddrs", "NumArgs", "DMaskIndex", "VAddrStart", "OffsetIndex", "BiasIndex", "ZCompareIndex", "GradientStart", "CoordStart", "LodIndex", "MipIndex",
+    "VAddrEnd", "RsrcIndex", "SampIndex", "UnormIndex", "TexFailCtrlIndex", "CachePolicyIndex",
     "BiasTyArg", "GradientTyArg", "CoordTyArg"];
   string TypeOf_BaseOpcode = "MIMGBaseOpcode";
+  string TypeOf_AtomicNoRetBaseOpcode = "MIMGBaseOpcode";
   string TypeOf_Dim = "MIMGDim";
 
   let PrimaryKey = ["Intr"];
@@ -1874,11 +1890,14 @@ def getImageDimIntrinsicByBaseOpcode : SearchIndex {
   let Key = ["BaseOpcode", "Dim"];
 }
 
-foreach intr = !listconcat(AMDGPUImageDimIntrinsics,
-                           AMDGPUImageDimAtomicIntrinsics) in {
+foreach intr = AMDGPUImageDimIntrinsics in {
   def : ImageDimIntrinsicInfo<intr>;
 }
 
+foreach intr = AMDGPUImageDimAtomicIntrinsics in {
+  def : ImageDimAtomicIntrinsicInfo<intr>;
+}
+
 // L to LZ Optimization Mapping
 def : MIMGLZMapping<IMAGE_SAMPLE_L, IMAGE_SAMPLE_LZ>;
 def : MIMGLZMapping<IMAGE_SAMPLE_C_L, IMAGE_SAMPLE_C_LZ>;
@@ -2097,8 +2116,10 @@ class VIMAGE_TENSOR_Real <bits<8> op, VIMAGE_TENSOR_Pseudo ps, string opName = p
   let vaddr2 = !if(ps.UpTo2D, !cast<int>(SGPR_NULL_gfx11plus.HWEncoding), ?);
   let vaddr3 = !if(ps.UpTo2D, !cast<int>(SGPR_NULL_gfx11plus.HWEncoding), ?);
 
+  // Set VADDR4 to NULL
+  let vaddr4 = !cast<int>(SGPR_NULL_gfx11plus.HWEncoding);
+
   // set to 0 based on SPG.
-  let vaddr4 = 0;
   let rsrc = 0;
   let vdata = 0;
   let d16 = 0;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index be4229155c983..8bb28084159e8 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -7035,9 +7035,15 @@ static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N,
   SDLoc SL(N);
 
   if (Src.getOpcode() == ISD::SETCC) {
+    SDValue Op0 = Src.getOperand(0);
+    SDValue Op1 = Src.getOperand(1);
+    // Need to expand bfloat to float for comparison (setcc).
+    if (Op0.getValueType() == MVT::bf16) {
+      Op0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op0);
+      Op1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op1);
+    }
     // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
-    return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src.getOperand(0),
-                       Src.getOperand(1), Src.getOperand(2));
+    return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Op0, Op1, Src.getOperand(2));
   }
   if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) {
     // (ballot 0) -> 0
@@ -9134,16 +9140,23 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
   SDLoc DL(Op);
   MachineFunction &MF = DAG.getMachineFunction();
   const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
+  unsigned IntrOpcode = Intr->BaseOpcode;
+  // For image atomic: use no-return opcode if result is unused.
+  if (Intr->AtomicNoRetBaseOpcode != Intr->BaseOpcode &&
+      !Op.getNode()->hasAnyUseOfValue(0))
+    IntrOpcode = Intr->AtomicNoRetBaseOpcode;
   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
-      AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
+      AMDGPU::getMIMGBaseOpcodeInfo(IntrOpcode);
   const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
-  unsigned IntrOpcode = Intr->BaseOpcode;
   bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
   bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
   bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
 
   SmallVector<EVT, 3> ResultTypes(Op->values());
   SmallVector<EVT, 3> OrigResultTypes(Op->values());
+  if (BaseOpcode->NoReturn && BaseOpcode->Atomic)
+    ResultTypes.erase(&ResultTypes[0]);
+
   bool IsD16 = false;
   bool IsG16 = false;
   bool IsA16 = false;
@@ -9162,8 +9175,10 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
     VData = Op.getOperand(2);
 
     IsAtomicPacked16Bit =
-        (Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
-         Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
+        (IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
+         IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16_NORTN ||
+         IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16 ||
+         IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16_NORTN);
 
     bool Is64Bit = VData.getValueSizeInBits() == 64;
     if (BaseOpcode->AtomicX2) {
@@ -9173,7 +9188,9 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
       if (Is64Bit)
         VData = DAG.getBitcast(MVT::v4i32, VData);
 
-      ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
+      if (!BaseOpcode->NoReturn)
+        ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
+
       DMask = Is64Bit ? 0xf : 0x3;
       NumVDataDwords = Is64Bit ? 4 : 2;
     } else {
@@ -9399,8 +9416,9 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
   }
 
   unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex);
-  if (BaseOpcode->Atomic)
-    CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
+  // Keep GLC only when the atomic's result is actually used.
+  if (BaseOpcode->Atomic && !BaseOpcode->NoReturn)
+    CPol |= AMDGPU::CPol::GLC;
   if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
                AMDGPU::CPol::VOLATILE))
     return Op;
@@ -9512,13 +9530,20 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
     DAG.setNodeMemRefs(NewNode, {MemRef});
   }
 
+  if (BaseOpcode->NoReturn) {
+    if (BaseOpcode->Atomic)
+      return DAG.getMergeValues(
+          {DAG.getPOISON(OrigResultTypes[0]), SDValue(NewNode, 0)}, DL);
+
+    return SDValue(NewNode, 0);
+  }
+
   if (BaseOpcode->AtomicX2) {
     SmallVector<SDValue, 1> Elt;
     DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
     return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
   }
-  if (BaseOpcode->NoReturn)
-    return SDValue(NewNode, 0);
+
   return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail,
                            Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
                            NumVDataDwords, IsAtomicPacked16Bit, DL);
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 6dcbced010a5a..b7fa899678ec7 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1288,18 +1288,38 @@ void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
 }
 
 void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) {
+  // On entry to a block with multiple predescessors, there may
+  // be pending SMEM and VMEM events active at the same time.
+  // In such cases, only clear one active event at a time.
+  auto applyPendingXcntGroup = [this](unsigned E) {
+    unsigned LowerBound = getScoreLB(X_CNT);
+    applyWaitcnt(X_CNT, 0);
+    PendingEvents |= (1 << E);
+    setScoreLB(X_CNT, LowerBound);
+  };
+
   // Wait on XCNT is redundant if we are already waiting for a load to complete.
   // SMEM can return out of order, so only omit XCNT wait if we are waiting till
   // zero.
-  if (Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP))
-    return applyWaitcnt(X_CNT, 0);
+  if (Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP)) {
+    if (hasPendingEvent(VMEM_GROUP))
+      applyPendingXcntGroup(VMEM_GROUP);
+    else
+      applyWaitcnt(X_CNT, 0);
+    return;
+  }
 
   // If we have pending store we cannot optimize XCnt because we do not wait for
   // stores. VMEM loads retun in order, so if we only have loads XCnt is
   // decremented to the same number as LOADCnt.
   if (Wait.LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) &&
-      !hasPendingEvent(STORE_CNT))
-    return applyWaitcnt(X_CNT, std::min(Wait.XCnt, Wait.LoadCnt));
+      !hasPendingEvent(STORE_CNT)) {
+    if (hasPendingEvent(SMEM_GROUP))
+      applyPendingXcntGroup(SMEM_GROUP);
+    else
+      applyWaitcnt(X_CNT, std::min(Wait.XCnt, Wait.LoadCnt));
+    return;
+  }
 
   applyWaitcnt(X_CNT, Wait.XCnt);
 }
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index d930a21c2d7f5..45f591927b86e 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -6153,7 +6153,7 @@ bool SIInstrInfo::isLegalRegOperand(const MachineInstr &MI, unsigned OpIdx,
   // information.
   if (AMDGPU::isPackedFP32Inst(MI.getOpcode()) && AMDGPU::isGFX12Plus(ST) &&
       MO.isReg() && RI.isSGPRReg(MRI, MO.getReg())) {
-    constexpr const AMDGPU::OpName OpNames[] = {
+    constexpr AMDGPU::OpName OpNames[] = {
         AMDGPU::OpName::src0, AMDGPU::OpName::src1, AMDGPU::OpName::src2};
 
     for (auto [I, OpName] : enumerate(OpNames)) {
@@ -6215,8 +6215,8 @@ bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI,
 bool SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand(
     const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN,
     const MachineOperand *MO) const {
-  constexpr const unsigned NumOps = 3;
-  constexpr const AMDGPU::OpName OpNames[NumOps * 2] = {
+  constexpr unsigned NumOps = 3;
+  constexpr AMDGPU::OpName OpNames[NumOps * 2] = {
       AMDGPU::OpName::src0,           AMDGPU::OpName::src1,
       AMDGPU::OpName::src2,           AMDGPU::OpName::src0_modifiers,
       AMDGPU::OpName::src1_modifiers, AMDGPU::OpName::src2_modifiers};
@@ -10618,6 +10618,42 @@ bool SIInstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
   return false;
 }
 
+// SCC is already valid after SCCValid.
+// SCCRedefine will redefine SCC to the same value already available after
+// SCCValid. If there are no intervening SCC conflicts delete SCCRedefine and
+// update kill/dead flags if necessary.
+static bool optimizeSCC(MachineInstr *SCCValid, MachineInstr *SCCRedefine,
+                        const SIRegisterInfo &RI) {
+  MachineInstr *KillsSCC = nullptr;
+  for (MachineInstr &MI : make_range(std::next(SCCValid->getIterator()),
+                                     SCCRedefine->getIterator())) {
+    if (MI.modifiesRegister(AMDGPU::SCC, &RI))
+      return false;
+    if (MI.killsRegister(AMDGPU::SCC, &RI))
+      KillsSCC = &MI;
+  }
+  if (MachineOperand *SccDef =
+          SCCValid->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr))
+    SccDef->setIsDead(false);
+  if (KillsSCC)
+    KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr);
+  SCCRedefine->eraseFromParent();
+  return true;
+}
+
+static bool foldableSelect(const MachineInstr &Def) {
+  if (Def.getOpcode() != AMDGPU::S_CSELECT_B32 &&
+      Def.getOpcode() != AMDGPU::S_CSELECT_B64)
+    return false;
+  bool Op1IsNonZeroImm =
+      Def.getOperand(1).isImm() && Def.getOperand(1).getImm() != 0;
+  bool Op2IsZeroImm =
+      Def.getOperand(2).isImm() && Def.getOperand(2).getImm() == 0;
+  if (!Op1IsNonZeroImm || !Op2IsZeroImm)
+    return false;
+  return true;
+}
+
 bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
                                        Register SrcReg2, int64_t CmpMask,
                                        int64_t CmpValue,
@@ -10637,19 +10673,6 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
     if (!Def || Def->getParent() != CmpInstr.getParent())
       return false;
 
-    const auto foldableSelect = [](MachineInstr *Def) -> bool {
-      if (Def->getOpcode() == AMDGPU::S_CSELECT_B32 ||
-          Def->getOpcode() == AMDGPU::S_CSELECT_B64) {
-        bool Op1IsNonZeroImm =
-            Def->getOperand(1).isImm() && Def->getOperand(1).getImm() != 0;
-        bool Op2IsZeroImm =
-            Def->getOperand(2).isImm() && Def->getOperand(2).getImm() == 0;
-        if (Op1IsNonZeroImm && Op2IsZeroImm)
-          return true;
-      }
-      return false;
-    };
-
     // For S_OP that set SCC = DST!=0, do the transformation
     //
     //   s_cmp_lg_* (S_OP ...), 0 => (S_OP ...)
@@ -10660,24 +10683,12 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
     //
     //   s_cmp_lg_* (S_CSELECT* (non-zero imm), 0), 0 => (S_CSELECT* (non-zero
     //   imm), 0)
-    if (!setsSCCifResultIsNonZero(*Def) && !foldableSelect(Def))
+    if (!setsSCCifResultIsNonZero(*Def) && !foldableSelect(*Def))
       return false;
 
-    MachineInstr *KillsSCC = nullptr;
-    for (MachineInstr &MI :
-         make_range(std::next(Def->getIterator()), CmpInstr.getIterator())) {
-      if (MI.modifiesRegister(AMDGPU::SCC, &RI))
-        return false;
-      if (MI.killsRegister(AMDGPU::SCC, &RI))
-        KillsSCC = &MI;
-    }
+    if (!optimizeSCC(Def, &CmpInstr, RI))
+      return false;
 
-    if (MachineOperand *SccDef =
-            Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr))
-      SccDef->setIsDead(false);
-    if (KillsSCC)
-      KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr);
-    CmpInstr.eraseFromParent();
     return true;
   };
 
@@ -10755,21 +10766,8 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
     if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))
       return false;
 
-    MachineInstr *KillsSCC = nullptr;
-    for (MachineInstr &MI :
-         make_range(std::next(Def->getIterator()), CmpInstr.getIterator())) {
-      if (MI.modifiesRegister(AMDGPU::SCC, &RI))
-        return false;
-      if (MI.killsRegister(AMDGPU::SCC, &RI))
-        KillsSCC = &MI;
-    }
-
-    MachineOperand *SccDef =
-        Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr);
-    SccDef->setIsDead(false);
-    if (KillsSCC)
-      KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr);
-    CmpInstr.eraseFromParent();
+    if (!optimizeSCC(Def, &CmpInstr, RI))
+      return false;
 
     if (!MRI->use_nodbg_empty(DefReg)) {
       assert(!IsReversedCC);
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index d80a6f339c8f6..a6c1af24e13e9 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -1823,6 +1823,16 @@ void SIRegisterInfo::buildSpillLoadStore(
       }
     }
 
+    Register FinalValueReg = ValueReg;
+    if (LoadStoreOp == AMDGPU::SCRATCH_LOAD_USHORT_SADDR) {
+      // If we are loading 16-bit value with SRAMECC endabled we need a temp
+      // 32-bit VGPR to load and extract 16-bits into the final register.
+      ValueReg =
+          RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false, 0);
+      SubReg = ValueReg;
+      IsKill = false;
+    }
+
     MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(RegOffset);
     MachineMemOperand *NewMMO =
         MF->getMachineMemOperand(PInfo, MMO->getFlags(), RemEltSize,
@@ -1863,6 +1873,17 @@ void SIRegisterInfo::buildSpillLoadStore(
       MIB.addImm(0); // swz
     MIB.addMemOperand(NewMMO);
 
+    if (FinalValueReg != ValueReg) {
+      // Extract 16-bit from the loaded 32-bit value.
+      ValueReg = getSubReg(ValueReg, AMDGPU::lo16);
+      MIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B16_t16_e64))
+                .addReg(FinalValueReg, getDefRegState(true))
+                .addImm(0)
+                .addReg(ValueReg, getKillRegState(true))
+                .addImm(0);
+      ValueReg = FinalValueReg;
+    }
+
     if (!IsAGPR && NeedSuperRegDef)
       MIB.addReg(ValueReg, RegState::ImplicitDefine);
 
@@ -2505,7 +2526,9 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
       unsigned Opc;
       if (MI->getOpcode() == AMDGPU::SI_SPILL_V16_RESTORE) {
         assert(ST.enableFlatScratch() && "Flat Scratch is not enabled!");
-        Opc = AMDGPU::SCRATCH_LOAD_SHORT_D16_SADDR_t16;
+        Opc = ST.d16PreservesUnusedBits()
+                  ? AMDGPU::SCRATCH_LOAD_SHORT_D16_SADDR_t16
+                  : AMDGPU::SCRATCH_LOAD_USHORT_SADDR;
       } else {
         Opc = MI->getOpcode() == AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE
                   ? AMDGPU::SCRATCH_LOAD_BLOCK_SADDR
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 313ae3d68fb83..6b0653457cbaf 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -601,10 +601,20 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
     setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom);
     setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom);
 
-    if (!Subtarget->hasVFP2Base())
+    if (!Subtarget->hasVFP2Base()) {
       setAllExpand(MVT::f32);
-    if (!Subtarget->hasFP64())
+    } else {
+      for (auto Op : {ISD::STRICT_FADD, ISD::STRICT_FSUB, ISD::STRICT_FMUL,
+                      ISD::STRICT_FDIV, ISD::STRICT_FMA, ISD::STRICT_FSQRT})
+        setOperationAction(Op, MVT::f32, Legal);
+    }
+    if (!Subtarget->hasFP64()) {
       setAllExpand(MVT::f64);
+    } else {
+      for (auto Op : {ISD::STRICT_FADD, ISD::STRICT_FSUB, ISD::STRICT_FMUL,
+                      ISD::STRICT_FDIV, ISD::STRICT_FMA, ISD::STRICT_FSQRT})
+        setOperationAction(Op, MVT::f64, Legal);
+    }
   }
 
   if (Subtarget->hasFullFP16()) {
@@ -1281,12 +1291,16 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
     if (!Subtarget->hasFPARMv8Base() || !Subtarget->hasFP64()) {
       setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
       setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
+      setOperationAction(ISD::STRICT_FP16_TO_FP, MVT::f64, LibCall);
+      setOperationAction(ISD::STRICT_FP_TO_FP16, MVT::f64, LibCall);
     }
 
     // fp16 is a special v7 extension that adds f16 <-> f32 conversions.
     if (!Subtarget->hasFP16()) {
       setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
       setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
+      setOperationAction(ISD::STRICT_FP16_TO_FP, MVT::f32, LibCall);
+      setOperationAction(ISD::STRICT_FP_TO_FP16, MVT::f32, LibCall);
     }
 
     // Strict floating-point comparisons need custom lowering.
@@ -1298,12 +1312,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
     setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Custom);
   }
 
-  // Use __sincos_stret if available.
-  if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
-      getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
-    setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
-    setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
-  }
+  setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
+  setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
 
   // FP-ARMv8 implements a lot of rounding-like FP operations.
   if (Subtarget->hasFPARMv8Base()) {
@@ -1337,31 +1347,42 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
   }
 
   // FP16 often need to be promoted to call lib functions
+  // clang-format off
   if (Subtarget->hasFullFP16()) {
-    setOperationAction(ISD::FREM, MVT::f16, Promote);
-    setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand);
-    setOperationAction(ISD::FSIN, MVT::f16, Promote);
-    setOperationAction(ISD::FCOS, MVT::f16, Promote);
-    setOperationAction(ISD::FTAN, MVT::f16, Promote);
-    setOperationAction(ISD::FSINCOS, MVT::f16, Promote);
-    setOperationAction(ISD::FPOWI, MVT::f16, Promote);
-    setOperationAction(ISD::FPOW, MVT::f16, Promote);
-    setOperationAction(ISD::FEXP, MVT::f16, Promote);
-    setOperationAction(ISD::FEXP2, MVT::f16, Promote);
-    setOperationAction(ISD::FEXP10, MVT::f16, Promote);
-    setOperationAction(ISD::FLOG, MVT::f16, Promote);
-    setOperationAction(ISD::FLOG10, MVT::f16, Promote);
-    setOperationAction(ISD::FLOG2, MVT::f16, Promote);
     setOperationAction(ISD::LRINT, MVT::f16, Expand);
     setOperationAction(ISD::LROUND, MVT::f16, Expand);
-
-    setOperationAction(ISD::FROUND, MVT::f16, Legal);
-    setOperationAction(ISD::FROUNDEVEN, MVT::f16, Legal);
-    setOperationAction(ISD::FTRUNC, MVT::f16, Legal);
-    setOperationAction(ISD::FNEARBYINT, MVT::f16, Legal);
-    setOperationAction(ISD::FRINT, MVT::f16, Legal);
-    setOperationAction(ISD::FFLOOR, MVT::f16, Legal);
-    setOperationAction(ISD::FCEIL, MVT::f16, Legal);
+    setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand);
+  
+    for (auto Op : {ISD::FREM,          ISD::FPOW,         ISD::FPOWI,
+                  ISD::FCOS,          ISD::FSIN,         ISD::FSINCOS,
+                  ISD::FSINCOSPI,     ISD::FMODF,        ISD::FACOS,
+                  ISD::FASIN,         ISD::FATAN,        ISD::FATAN2,
+                  ISD::FCOSH,         ISD::FSINH,        ISD::FTANH,
+                  ISD::FTAN,          ISD::FEXP,         ISD::FEXP2,
+                  ISD::FEXP10,        ISD::FLOG,         ISD::FLOG2,
+                  ISD::FLOG10,        ISD::STRICT_FREM,  ISD::STRICT_FPOW,
+                  ISD::STRICT_FPOWI,  ISD::STRICT_FCOS,  ISD::STRICT_FSIN,
+                  ISD::STRICT_FACOS,  ISD::STRICT_FASIN, ISD::STRICT_FATAN,
+                  ISD::STRICT_FATAN2, ISD::STRICT_FCOSH, ISD::STRICT_FSINH,
+                  ISD::STRICT_FTANH,  ISD::STRICT_FEXP,  ISD::STRICT_FEXP2,
+                  ISD::STRICT_FLOG,   ISD::STRICT_FLOG2, ISD::STRICT_FLOG10,
+                  ISD::STRICT_FTAN}) {
+        setOperationAction(Op, MVT::f16, Promote);
+    }
+
+    // Round-to-integer need custom lowering for fp16, as Promote doesn't work
+    // because the result type is integer.
+    for (auto Op : {ISD::STRICT_LROUND, ISD::STRICT_LLROUND, ISD::STRICT_LRINT, ISD::STRICT_LLRINT})
+      setOperationAction(Op, MVT::f16, Custom);
+  
+    for (auto Op : {ISD::FROUND,         ISD::FROUNDEVEN,        ISD::FTRUNC,
+                    ISD::FNEARBYINT,     ISD::FRINT,             ISD::FFLOOR, 
+                    ISD::FCEIL,          ISD::STRICT_FROUND,     ISD::STRICT_FROUNDEVEN,
+                    ISD::STRICT_FTRUNC,  ISD::STRICT_FNEARBYINT, ISD::STRICT_FRINT, 
+                    ISD::STRICT_FFLOOR,  ISD::STRICT_FCEIL}) {
+      setOperationAction(Op, MVT::f16, Legal);
+    }
+    // clang-format on
   }
 
   if (Subtarget->hasNEON()) {
@@ -9835,13 +9856,18 @@ static SDValue LowerUADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG) {
 }
 
 SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
-  assert(Subtarget->isTargetDarwin());
-
   // For iOS, we want to call an alternative entry point: __sincos_stret,
   // return values are passed via sret.
   SDLoc dl(Op);
   SDValue Arg = Op.getOperand(0);
   EVT ArgVT = Arg.getValueType();
+  RTLIB::Libcall LC = RTLIB::getSINCOS_STRET(ArgVT);
+  RTLIB::LibcallImpl SincosStret = getLibcallImpl(LC);
+  if (SincosStret == RTLIB::Unsupported)
+    return SDValue();
+
+  assert(Subtarget->isTargetDarwin());
+
   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
   auto PtrVT = getPointerTy(DAG.getDataLayout());
 
@@ -9871,11 +9897,9 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
 
   Args.emplace_back(Arg, ArgTy);
 
-  RTLIB::Libcall LC =
-      (ArgVT == MVT::f64) ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
-  const char *LibcallName = getLibcallName(LC);
-  CallingConv::ID CC = getLibcallCallingConv(LC);
-  SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DL));
+  StringRef LibcallName = getLibcallImplName(SincosStret);
+  CallingConv::ID CC = getLibcallImplCallingConv(SincosStret);
+  SDValue Callee = DAG.getExternalSymbol(LibcallName.data(), getPointerTy(DL));
 
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl)
@@ -10726,6 +10750,19 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     return LowerCMP(Op, DAG);
   case ISD::ABS:
     return LowerABS(Op, DAG);
+  case ISD::STRICT_LROUND:
+  case ISD::STRICT_LLROUND:
+  case ISD::STRICT_LRINT:
+  case ISD::STRICT_LLRINT: {
+    assert((Op.getOperand(1).getValueType() == MVT::f16 ||
+            Op.getOperand(1).getValueType() == MVT::bf16) &&
+           "Expected custom lowering of rounding operations only for f16");
+    SDLoc DL(Op);
+    SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
+                              {Op.getOperand(0), Op.getOperand(1)});
+    return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other},
+                       {Ext.getValue(1), Ext.getValue(0)});
+  }
   }
 }
 
@@ -22072,6 +22109,11 @@ bool ARMTargetLowering::isComplexDeinterleavingOperationSupported(
           ScalarTy->isIntegerTy(32));
 }
 
+ArrayRef<MCPhysReg> ARMTargetLowering::getRoundingControlRegisters() const {
+  static const MCPhysReg RCRegs[] = {ARM::FPSCR_RM};
+  return RCRegs;
+}
+
 Value *ARMTargetLowering::createComplexDeinterleavingIR(
     IRBuilderBase &B, ComplexDeinterleavingOperation OperationType,
     ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB,
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 357d2c5d2fad1..bf3438b0d8803 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -1009,6 +1009,8 @@ class VectorType;
 
     bool isUnsupportedFloatingType(EVT VT) const;
 
+    ArrayRef<MCPhysReg> getRoundingControlRegisters() const override;
+
     SDValue getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal, SDValue TrueVal,
                     SDValue ARMcc, SDValue Flags, SelectionDAG &DAG) const;
     SDValue getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td
index 10d4cd5dd96c1..f7176a65d8163 100644
--- a/llvm/lib/Target/ARM/ARMInstrInfo.td
+++ b/llvm/lib/Target/ARM/ARMInstrInfo.td
@@ -473,15 +473,15 @@ def xor_su : PatFrag<(ops node:$lhs, node:$rhs), (xor node:$lhs, node:$rhs)>;
 
 // An 'fmul' node with a single use.
 let HasOneUse = 1 in
-def fmul_su : PatFrag<(ops node:$lhs, node:$rhs), (fmul node:$lhs, node:$rhs)>;
+def fmul_su : PatFrag<(ops node:$lhs, node:$rhs), (any_fmul node:$lhs, node:$rhs)>;
 
 // An 'fadd' node which checks for single non-hazardous use.
-def fadd_mlx : PatFrag<(ops node:$lhs, node:$rhs),(fadd node:$lhs, node:$rhs),[{
+def fadd_mlx : PatFrag<(ops node:$lhs, node:$rhs),(any_fadd node:$lhs, node:$rhs),[{
   return hasNoVMLxHazardUse(N);
 }]>;
 
 // An 'fsub' node which checks for single non-hazardous use.
-def fsub_mlx : PatFrag<(ops node:$lhs, node:$rhs),(fsub node:$lhs, node:$rhs),[{
+def fsub_mlx : PatFrag<(ops node:$lhs, node:$rhs),(any_fsub node:$lhs, node:$rhs),[{
   return hasNoVMLxHazardUse(N);
 }]>;
 
diff --git a/llvm/lib/Target/ARM/ARMInstrVFP.td b/llvm/lib/Target/ARM/ARMInstrVFP.td
index 6771106ef2d89..e2cc97b7b4634 100644
--- a/llvm/lib/Target/ARM/ARMInstrVFP.td
+++ b/llvm/lib/Target/ARM/ARMInstrVFP.td
@@ -439,14 +439,14 @@ let TwoOperandAliasConstraint = "$Dn = $Dd", mayRaiseFPException = 1, Uses = [FP
 def VADDD  : ADbI<0b11100, 0b11, 0, 0,
                   (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
                   IIC_fpALU64, "vadd", ".f64\t$Dd, $Dn, $Dm",
-                  [(set DPR:$Dd, (fadd DPR:$Dn, (f64 DPR:$Dm)))]>,
+                  [(set DPR:$Dd, (any_fadd DPR:$Dn, (f64 DPR:$Dm)))]>,
              Sched<[WriteFPALU64]>;
 
 let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in
 def VADDS  : ASbIn<0b11100, 0b11, 0, 0,
                    (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
                    IIC_fpALU32, "vadd", ".f32\t$Sd, $Sn, $Sm",
-                   [(set SPR:$Sd, (fadd SPR:$Sn, SPR:$Sm))]>,
+                   [(set SPR:$Sd, (any_fadd SPR:$Sn, SPR:$Sm))]>,
              Sched<[WriteFPALU32]> {
   // Some single precision VFP instructions may be executed on both NEON and
   // VFP pipelines on A8.
@@ -457,21 +457,21 @@ let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FP
 def VADDH  : AHbI<0b11100, 0b11, 0, 0,
                   (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm),
                   IIC_fpALU16, "vadd", ".f16\t$Sd, $Sn, $Sm",
-                  [(set (f16 HPR:$Sd), (fadd (f16 HPR:$Sn), (f16 HPR:$Sm)))]>,
+                  [(set (f16 HPR:$Sd), (any_fadd (f16 HPR:$Sn), (f16 HPR:$Sm)))]>,
              Sched<[WriteFPALU32]>;
 
 let TwoOperandAliasConstraint = "$Dn = $Dd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in
 def VSUBD  : ADbI<0b11100, 0b11, 1, 0,
                   (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
                   IIC_fpALU64, "vsub", ".f64\t$Dd, $Dn, $Dm",
-                  [(set DPR:$Dd, (fsub DPR:$Dn, (f64 DPR:$Dm)))]>,
+                  [(set DPR:$Dd, (any_fsub DPR:$Dn, (f64 DPR:$Dm)))]>,
              Sched<[WriteFPALU64]>;
 
 let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in
 def VSUBS  : ASbIn<0b11100, 0b11, 1, 0,
                    (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
                    IIC_fpALU32, "vsub", ".f32\t$Sd, $Sn, $Sm",
-                   [(set SPR:$Sd, (fsub SPR:$Sn, SPR:$Sm))]>,
+                   [(set SPR:$Sd, (any_fsub SPR:$Sn, SPR:$Sm))]>,
              Sched<[WriteFPALU32]>{
   // Some single precision VFP instructions may be executed on both NEON and
   // VFP pipelines on A8.
@@ -482,42 +482,42 @@ let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FP
 def VSUBH  : AHbI<0b11100, 0b11, 1, 0,
                   (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm),
                   IIC_fpALU16, "vsub", ".f16\t$Sd, $Sn, $Sm",
-                  [(set (f16 HPR:$Sd), (fsub (f16 HPR:$Sn), (f16 HPR:$Sm)))]>,
+                  [(set (f16 HPR:$Sd), (any_fsub (f16 HPR:$Sn), (f16 HPR:$Sm)))]>,
             Sched<[WriteFPALU32]>;
 
 let TwoOperandAliasConstraint = "$Dn = $Dd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in
 def VDIVD  : ADbI<0b11101, 0b00, 0, 0,
                   (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
                   IIC_fpDIV64, "vdiv", ".f64\t$Dd, $Dn, $Dm",
-                  [(set DPR:$Dd, (fdiv DPR:$Dn, (f64 DPR:$Dm)))]>,
+                  [(set DPR:$Dd, (any_fdiv DPR:$Dn, (f64 DPR:$Dm)))]>,
              Sched<[WriteFPDIV64]>;
 
 let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in
 def VDIVS  : ASbI<0b11101, 0b00, 0, 0,
                   (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
                   IIC_fpDIV32, "vdiv", ".f32\t$Sd, $Sn, $Sm",
-                  [(set SPR:$Sd, (fdiv SPR:$Sn, SPR:$Sm))]>,
+                  [(set SPR:$Sd, (any_fdiv SPR:$Sn, SPR:$Sm))]>,
              Sched<[WriteFPDIV32]>;
 
 let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FPSCR_RM]  in
 def VDIVH  : AHbI<0b11101, 0b00, 0, 0,
                   (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm),
                   IIC_fpDIV16, "vdiv", ".f16\t$Sd, $Sn, $Sm",
-                  [(set (f16 HPR:$Sd), (fdiv (f16 HPR:$Sn), (f16 HPR:$Sm)))]>,
+                  [(set (f16 HPR:$Sd), (any_fdiv (f16 HPR:$Sn), (f16 HPR:$Sm)))]>,
              Sched<[WriteFPDIV32]>;
 
 let TwoOperandAliasConstraint = "$Dn = $Dd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in
 def VMULD  : ADbI<0b11100, 0b10, 0, 0,
                   (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
                   IIC_fpMUL64, "vmul", ".f64\t$Dd, $Dn, $Dm",
-                  [(set DPR:$Dd, (fmul DPR:$Dn, (f64 DPR:$Dm)))]>,
+                  [(set DPR:$Dd, (any_fmul DPR:$Dn, (f64 DPR:$Dm)))]>,
              Sched<[WriteFPMUL64, ReadFPMUL, ReadFPMUL]>;
 
 let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in
 def VMULS  : ASbIn<0b11100, 0b10, 0, 0,
                    (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
                    IIC_fpMUL32, "vmul", ".f32\t$Sd, $Sn, $Sm",
-                   [(set SPR:$Sd, (fmul SPR:$Sn, SPR:$Sm))]>,
+                   [(set SPR:$Sd, (any_fmul SPR:$Sn, SPR:$Sm))]>,
             Sched<[WriteFPMUL32, ReadFPMUL, ReadFPMUL]> {
   // Some single precision VFP instructions may be executed on both NEON and
   // VFP pipelines on A8.
@@ -528,21 +528,21 @@ let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FP
 def VMULH  : AHbI<0b11100, 0b10, 0, 0,
                   (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm),
                   IIC_fpMUL16, "vmul", ".f16\t$Sd, $Sn, $Sm",
-                  [(set (f16 HPR:$Sd), (fmul (f16 HPR:$Sn), (f16 HPR:$Sm)))]>,
+                  [(set (f16 HPR:$Sd), (any_fmul (f16 HPR:$Sn), (f16 HPR:$Sm)))]>,
              Sched<[WriteFPMUL32, ReadFPMUL, ReadFPMUL]>;
 
 let TwoOperandAliasConstraint = "$Dn = $Dd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in
 def VNMULD : ADbI<0b11100, 0b10, 1, 0,
                   (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
                   IIC_fpMUL64, "vnmul", ".f64\t$Dd, $Dn, $Dm",
-                  [(set DPR:$Dd, (fneg (fmul DPR:$Dn, (f64 DPR:$Dm))))]>,
+                  [(set DPR:$Dd, (fneg (any_fmul DPR:$Dn, (f64 DPR:$Dm))))]>,
              Sched<[WriteFPMUL64, ReadFPMUL, ReadFPMUL]>;
 
 let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in
 def VNMULS : ASbI<0b11100, 0b10, 1, 0,
                   (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
                   IIC_fpMUL32, "vnmul", ".f32\t$Sd, $Sn, $Sm",
-                  [(set SPR:$Sd, (fneg (fmul SPR:$Sn, SPR:$Sm)))]>,
+                  [(set SPR:$Sd, (fneg (any_fmul SPR:$Sn, SPR:$Sm)))]>,
             Sched<[WriteFPMUL32, ReadFPMUL, ReadFPMUL]> {
   // Some single precision VFP instructions may be executed on both NEON and
   // VFP pipelines on A8.
@@ -553,7 +553,7 @@ let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FP
 def VNMULH : AHbI<0b11100, 0b10, 1, 0,
                   (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm),
                   IIC_fpMUL16, "vnmul", ".f16\t$Sd, $Sn, $Sm",
-                  [(set (f16 HPR:$Sd), (fneg (fmul (f16 HPR:$Sn), (f16 HPR:$Sm))))]>,
+                  [(set (f16 HPR:$Sd), (fneg (any_fmul (f16 HPR:$Sn), (f16 HPR:$Sm))))]>,
              Sched<[WriteFPMUL32, ReadFPMUL, ReadFPMUL]>;
 
 multiclass vsel_inst<string op, bits<2> opc, int CC> {
@@ -587,7 +587,7 @@ defm VSELGE : vsel_inst<"ge", 0b10, 10>;
 defm VSELEQ : vsel_inst<"eq", 0b00, 0>;
 defm VSELVS : vsel_inst<"vs", 0b01, 6>;
 
-multiclass vmaxmin_inst<string op, bit opc, SDNode SD> {
+multiclass vmaxmin_inst<string op, bit opc, PatFrags SD> {
   let DecoderNamespace = "VFPV8", PostEncoderMethod = "",
       isUnpredicable = 1, mayRaiseFPException = 1 in {
     def H : AHbInp<0b11101, 0b00, opc,
@@ -610,8 +610,8 @@ multiclass vmaxmin_inst<string op, bit opc, SDNode SD> {
   }
 }
 
-defm VFP_VMAXNM : vmaxmin_inst<"vmaxnm", 0, fmaxnum>;
-defm VFP_VMINNM : vmaxmin_inst<"vminnm", 1, fminnum>;
+defm VFP_VMAXNM : vmaxmin_inst<"vmaxnm", 0, any_fmaxnum>;
+defm VFP_VMINNM : vmaxmin_inst<"vminnm", 1, any_fminnum>;
 
 // Match reassociated forms only if not sign dependent rounding.
 def : Pat<(fmul (fneg DPR:$a), (f64 DPR:$b)),
@@ -746,7 +746,7 @@ let mayRaiseFPException = 1, Uses = [FPSCR_RM] in
 def VCVTDS  : ASuI<0b11101, 0b11, 0b0111, 0b11, 0,
                    (outs DPR:$Dd), (ins SPR:$Sm),
                    IIC_fpCVTDS, "vcvt", ".f64.f32\t$Dd, $Sm", "",
-                   [(set DPR:$Dd, (fpextend SPR:$Sm))]>,
+                   [(set DPR:$Dd, (any_fpextend SPR:$Sm))]>,
              Sched<[WriteFPCVT]> {
   // Instruction operands.
   bits<5> Dd;
@@ -766,7 +766,7 @@ def VCVTDS  : ASuI<0b11101, 0b11, 0b0111, 0b11, 0,
 let mayRaiseFPException = 1, Uses = [FPSCR_RM] in
 def VCVTSD  : VFPAI<(outs SPR:$Sd), (ins DPR:$Dm), VFPUnaryFrm,
                     IIC_fpCVTSD, "vcvt", ".f32.f64\t$Sd, $Dm", "",
-                    [(set SPR:$Sd, (fpround DPR:$Dm))]>,
+                    [(set SPR:$Sd, (any_fpround DPR:$Dm))]>,
               Sched<[WriteFPCVT]> {
   // Instruction operands.
   bits<5> Sd;
@@ -796,7 +796,7 @@ def VCVTBHS: ASuI<0b11101, 0b11, 0b0010, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm),
                  Requires<[HasFP16]>,
              Sched<[WriteFPCVT]>;
 
-def : FP16Pat<(f32 (fpextend (f16 HPR:$Sm))),
+def : FP16Pat<(f32 (any_fpextend (f16 HPR:$Sm))),
               (VCVTBHS (COPY_TO_REGCLASS (f16 HPR:$Sm), SPR))>;
 def : FP16Pat<(f16_to_fp GPR:$a),
               (VCVTBHS (COPY_TO_REGCLASS GPR:$a, SPR))>;
@@ -808,16 +808,16 @@ def VCVTBSH: ASuI<0b11101, 0b11, 0b0011, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sda,
                  Requires<[HasFP16]>,
              Sched<[WriteFPCVT]>;
 
-def : FP16Pat<(f16 (fpround SPR:$Sm)),
+def : FP16Pat<(f16 (any_fpround SPR:$Sm)),
               (COPY_TO_REGCLASS (VCVTBSH (IMPLICIT_DEF), SPR:$Sm), HPR)>;
 def : FP16Pat<(fp_to_f16 SPR:$a),
               (i32 (COPY_TO_REGCLASS (VCVTBSH (IMPLICIT_DEF), SPR:$a), GPR))>;
-def : FP16Pat<(insertelt (v8f16 MQPR:$src1), (f16 (fpround (f32 SPR:$src2))), imm_even:$lane),
+def : FP16Pat<(insertelt (v8f16 MQPR:$src1), (f16 (any_fpround (f32 SPR:$src2))), imm_even:$lane),
               (v8f16 (INSERT_SUBREG (v8f16 MQPR:$src1),
                                     (VCVTBSH (EXTRACT_SUBREG (v8f16 MQPR:$src1), (SSubReg_f16_reg imm:$lane)),
                                              SPR:$src2),
                                     (SSubReg_f16_reg imm:$lane)))>;
-def : FP16Pat<(insertelt (v4f16 DPR:$src1), (f16 (fpround (f32 SPR:$src2))), imm_even:$lane),
+def : FP16Pat<(insertelt (v4f16 DPR:$src1), (f16 (any_fpround (f32 SPR:$src2))), imm_even:$lane),
               (v4f16 (INSERT_SUBREG (v4f16 DPR:$src1),
                                     (VCVTBSH (EXTRACT_SUBREG (v4f16 DPR:$src1), (SSubReg_f16_reg imm:$lane)),
                                              SPR:$src2),
@@ -830,9 +830,9 @@ def VCVTTHS: ASuI<0b11101, 0b11, 0b0010, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm),
                  Requires<[HasFP16]>,
              Sched<[WriteFPCVT]>;
 
-def : FP16Pat<(f32 (fpextend (extractelt (v8f16 MQPR:$src), imm_odd:$lane))),
+def : FP16Pat<(f32 (any_fpextend (extractelt (v8f16 MQPR:$src), imm_odd:$lane))),
               (VCVTTHS (EXTRACT_SUBREG MQPR:$src, (SSubReg_f16_reg imm_odd:$lane)))>;
-def : FP16Pat<(f32 (fpextend (extractelt (v4f16 DPR:$src), imm_odd:$lane))),
+def : FP16Pat<(f32 (any_fpextend (extractelt (v4f16 DPR:$src), imm_odd:$lane))),
               (VCVTTHS (EXTRACT_SUBREG
                 (v2f32 (COPY_TO_REGCLASS (v4f16 DPR:$src), DPR_VFP2)),
                 (SSubReg_f16_reg imm_odd:$lane)))>;
@@ -844,12 +844,12 @@ def VCVTTSH: ASuI<0b11101, 0b11, 0b0011, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sda,
                  Requires<[HasFP16]>,
             Sched<[WriteFPCVT]>;
 
-def : FP16Pat<(insertelt (v8f16 MQPR:$src1), (f16 (fpround (f32 SPR:$src2))), imm_odd:$lane),
+def : FP16Pat<(insertelt (v8f16 MQPR:$src1), (f16 (any_fpround (f32 SPR:$src2))), imm_odd:$lane),
               (v8f16 (INSERT_SUBREG (v8f16 MQPR:$src1),
                                     (VCVTTSH (EXTRACT_SUBREG (v8f16 MQPR:$src1), (SSubReg_f16_reg imm:$lane)),
                                              SPR:$src2),
                                     (SSubReg_f16_reg imm:$lane)))>;
-def : FP16Pat<(insertelt (v4f16 DPR:$src1), (f16 (fpround (f32 SPR:$src2))), imm_odd:$lane),
+def : FP16Pat<(insertelt (v4f16 DPR:$src1), (f16 (any_fpround (f32 SPR:$src2))), imm_odd:$lane),
               (v4f16 (INSERT_SUBREG (v4f16 DPR:$src1),
                                     (VCVTTSH (EXTRACT_SUBREG (v4f16 DPR:$src1), (SSubReg_f16_reg imm:$lane)),
                                              SPR:$src2),
@@ -872,7 +872,7 @@ def VCVTBHD : ADuI<0b11101, 0b11, 0b0010, 0b01, 0,
   let hasSideEffects = 0;
 }
 
-def : FullFP16Pat<(f64 (fpextend (f16 HPR:$Sm))),
+def : FullFP16Pat<(f64 (any_fpextend (f16 HPR:$Sm))),
                   (VCVTBHD (COPY_TO_REGCLASS (f16 HPR:$Sm), SPR))>,
                   Requires<[HasFPARMv8, HasDPVFP]>;
 def : FP16Pat<(f64 (f16_to_fp GPR:$a)),
@@ -898,7 +898,7 @@ def VCVTBDH : ADuI<0b11101, 0b11, 0b0011, 0b01, 0,
   let hasSideEffects = 0;
 }
 
-def : FullFP16Pat<(f16 (fpround DPR:$Dm)),
+def : FullFP16Pat<(f16 (any_fpround DPR:$Dm)),
                   (COPY_TO_REGCLASS (VCVTBDH (IMPLICIT_DEF), DPR:$Dm), HPR)>,
                   Requires<[HasFPARMv8, HasDPVFP]>;
 def : FP16Pat<(fp_to_f16 (f64 DPR:$a)),
@@ -1007,41 +1007,41 @@ multiclass vcvt_inst<string opc, bits<2> rm,
 
   let Predicates = [HasFPARMv8] in {
     let Predicates = [HasFullFP16] in {
-    def : Pat<(i32 (fp_to_sint (node (f16 HPR:$a)))),
+    def : Pat<(i32 (any_fp_to_sint (node (f16 HPR:$a)))),
               (COPY_TO_REGCLASS
                 (!cast<Instruction>(NAME#"SH") (f16 HPR:$a)),
                 GPR)>;
 
-    def : Pat<(i32 (fp_to_uint (node (f16 HPR:$a)))),
+    def : Pat<(i32 (any_fp_to_uint (node (f16 HPR:$a)))),
               (COPY_TO_REGCLASS
                 (!cast<Instruction>(NAME#"UH") (f16 HPR:$a)),
                 GPR)>;
     }
-    def : Pat<(i32 (fp_to_sint (node SPR:$a))),
+    def : Pat<(i32 (any_fp_to_sint (node SPR:$a))),
               (COPY_TO_REGCLASS
                 (!cast<Instruction>(NAME#"SS") SPR:$a),
                 GPR)>;
-    def : Pat<(i32 (fp_to_uint (node SPR:$a))),
+    def : Pat<(i32 (any_fp_to_uint (node SPR:$a))),
               (COPY_TO_REGCLASS
                 (!cast<Instruction>(NAME#"US") SPR:$a),
                 GPR)>;
   }
   let Predicates = [HasFPARMv8, HasDPVFP] in {
-    def : Pat<(i32 (fp_to_sint (node (f64 DPR:$a)))),
+    def : Pat<(i32 (any_fp_to_sint (node (f64 DPR:$a)))),
               (COPY_TO_REGCLASS
                 (!cast<Instruction>(NAME#"SD") DPR:$a),
                 GPR)>;
-    def : Pat<(i32 (fp_to_uint (node (f64 DPR:$a)))),
+    def : Pat<(i32 (any_fp_to_uint (node (f64 DPR:$a)))),
               (COPY_TO_REGCLASS
                 (!cast<Instruction>(NAME#"UD") DPR:$a),
                 GPR)>;
   }
 }
 
-defm VCVTA : vcvt_inst<"a", 0b00, fround>;
+defm VCVTA : vcvt_inst<"a", 0b00, any_fround>;
 defm VCVTN : vcvt_inst<"n", 0b01>;
-defm VCVTP : vcvt_inst<"p", 0b10, fceil>;
-defm VCVTM : vcvt_inst<"m", 0b11, ffloor>;
+defm VCVTP : vcvt_inst<"p", 0b10, any_fceil>;
+defm VCVTM : vcvt_inst<"m", 0b11, any_ffloor>;
 
 def VNEGD  : ADuI<0b11101, 0b11, 0b0001, 0b01, 0,
                   (outs DPR:$Dd), (ins DPR:$Dm),
@@ -1103,9 +1103,9 @@ multiclass vrint_inst_zrx<string opc, bit op, bit op2, SDPatternOperator node,
         Requires<[HasFPARMv8,HasDPVFP]>;
 }
 
-defm VRINTZ : vrint_inst_zrx<"z", 0, 1, ftrunc, [], 0>;
-defm VRINTR : vrint_inst_zrx<"r", 0, 0, fnearbyint, [FPSCR_RM], 0>;
-defm VRINTX : vrint_inst_zrx<"x", 1, 0, frint, [FPSCR_RM], 1>;
+defm VRINTZ : vrint_inst_zrx<"z", 0, 1, any_ftrunc, [], 0>;
+defm VRINTR : vrint_inst_zrx<"r", 0, 0, any_fnearbyint, [FPSCR_RM], 0>;
+defm VRINTX : vrint_inst_zrx<"x", 1, 0, any_frint, [FPSCR_RM], 1>;
 
 multiclass vrint_inst_anpm<string opc, bits<2> rm,
                            SDPatternOperator node = null_frag> {
@@ -1145,30 +1145,31 @@ multiclass vrint_inst_anpm<string opc, bits<2> rm,
         Requires<[HasFPARMv8,HasDPVFP]>;
 }
 
-defm VRINTA : vrint_inst_anpm<"a", 0b00, fround>;
-defm VRINTN : vrint_inst_anpm<"n", 0b01, froundeven>;
-defm VRINTP : vrint_inst_anpm<"p", 0b10, fceil>;
-defm VRINTM : vrint_inst_anpm<"m", 0b11, ffloor>;
+defm VRINTA : vrint_inst_anpm<"a", 0b00, any_fround>;
+defm VRINTN : vrint_inst_anpm<"n", 0b01, any_froundeven>;
+defm VRINTP : vrint_inst_anpm<"p", 0b10, any_fceil>;
+defm VRINTM : vrint_inst_anpm<"m", 0b11, any_ffloor>;
+
 
 let mayRaiseFPException = 1, Uses = [FPSCR_RM] in
 def VSQRTD : ADuI<0b11101, 0b11, 0b0001, 0b11, 0,
                   (outs DPR:$Dd), (ins DPR:$Dm),
                   IIC_fpSQRT64, "vsqrt", ".f64\t$Dd, $Dm", "",
-                  [(set DPR:$Dd, (fsqrt (f64 DPR:$Dm)))]>,
+                  [(set DPR:$Dd, (any_fsqrt (f64 DPR:$Dm)))]>,
              Sched<[WriteFPSQRT64]>;
 
 let mayRaiseFPException = 1, Uses = [FPSCR_RM] in
 def VSQRTS : ASuI<0b11101, 0b11, 0b0001, 0b11, 0,
                   (outs SPR:$Sd), (ins SPR:$Sm),
                   IIC_fpSQRT32, "vsqrt", ".f32\t$Sd, $Sm", "",
-                  [(set SPR:$Sd, (fsqrt SPR:$Sm))]>,
+                  [(set SPR:$Sd, (any_fsqrt SPR:$Sm))]>,
              Sched<[WriteFPSQRT32]>;
 
 let mayRaiseFPException = 1, Uses = [FPSCR_RM] in
 def VSQRTH : AHuI<0b11101, 0b11, 0b0001, 0b11, 0,
                   (outs HPR:$Sd), (ins HPR:$Sm),
                   IIC_fpSQRT16, "vsqrt", ".f16\t$Sd, $Sm",
-                  [(set (f16 HPR:$Sd), (fsqrt (f16 HPR:$Sm)))]>;
+                  [(set (f16 HPR:$Sd), (any_fsqrt (f16 HPR:$Sm)))]>;
 
 let hasSideEffects = 0 in {
 let isMoveReg = 1 in {
@@ -1509,10 +1510,10 @@ def VSITOD : AVConv1IDs_Encode<0b11101, 0b11, 0b1000, 0b1011,
 }
 
 let Predicates=[HasVFP2, HasDPVFP] in {
-  def : VFPPat<(f64 (sint_to_fp GPR:$a)),
+  def : VFPPat<(f64 (any_sint_to_fp GPR:$a)),
                (VSITOD (COPY_TO_REGCLASS GPR:$a, SPR))>;
 
-  def : VFPPat<(f64 (sint_to_fp (i32 (alignedload32 addrmode5:$a)))),
+  def : VFPPat<(f64 (any_sint_to_fp (i32 (alignedload32 addrmode5:$a)))),
                (VSITOD (VLDRS addrmode5:$a))>;
 }
 
@@ -1529,10 +1530,10 @@ def VSITOS : AVConv1InSs_Encode<0b11101, 0b11, 0b1000, 0b1010,
   let D = VFPNeonA8Domain;
 }
 
-def : VFPNoNEONPat<(f32 (sint_to_fp GPR:$a)),
+def : VFPNoNEONPat<(f32 (any_sint_to_fp GPR:$a)),
                    (VSITOS (COPY_TO_REGCLASS GPR:$a, SPR))>;
 
-def : VFPNoNEONPat<(f32 (sint_to_fp (i32 (alignedload32 addrmode5:$a)))),
+def : VFPNoNEONPat<(f32 (any_sint_to_fp (i32 (alignedload32 addrmode5:$a)))),
                    (VSITOS (VLDRS addrmode5:$a))>;
 
 let mayRaiseFPException = 1 in 
@@ -1545,7 +1546,7 @@ def VSITOH : AVConv1IHs_Encode<0b11101, 0b11, 0b1000, 0b1001,
   let isUnpredicable = 1;
 }
 
-def : VFPNoNEONPat<(f16 (sint_to_fp GPR:$a)),
+def : VFPNoNEONPat<(f16 (any_sint_to_fp GPR:$a)),
                    (VSITOH (COPY_TO_REGCLASS GPR:$a, SPR))>;
 
 let mayRaiseFPException = 1 in 
@@ -1558,10 +1559,10 @@ def VUITOD : AVConv1IDs_Encode<0b11101, 0b11, 0b1000, 0b1011,
 }
 
 let Predicates=[HasVFP2, HasDPVFP] in {
-  def : VFPPat<(f64 (uint_to_fp GPR:$a)),
+  def : VFPPat<(f64 (any_uint_to_fp GPR:$a)),
                (VUITOD (COPY_TO_REGCLASS GPR:$a, SPR))>;
 
-  def : VFPPat<(f64 (uint_to_fp (i32 (alignedload32 addrmode5:$a)))),
+  def : VFPPat<(f64 (any_uint_to_fp (i32 (alignedload32 addrmode5:$a)))),
                (VUITOD (VLDRS addrmode5:$a))>;
 }
 
@@ -1578,10 +1579,10 @@ def VUITOS : AVConv1InSs_Encode<0b11101, 0b11, 0b1000, 0b1010,
   let D = VFPNeonA8Domain;
 }
 
-def : VFPNoNEONPat<(f32 (uint_to_fp GPR:$a)),
+def : VFPNoNEONPat<(f32 (any_uint_to_fp GPR:$a)),
                    (VUITOS (COPY_TO_REGCLASS GPR:$a, SPR))>;
 
-def : VFPNoNEONPat<(f32 (uint_to_fp (i32 (alignedload32 addrmode5:$a)))),
+def : VFPNoNEONPat<(f32 (any_uint_to_fp (i32 (alignedload32 addrmode5:$a)))),
                    (VUITOS (VLDRS addrmode5:$a))>;
 
 let mayRaiseFPException = 1 in 
@@ -1594,7 +1595,7 @@ def VUITOH : AVConv1IHs_Encode<0b11101, 0b11, 0b1000, 0b1001,
   let isUnpredicable = 1;
 }
 
-def : VFPNoNEONPat<(f16 (uint_to_fp GPR:$a)),
+def : VFPNoNEONPat<(f16 (any_uint_to_fp GPR:$a)),
                    (VUITOH (COPY_TO_REGCLASS GPR:$a, SPR))>;
 
 // FP -> Int:
@@ -1669,12 +1670,12 @@ def VTOSIZD : AVConv1IsD_Encode<0b11101, 0b11, 0b1101, 0b1011,
 }
 
 let Predicates=[HasVFP2, HasDPVFP] in {
-  def : VFPPat<(i32 (fp_to_sint (f64 DPR:$a))),
+  def : VFPPat<(i32 (any_fp_to_sint (f64 DPR:$a))),
                (COPY_TO_REGCLASS (VTOSIZD DPR:$a), GPR)>;
   def : VFPPat<(i32 (fp_to_sint_sat (f64 DPR:$a), i32)),
                (COPY_TO_REGCLASS (VTOSIZD DPR:$a), GPR)>;
 
-  def : VFPPat<(alignedstore32 (i32 (fp_to_sint (f64 DPR:$a))), addrmode5:$ptr),
+  def : VFPPat<(alignedstore32 (i32 (any_fp_to_sint (f64 DPR:$a))), addrmode5:$ptr),
                (VSTRS (VTOSIZD DPR:$a), addrmode5:$ptr)>;
   def : VFPPat<(alignedstore32 (i32 (fp_to_sint_sat (f64 DPR:$a), i32)), addrmode5:$ptr),
                (VSTRS (VTOSIZD DPR:$a), addrmode5:$ptr)>;
@@ -1693,12 +1694,12 @@ def VTOSIZS : AVConv1InsS_Encode<0b11101, 0b11, 0b1101, 0b1010,
   let D = VFPNeonA8Domain;
 }
 
-def : VFPNoNEONPat<(i32 (fp_to_sint SPR:$a)),
+def : VFPNoNEONPat<(i32 (any_fp_to_sint SPR:$a)),
                    (COPY_TO_REGCLASS (VTOSIZS SPR:$a), GPR)>;
 def : VFPPat<(i32 (fp_to_sint_sat SPR:$a, i32)),
              (COPY_TO_REGCLASS (VTOSIZS SPR:$a), GPR)>;
 
-def : VFPNoNEONPat<(alignedstore32 (i32 (fp_to_sint (f32 SPR:$a))),
+def : VFPNoNEONPat<(alignedstore32 (i32 (any_fp_to_sint (f32 SPR:$a))),
                                    addrmode5:$ptr),
                    (VSTRS (VTOSIZS SPR:$a), addrmode5:$ptr)>;
 def : VFPPat<(alignedstore32 (i32 (fp_to_sint_sat (f32 SPR:$a), i32)),
@@ -1715,7 +1716,7 @@ def VTOSIZH : AVConv1IsH_Encode<0b11101, 0b11, 0b1101, 0b1001,
   let isUnpredicable = 1;
 }
 
-def : VFPNoNEONPat<(i32 (fp_to_sint (f16 HPR:$a))),
+def : VFPNoNEONPat<(i32 (any_fp_to_sint (f16 HPR:$a))),
                    (COPY_TO_REGCLASS (VTOSIZH (f16 HPR:$a)), GPR)>;
 def : VFPPat<(i32 (fp_to_sint_sat (f16 HPR:$a), i32)),
              (COPY_TO_REGCLASS (VTOSIZH (f16 HPR:$a)), GPR)>;
@@ -1730,12 +1731,12 @@ def VTOUIZD : AVConv1IsD_Encode<0b11101, 0b11, 0b1100, 0b1011,
 }
 
 let Predicates=[HasVFP2, HasDPVFP] in {
-  def : VFPPat<(i32 (fp_to_uint (f64 DPR:$a))),
+  def : VFPPat<(i32 (any_fp_to_uint (f64 DPR:$a))),
                (COPY_TO_REGCLASS (VTOUIZD DPR:$a), GPR)>;
   def : VFPPat<(i32 (fp_to_uint_sat (f64 DPR:$a), i32)),
                (COPY_TO_REGCLASS (VTOUIZD DPR:$a), GPR)>;
 
-  def : VFPPat<(alignedstore32 (i32 (fp_to_uint (f64 DPR:$a))), addrmode5:$ptr),
+  def : VFPPat<(alignedstore32 (i32 (any_fp_to_uint (f64 DPR:$a))), addrmode5:$ptr),
                (VSTRS (VTOUIZD DPR:$a), addrmode5:$ptr)>;
   def : VFPPat<(alignedstore32 (i32 (fp_to_uint_sat (f64 DPR:$a), i32)), addrmode5:$ptr),
                (VSTRS (VTOUIZD DPR:$a), addrmode5:$ptr)>;
@@ -1754,12 +1755,12 @@ def VTOUIZS : AVConv1InsS_Encode<0b11101, 0b11, 0b1100, 0b1010,
   let D = VFPNeonA8Domain;
 }
 
-def : VFPNoNEONPat<(i32 (fp_to_uint SPR:$a)),
+def : VFPNoNEONPat<(i32 (any_fp_to_uint SPR:$a)),
                    (COPY_TO_REGCLASS (VTOUIZS SPR:$a), GPR)>;
 def : VFPPat<(i32 (fp_to_uint_sat SPR:$a, i32)),
              (COPY_TO_REGCLASS (VTOUIZS SPR:$a), GPR)>;
 
-def : VFPNoNEONPat<(alignedstore32 (i32 (fp_to_uint (f32 SPR:$a))),
+def : VFPNoNEONPat<(alignedstore32 (i32 (any_fp_to_uint (f32 SPR:$a))),
                                    addrmode5:$ptr),
                   (VSTRS (VTOUIZS SPR:$a), addrmode5:$ptr)>;
 def : VFPPat<(alignedstore32 (i32 (fp_to_uint_sat (f32 SPR:$a), i32)),
@@ -1776,7 +1777,7 @@ def VTOUIZH : AVConv1IsH_Encode<0b11101, 0b11, 0b1100, 0b1001,
   let isUnpredicable = 1;
 }
 
-def : VFPNoNEONPat<(i32 (fp_to_uint (f16 HPR:$a))),
+def : VFPNoNEONPat<(i32 (any_fp_to_uint (f16 HPR:$a))),
                    (COPY_TO_REGCLASS (VTOUIZH (f16 HPR:$a)), GPR)>;
 def : VFPPat<(i32 (fp_to_uint_sat (f16 HPR:$a), i32)),
              (COPY_TO_REGCLASS (VTOUIZH (f16 HPR:$a)), GPR)>;
@@ -2320,13 +2321,13 @@ def : Pat<(fadd_mlx HPR:$dstin, (fmul_su (f16 HPR:$a), HPR:$b)),
 
 // Match @llvm.fma.* intrinsics
 // (fma x, y, z) -> (vfms z, x, y)
-def : Pat<(f64 (fma DPR:$Dn, DPR:$Dm, DPR:$Ddin)),
+def : Pat<(f64 (any_fma DPR:$Dn, DPR:$Dm, DPR:$Ddin)),
           (VFMAD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
       Requires<[HasVFP4,HasDPVFP]>;
-def : Pat<(f32 (fma SPR:$Sn, SPR:$Sm, SPR:$Sdin)),
+def : Pat<(f32 (any_fma SPR:$Sn, SPR:$Sm, SPR:$Sdin)),
           (VFMAS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
       Requires<[HasVFP4]>;
-def : Pat<(f16 (fma HPR:$Sn, HPR:$Sm, (f16 HPR:$Sdin))),
+def : Pat<(f16 (any_fma HPR:$Sn, HPR:$Sm, (f16 HPR:$Sdin))),
           (VFMAH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>,
       Requires<[HasFullFP16]>;
 
@@ -2375,13 +2376,13 @@ def : Pat<(fsub_mlx HPR:$dstin, (fmul_su (f16 HPR:$a), HPR:$b)),
 
 // Match @llvm.fma.* intrinsics
 // (fma (fneg x), y, z) -> (vfms z, x, y)
-def : Pat<(f64 (fma (fneg DPR:$Dn), DPR:$Dm, DPR:$Ddin)),
+def : Pat<(f64 (any_fma (fneg DPR:$Dn), DPR:$Dm, DPR:$Ddin)),
           (VFMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
       Requires<[HasVFP4,HasDPVFP]>;
-def : Pat<(f32 (fma (fneg SPR:$Sn), SPR:$Sm, SPR:$Sdin)),
+def : Pat<(f32 (any_fma (fneg SPR:$Sn), SPR:$Sm, SPR:$Sdin)),
           (VFMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
       Requires<[HasVFP4]>;
-def : Pat<(f16 (fma (fneg (f16 HPR:$Sn)), (f16 HPR:$Sm), (f16 HPR:$Sdin))),
+def : Pat<(f16 (any_fma (fneg (f16 HPR:$Sn)), (f16 HPR:$Sm), (f16 HPR:$Sdin))),
           (VFMSH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>,
       Requires<[HasFullFP16]>;
 
@@ -2427,23 +2428,23 @@ def : Pat<(fsub_mlx (fneg (fmul_su SPR:$a, SPR:$b)), SPR:$dstin),
 
 // Match @llvm.fma.* intrinsics
 // (fneg (fma x, y, z)) -> (vfnma z, x, y)
-def : Pat<(fneg (fma (f64 DPR:$Dn), (f64 DPR:$Dm), (f64 DPR:$Ddin))),
+def : Pat<(fneg (any_fma (f64 DPR:$Dn), (f64 DPR:$Dm), (f64 DPR:$Ddin))),
           (VFNMAD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
       Requires<[HasVFP4,HasDPVFP]>;
-def : Pat<(fneg (fma (f32 SPR:$Sn), (f32 SPR:$Sm), (f32 SPR:$Sdin))),
+def : Pat<(fneg (any_fma (f32 SPR:$Sn), (f32 SPR:$Sm), (f32 SPR:$Sdin))),
           (VFNMAS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
       Requires<[HasVFP4]>;
-def : Pat<(fneg (fma (f16 HPR:$Sn), (f16 HPR:$Sm), (f16 (f16 HPR:$Sdin)))),
+def : Pat<(fneg (any_fma (f16 HPR:$Sn), (f16 HPR:$Sm), (f16 (f16 HPR:$Sdin)))),
           (VFNMAH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>,
       Requires<[HasFullFP16]>;
 // (fma (fneg x), y, (fneg z)) -> (vfnma z, x, y)
-def : Pat<(f64 (fma (fneg DPR:$Dn), DPR:$Dm, (fneg DPR:$Ddin))),
+def : Pat<(f64 (any_fma (fneg DPR:$Dn), DPR:$Dm, (fneg DPR:$Ddin))),
           (VFNMAD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
       Requires<[HasVFP4,HasDPVFP]>;
-def : Pat<(f32 (fma (fneg SPR:$Sn), SPR:$Sm, (fneg SPR:$Sdin))),
+def : Pat<(f32 (any_fma (fneg SPR:$Sn), SPR:$Sm, (fneg SPR:$Sdin))),
           (VFNMAS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
       Requires<[HasVFP4]>;
-def : Pat<(f16 (fma (fneg (f16 HPR:$Sn)), (f16 HPR:$Sm), (fneg (f16 HPR:$Sdin)))),
+def : Pat<(f16 (any_fma (fneg (f16 HPR:$Sn)), (f16 HPR:$Sm), (fneg (f16 HPR:$Sdin)))),
           (VFNMAH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>,
       Requires<[HasFullFP16]>;
 
@@ -2488,23 +2489,23 @@ def : Pat<(fsub_mlx (fmul_su SPR:$a, SPR:$b), SPR:$dstin),
 // Match @llvm.fma.* intrinsics
 
 // (fma x, y, (fneg z)) -> (vfnms z, x, y))
-def : Pat<(f64 (fma DPR:$Dn, DPR:$Dm, (fneg DPR:$Ddin))),
+def : Pat<(f64 (any_fma DPR:$Dn, DPR:$Dm, (fneg DPR:$Ddin))),
           (VFNMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
       Requires<[HasVFP4,HasDPVFP]>;
-def : Pat<(f32 (fma SPR:$Sn, SPR:$Sm, (fneg SPR:$Sdin))),
+def : Pat<(f32 (any_fma SPR:$Sn, SPR:$Sm, (fneg SPR:$Sdin))),
           (VFNMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
       Requires<[HasVFP4]>;
-def : Pat<(f16 (fma (f16 HPR:$Sn), (f16 HPR:$Sm), (fneg (f16 HPR:$Sdin)))),
+def : Pat<(f16 (any_fma (f16 HPR:$Sn), (f16 HPR:$Sm), (fneg (f16 HPR:$Sdin)))),
           (VFNMSH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>,
       Requires<[HasFullFP16]>;
 // (fneg (fma (fneg x), y, z)) -> (vfnms z, x, y)
-def : Pat<(fneg (f64 (fma (fneg DPR:$Dn), DPR:$Dm, DPR:$Ddin))),
+def : Pat<(fneg (f64 (any_fma (fneg DPR:$Dn), DPR:$Dm, DPR:$Ddin))),
           (VFNMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
       Requires<[HasVFP4,HasDPVFP]>;
-def : Pat<(fneg (f32 (fma (fneg SPR:$Sn), SPR:$Sm, SPR:$Sdin))),
+def : Pat<(fneg (f32 (any_fma (fneg SPR:$Sn), SPR:$Sm, SPR:$Sdin))),
           (VFNMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
       Requires<[HasVFP4]>;
-def : Pat<(fneg (f16 (fma (fneg (f16 HPR:$Sn)), (f16 HPR:$Sm), (f16 HPR:$Sdin)))),
+def : Pat<(fneg (f16 (any_fma (fneg (f16 HPR:$Sn)), (f16 HPR:$Sm), (f16 HPR:$Sdin)))),
           (VFNMSH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>,
       Requires<[HasFullFP16]>;
 
diff --git a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
index ebfa593fbe9e6..bf7c962f02efc 100644
--- a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
@@ -47,9 +47,7 @@ SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall(
 
   // Only use a specialized AEABI function if the default version of this
   // Libcall is an AEABI function.
-  if (std::strncmp(TLI->getLibcallName(LC), "__aeabi", 7) != 0)
-    return SDValue();
-
+  //
   // Translate RTLIB::Libcall to AEABILibcall. We only do this in order to be
   // able to translate memset to memclr and use the value to index the function
   // name array.
@@ -61,12 +59,21 @@ SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall(
   } AEABILibcall;
   switch (LC) {
   case RTLIB::MEMCPY:
+    if (TLI->getLibcallImpl(LC) != RTLIB::impl___aeabi_memcpy)
+      return SDValue();
+
     AEABILibcall = AEABI_MEMCPY;
     break;
   case RTLIB::MEMMOVE:
+    if (TLI->getLibcallImpl(LC) != RTLIB::impl___aeabi_memmove)
+      return SDValue();
+
     AEABILibcall = AEABI_MEMMOVE;
     break;
   case RTLIB::MEMSET:
+    if (TLI->getLibcallImpl(LC) != RTLIB::impl___aeabi_memset)
+      return SDValue();
+
     AEABILibcall = AEABI_MEMSET;
     if (isNullConstant(Src))
       AEABILibcall = AEABI_MEMCLR;
diff --git a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index f60660b12baca..1bb670d195a98 100644
--- a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -426,15 +426,15 @@ class ARMAsmParser : public MCTargetAsmParser {
       VPTState.CurPosition = ~0U;
   }
 
-  void Note(SMLoc L, const Twine &Msg, SMRange Range = std::nullopt) {
+  void Note(SMLoc L, const Twine &Msg, SMRange Range = {}) {
     return getParser().Note(L, Msg, Range);
   }
 
-  bool Warning(SMLoc L, const Twine &Msg, SMRange Range = std::nullopt) {
+  bool Warning(SMLoc L, const Twine &Msg, SMRange Range = {}) {
     return getParser().Warning(L, Msg, Range);
   }
 
-  bool Error(SMLoc L, const Twine &Msg, SMRange Range = std::nullopt) {
+  bool Error(SMLoc L, const Twine &Msg, SMRange Range = {}) {
     return getParser().Error(L, Msg, Range);
   }
 
diff --git a/llvm/lib/Target/CSKY/CSKYISelLowering.cpp b/llvm/lib/Target/CSKY/CSKYISelLowering.cpp
index e5b4f6eeb7b73..08f196b248029 100644
--- a/llvm/lib/Target/CSKY/CSKYISelLowering.cpp
+++ b/llvm/lib/Target/CSKY/CSKYISelLowering.cpp
@@ -884,13 +884,13 @@ CSKYTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
                                .Case("{t4}", CSKY::R20)
                                .Case("{t5}", CSKY::R21)
                                .Case("{t6}", CSKY::R22)
-                               .Cases("{t7}", "{fp}", CSKY::R23)
-                               .Cases("{t8}", "{top}", CSKY::R24)
-                               .Cases("{t9}", "{bsp}", CSKY::R25)
+                               .Cases({"{t7}", "{fp}"}, CSKY::R23)
+                               .Cases({"{t8}", "{top}"}, CSKY::R24)
+                               .Cases({"{t9}", "{bsp}"}, CSKY::R25)
                                .Case("{r26}", CSKY::R26)
                                .Case("{r27}", CSKY::R27)
-                               .Cases("{gb}", "{rgb}", "{rdb}", CSKY::R28)
-                               .Cases("{tb}", "{rtb}", CSKY::R29)
+                               .Cases({"{gb}", "{rgb}", "{rdb}"}, CSKY::R28)
+                               .Cases({"{tb}", "{rtb}"}, CSKY::R29)
                                .Case("{svbr}", CSKY::R30)
                                .Case("{tls}", CSKY::R31)
                                .Default(CSKY::NoRegister);
@@ -907,38 +907,38 @@ CSKYTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
   // use the ABI names in register constraint lists.
   if (Subtarget.useHardFloat()) {
     unsigned FReg = StringSwitch<unsigned>(Constraint.lower())
-                        .Cases("{fr0}", "{vr0}", CSKY::F0_32)
-                        .Cases("{fr1}", "{vr1}", CSKY::F1_32)
-                        .Cases("{fr2}", "{vr2}", CSKY::F2_32)
-                        .Cases("{fr3}", "{vr3}", CSKY::F3_32)
-                        .Cases("{fr4}", "{vr4}", CSKY::F4_32)
-                        .Cases("{fr5}", "{vr5}", CSKY::F5_32)
-                        .Cases("{fr6}", "{vr6}", CSKY::F6_32)
-                        .Cases("{fr7}", "{vr7}", CSKY::F7_32)
-                        .Cases("{fr8}", "{vr8}", CSKY::F8_32)
-                        .Cases("{fr9}", "{vr9}", CSKY::F9_32)
-                        .Cases("{fr10}", "{vr10}", CSKY::F10_32)
-                        .Cases("{fr11}", "{vr11}", CSKY::F11_32)
-                        .Cases("{fr12}", "{vr12}", CSKY::F12_32)
-                        .Cases("{fr13}", "{vr13}", CSKY::F13_32)
-                        .Cases("{fr14}", "{vr14}", CSKY::F14_32)
-                        .Cases("{fr15}", "{vr15}", CSKY::F15_32)
-                        .Cases("{fr16}", "{vr16}", CSKY::F16_32)
-                        .Cases("{fr17}", "{vr17}", CSKY::F17_32)
-                        .Cases("{fr18}", "{vr18}", CSKY::F18_32)
-                        .Cases("{fr19}", "{vr19}", CSKY::F19_32)
-                        .Cases("{fr20}", "{vr20}", CSKY::F20_32)
-                        .Cases("{fr21}", "{vr21}", CSKY::F21_32)
-                        .Cases("{fr22}", "{vr22}", CSKY::F22_32)
-                        .Cases("{fr23}", "{vr23}", CSKY::F23_32)
-                        .Cases("{fr24}", "{vr24}", CSKY::F24_32)
-                        .Cases("{fr25}", "{vr25}", CSKY::F25_32)
-                        .Cases("{fr26}", "{vr26}", CSKY::F26_32)
-                        .Cases("{fr27}", "{vr27}", CSKY::F27_32)
-                        .Cases("{fr28}", "{vr28}", CSKY::F28_32)
-                        .Cases("{fr29}", "{vr29}", CSKY::F29_32)
-                        .Cases("{fr30}", "{vr30}", CSKY::F30_32)
-                        .Cases("{fr31}", "{vr31}", CSKY::F31_32)
+                        .Cases({"{fr0}", "{vr0}"}, CSKY::F0_32)
+                        .Cases({"{fr1}", "{vr1}"}, CSKY::F1_32)
+                        .Cases({"{fr2}", "{vr2}"}, CSKY::F2_32)
+                        .Cases({"{fr3}", "{vr3}"}, CSKY::F3_32)
+                        .Cases({"{fr4}", "{vr4}"}, CSKY::F4_32)
+                        .Cases({"{fr5}", "{vr5}"}, CSKY::F5_32)
+                        .Cases({"{fr6}", "{vr6}"}, CSKY::F6_32)
+                        .Cases({"{fr7}", "{vr7}"}, CSKY::F7_32)
+                        .Cases({"{fr8}", "{vr8}"}, CSKY::F8_32)
+                        .Cases({"{fr9}", "{vr9}"}, CSKY::F9_32)
+                        .Cases({"{fr10}", "{vr10}"}, CSKY::F10_32)
+                        .Cases({"{fr11}", "{vr11}"}, CSKY::F11_32)
+                        .Cases({"{fr12}", "{vr12}"}, CSKY::F12_32)
+                        .Cases({"{fr13}", "{vr13}"}, CSKY::F13_32)
+                        .Cases({"{fr14}", "{vr14}"}, CSKY::F14_32)
+                        .Cases({"{fr15}", "{vr15}"}, CSKY::F15_32)
+                        .Cases({"{fr16}", "{vr16}"}, CSKY::F16_32)
+                        .Cases({"{fr17}", "{vr17}"}, CSKY::F17_32)
+                        .Cases({"{fr18}", "{vr18}"}, CSKY::F18_32)
+                        .Cases({"{fr19}", "{vr19}"}, CSKY::F19_32)
+                        .Cases({"{fr20}", "{vr20}"}, CSKY::F20_32)
+                        .Cases({"{fr21}", "{vr21}"}, CSKY::F21_32)
+                        .Cases({"{fr22}", "{vr22}"}, CSKY::F22_32)
+                        .Cases({"{fr23}", "{vr23}"}, CSKY::F23_32)
+                        .Cases({"{fr24}", "{vr24}"}, CSKY::F24_32)
+                        .Cases({"{fr25}", "{vr25}"}, CSKY::F25_32)
+                        .Cases({"{fr26}", "{vr26}"}, CSKY::F26_32)
+                        .Cases({"{fr27}", "{vr27}"}, CSKY::F27_32)
+                        .Cases({"{fr28}", "{vr28}"}, CSKY::F28_32)
+                        .Cases({"{fr29}", "{vr29}"}, CSKY::F29_32)
+                        .Cases({"{fr30}", "{vr30}"}, CSKY::F30_32)
+                        .Cases({"{fr31}", "{vr31}"}, CSKY::F31_32)
                         .Default(CSKY::NoRegister);
     if (FReg != CSKY::NoRegister) {
       assert(CSKY::F0_32 <= FReg && FReg <= CSKY::F31_32 && "Unknown fp-reg");
diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td
index 44c48305f2832..7ae500a55b92d 100644
--- a/llvm/lib/Target/DirectX/DXIL.td
+++ b/llvm/lib/Target/DirectX/DXIL.td
@@ -1058,6 +1058,16 @@ def WaveActiveOp : DXILOp<119, waveActiveOp> {
                    IntrinArgIndex<0>, IntrinArgI8<WaveOpKind_Max>,
                    IntrinArgI8<SignedOpKind_Unsigned>
                  ]>,
+    IntrinSelect<int_dx_wave_reduce_min,
+                 [
+                   IntrinArgIndex<0>, IntrinArgI8<WaveOpKind_Min>,
+                   IntrinArgI8<SignedOpKind_Signed>
+                 ]>,
+    IntrinSelect<int_dx_wave_reduce_umin,
+                 [
+                   IntrinArgIndex<0>, IntrinArgI8<WaveOpKind_Min>,
+                   IntrinArgI8<SignedOpKind_Unsigned>
+                 ]>,
   ];
 
   let arguments = [OverloadTy, Int8Ty, Int8Ty];
diff --git a/llvm/lib/Target/DirectX/DXILShaderFlags.cpp b/llvm/lib/Target/DirectX/DXILShaderFlags.cpp
index e7e7f2ce66ae8..ce6e8121b9d94 100644
--- a/llvm/lib/Target/DirectX/DXILShaderFlags.cpp
+++ b/llvm/lib/Target/DirectX/DXILShaderFlags.cpp
@@ -94,6 +94,8 @@ static bool checkWaveOps(Intrinsic::ID IID) {
   case Intrinsic::dx_wave_reduce_usum:
   case Intrinsic::dx_wave_reduce_max:
   case Intrinsic::dx_wave_reduce_umax:
+  case Intrinsic::dx_wave_reduce_min:
+  case Intrinsic::dx_wave_reduce_umin:
     return true;
   }
 }
diff --git a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp
index 1e4797bbd05aa..cf8b833b3e42e 100644
--- a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp
+++ b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp
@@ -36,9 +36,10 @@ using namespace llvm;
 using namespace llvm::dxil;
 
 namespace {
-/// A simple Wrapper DiagnosticInfo that generates Module-level diagnostic
-/// for TranslateMetadata pass
-class DiagnosticInfoTranslateMD : public DiagnosticInfo {
+
+/// A simple wrapper of DiagnosticInfo that generates module-level diagnostic
+/// for the DXILValidateMetadata pass
+class DiagnosticInfoValidateMD : public DiagnosticInfo {
 private:
   const Twine &Msg;
   const Module &Mod;
@@ -47,9 +48,9 @@ class DiagnosticInfoTranslateMD : public DiagnosticInfo {
   /// \p M is the module for which the diagnostic is being emitted. \p Msg is
   /// the message to show. Note that this class does not copy this message, so
   /// this reference must be valid for the whole life time of the diagnostic.
-  DiagnosticInfoTranslateMD(const Module &M,
-                            const Twine &Msg LLVM_LIFETIME_BOUND,
-                            DiagnosticSeverity Severity = DS_Error)
+  DiagnosticInfoValidateMD(const Module &M,
+                           const Twine &Msg LLVM_LIFETIME_BOUND,
+                           DiagnosticSeverity Severity = DS_Error)
       : DiagnosticInfo(DK_Unsupported, Severity), Msg(Msg), Mod(M) {}
 
   void print(DiagnosticPrinter &DP) const override {
@@ -57,6 +58,16 @@ class DiagnosticInfoTranslateMD : public DiagnosticInfo {
   }
 };
 
+static void reportError(Module &M, Twine Message,
+                        DiagnosticSeverity Severity = DS_Error) {
+  M.getContext().diagnose(DiagnosticInfoValidateMD(M, Message, Severity));
+}
+
+static void reportLoopError(Module &M, Twine Message,
+                            DiagnosticSeverity Severity = DS_Error) {
+  reportError(M, Twine("Invalid \"llvm.loop\" metadata: ") + Message, Severity);
+}
+
 enum class EntryPropsTag {
   ShaderFlags = 0,
   GSState,
@@ -314,25 +325,122 @@ static void translateBranchMetadata(Module &M, Instruction *BBTerminatorInst) {
   BBTerminatorInst->setMetadata("hlsl.controlflow.hint", nullptr);
 }
 
-static std::array<unsigned, 6> getCompatibleInstructionMDs(llvm::Module &M) {
+// Determines if the metadata node will be compatible with DXIL's loop metadata
+// representation.
+//
+// Reports an error for compatible metadata that is ill-formed.
+static bool isLoopMDCompatible(Module &M, Metadata *MD) {
+  // DXIL only accepts the following loop hints:
+  std::array<StringLiteral, 3> ValidHintNames = {"llvm.loop.unroll.count",
+                                                 "llvm.loop.unroll.disable",
+                                                 "llvm.loop.unroll.full"};
+
+  MDNode *HintMD = dyn_cast<MDNode>(MD);
+  if (!HintMD || HintMD->getNumOperands() == 0)
+    return false;
+
+  auto *HintStr = dyn_cast<MDString>(HintMD->getOperand(0));
+  if (!HintStr)
+    return false;
+
+  if (!llvm::is_contained(ValidHintNames, HintStr->getString()))
+    return false;
+
+  auto ValidCountNode = [](MDNode *CountMD) -> bool {
+    if (CountMD->getNumOperands() == 2)
+      if (auto *Count = dyn_cast<ConstantAsMetadata>(CountMD->getOperand(1)))
+        if (isa<ConstantInt>(Count->getValue()))
+          return true;
+    return false;
+  };
+
+  if (HintStr->getString() == "llvm.loop.unroll.count") {
+    if (!ValidCountNode(HintMD)) {
+      reportLoopError(M, "\"llvm.loop.unroll.count\" must have 2 operands and "
+                         "the second must be a constant integer");
+      return false;
+    }
+  } else if (HintMD->getNumOperands() != 1) {
+    reportLoopError(
+        M, "\"llvm.loop.unroll.disable\" and \"llvm.loop.unroll.full\" "
+           "must be provided as a single operand");
+    return false;
+  }
+
+  return true;
+}
+
+static void translateLoopMetadata(Module &M, Instruction *I, MDNode *BaseMD) {
+  // A distinct node has the self-referential form: !0 = !{ !0, ... }
+  auto IsDistinctNode = [](MDNode *Node) -> bool {
+    return Node && Node->getNumOperands() != 0 && Node == Node->getOperand(0);
+  };
+
+  // Set metadata to null to remove empty/ill-formed metadata from instruction
+  if (BaseMD->getNumOperands() == 0 || !IsDistinctNode(BaseMD))
+    return I->setMetadata("llvm.loop", nullptr);
+
+  // It is valid to have a chain of self-refential loop metadata nodes, as
+  // below. We will collapse these into just one when we reconstruct the
+  // metadata.
+  //
+  // Eg:
+  // !0 = !{!0, !1}
+  // !1 = !{!1, !2}
+  // !2 = !{!"llvm.loop.unroll.disable"}
+  //
+  // So, traverse down a potential self-referential chain
+  while (1 < BaseMD->getNumOperands() &&
+         IsDistinctNode(dyn_cast<MDNode>(BaseMD->getOperand(1))))
+    BaseMD = dyn_cast<MDNode>(BaseMD->getOperand(1));
+
+  // To reconstruct a distinct node we create a temporary node that we will
+  // then update to create a self-reference.
+  llvm::TempMDTuple TempNode = llvm::MDNode::getTemporary(M.getContext(), {});
+  SmallVector<Metadata *> CompatibleOperands = {TempNode.get()};
+
+  // Iterate and reconstruct the metadata nodes that contains any hints,
+  // stripping any unrecognized metadata.
+  ArrayRef<MDOperand> Operands = BaseMD->operands();
+  for (auto &Op : Operands.drop_front())
+    if (isLoopMDCompatible(M, Op.get()))
+      CompatibleOperands.push_back(Op.get());
+
+  if (2 < CompatibleOperands.size())
+    reportLoopError(M, "Provided conflicting hints");
+
+  MDNode *CompatibleLoopMD = MDNode::get(M.getContext(), CompatibleOperands);
+  TempNode->replaceAllUsesWith(CompatibleLoopMD);
+
+  I->setMetadata("llvm.loop", CompatibleLoopMD);
+}
+
+using InstructionMDList = std::array<unsigned, 7>;
+
+static InstructionMDList getCompatibleInstructionMDs(llvm::Module &M) {
   return {
       M.getMDKindID("dx.nonuniform"),    M.getMDKindID("dx.controlflow.hints"),
       M.getMDKindID("dx.precise"),       llvm::LLVMContext::MD_range,
-      llvm::LLVMContext::MD_alias_scope, llvm::LLVMContext::MD_noalias};
+      llvm::LLVMContext::MD_alias_scope, llvm::LLVMContext::MD_noalias,
+      M.getMDKindID("llvm.loop")};
 }
 
 static void translateInstructionMetadata(Module &M) {
   // construct allowlist of valid metadata node kinds
-  std::array<unsigned, 6> DXILCompatibleMDs = getCompatibleInstructionMDs(M);
+  InstructionMDList DXILCompatibleMDs = getCompatibleInstructionMDs(M);
+  unsigned char MDLoopKind = M.getContext().getMDKindID("llvm.loop");
 
   for (Function &F : M) {
     for (BasicBlock &BB : F) {
       // This needs to be done first so that "hlsl.controlflow.hints" isn't
-      // removed in the whitelist below
+      // removed in the allow-list below
       if (auto *I = BB.getTerminator())
         translateBranchMetadata(M, I);
 
       for (auto &I : make_early_inc_range(BB)) {
+        if (isa<BranchInst>(I))
+          if (MDNode *LoopMD = I.getMetadata(MDLoopKind))
+            translateLoopMetadata(M, &I, LoopMD);
         I.dropUnknownNonDebugMetadata(DXILCompatibleMDs);
       }
     }
@@ -364,6 +472,16 @@ static void cleanModuleFlags(Module &M) {
     M.addModuleFlag(Flag.Behavior, Flag.Key->getString(), Flag.Val);
 }
 
+using GlobalMDList = std::array<StringLiteral, 7>;
+
+// The following are compatible with DXIL but not emit with clang, they can
+// be added when applicable:
+// dx.typeAnnotations, dx.viewIDState, dx.dxrPayloadAnnotations
+static GlobalMDList CompatibleNamedModuleMDs = {
+    "llvm.ident",     "llvm.module.flags", "dx.resources",   "dx.valver",
+    "dx.shaderModel", "dx.version",        "dx.entryPoints",
+};
+
 static void translateGlobalMetadata(Module &M, DXILResourceMap &DRM,
                                     DXILResourceTypeMap &DRTM,
                                     const ModuleShaderFlags &ShaderFlags,
@@ -389,31 +507,23 @@ static void translateGlobalMetadata(Module &M, DXILResourceMap &DRM,
     uint64_t CombinedMask = ShaderFlags.getCombinedFlags();
     EntryFnMDNodes.emplace_back(
         emitTopLevelLibraryNode(M, ResourceMD, CombinedMask));
-  } else if (MMDI.EntryPropertyVec.size() > 1) {
-    M.getContext().diagnose(DiagnosticInfoTranslateMD(
-        M, "Non-library shader: One and only one entry expected"));
-  }
+  } else if (1 < MMDI.EntryPropertyVec.size())
+    reportError(M, "Non-library shader: One and only one entry expected");
 
   for (const EntryProperties &EntryProp : MMDI.EntryPropertyVec) {
-    const ComputedShaderFlags &EntrySFMask =
-        ShaderFlags.getFunctionFlags(EntryProp.Entry);
-
-    // If ShaderProfile is Library, mask is already consolidated in the
-    // top-level library node. Hence it is not emitted.
     uint64_t EntryShaderFlags = 0;
     if (MMDI.ShaderProfile != Triple::EnvironmentType::Library) {
-      EntryShaderFlags = EntrySFMask;
-      if (EntryProp.ShaderStage != MMDI.ShaderProfile) {
-        M.getContext().diagnose(DiagnosticInfoTranslateMD(
-            M,
-            "Shader stage '" +
-                Twine(getShortShaderStage(EntryProp.ShaderStage) +
-                      "' for entry '" + Twine(EntryProp.Entry->getName()) +
-                      "' different from specified target profile '" +
-                      Twine(Triple::getEnvironmentTypeName(MMDI.ShaderProfile) +
-                            "'"))));
-      }
+      EntryShaderFlags = ShaderFlags.getFunctionFlags(EntryProp.Entry);
+      if (EntryProp.ShaderStage != MMDI.ShaderProfile)
+        reportError(
+            M, "Shader stage '" +
+                   Twine(getShortShaderStage(EntryProp.ShaderStage)) +
+                   "' for entry '" + Twine(EntryProp.Entry->getName()) +
+                   "' different from specified target profile '" +
+                   Twine(Triple::getEnvironmentTypeName(MMDI.ShaderProfile) +
+                         "'"));
     }
+
     EntryFnMDNodes.emplace_back(emitEntryMD(EntryProp, Signatures, ResourceMD,
                                             EntryShaderFlags,
                                             MMDI.ShaderProfile));
@@ -426,19 +536,17 @@ static void translateGlobalMetadata(Module &M, DXILResourceMap &DRM,
 
   cleanModuleFlags(M);
 
-  // dx.rootsignatures will have been parsed from its metadata form as its
-  // binary form as part of the RootSignatureAnalysisWrapper, so safely
-  // remove it as it is not recognized in DXIL
-  if (NamedMDNode *RootSignature = M.getNamedMetadata("dx.rootsignatures"))
-    RootSignature->eraseFromParent();
+  // Finally, strip all module metadata that is not explicitly specified in the
+  // allow-list
+  SmallVector<NamedMDNode *> ToStrip;
 
-  // llvm.errno.tbaa was recently added but is not supported in LLVM 3.7 and
-  // causes all tests using the DXIL Validator to fail.
-  //
-  // This is a temporary fix and should be replaced with a allowlist once
-  // we have determined all metadata that the DXIL Validator allows
-  if (NamedMDNode *ErrNo = M.getNamedMetadata("llvm.errno.tbaa"))
-    ErrNo->eraseFromParent();
+  for (NamedMDNode &NamedMD : M.named_metadata())
+    if (!NamedMD.getName().starts_with("llvm.dbg.") &&
+        !llvm::is_contained(CompatibleNamedModuleMDs, NamedMD.getName()))
+      ToStrip.push_back(&NamedMD);
+
+  for (NamedMDNode *NamedMD : ToStrip)
+    NamedMD->eraseFromParent();
 }
 
 PreservedAnalyses DXILTranslateMetadata::run(Module &M,
@@ -454,45 +562,34 @@ PreservedAnalyses DXILTranslateMetadata::run(Module &M,
   return PreservedAnalyses::all();
 }
 
-namespace {
-class DXILTranslateMetadataLegacy : public ModulePass {
-public:
-  static char ID; // Pass identification, replacement for typeid
-  explicit DXILTranslateMetadataLegacy() : ModulePass(ID) {}
-
-  StringRef getPassName() const override { return "DXIL Translate Metadata"; }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<DXILResourceTypeWrapperPass>();
-    AU.addRequired<DXILResourceWrapperPass>();
-    AU.addRequired<ShaderFlagsAnalysisWrapper>();
-    AU.addRequired<DXILMetadataAnalysisWrapperPass>();
-    AU.addRequired<RootSignatureAnalysisWrapper>();
-
-    AU.addPreserved<DXILMetadataAnalysisWrapperPass>();
-    AU.addPreserved<DXILResourceBindingWrapperPass>();
-    AU.addPreserved<DXILResourceWrapperPass>();
-    AU.addPreserved<RootSignatureAnalysisWrapper>();
-    AU.addPreserved<ShaderFlagsAnalysisWrapper>();
-  }
+void DXILTranslateMetadataLegacy::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<DXILResourceTypeWrapperPass>();
+  AU.addRequired<DXILResourceWrapperPass>();
+  AU.addRequired<ShaderFlagsAnalysisWrapper>();
+  AU.addRequired<DXILMetadataAnalysisWrapperPass>();
+  AU.addRequired<RootSignatureAnalysisWrapper>();
+
+  AU.addPreserved<DXILMetadataAnalysisWrapperPass>();
+  AU.addPreserved<DXILResourceBindingWrapperPass>();
+  AU.addPreserved<DXILResourceWrapperPass>();
+  AU.addPreserved<RootSignatureAnalysisWrapper>();
+  AU.addPreserved<ShaderFlagsAnalysisWrapper>();
+}
 
-  bool runOnModule(Module &M) override {
-    DXILResourceMap &DRM =
-        getAnalysis<DXILResourceWrapperPass>().getResourceMap();
-    DXILResourceTypeMap &DRTM =
-        getAnalysis<DXILResourceTypeWrapperPass>().getResourceTypeMap();
-    const ModuleShaderFlags &ShaderFlags =
-        getAnalysis<ShaderFlagsAnalysisWrapper>().getShaderFlags();
-    dxil::ModuleMetadataInfo MMDI =
-        getAnalysis<DXILMetadataAnalysisWrapperPass>().getModuleMetadata();
-
-    translateGlobalMetadata(M, DRM, DRTM, ShaderFlags, MMDI);
-    translateInstructionMetadata(M);
-    return true;
-  }
-};
+bool DXILTranslateMetadataLegacy::runOnModule(Module &M) {
+  DXILResourceMap &DRM =
+      getAnalysis<DXILResourceWrapperPass>().getResourceMap();
+  DXILResourceTypeMap &DRTM =
+      getAnalysis<DXILResourceTypeWrapperPass>().getResourceTypeMap();
+  const ModuleShaderFlags &ShaderFlags =
+      getAnalysis<ShaderFlagsAnalysisWrapper>().getShaderFlags();
+  dxil::ModuleMetadataInfo MMDI =
+      getAnalysis<DXILMetadataAnalysisWrapperPass>().getModuleMetadata();
 
-} // namespace
+  translateGlobalMetadata(M, DRM, DRTM, ShaderFlags, MMDI);
+  translateInstructionMetadata(M);
+  return true;
+}
 
 char DXILTranslateMetadataLegacy::ID = 0;
 
diff --git a/llvm/lib/Target/DirectX/DXILTranslateMetadata.h b/llvm/lib/Target/DirectX/DXILTranslateMetadata.h
index 4c1ffac1781e6..cfb8aaa8f98b5 100644
--- a/llvm/lib/Target/DirectX/DXILTranslateMetadata.h
+++ b/llvm/lib/Target/DirectX/DXILTranslateMetadata.h
@@ -10,6 +10,7 @@
 #define LLVM_TARGET_DIRECTX_DXILTRANSLATEMETADATA_H
 
 #include "llvm/IR/PassManager.h"
+#include "llvm/Pass.h"
 
 namespace llvm {
 
@@ -20,6 +21,22 @@ class DXILTranslateMetadata : public PassInfoMixin<DXILTranslateMetadata> {
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &);
 };
 
+/// Wrapper pass for the legacy pass manager.
+///
+/// This is required because the passes that will depend on this are codegen
+/// passes which run through the legacy pass manager.
+class DXILTranslateMetadataLegacy : public ModulePass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+  explicit DXILTranslateMetadataLegacy() : ModulePass(ID) {}
+
+  StringRef getPassName() const override { return "DXIL Translate Metadata"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  bool runOnModule(Module &M) override;
+};
+
 } // namespace llvm
 
 #endif // LLVM_TARGET_DIRECTX_DXILTRANSLATEMETADATA_H
diff --git a/llvm/lib/Target/DirectX/DirectXAsmPrinter.cpp b/llvm/lib/Target/DirectX/DirectXAsmPrinter.cpp
index 15def3637c5a7..b6bbb201f5c5d 100644
--- a/llvm/lib/Target/DirectX/DirectXAsmPrinter.cpp
+++ b/llvm/lib/Target/DirectX/DirectXAsmPrinter.cpp
@@ -52,6 +52,7 @@ void DXILAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
   emitGlobalConstant(GV->getDataLayout(), GV->getInitializer());
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeDirectXAsmPrinter() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeDirectXAsmPrinter() {
   RegisterAsmPrinter<DXILAsmPrinter> X(getTheDirectXTarget());
 }
diff --git a/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp b/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
index bcf84403b2c0d..84b1a313df2ea 100644
--- a/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
+++ b/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
@@ -53,7 +53,8 @@
 
 using namespace llvm;
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeDirectXTarget() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeDirectXTarget() {
   RegisterTargetMachine<DirectXTargetMachine> X(getTheDirectXTarget());
   auto *PR = PassRegistry::getPassRegistry();
   initializeDXILIntrinsicExpansionLegacyPass(*PR);
diff --git a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp
index 68fd3e0bc74c7..60dfd9650937c 100644
--- a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp
+++ b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp
@@ -55,8 +55,10 @@ bool DirectXTTIImpl::isTargetIntrinsicTriviallyScalarizable(
   case Intrinsic::dx_splitdouble:
   case Intrinsic::dx_wave_readlane:
   case Intrinsic::dx_wave_reduce_max:
+  case Intrinsic::dx_wave_reduce_min:
   case Intrinsic::dx_wave_reduce_sum:
   case Intrinsic::dx_wave_reduce_umax:
+  case Intrinsic::dx_wave_reduce_umin:
   case Intrinsic::dx_wave_reduce_usum:
   case Intrinsic::dx_imad:
   case Intrinsic::dx_umad:
diff --git a/llvm/lib/Target/DirectX/MCTargetDesc/DirectXMCTargetDesc.cpp b/llvm/lib/Target/DirectX/MCTargetDesc/DirectXMCTargetDesc.cpp
index 9a14c01f62ae7..62ad014f3739f 100644
--- a/llvm/lib/Target/DirectX/MCTargetDesc/DirectXMCTargetDesc.cpp
+++ b/llvm/lib/Target/DirectX/MCTargetDesc/DirectXMCTargetDesc.cpp
@@ -132,7 +132,8 @@ static MCRegisterInfo *createDirectXMCRegisterInfo(const Triple &Triple) {
 
 static MCInstrInfo *createDirectXMCInstrInfo() { return new MCInstrInfo(); }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeDirectXTargetMC() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeDirectXTargetMC() {
   Target &T = getTheDirectXTarget();
   RegisterMCAsmInfo<DirectXMCAsmInfo> X(T);
   TargetRegistry::RegisterMCInstrInfo(T, createDirectXMCInstrInfo);
diff --git a/llvm/lib/Target/DirectX/TargetInfo/DirectXTargetInfo.cpp b/llvm/lib/Target/DirectX/TargetInfo/DirectXTargetInfo.cpp
index ae01626e5229d..934bd1b0e8adb 100644
--- a/llvm/lib/Target/DirectX/TargetInfo/DirectXTargetInfo.cpp
+++ b/llvm/lib/Target/DirectX/TargetInfo/DirectXTargetInfo.cpp
@@ -24,7 +24,8 @@ Target &getTheDirectXTarget() {
 
 using namespace llvm;
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeDirectXTargetInfo() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeDirectXTargetInfo() {
   RegisterTarget<Triple::dxil, /*HasJIT=*/false> X(
       getTheDirectXTarget(), "dxil", "DirectX Intermediate Language", "DXIL");
 }
diff --git a/llvm/lib/Target/Hexagon/HexagonCopyHoisting.cpp b/llvm/lib/Target/Hexagon/HexagonCopyHoisting.cpp
index 3b810d0b65fab..79863e1c3cb74 100644
--- a/llvm/lib/Target/Hexagon/HexagonCopyHoisting.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonCopyHoisting.cpp
@@ -34,7 +34,7 @@ class HexagonCopyHoisting : public MachineFunctionPass {
 
 public:
   static char ID;
-  HexagonCopyHoisting() : MachineFunctionPass(ID), MFN(nullptr), MRI(nullptr) {}
+  HexagonCopyHoisting() : MachineFunctionPass(ID) {}
 
   StringRef getPassName() const override { return "Hexagon Copy Hoisting"; }
 
@@ -56,8 +56,8 @@ class HexagonCopyHoisting : public MachineFunctionPass {
   void moveCopyInstr(MachineBasicBlock *DestBB,
                      std::pair<Register, Register> Key, MachineInstr *MI);
 
-  MachineFunction *MFN;
-  MachineRegisterInfo *MRI;
+  MachineFunction *MFN = nullptr;
+  MachineRegisterInfo *MRI = nullptr;
   std::vector<DenseMap<std::pair<Register, Register>, MachineInstr *>>
       CopyMIList;
 };
diff --git a/llvm/lib/Target/Hexagon/HexagonDepIICHVX.td b/llvm/lib/Target/Hexagon/HexagonDepIICHVX.td
index f4e36fa7dc767..e661c94690729 100644
--- a/llvm/lib/Target/Hexagon/HexagonDepIICHVX.td
+++ b/llvm/lib/Target/Hexagon/HexagonDepIICHVX.td
@@ -26,6 +26,7 @@ def tc_20a4bbec : InstrItinClass;
 def tc_227864f7 : InstrItinClass;
 def tc_257f6f7c : InstrItinClass;
 def tc_26a377fe : InstrItinClass;
+def tc_2a698a03 : InstrItinClass;
 def tc_2b4c548e : InstrItinClass;
 def tc_2c745bb8 : InstrItinClass;
 def tc_2d4051cd : InstrItinClass;
@@ -52,6 +53,7 @@ def tc_561aaa58 : InstrItinClass;
 def tc_56c4f9fe : InstrItinClass;
 def tc_56e64202 : InstrItinClass;
 def tc_58d21193 : InstrItinClass;
+def tc_57a4709c : InstrItinClass;
 def tc_5bf8afbb : InstrItinClass;
 def tc_5cdf8c84 : InstrItinClass;
 def tc_61bf7c03 : InstrItinClass;
@@ -220,6 +222,11 @@ class DepHVXItinV55 {
        InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2],
       [HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_2a698a03, /*SLOT0123,VSorVP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/
       [InstrStage<1, [SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2],
@@ -356,6 +363,11 @@ class DepHVXItinV55 {
        InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7],
       [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
 
+    InstrItinData <tc_57a4709c, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
     InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_XLANE]>], [9, 2],
@@ -812,6 +824,11 @@ class DepHVXItinV60 {
        InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2],
       [HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_2a698a03, /*SLOT0123,VSorVP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/
       [InstrStage<1, [SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2],
@@ -948,6 +965,11 @@ class DepHVXItinV60 {
        InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7],
       [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
 
+    InstrItinData <tc_57a4709c, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
     InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_XLANE]>], [9, 2],
@@ -1404,6 +1426,11 @@ class DepHVXItinV62 {
        InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2],
       [HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_2a698a03, /*SLOT0123,VSorVP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/
       [InstrStage<1, [SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2],
@@ -1540,6 +1567,11 @@ class DepHVXItinV62 {
        InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7],
       [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
 
+    InstrItinData <tc_57a4709c, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
     InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_XLANE]>], [9, 2],
@@ -1996,6 +2028,11 @@ class DepHVXItinV65 {
        InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2],
       [HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_2a698a03, /*SLOT0123,VSorVP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/
       [InstrStage<1, [SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2],
@@ -2132,6 +2169,11 @@ class DepHVXItinV65 {
        InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7],
       [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
 
+    InstrItinData <tc_57a4709c, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
     InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_XLANE]>], [9, 2],
@@ -2588,6 +2630,11 @@ class DepHVXItinV66 {
        InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2],
       [HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_2a698a03, /*SLOT0123,VSorVP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/
       [InstrStage<1, [SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2],
@@ -2724,6 +2771,11 @@ class DepHVXItinV66 {
        InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7],
       [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
 
+    InstrItinData <tc_57a4709c, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
     InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_XLANE]>], [9, 2],
@@ -3180,6 +3232,11 @@ class DepHVXItinV67 {
        InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2],
       [HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_2a698a03, /*SLOT0123,VSorVP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/
       [InstrStage<1, [SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2],
@@ -3316,6 +3373,11 @@ class DepHVXItinV67 {
        InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7],
       [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
 
+    InstrItinData <tc_57a4709c, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
     InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_XLANE]>], [9, 2],
@@ -3772,6 +3834,11 @@ class DepHVXItinV68 {
        InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2],
       [HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_2a698a03, /*SLOT0123,VSorVP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/
       [InstrStage<1, [SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2],
@@ -3908,6 +3975,11 @@ class DepHVXItinV68 {
        InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7],
       [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
 
+    InstrItinData <tc_57a4709c, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
     InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_XLANE]>], [9, 2],
@@ -4364,6 +4436,11 @@ class DepHVXItinV69 {
        InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2],
       [HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_2a698a03, /*SLOT0123,VSorVP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/
       [InstrStage<1, [SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2],
@@ -4500,6 +4577,11 @@ class DepHVXItinV69 {
        InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7],
       [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
 
+    InstrItinData <tc_57a4709c, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
     InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_XLANE]>], [9, 2],
@@ -4956,6 +5038,11 @@ class DepHVXItinV71 {
        InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2],
       [HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_2a698a03, /*SLOT0123,VSorVP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/
       [InstrStage<1, [SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2],
@@ -5092,6 +5179,11 @@ class DepHVXItinV71 {
        InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7],
       [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
 
+    InstrItinData <tc_57a4709c, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
     InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_XLANE]>], [9, 2],
@@ -5548,6 +5640,11 @@ class DepHVXItinV73 {
        InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2],
       [HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_2a698a03, /*SLOT0123,VSorVP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/
       [InstrStage<1, [SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2],
@@ -5684,6 +5781,11 @@ class DepHVXItinV73 {
        InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7],
       [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
 
+    InstrItinData <tc_57a4709c, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
     InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_XLANE]>], [9, 2],
@@ -6140,6 +6242,11 @@ class DepHVXItinV75 {
        InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2],
       [HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_2a698a03, /*SLOT0123,VSorVP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/
       [InstrStage<1, [SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2],
@@ -6276,6 +6383,11 @@ class DepHVXItinV75 {
        InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7],
       [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
 
+    InstrItinData <tc_57a4709c, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
     InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_XLANE]>], [9, 2],
@@ -6732,6 +6844,11 @@ class DepHVXItinV79 {
        InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2],
       [HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_2a698a03, /*SLOT0123,VSorVP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/
       [InstrStage<1, [SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2],
@@ -6868,6 +6985,11 @@ class DepHVXItinV79 {
        InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7],
       [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
 
+    InstrItinData <tc_57a4709c, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
     InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_XLANE]>], [9, 2],
@@ -7324,6 +7446,11 @@ class DepHVXItinV81 {
        InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2],
       [HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_2a698a03, /*SLOT0123,VSorVP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/
       [InstrStage<1, [SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2],
@@ -7460,6 +7587,11 @@ class DepHVXItinV81 {
        InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7],
       [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
 
+    InstrItinData <tc_57a4709c, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
     InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_XLANE]>], [9, 2],
diff --git a/llvm/lib/Target/Hexagon/HexagonDepInstrInfo.td b/llvm/lib/Target/Hexagon/HexagonDepInstrInfo.td
index f8f1c2ad07b75..b188134d60d39 100644
--- a/llvm/lib/Target/Hexagon/HexagonDepInstrInfo.td
+++ b/llvm/lib/Target/Hexagon/HexagonDepInstrInfo.td
@@ -29939,6 +29939,58 @@ let opNewValue = 0;
 let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
+def V6_vabs_qf16_hf : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32),
+"$Vd32.qf16 = vabs($Vu32.hf)",
+tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000001110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vabs_qf16_qf16 : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32),
+"$Vd32.qf16 = vabs($Vu32.qf16)",
+tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000001110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vabs_qf32_qf32 : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32),
+"$Vd32.qf32 = vabs($Vu32.qf32)",
+tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000001110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vabs_qf32_sf : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32),
+"$Vd32.qf32 = vabs($Vu32.sf)",
+tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000001110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
 def V6_vabs_sf : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32),
@@ -31302,6 +31354,21 @@ let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
+def V6_valign4 : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32 = valign4($Vu32,$Vv32,$Rt8)",
+tc_57a4709c, TypeCVI_VA>, Enc_a30110, Requires<[UseHVXV81]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let isHVXALU = 1;
+let isHVXALU2SRC = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
 def V6_valignb : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8),
@@ -32583,6 +32650,32 @@ let isCVI = 1;
 let hasHvxTmp = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
+def V6_vconv_bf_qf32 : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxWR:$Vuu32),
+"$Vd32.bf = $Vuu32.qf32",
+tc_2a698a03, TypeCVI_VS>, Enc_a33d04, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000000110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vconv_f8_qf16 : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32),
+"$Vd32.f8 = $Vu32.qf16",
+tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
 def V6_vconv_h_hf : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32),
@@ -32596,6 +32689,19 @@ let opNewValue = 0;
 let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
+def V6_vconv_h_hf_rnd : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32),
+"$Vd32.h = $Vu32.hf:rnd",
+tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000000110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
 def V6_vconv_hf_h : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32),
@@ -32635,6 +32741,71 @@ let opNewValue = 0;
 let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
+def V6_vconv_qf16_f8 : HInst<
+(outs HvxWR:$Vdd32),
+(ins HvxVR:$Vu32),
+"$Vdd32.qf16 = $Vu32.f8",
+tc_04da405a, TypeCVI_VP_VS>, Enc_dd766a, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vconv_qf16_hf : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32),
+"$Vd32.qf16 = $Vu32.hf",
+tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vconv_qf16_qf16 : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32),
+"$Vd32.qf16 = $Vu32.qf16",
+tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vconv_qf32_qf32 : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32),
+"$Vd32.qf32 = $Vu32.qf32",
+tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vconv_qf32_sf : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32),
+"$Vd32.qf32 = $Vu32.sf",
+tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
 def V6_vconv_sf_qf32 : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32),
@@ -33720,6 +33891,122 @@ let isHVXALU2SRC = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Qx4 = $Qx4in";
 }
+def V6_veqhf : HInst<
+(outs HvxQR:$Qd4),
+(ins HvxVR:$Vu32, HvxVR:$Vv32),
+"$Qd4 = vcmp.eq($Vu32.hf,$Vv32.hf)",
+tc_56c4f9fe, TypeCVI_VA>, Enc_95441f, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-2} = 0b000111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let isHVXALU = 1;
+let isHVXALU2SRC = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_veqhf_and : HInst<
+(outs HvxQR:$Qx4),
+(ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32),
+"$Qx4 &= vcmp.eq($Vu32.hf,$Vv32.hf)",
+tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-2} = 0b000111;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let isCVI = 1;
+let isHVXALU = 1;
+let isHVXALU2SRC = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_veqhf_or : HInst<
+(outs HvxQR:$Qx4),
+(ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32),
+"$Qx4 |= vcmp.eq($Vu32.hf,$Vv32.hf)",
+tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-2} = 0b010111;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let isAccumulator = 1;
+let isCVI = 1;
+let isHVXALU = 1;
+let isHVXALU2SRC = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_veqhf_xor : HInst<
+(outs HvxQR:$Qx4),
+(ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32),
+"$Qx4 ^= vcmp.eq($Vu32.hf,$Vv32.hf)",
+tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-2} = 0b100111;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let isCVI = 1;
+let isHVXALU = 1;
+let isHVXALU2SRC = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_veqsf : HInst<
+(outs HvxQR:$Qd4),
+(ins HvxVR:$Vu32, HvxVR:$Vv32),
+"$Qd4 = vcmp.eq($Vu32.sf,$Vv32.sf)",
+tc_56c4f9fe, TypeCVI_VA>, Enc_95441f, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-2} = 0b000011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let isHVXALU = 1;
+let isHVXALU2SRC = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_veqsf_and : HInst<
+(outs HvxQR:$Qx4),
+(ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32),
+"$Qx4 &= vcmp.eq($Vu32.sf,$Vv32.sf)",
+tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-2} = 0b000011;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let isCVI = 1;
+let isHVXALU = 1;
+let isHVXALU2SRC = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_veqsf_or : HInst<
+(outs HvxQR:$Qx4),
+(ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32),
+"$Qx4 |= vcmp.eq($Vu32.sf,$Vv32.sf)",
+tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-2} = 0b010011;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let isAccumulator = 1;
+let isCVI = 1;
+let isHVXALU = 1;
+let isHVXALU2SRC = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_veqsf_xor : HInst<
+(outs HvxQR:$Qx4),
+(ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32),
+"$Qx4 ^= vcmp.eq($Vu32.sf,$Vv32.sf)",
+tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-2} = 0b100011;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let isCVI = 1;
+let isHVXALU = 1;
+let isHVXALU2SRC = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
 def V6_veqw : HInst<
 (outs HvxQR:$Qd4),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
@@ -34538,6 +34825,58 @@ let Inst{31-24} = 0b00011110;
 let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
+def V6_vilog2_hf : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32),
+"$Vd32.w = vilog2($Vu32.hf)",
+tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vilog2_qf16 : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32),
+"$Vd32.w = vilog2($Vu32.qf16)",
+tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vilog2_qf32 : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32),
+"$Vd32.w = vilog2($Vu32.qf32)",
+tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vilog2_sf : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32),
+"$Vd32.w = vilog2($Vu32.sf)",
+tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
 def V6_vinsertwr : HInst<
 (outs HvxVR:$Vx32),
 (ins HvxVR:$Vx32in, IntRegs:$Rt32),
@@ -37170,6 +37509,58 @@ let isCVI = 1;
 let isHVXALU = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
+def V6_vneg_qf16_hf : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32),
+"$Vd32.qf16 = vneg($Vu32.hf)",
+tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000001110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vneg_qf16_qf16 : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32),
+"$Vd32.qf16 = vneg($Vu32.qf16)",
+tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000001110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vneg_qf32_qf32 : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32),
+"$Vd32.qf32 = vneg($Vu32.qf32)",
+tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000001110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vneg_qf32_sf : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32),
+"$Vd32.qf32 = vneg($Vu32.sf)",
+tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000001110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
 def V6_vnormamth : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32),
diff --git a/llvm/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td b/llvm/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td
index 23f4b3aef7d10..c11483b961cc3 100644
--- a/llvm/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td
+++ b/llvm/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td
@@ -3830,6 +3830,122 @@ def: Pat<(int_hexagon_V6_vsub_hf_f8_128B HvxVR:$src1, HvxVR:$src2),
 
 // V81 HVX Instructions.
 
+def: Pat<(int_hexagon_V6_vabs_qf16_hf HvxVR:$src1),
+         (V6_vabs_qf16_hf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vabs_qf16_hf_128B HvxVR:$src1),
+         (V6_vabs_qf16_hf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vabs_qf16_qf16 HvxVR:$src1),
+         (V6_vabs_qf16_qf16 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vabs_qf16_qf16_128B HvxVR:$src1),
+         (V6_vabs_qf16_qf16 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vabs_qf32_qf32 HvxVR:$src1),
+         (V6_vabs_qf32_qf32 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vabs_qf32_qf32_128B HvxVR:$src1),
+         (V6_vabs_qf32_qf32 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vabs_qf32_sf HvxVR:$src1),
+         (V6_vabs_qf32_sf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vabs_qf32_sf_128B HvxVR:$src1),
+         (V6_vabs_qf32_sf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_valign4 HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_valign4 HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[UseHVXV81, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_valign4_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_valign4 HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[UseHVXV81, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vconv_bf_qf32 HvxWR:$src1),
+         (V6_vconv_bf_qf32 HvxWR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vconv_bf_qf32_128B HvxWR:$src1),
+         (V6_vconv_bf_qf32 HvxWR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vconv_f8_qf16 HvxVR:$src1),
+         (V6_vconv_f8_qf16 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vconv_f8_qf16_128B HvxVR:$src1),
+         (V6_vconv_f8_qf16 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vconv_h_hf_rnd HvxVR:$src1),
+         (V6_vconv_h_hf_rnd HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vconv_h_hf_rnd_128B HvxVR:$src1),
+         (V6_vconv_h_hf_rnd HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vconv_qf16_f8 HvxVR:$src1),
+         (V6_vconv_qf16_f8 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vconv_qf16_f8_128B HvxVR:$src1),
+         (V6_vconv_qf16_f8 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vconv_qf16_hf HvxVR:$src1),
+         (V6_vconv_qf16_hf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vconv_qf16_hf_128B HvxVR:$src1),
+         (V6_vconv_qf16_hf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vconv_qf16_qf16 HvxVR:$src1),
+         (V6_vconv_qf16_qf16 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vconv_qf16_qf16_128B HvxVR:$src1),
+         (V6_vconv_qf16_qf16 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vconv_qf32_qf32 HvxVR:$src1),
+         (V6_vconv_qf32_qf32 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vconv_qf32_qf32_128B HvxVR:$src1),
+         (V6_vconv_qf32_qf32 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vconv_qf32_sf HvxVR:$src1),
+         (V6_vconv_qf32_sf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vconv_qf32_sf_128B HvxVR:$src1),
+         (V6_vconv_qf32_sf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_veqhf HvxVR:$src1, HvxVR:$src2),
+         (V6_veqhf HvxVR:$src1, HvxVR:$src2)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_veqhf_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_veqhf HvxVR:$src1, HvxVR:$src2)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_veqhf_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_veqhf_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_veqhf_and_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_veqhf_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_veqhf_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_veqhf_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_veqhf_or_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_veqhf_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_veqhf_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_veqhf_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_veqhf_xor_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_veqhf_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_veqsf HvxVR:$src1, HvxVR:$src2),
+         (V6_veqsf HvxVR:$src1, HvxVR:$src2)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_veqsf_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_veqsf HvxVR:$src1, HvxVR:$src2)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_veqsf_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_veqsf_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_veqsf_and_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_veqsf_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_veqsf_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_veqsf_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_veqsf_or_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_veqsf_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_veqsf_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_veqsf_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_veqsf_xor_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_veqsf_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vilog2_hf HvxVR:$src1),
+         (V6_vilog2_hf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vilog2_hf_128B HvxVR:$src1),
+         (V6_vilog2_hf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vilog2_qf16 HvxVR:$src1),
+         (V6_vilog2_qf16 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vilog2_qf16_128B HvxVR:$src1),
+         (V6_vilog2_qf16 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vilog2_qf32 HvxVR:$src1),
+         (V6_vilog2_qf32 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vilog2_qf32_128B HvxVR:$src1),
+         (V6_vilog2_qf32 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vilog2_sf HvxVR:$src1),
+         (V6_vilog2_sf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vilog2_sf_128B HvxVR:$src1),
+         (V6_vilog2_sf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vneg_qf16_hf HvxVR:$src1),
+         (V6_vneg_qf16_hf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vneg_qf16_hf_128B HvxVR:$src1),
+         (V6_vneg_qf16_hf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vneg_qf16_qf16 HvxVR:$src1),
+         (V6_vneg_qf16_qf16 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vneg_qf16_qf16_128B HvxVR:$src1),
+         (V6_vneg_qf16_qf16 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vneg_qf32_qf32 HvxVR:$src1),
+         (V6_vneg_qf32_qf32 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vneg_qf32_qf32_128B HvxVR:$src1),
+         (V6_vneg_qf32_qf32 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vneg_qf32_sf HvxVR:$src1),
+         (V6_vneg_qf32_sf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vneg_qf32_sf_128B HvxVR:$src1),
+         (V6_vneg_qf32_sf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
 def: Pat<(int_hexagon_V6_vsub_hf_mix HvxVR:$src1, HvxVR:$src2),
          (V6_vsub_hf_mix HvxVR:$src1, HvxVR:$src2)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
 def: Pat<(int_hexagon_V6_vsub_hf_mix_128B HvxVR:$src1, HvxVR:$src2),
diff --git a/llvm/lib/Target/Hexagon/HexagonGenMemAbsolute.cpp b/llvm/lib/Target/Hexagon/HexagonGenMemAbsolute.cpp
index 93418f7e15e8d..a10c93704a85b 100644
--- a/llvm/lib/Target/Hexagon/HexagonGenMemAbsolute.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonGenMemAbsolute.cpp
@@ -34,13 +34,13 @@ STATISTIC(HexagonNumStoreAbsConversions,
 namespace {
 
 class HexagonGenMemAbsolute : public MachineFunctionPass {
-  const HexagonInstrInfo *TII;
-  MachineRegisterInfo *MRI;
-  const TargetRegisterInfo *TRI;
+  const HexagonInstrInfo *TII = nullptr;
+  MachineRegisterInfo *MRI = nullptr;
+  const TargetRegisterInfo *TRI = nullptr;
 
 public:
   static char ID;
-  HexagonGenMemAbsolute() : MachineFunctionPass(ID), TII(0), MRI(0), TRI(0) {}
+  HexagonGenMemAbsolute() : MachineFunctionPass(ID) {}
 
   StringRef getPassName() const override {
     return "Hexagon Generate Load/Store Set Absolute Address Instruction";
diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
index 7ee280d8fc8b0..eadf02043841e 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
@@ -1815,7 +1815,7 @@ struct WeightedLeaf {
   int Weight;
   int InsertionOrder;
 
-  WeightedLeaf() {}
+  WeightedLeaf() = default;
 
   WeightedLeaf(SDValue Value, int Weight, int InsertionOrder) :
     Value(Value), Weight(Weight), InsertionOrder(InsertionOrder) {
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
index 54c89721bc1f0..0573f64084d6f 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
@@ -1061,8 +1061,11 @@ HexagonTargetLowering::createHvxPrefixPred(SDValue PredV, const SDLoc &dl,
   SDValue W0 = isUndef(PredV)
                   ? DAG.getUNDEF(MVT::i64)
                   : DAG.getNode(HexagonISD::P2D, dl, MVT::i64, PredV);
-  Words[IdxW].push_back(HiHalf(W0, DAG));
-  Words[IdxW].push_back(LoHalf(W0, DAG));
+  if (Bytes < BitBytes) {
+    Words[IdxW].push_back(HiHalf(W0, DAG));
+    Words[IdxW].push_back(LoHalf(W0, DAG));
+  } else
+    Words[IdxW].push_back(W0);
 
   while (Bytes < BitBytes) {
     IdxW ^= 1;
@@ -1083,7 +1086,26 @@ HexagonTargetLowering::createHvxPrefixPred(SDValue PredV, const SDLoc &dl,
     Bytes *= 2;
   }
 
+  while (Bytes > BitBytes) {
+    IdxW ^= 1;
+    Words[IdxW].clear();
+
+    if (Bytes <= 4) {
+      for (const SDValue &W : Words[IdxW ^ 1]) {
+        SDValue T = contractPredicate(W, dl, DAG);
+        Words[IdxW].push_back(T);
+      }
+    } else {
+      for (const SDValue &W : Words[IdxW ^ 1]) {
+        Words[IdxW].push_back(W);
+      }
+    }
+    Bytes /= 2;
+  }
+
   assert(Bytes == BitBytes);
+  if (BitBytes == 1 && PredTy == MVT::v2i1)
+    ByteTy = MVT::getVectorVT(MVT::i16, HwLen);
 
   SDValue Vec = ZeroFill ? getZero(dl, ByteTy, DAG) : DAG.getUNDEF(ByteTy);
   SDValue S4 = DAG.getConstant(HwLen-4, dl, MVT::i32);
@@ -3157,6 +3179,9 @@ SDValue
 HexagonTargetLowering::SplitHvxMemOp(SDValue Op, SelectionDAG &DAG) const {
   auto *MemN = cast<MemSDNode>(Op.getNode());
 
+  if (!MemN->getMemoryVT().isSimple())
+    return Op;
+
   MVT MemTy = MemN->getMemoryVT().getSimpleVT();
   if (!isHvxPairTy(MemTy))
     return Op;
diff --git a/llvm/lib/Target/Hexagon/HexagonPatterns.td b/llvm/lib/Target/Hexagon/HexagonPatterns.td
index 85ce9447c2028..e40dbd251b5b7 100644
--- a/llvm/lib/Target/Hexagon/HexagonPatterns.td
+++ b/llvm/lib/Target/Hexagon/HexagonPatterns.td
@@ -3434,6 +3434,19 @@ let AddedComplexity = 100 in {
            (C2_not (S4_stored_locked I32:$Rs, I64:$Rt))>;
 }
 
+multiclass FloatClass<SDPatternOperator IntOp, InstHexagon MI,
+                      PatFrag RegPred> {
+  let AddedComplexity = 100 in {
+    def: Pat<(i1 (seteq (IntOp RegPred:$Rs, u5_0ImmPred_timm:$u5), 0)),
+             (C2_not (MI RegPred:$Rs, u5_0ImmPred_timm:$u5))>;
+    def: Pat<(i1 (setne (IntOp RegPred:$Rs, u5_0ImmPred_timm:$u5), 0)),
+             (MI RegPred:$Rs, u5_0ImmPred_timm:$u5)>;
+  }
+}
+
+defm : FloatClass<int_hexagon_F2_sfclass, F2_sfclass, F32>;
+defm : FloatClass<int_hexagon_F2_dfclass, F2_dfclass, F64>;
+
 def: Pat<(int_hexagon_instrprof_custom (HexagonAtPcrel tglobaladdr:$addr), u32_0ImmPred:$I),
          (PS_call_instrprof_custom tglobaladdr:$addr, imm:$I)>;
 
diff --git a/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td b/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td
index 1637b91f1fa12..d19920cfc9ea0 100644
--- a/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td
+++ b/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td
@@ -612,6 +612,9 @@ let Predicates = [UseHVX] in {
            (V6_vandvrt HvxVR:$Vs, (ToI32 0x01010101))>;
   def: Pat<(VecQ32 (trunc HVI32:$Vs)),
            (V6_vandvrt HvxVR:$Vs, (ToI32 0x01010101))>;
+  def: Pat<(VecQ16 (trunc HWI32:$Vss)),
+           (Combineq(VecQ32(V6_vandvrt (HiVec $Vss), (ToI32 0x01010101))),
+           (VecQ32 (V6_vandvrt (LoVec $Vss), (ToI32 0x01010101))))>;
 }
 
 let Predicates = [UseHVX] in {
diff --git a/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp b/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
index b9cdd6a2a3767..ce2de752f3b3a 100644
--- a/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
@@ -544,7 +544,7 @@ int HexagonSubtarget::updateLatency(MachineInstr &SrcInst,
   if (!hasV60Ops())
     return Latency;
 
-  auto &QII = static_cast<const HexagonInstrInfo &>(*getInstrInfo());
+  const HexagonInstrInfo &QII = *getInstrInfo();
   // BSB scheduling.
   if (QII.isHVXVec(SrcInst) || useBSBScheduling())
     Latency = (Latency + 1) >> 1;
diff --git a/llvm/lib/Target/Hexagon/HexagonTfrCleanup.cpp b/llvm/lib/Target/Hexagon/HexagonTfrCleanup.cpp
index 71bdfc6657c57..5a85f348fdaf7 100644
--- a/llvm/lib/Target/Hexagon/HexagonTfrCleanup.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTfrCleanup.cpp
@@ -43,7 +43,7 @@ namespace {
 class HexagonTfrCleanup : public MachineFunctionPass {
 public:
   static char ID;
-  HexagonTfrCleanup() : MachineFunctionPass(ID), HII(0), TRI(0) {}
+  HexagonTfrCleanup() : MachineFunctionPass(ID) {}
   StringRef getPassName() const override { return "Hexagon TFR Cleanup"; }
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesAll();
@@ -52,8 +52,8 @@ class HexagonTfrCleanup : public MachineFunctionPass {
   bool runOnMachineFunction(MachineFunction &MF) override;
 
 private:
-  const HexagonInstrInfo *HII;
-  const TargetRegisterInfo *TRI;
+  const HexagonInstrInfo *HII = nullptr;
+  const TargetRegisterInfo *TRI = nullptr;
 
   typedef DenseMap<unsigned, uint64_t> ImmediateMap;
 
diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td
index 690dd73014e57..e86b21cf849cb 100644
--- a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td
@@ -365,6 +365,7 @@ def : Pat<(f32 (uint_to_fp (i64 (sexti32 (i64 GPR:$src))))),
 // FP Rounding
 let Predicates = [HasBasicF, IsLA64] in {
 def : PatFpr<frint, FRINT_S, FPR32>;
+def : PatFpr<flog2, FLOGB_S, FPR32>;
 } // Predicates = [HasBasicF, IsLA64]
 
 let Predicates = [HasBasicF, IsLA32] in {
diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td
index daefbaa52d42a..2e88254aab4d5 100644
--- a/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td
@@ -348,6 +348,7 @@ def : Pat<(bitconvert FPR64:$src), (MOVFR2GR_D FPR64:$src)>;
 // FP Rounding
 let Predicates = [HasBasicD, IsLA64] in {
 def : PatFpr<frint, FRINT_D, FPR64>;
+def : PatFpr<flog2, FLOGB_D, FPR64>;
 } // Predicates = [HasBasicD, IsLA64]
 
 /// Pseudo-instructions needed for the soft-float ABI with LA32D
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 80c96c6dc8eb6..fe700e17d341b 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -244,8 +244,10 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::FP_TO_BF16, MVT::f32,
                        Subtarget.isSoftFPABI() ? LibCall : Custom);
 
-    if (Subtarget.is64Bit())
+    if (Subtarget.is64Bit()) {
       setOperationAction(ISD::FRINT, MVT::f32, Legal);
+      setOperationAction(ISD::FLOG2, MVT::f32, Legal);
+    }
 
     if (!Subtarget.hasBasicD()) {
       setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
@@ -291,8 +293,10 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::FP_TO_BF16, MVT::f64,
                        Subtarget.isSoftFPABI() ? LibCall : Custom);
 
-    if (Subtarget.is64Bit())
+    if (Subtarget.is64Bit()) {
       setOperationAction(ISD::FRINT, MVT::f64, Legal);
+      setOperationAction(ISD::FLOG2, MVT::f64, Legal);
+    }
   }
 
   // Set operations for 'LSX' feature.
@@ -362,10 +366,17 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::FMA, VT, Legal);
       setOperationAction(ISD::FSQRT, VT, Legal);
       setOperationAction(ISD::FNEG, VT, Legal);
+      setOperationAction(ISD::FLOG2, VT, Legal);
       setCondCodeAction({ISD::SETGE, ISD::SETGT, ISD::SETOGE, ISD::SETOGT,
                          ISD::SETUGE, ISD::SETUGT},
                         VT, Expand);
       setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Legal);
+      setOperationAction(ISD::FCEIL, VT, Legal);
+      setOperationAction(ISD::FFLOOR, VT, Legal);
+      setOperationAction(ISD::FTRUNC, VT, Legal);
+      setOperationAction(ISD::FROUNDEVEN, VT, Legal);
+      setOperationAction(ISD::FMINNUM, VT, Legal);
+      setOperationAction(ISD::FMAXNUM, VT, Legal);
     }
     setOperationAction(ISD::CTPOP, GRLenVT, Legal);
     setOperationAction(ISD::FCEIL, {MVT::f32, MVT::f64}, Legal);
@@ -443,10 +454,17 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::FMA, VT, Legal);
       setOperationAction(ISD::FSQRT, VT, Legal);
       setOperationAction(ISD::FNEG, VT, Legal);
+      setOperationAction(ISD::FLOG2, VT, Legal);
       setCondCodeAction({ISD::SETGE, ISD::SETGT, ISD::SETOGE, ISD::SETOGT,
                          ISD::SETUGE, ISD::SETUGT},
                         VT, Expand);
       setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Legal);
+      setOperationAction(ISD::FCEIL, VT, Legal);
+      setOperationAction(ISD::FFLOOR, VT, Legal);
+      setOperationAction(ISD::FTRUNC, VT, Legal);
+      setOperationAction(ISD::FROUNDEVEN, VT, Legal);
+      setOperationAction(ISD::FMINNUM, VT, Legal);
+      setOperationAction(ISD::FMAXNUM, VT, Legal);
     }
   }
 
diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
index 613dea6093f5f..b502b056c4cdf 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
@@ -1558,6 +1558,10 @@ defm : PatXrXrF<fmul, "XVFMUL">;
 // XVFDIV_{S/D}
 defm : PatXrXrF<fdiv, "XVFDIV">;
 
+// XVFMAX_{S/D}, XVFMIN_{S/D}
+defm : PatXrXrF<fmaxnum, "XVFMAX">;
+defm : PatXrXrF<fminnum, "XVFMIN">;
+
 // XVFMADD_{S/D}
 def : Pat<(fma v8f32:$xj, v8f32:$xk, v8f32:$xa),
           (XVFMADD_S v8f32:$xj, v8f32:$xk, v8f32:$xa)>;
@@ -1593,6 +1597,9 @@ def : Pat<(fma_nsz (fneg v4f64:$xj), v4f64:$xk, v4f64:$xa),
 // XVFSQRT_{S/D}
 defm : PatXrF<fsqrt, "XVFSQRT">;
 
+// XVFLOGB_{S/D}
+defm : PatXrF<flog2, "XVFLOGB">;
+
 // XVRECIP_{S/D}
 def : Pat<(fdiv vsplatf32_fpimm_eq_1, v8f32:$xj),
           (XVFRECIP_S v8f32:$xj)>;
@@ -2024,6 +2031,24 @@ def : Pat<(v4i32(fp_to_uint v4f64:$vj)),
                (XVFTINTRZ_LU_D v4f64:$vj)),
               sub_128)>;
 
+// XVAVG_{B/H/W/D/BU/HU/WU/DU}, XVAVGR_{B/H/W/D/BU/HU/WU/DU}
+defm : VAvgPat<sra, "XVAVG_B", v32i8>;
+defm : VAvgPat<sra, "XVAVG_H", v16i16>;
+defm : VAvgPat<sra, "XVAVG_W", v8i32>;
+defm : VAvgPat<sra, "XVAVG_D", v4i64>;
+defm : VAvgPat<srl, "XVAVG_BU", v32i8>;
+defm : VAvgPat<srl, "XVAVG_HU", v16i16>;
+defm : VAvgPat<srl, "XVAVG_WU", v8i32>;
+defm : VAvgPat<srl, "XVAVG_DU", v4i64>;
+defm : VAvgrPat<sra, "XVAVGR_B", v32i8>;
+defm : VAvgrPat<sra, "XVAVGR_H", v16i16>;
+defm : VAvgrPat<sra, "XVAVGR_W", v8i32>;
+defm : VAvgrPat<sra, "XVAVGR_D", v4i64>;
+defm : VAvgrPat<srl, "XVAVGR_BU", v32i8>;
+defm : VAvgrPat<srl, "XVAVGR_HU", v16i16>;
+defm : VAvgrPat<srl, "XVAVGR_WU", v8i32>;
+defm : VAvgrPat<srl, "XVAVGR_DU", v4i64>;
+
 // abs
 def : Pat<(abs v32i8:$xj), (XVSIGNCOV_B v32i8:$xj, v32i8:$xj)>;
 def : Pat<(abs v16i16:$xj), (XVSIGNCOV_H v16i16:$xj, v16i16:$xj)>;
@@ -2403,6 +2428,12 @@ def : Pat<(int_loongarch_lasx_xvpickve_w_f v8f32:$xj, timm:$imm),
 def : Pat<(int_loongarch_lasx_xvpickve_d_f v4f64:$xj, timm:$imm),
           (XVPICKVE_D v4f64:$xj, (to_valid_timm timm:$imm))>;
 
+// Vector floating-point conversion
+defm : PatXrF<fceil, "XVFRINTRP">;
+defm : PatXrF<ffloor, "XVFRINTRM">;
+defm : PatXrF<ftrunc, "XVFRINTRZ">;
+defm : PatXrF<froundeven, "XVFRINTRNE">;
+
 // load
 def : Pat<(int_loongarch_lasx_xvld GPR:$rj, timm:$imm),
           (XVLD GPR:$rj, (to_valid_timm timm:$imm))>;
diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
index 4619c6bd248a6..6b74a4b5e5f6f 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
@@ -1518,6 +1518,18 @@ multiclass InsertExtractPatV2<ValueType vecty, ValueType elemty> {
   }
 }
 
+multiclass VAvgPat<SDPatternOperator OpNode, string Inst, ValueType vt> {
+  def : Pat<(OpNode (vt (add vt:$vj, vt:$vk)), (vt (vsplat_imm_eq_1))),
+            (!cast<LAInst>(Inst) vt:$vj, vt:$vk)>;
+}
+
+multiclass VAvgrPat<SDPatternOperator OpNode, string Inst, ValueType vt> {
+  def : Pat<(OpNode (vt (add (vt (add vt:$vj, vt:$vk)),
+                             (vt (vsplat_imm_eq_1)))),
+                    (vt (vsplat_imm_eq_1))),
+            (!cast<LAInst>(Inst) vt:$vj, vt:$vk)>;
+}
+
 let Predicates = [HasExtLSX] in {
 
 // VADD_{B/H/W/D}
@@ -1748,6 +1760,10 @@ defm : PatVrVrF<fmul, "VFMUL">;
 // VFDIV_{S/D}
 defm : PatVrVrF<fdiv, "VFDIV">;
 
+// VFMAX_{S/D}, VFMIN_{S/D}
+defm : PatVrVrF<fmaxnum, "VFMAX">;
+defm : PatVrVrF<fminnum, "VFMIN">;
+
 // VFMADD_{S/D}
 def : Pat<(fma v4f32:$vj, v4f32:$vk, v4f32:$va),
           (VFMADD_S v4f32:$vj, v4f32:$vk, v4f32:$va)>;
@@ -1783,6 +1799,9 @@ def : Pat<(fma_nsz (fneg v2f64:$vj), v2f64:$vk, v2f64:$va),
 // VFSQRT_{S/D}
 defm : PatVrF<fsqrt, "VFSQRT">;
 
+// VFLOGB_{S/D}
+defm : PatVrF<flog2, "VFLOGB">;
+
 // VFRECIP_{S/D}
 def : Pat<(fdiv vsplatf32_fpimm_eq_1, v4f32:$vj),
           (VFRECIP_S v4f32:$vj)>;
@@ -2154,6 +2173,24 @@ def : Pat<(f32 f32imm_vldi:$in),
 def : Pat<(f64 f64imm_vldi:$in),
           (f64 (EXTRACT_SUBREG (VLDI (to_f64imm_vldi f64imm_vldi:$in)), sub_64))>;
 
+// VAVG_{B/H/W/D/BU/HU/WU/DU}, VAVGR_{B/H/W/D/BU/HU/WU/DU}
+defm : VAvgPat<sra, "VAVG_B", v16i8>;
+defm : VAvgPat<sra, "VAVG_H", v8i16>;
+defm : VAvgPat<sra, "VAVG_W", v4i32>;
+defm : VAvgPat<sra, "VAVG_D", v2i64>;
+defm : VAvgPat<srl, "VAVG_BU", v16i8>;
+defm : VAvgPat<srl, "VAVG_HU", v8i16>;
+defm : VAvgPat<srl, "VAVG_WU", v4i32>;
+defm : VAvgPat<srl, "VAVG_DU", v2i64>;
+defm : VAvgrPat<sra, "VAVGR_B", v16i8>;
+defm : VAvgrPat<sra, "VAVGR_H", v8i16>;
+defm : VAvgrPat<sra, "VAVGR_W", v4i32>;
+defm : VAvgrPat<sra, "VAVGR_D", v2i64>;
+defm : VAvgrPat<srl, "VAVGR_BU", v16i8>;
+defm : VAvgrPat<srl, "VAVGR_HU", v8i16>;
+defm : VAvgrPat<srl, "VAVGR_WU", v4i32>;
+defm : VAvgrPat<srl, "VAVGR_DU", v2i64>;
+
 // abs
 def : Pat<(abs v16i8:$vj), (VSIGNCOV_B v16i8:$vj, v16i8:$vj)>;
 def : Pat<(abs v8i16:$vj), (VSIGNCOV_H v8i16:$vj, v8i16:$vj)>;
@@ -2519,6 +2556,11 @@ def : Pat<(f64 (froundeven FPR64:$fj)),
           (f64 (EXTRACT_SUBREG (VFRINTRNE_D (VREPLVEI_D
                (SUBREG_TO_REG (i64 0), FPR64:$fj, sub_64), 0)), sub_64))>;
 
+defm : PatVrF<fceil, "VFRINTRP">;
+defm : PatVrF<ffloor, "VFRINTRM">;
+defm : PatVrF<ftrunc, "VFRINTRZ">;
+defm : PatVrF<froundeven, "VFRINTRNE">;
+
 // load
 def : Pat<(int_loongarch_lsx_vld GPR:$rj, timm:$imm),
           (VLD GPR:$rj, (to_valid_timm timm:$imm))>;
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp
index 7d5456555045b..6d69af5938e79 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp
@@ -39,7 +39,7 @@ LoongArchELFObjectWriter::LoongArchELFObjectWriter(uint8_t OSABI, bool Is64Bit)
     : MCELFObjectTargetWriter(Is64Bit, OSABI, ELF::EM_LOONGARCH,
                               /*HasRelocationAddend=*/true) {}
 
-LoongArchELFObjectWriter::~LoongArchELFObjectWriter() {}
+LoongArchELFObjectWriter::~LoongArchELFObjectWriter() = default;
 
 unsigned LoongArchELFObjectWriter::getRelocType(const MCFixup &Fixup,
                                                 const MCValue &Target,
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
index f0e2bc4855187..08fa51d333346 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
@@ -38,7 +38,7 @@ class LoongArchMCCodeEmitter : public MCCodeEmitter {
   LoongArchMCCodeEmitter(MCContext &ctx, MCInstrInfo const &MCII)
       : Ctx(ctx), MCII(MCII) {}
 
-  ~LoongArchMCCodeEmitter() override {}
+  ~LoongArchMCCodeEmitter() override = default;
 
   void encodeInstruction(const MCInst &MI, SmallVectorImpl<char> &CB,
                          SmallVectorImpl<MCFixup> &Fixups,
diff --git a/llvm/lib/Target/M68k/AsmParser/M68kAsmParser.cpp b/llvm/lib/Target/M68k/AsmParser/M68kAsmParser.cpp
index e37f3a66fe11f..fb5cd5c29d7dc 100644
--- a/llvm/lib/Target/M68k/AsmParser/M68kAsmParser.cpp
+++ b/llvm/lib/Target/M68k/AsmParser/M68kAsmParser.cpp
@@ -690,9 +690,9 @@ bool M68kAsmParser::parseRegisterName(MCRegister &RegNo, SMLoc Loc,
     } else {
       // Floating point control register.
       RegNo = StringSwitch<unsigned>(RegisterNameLower)
-                  .Cases("fpc", "fpcr", M68k::FPC)
-                  .Cases("fps", "fpsr", M68k::FPS)
-                  .Cases("fpi", "fpiar", M68k::FPIAR)
+                  .Cases({"fpc", "fpcr"}, M68k::FPC)
+                  .Cases({"fps", "fpsr"}, M68k::FPS)
+                  .Cases({"fpi", "fpiar"}, M68k::FPIAR)
                   .Default(M68k::NoRegister);
       assert(RegNo != M68k::NoRegister &&
              "Unrecognized FP control register name");
diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp b/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp
index fe83dc6e1abfb..51bafe4a4c56c 100644
--- a/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp
+++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp
@@ -49,7 +49,7 @@ class M68kAsmBackend : public MCAsmBackend {
   M68kAsmBackend(const Target &T, const MCSubtargetInfo &STI)
       : MCAsmBackend(llvm::endianness::big),
         Allows32BitBranch(llvm::StringSwitch<bool>(STI.getCPU())
-                              .CasesLower("m68020", "m68030", "m68040", true)
+                              .CasesLower({"m68020", "m68030", "m68040"}, true)
                               .Default(false)) {}
 
   void applyFixup(const MCFragment &, const MCFixup &, const MCValue &,
diff --git a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index 97379d78ae4ae..f588e56f2ea18 100644
--- a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -6176,7 +6176,7 @@ int MipsAsmParser::matchCPURegisterName(StringRef Name) {
 
   CC = StringSwitch<unsigned>(Name)
            .Case("zero", 0)
-           .Cases("at", "AT", 1)
+           .Cases({"at", "AT"}, 1)
            .Case("a0", 4)
            .Case("a1", 5)
            .Case("a2", 6)
diff --git a/llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.h b/llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.h
index caef8fe790adb..b832b82cbc30c 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.h
+++ b/llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.h
@@ -20,7 +20,7 @@ class MemoryLocation;
 
 class NVPTXAAResult : public AAResultBase {
 public:
-  NVPTXAAResult() {}
+  NVPTXAAResult() = default;
   NVPTXAAResult(NVPTXAAResult &&Arg) : AAResultBase(std::move(Arg)) {}
 
   /// Handle invalidation events from the new pass manager.
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 7e7ee754c250d..c667a09f95dbb 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -1871,17 +1871,6 @@ bool NVPTXScopes::empty() const { return Scopes.size() == 0; }
   (is_ch ? (CP_ASYNC_BULK_TENSOR_OPCODE(RED, dim, mode, is_s32, _CH))          \
          : (CP_ASYNC_BULK_TENSOR_OPCODE(RED, dim, mode, is_s32, )))
 
-#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(dim, mode, is_mc, is_ch, is_s32)   \
-  [&]() -> auto {                                                              \
-    if (is_mc && is_ch)                                                        \
-      return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, _MC_CH);      \
-    if (is_ch)                                                                 \
-      return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, _CH);         \
-    if (is_mc)                                                                 \
-      return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, _MC);         \
-    return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, );              \
-  }()
-
 static unsigned GetCpAsyncBulkTensorS2GReductionOpcode(size_t Dim,
                                                        bool IsShared32,
                                                        bool IsCacheHint,
@@ -1925,112 +1914,6 @@ static unsigned GetCpAsyncBulkTensorS2GReductionOpcode(size_t Dim,
   }
 }
 
-static unsigned GetCpAsyncBulkTensorG2SOpcode(size_t Dim, bool IsShared32,
-                                              bool IsMultiCast,
-                                              bool IsCacheHint, bool IsIm2Col) {
-  if (IsIm2Col) {
-    switch (Dim) {
-    case 3:
-      return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(3D, IM2COL, IsMultiCast,
-                                                 IsCacheHint, IsShared32);
-    case 4:
-      return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(4D, IM2COL, IsMultiCast,
-                                                 IsCacheHint, IsShared32);
-    case 5:
-      return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(5D, IM2COL, IsMultiCast,
-                                                 IsCacheHint, IsShared32);
-    default:
-      llvm_unreachable("Invalid Dimension in im2col mode for "
-                       "GetCpAsyncBulkTensorG2SOpcode.");
-    }
-  } else {
-    switch (Dim) {
-    case 1:
-      return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(1D, TILE, IsMultiCast,
-                                                 IsCacheHint, IsShared32);
-    case 2:
-      return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(2D, TILE, IsMultiCast,
-                                                 IsCacheHint, IsShared32);
-    case 3:
-      return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(3D, TILE, IsMultiCast,
-                                                 IsCacheHint, IsShared32);
-    case 4:
-      return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(4D, TILE, IsMultiCast,
-                                                 IsCacheHint, IsShared32);
-    case 5:
-      return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(5D, TILE, IsMultiCast,
-                                                 IsCacheHint, IsShared32);
-    default:
-      llvm_unreachable(
-          "Invalid Dimension in tile mode for GetCpAsyncBulkTensorG2SOpcode.");
-    }
-  }
-}
-
-static size_t GetDimsFromIntrinsic(unsigned IID) {
-  switch (IID) {
-  case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d:
-  case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_3d:
-    return 3;
-  case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d:
-  case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_4d:
-    return 4;
-  case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d:
-  case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_5d:
-    return 5;
-  default:
-    llvm_unreachable("Invalid im2col intrinsic in GetDimsFromIntrinsic.");
-  }
-}
-
-void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorG2SCommon(SDNode *N,
-                                                         bool IsIm2Col) {
-  // We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
-  // {dst, mbar, src, dims{d0...dN}, im2col_offsets{dims-2}
-  // multicast, cache_hint,
-  // multicast_flag, cache_hint_flag, cta_group_flag}
-  // NumOperands = {Chain, IID} + {Actual intrinsic args}
-  //             = {2}          + {8 + dims + im2col_offsets}
-  size_t NumOps = N->getNumOperands();
-  size_t NumDims = IsIm2Col ? GetDimsFromIntrinsic(N->getConstantOperandVal(1))
-                            : (NumOps - 10);
-  // Offsets is always 'NumDims - 2' and only for im2col mode
-  size_t NumOffsets = IsIm2Col ? (NumDims - 2) : 0;
-  bool IsCacheHint = N->getConstantOperandVal(NumOps - 2) == 1;
-  bool IsMultiCast = N->getConstantOperandVal(NumOps - 3) == 1;
-  size_t NumBaseArgs = NumDims + NumOffsets + 3; // for {dst, mbar, src}
-  size_t MultiCastIdx = NumBaseArgs + 2;         // for Chain and IID
-
-  unsigned CTAGroupVal = N->getConstantOperandVal(NumOps - 1);
-  if ((CTAGroupVal > 0) && !Subtarget->hasCpAsyncBulkTensorCTAGroupSupport())
-    report_fatal_error(
-        formatv("CpAsyncBulkTensorG2S cta_group::1/2 is not supported on sm_{}",
-                Subtarget->getSmVersion()));
-
-  SDLoc DL(N);
-  SmallVector<SDValue, 8> Ops(N->ops().slice(2, NumBaseArgs));
-
-  // Push MultiCast operand, if available
-  if (IsMultiCast)
-    Ops.push_back(N->getOperand(MultiCastIdx));
-
-  // Push CacheHint operand, if available
-  if (IsCacheHint)
-    Ops.push_back(N->getOperand(MultiCastIdx + 1));
-
-  // Flag for CTA Group
-  Ops.push_back(getI32Imm(CTAGroupVal, DL));
-
-  // Finally, the chain operand
-  Ops.push_back(N->getOperand(0));
-
-  bool IsShared32 =
-      CurDAG->getDataLayout().getPointerSizeInBits(ADDRESS_SPACE_SHARED) == 32;
-  unsigned Opcode = GetCpAsyncBulkTensorG2SOpcode(
-      NumDims, IsShared32, IsMultiCast, IsCacheHint, IsIm2Col);
-  ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
-}
-
 void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorReduceCommon(SDNode *N,
                                                             unsigned RedOp,
                                                             bool IsIm2Col) {
@@ -2175,18 +2058,6 @@ bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) {
   switch (IID) {
   default:
     return false;
-  case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_1d:
-  case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_2d:
-  case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_3d:
-  case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_4d:
-  case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_5d:
-    SelectCpAsyncBulkTensorG2SCommon(N);
-    return true;
-  case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d:
-  case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d:
-  case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d:
-    SelectCpAsyncBulkTensorG2SCommon(N, /*IsIm2Col=*/true);
-    return true;
   case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_1d:
   case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_2d:
   case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_3d:
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index c912e709d0aa0..1cb579bd96730 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -86,7 +86,6 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
   bool tryEXTRACT_VECTOR_ELEMENT(SDNode *N);
   void SelectV2I64toI128(SDNode *N);
   void SelectI128toV2I64(SDNode *N);
-  void SelectCpAsyncBulkTensorG2SCommon(SDNode *N, bool IsIm2Col = false);
   void SelectCpAsyncBulkTensorReduceCommon(SDNode *N, unsigned RedOp,
                                            bool IsIm2Col = false);
   void SelectTcgen05Ld(SDNode *N, bool hasOffset = false);
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index dfde0cca0f00c..b26022184708c 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -139,7 +139,6 @@ def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">;
 def hasDotInstructions : Predicate<"Subtarget->hasDotInstructions()">;
 def hasTcgen05Instructions : Predicate<"Subtarget->hasTcgen05Instructions()">;
 def hasTcgen05MMAScaleInputDImm : Predicate<"Subtarget->hasTcgen05MMAScaleInputDImm()">;
-def hasTMACTAGroupSupport  : Predicate<"Subtarget->hasCpAsyncBulkTensorCTAGroupSupport()">;
 def hasF32x2Instructions : Predicate<"Subtarget->hasF32x2Instructions()">;
 
 class hasPTX<int version>: Predicate<"Subtarget->getPTXVersion() >= " # version>;
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index c923f0ec907e7..50827bd548ad5 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -599,75 +599,15 @@ class TMA_IM2COL_UTIL<int dim, string mode> {
   string base_str = !interleave(!foreach(i, !range(offsets), "$im2col" # i), ", ");
 }
 
-// From Global to Shared memory (G2S)
-class G2S_STRINGS<int dim, string mode, bit mc, bit ch, bit is_shared32 = 0> {
-  string prefix = "cp.async.bulk.tensor";
-  string dir = "shared::cluster.global";
-  string completion = "mbarrier::complete_tx::bytes";
-  string inst_name = prefix
-                     # "." # dim # "d"
-                     # "." # dir
-                     # "." # mode
-                     # "." # completion
-                     # !if(mc, ".multicast::cluster", "")
-                     # !if(ch, ".L2::cache_hint", "");
-  string intr_name = "CP_ASYNC_BULK_TENSOR_G2S_"
-                     # dim # "D"
-                     # !if(is_shared32, "_SHARED32", "")
-                     # !if(!eq(mode, "tile"), "_TILE", "_IM2COL");
-}
-
 def CTAGroupFlags : Operand<i32> {
   let PrintMethod = "printCTAGroup";
 }
 
-multiclass CP_ASYNC_BULK_TENSOR_G2S_INTR<int dim, bit is_shared32, string mode> {
-  defvar dims_dag = TMA_DIMS_UTIL<dim>.ins_dag;
-  defvar dims_str = TMA_DIMS_UTIL<dim>.base_str;
-  defvar asm_str_default = "$cg [$dst], [$tmap, {{" # dims_str # "}}], [$mbar]";
-  defvar rc = !if(is_shared32, B32, B64);
-
-  defvar num_im2col = !if(!ge(dim, 3), !add(dim, -2), 0);
-  defvar im2col_dag = !if(!eq(mode, "im2col"),
-    !dag(ins, !listsplat(B16, num_im2col), !foreach(i, !range(num_im2col), "im2col" # i)),
-    (ins));
-  defvar im2col_str = !interleave(!foreach(i, !range(num_im2col), "$im2col" # i), ", ");
-  defvar im2col_asm_str = ", {{" # im2col_str # "}}";
-
-  defvar asm_str = !if(!eq(mode, "im2col"),
-    !strconcat(asm_str_default, im2col_asm_str), asm_str_default);
+def tma_cta_group_imm0 : TImmLeaf<i32, [{return Imm == 0;}]>;
+def tma_cta_group_imm_any : TImmLeaf<i32, [{return Imm >= 0;}]>;
 
-  def "" : NVPTXInst<(outs),
-            !con((ins rc:$dst, rc:$mbar, B64:$tmap), dims_dag, im2col_dag, (ins CTAGroupFlags:$cg)),
-            !strconcat(G2S_STRINGS<dim, mode, 0, 0>.inst_name, asm_str, ";")>,
-            Requires<[hasPTX<80>, hasSM<90>]>;
-  def _MC : NVPTXInst<(outs),
-                  !con((ins rc:$dst, rc:$mbar, B64:$tmap), dims_dag, im2col_dag,
-                       (ins B16:$mc, CTAGroupFlags:$cg)),
-                  !strconcat(G2S_STRINGS<dim, mode, 1, 0>.inst_name, asm_str, ", $mc;")>,
-                  Requires<[hasPTX<80>, hasSM<90>]>;
-  def _CH : NVPTXInst<(outs),
-                  !con((ins rc:$dst, rc:$mbar, B64:$tmap), dims_dag, im2col_dag,
-                       (ins B64:$ch, CTAGroupFlags:$cg)),
-                  !strconcat(G2S_STRINGS<dim, mode, 0, 1>.inst_name, asm_str, ", $ch;")>,
-                  Requires<[hasPTX<80>, hasSM<90>]>;
-  def _MC_CH : NVPTXInst<(outs),
-                     !con((ins rc:$dst, rc:$mbar, B64:$tmap), dims_dag, im2col_dag,
-                          (ins B16:$mc, B64:$ch, CTAGroupFlags:$cg)),
-                     !strconcat(G2S_STRINGS<dim, mode, 1, 1>.inst_name, asm_str, ", $mc, $ch;")>,
-                     Requires<[hasPTX<80>, hasSM<90>]>;
-}
-
-foreach dim = [1, 2, 3, 4, 5] in {
-  foreach shared32 = [true, false] in {
-    foreach mode = !if(!ge(dim, 3), ["tile", "im2col"], ["tile"]) in {
-      defm G2S_STRINGS<dim, mode, 0, 0, shared32>.intr_name :
-        CP_ASYNC_BULK_TENSOR_G2S_INTR<dim, shared32, mode>;
-    }
-  }
-}
-
-multiclass TMA_TENSOR_G2S_INTR<int dim, string mode, list<Predicate> pred = []> {
+multiclass TMA_TENSOR_G2S_INTR<int dim, string mode, list<Predicate> pred,
+                               TImmLeaf cta_group_type = tma_cta_group_imm_any> {
   defvar dims_dag = TMA_DIMS_UTIL<dim>.ins_dag;
   defvar dims_str = TMA_DIMS_UTIL<dim>.base_str;
   defvar asm_str_base = "$cg [$dst], [$tmap, {{" # dims_str # "}}], [$mbar]";
@@ -697,10 +637,10 @@ multiclass TMA_TENSOR_G2S_INTR<int dim, string mode, list<Predicate> pred = []>
                          !setdagop(dims_dag, intr),
                          !setdagop(im2col_dag, intr),
                          (intr B16:$mc, B64:$ch));
-  defvar intr_dag_no_hints   = !con(intr_dag_base, (intr 0,  0,  timm:$cg));
-  defvar intr_dag_with_mc    = !con(intr_dag_base, (intr -1, 0,  timm:$cg));
-  defvar intr_dag_with_ch    = !con(intr_dag_base, (intr 0, -1,  timm:$cg));
-  defvar intr_dag_with_mc_ch = !con(intr_dag_base, (intr -1, -1, timm:$cg));
+  defvar intr_dag_no_hints   = !con(intr_dag_base, (intr 0,  0,  cta_group_type:$cg));
+  defvar intr_dag_with_mc    = !con(intr_dag_base, (intr -1, 0,  cta_group_type:$cg));
+  defvar intr_dag_with_ch    = !con(intr_dag_base, (intr 0, -1,  cta_group_type:$cg));
+  defvar intr_dag_with_mc_ch = !con(intr_dag_base, (intr -1, -1, cta_group_type:$cg));
 
   def "" : NVPTXInst<(outs), ins_dag,
              inst_name # asm_str # ";",
@@ -719,14 +659,30 @@ multiclass TMA_TENSOR_G2S_INTR<int dim, string mode, list<Predicate> pred = []>
                  [intr_dag_with_mc_ch]>,
                  Requires<pred>;
 }
+
+foreach dim = 1...5 in {
+  defm TMA_G2S_TILE_CG0_ # dim # "D"
+      : TMA_TENSOR_G2S_INTR<dim, "tile", [hasPTX<80>, hasSM<90>],
+                            tma_cta_group_imm0>;
+  defm TMA_G2S_TILE_ # dim # "D"
+      : TMA_TENSOR_G2S_INTR<dim, "tile",
+                            [callSubtarget<"hasTMABlackwellSupport">]>;
+}
 foreach dim = 3...5 in {
+  defm TMA_G2S_IM2COL_CG0_ # dim # "D"
+      : TMA_TENSOR_G2S_INTR<dim, "im2col", [hasPTX<80>, hasSM<90>],
+                            tma_cta_group_imm0>;
+  defm TMA_G2S_IM2COL_ # dim # "D"
+      : TMA_TENSOR_G2S_INTR<dim, "im2col",
+                            [callSubtarget<"hasTMABlackwellSupport">]>;
   foreach mode = ["im2col_w", "im2col_w_128"] in {
     defm TMA_G2S_ # !toupper(mode) # "_" # dim # "D"
-      : TMA_TENSOR_G2S_INTR<dim, mode, [hasTMACTAGroupSupport]>;
+        : TMA_TENSOR_G2S_INTR<dim, mode,
+                              [callSubtarget<"hasTMABlackwellSupport">]>;
   }
 }
 defm TMA_G2S_TILE_GATHER4_2D : TMA_TENSOR_G2S_INTR<5, "tile_gather4",
-                               [hasTMACTAGroupSupport]>;
+                               [callSubtarget<"hasTMABlackwellSupport">]>;
 
 multiclass TMA_TENSOR_G2S_CTA_INTR<int dim, string mode, list<Predicate> pred = []> {
   defvar dims_dag = TMA_DIMS_UTIL<dim>.ins_dag;
@@ -784,7 +740,8 @@ foreach dim = 3...5 in {
     : TMA_TENSOR_G2S_CTA_INTR<dim, "im2col_w", [hasPTX<86>, hasSM<100>]>;
 
   defm TMA_G2S_CTA_IM2COL_W_128_ # dim # "D"
-    : TMA_TENSOR_G2S_CTA_INTR<dim, "im2col_w_128", [hasTMACTAGroupSupport]>;
+    : TMA_TENSOR_G2S_CTA_INTR<dim, "im2col_w_128",
+                              [callSubtarget<"hasTMABlackwellSupport">]>;
 }
 defm TMA_G2S_CTA_TILE_GATHER4_2D : TMA_TENSOR_G2S_CTA_INTR<5, "tile_gather4",
                                    [hasPTX<86>, hasSM<100>]>;
@@ -835,7 +792,7 @@ foreach dim = 1...5 in {
   }
 }
 defm TMA_S2G_TILE_SCATTER4_2D : TMA_TENSOR_S2G_INTR<5, "tile_scatter4",
-                                [hasTMACTAGroupSupport]>;
+                                [callSubtarget<"hasTMABlackwellSupport">]>;
 
 def TMAReductionFlags : Operand<i32> {
   let PrintMethod = "printTmaReductionMode";
@@ -930,11 +887,11 @@ foreach dim = 3...5 in {
   foreach mode = ["im2col_w", "im2col_w_128"] in {
     defvar suffix = !toupper(mode) # "_" # dim # "D";
     defm TMA_TENSOR_PF_ # suffix : TMA_TENSOR_PREFETCH_INTR<dim, mode,
-                                   [hasTMACTAGroupSupport]>;
+                                   [callSubtarget<"hasTMABlackwellSupport">]>;
   }
 }
 defm TMA_TENSOR_PF_TILE_GATHER4_2D : TMA_TENSOR_PREFETCH_INTR<5, "tile_gather4",
-                                     [hasTMACTAGroupSupport]>;
+                                     [callSubtarget<"hasTMABlackwellSupport">]>;
 
 //Prefetchu and Prefetch
 
@@ -1605,12 +1562,17 @@ def : Pat<(int_nvvm_saturate_d f64:$a),     (CVT_f64_f64 $a, CvtSAT)>;
 // Exp2  Log2
 //
 
-def : Pat<(int_nvvm_ex2_approx_ftz_f f32:$a), (EX2_APPROX_f32 $a, FTZ)>;
-def : Pat<(int_nvvm_ex2_approx_f f32:$a), (EX2_APPROX_f32 $a, NoFTZ)>;
+def : Pat<(f32 (int_nvvm_ex2_approx_ftz f32:$a)), (EX2_APPROX_f32 $a, FTZ)>;
+def : Pat<(f32 (int_nvvm_ex2_approx f32:$a)), (EX2_APPROX_f32 $a, NoFTZ)>;
 
 let Predicates = [hasPTX<70>, hasSM<75>] in {
-  def : Pat<(int_nvvm_ex2_approx_f16 f16:$a), (EX2_APPROX_f16 $a)>;
-  def : Pat<(int_nvvm_ex2_approx_f16x2 v2f16:$a), (EX2_APPROX_f16x2 $a)>;
+  def : Pat<(f16 (int_nvvm_ex2_approx f16:$a)), (EX2_APPROX_f16 $a)>;
+  def : Pat<(v2f16 (int_nvvm_ex2_approx v2f16:$a)), (EX2_APPROX_f16x2 $a)>;
+}
+
+let Predicates = [hasPTX<78>, hasSM<90>] in {
+  def : Pat<(bf16 (int_nvvm_ex2_approx_ftz bf16:$a)), (EX2_APPROX_bf16 $a)>;
+  def : Pat<(v2bf16 (int_nvvm_ex2_approx_ftz v2bf16:$a)), (EX2_APPROX_bf16x2 $a)>;
 }
 
 def LG2_APPROX_f32 :
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index 194dbdc061a96..021b1f6d0bf57 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -166,18 +166,15 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
   // f32x2 instructions in Blackwell family
   bool hasF32x2Instructions() const;
 
-  // TMA G2S copy with cta_group::1/2 support
-  bool hasCpAsyncBulkTensorCTAGroupSupport() const {
-    // TODO: Update/tidy-up after the family-conditional support arrives
-    switch (FullSmVersion) {
-    case 1003:
-    case 1013:
-      return PTXVersion >= 86;
-    case 1033:
-      return PTXVersion >= 88;
-    default:
-      return false;
-    }
+  // Checks support for following in TMA:
+  //  - cta_group::1/2 support
+  //  - im2col_w/w_128 mode support
+  //  - tile_gather4 mode support
+  //  - tile_scatter4 mode support
+  bool hasTMABlackwellSupport() const {
+    return hasPTXWithFamilySMs(90, {100, 110}) ||
+           hasPTXWithFamilySMs(88, {100, 101}) ||
+           hasPTXWithAccelSMs(86, {100, 101});
   }
 
   // Prior to CUDA 12.3 ptxas did not recognize that the trap instruction
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
index 4029e143ae2a4..64593e6439184 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -318,7 +318,7 @@ static Instruction *convertNvvmIntrinsicToLlvm(InstCombiner &IC,
       // answer. These include:
       //
       //   - nvvm_cos_approx_{f,ftz_f}
-      //   - nvvm_ex2_approx_{d,f,ftz_f}
+      //   - nvvm_ex2_approx(_ftz)
       //   - nvvm_lg2_approx_{d,f,ftz_f}
       //   - nvvm_sin_approx_{f,ftz_f}
       //   - nvvm_sqrt_approx_{f,ftz_f}
@@ -493,7 +493,7 @@ NVPTXTTIImpl::getInstructionCost(const User *U,
             // predicate ("@").
             return !AsmInst.empty() &&
                    (AsmInst[0] == '@' || isAlpha(AsmInst[0]) ||
-                    AsmInst.find(".pragma") != StringRef::npos);
+                    AsmInst.contains(".pragma"));
           });
       return InstCount * TargetTransformInfo::TCC_Basic;
     }
diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index bcb3f507e98d6..780e124bd2c14 100644
--- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -2702,7 +2702,7 @@ static bool isSpecialLLVMGlobalArrayToSkip(const GlobalVariable *GV) {
 
 static bool isSpecialLLVMGlobalArrayForStaticInit(const GlobalVariable *GV) {
   return StringSwitch<bool>(GV->getName())
-      .Cases("llvm.global_ctors", "llvm.global_dtors", true)
+      .Cases({"llvm.global_ctors", "llvm.global_dtors"}, true)
       .Default(false);
 }
 
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 17f04d0fd05e8..20fc849ea4aa5 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -138,6 +138,11 @@ static cl::opt<unsigned> PPCMinimumJumpTableEntries(
     "ppc-min-jump-table-entries", cl::init(64), cl::Hidden,
     cl::desc("Set minimum number of entries to use a jump table on PPC"));
 
+static cl::opt<unsigned> PPCMinimumBitTestCmps(
+    "ppc-min-bit-test-cmps", cl::init(3), cl::Hidden,
+    cl::desc("Set minimum of largest number of comparisons to use bit test for "
+             "switch on PPC."));
+
 static cl::opt<unsigned> PPCGatherAllAliasesMaxDepth(
     "ppc-gather-alias-max-depth", cl::init(18), cl::Hidden,
     cl::desc("max depth when checking alias info in GatherAllAliases()"));
@@ -1436,6 +1441,9 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
   // Re-evaluate this value on future HWs that can do better with mtctr.
   setMinimumJumpTableEntries(PPCMinimumJumpTableEntries);
 
+  // The default minimum of largest number in a BitTest cluster is 3.
+  setMinimumBitTestCmps(PPCMinimumBitTestCmps);
+
   setMinFunctionAlignment(Align(4));
   setMinCmpXchgSizeInBits(Subtarget.hasPartwordAtomics() ? 8 : 32);
 
diff --git a/llvm/lib/Target/PowerPC/PPCInstrFuture.td b/llvm/lib/Target/PowerPC/PPCInstrFuture.td
index b0bed71c6755f..da3efdc15f1e1 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrFuture.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrFuture.td
@@ -194,6 +194,22 @@ class XX3Form_XTAB6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr,
   let Inst{31} = XT{5};
 }
 
+class XForm_RBS5<bits<6> opCode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+                 InstrItinClass itin, list<dag> pattern>
+    : I<opCode, OOL, IOL, asmstr, itin> {
+
+  bits<5> RB;
+  bits<5> RS;
+
+  let Pattern = pattern;
+
+  let Inst{6...10} = RS;
+  let Inst{11...15} = 0;
+  let Inst{16...20} = RB;
+  let Inst{21...30} = xo;
+  let Inst{31} = 0;
+}
+
 class XX3Form_XTAB6_S<bits<5> xo, dag OOL, dag IOL, string asmstr,
                        list<dag> pattern>
     : I<59, OOL, IOL, asmstr, NoItinerary> {
@@ -317,12 +333,16 @@ let Predicates = [IsISAFuture] in {
   def TLBIEIO
       : XForm_RSB5_UIMM2<31, 18, (outs), (ins g8rc:$RB, g8rc:$RS, u2imm:$RIC),
                          "tlbieio $RB, $RS, $RIC", []>;
+  def MTLPL : XForm_RBS5<31, 275, (outs), (ins gprc:$RB, gprc:$RS),
+                         "mtlpl $RB, $RS", IIC_SprMTSPR, []>;
   let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
     def TLBIEP8
         : XForm_RSB5_UIMM2_2UIMM1<31, 50, (outs),
                                   (ins g8rc:$RB, g8rc:$RS, u2imm:$RIC,
                                       u1imm:$PRS, u1imm:$R),
                                   "tlbiep $RB, $RS, $RIC, $PRS, $R", []>;
+    def MTLPL8 : XForm_RBS5<31, 275, (outs), (ins g8rc:$RB, g8rc:$RS),
+                            "mtlpl $RB, $RS", IIC_SprMTSPR, []>, isPPC64;
   }
 }
 
diff --git a/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp
index 3640d2545b5ac..70df59d01d6c7 100644
--- a/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp
+++ b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp
@@ -1316,7 +1316,7 @@ bool PPCLoopInstrFormPrep::runOnLoop(Loop *L) {
     // useless and possible to break some original well-form addressing mode
     // to make this pre-inc prep for it.
     if (PointerElementType->isIntegerTy(64)) {
-      const SCEV *LSCEV = SE->getSCEVAtScope(const_cast<Value *>(PtrValue), L);
+      const SCEV *LSCEV = SE->getSCEVAtScope(PtrValue, L);
       const SCEVAddRecExpr *LARSCEV = dyn_cast<SCEVAddRecExpr>(LSCEV);
       if (!LARSCEV || LARSCEV->getLoop() != L)
         return false;
diff --git a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
index 000d29610678f..4ff489d482fa5 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -296,8 +296,9 @@ PPCTargetMachine::PPCTargetMachine(const Target &T, const Triple &TT,
                                    std::optional<Reloc::Model> RM,
                                    std::optional<CodeModel::Model> CM,
                                    CodeGenOptLevel OL, bool JIT)
-    : CodeGenTargetMachineImpl(T, TT.computeDataLayout(), TT, CPU,
-                               computeFSAdditions(FS, OL, TT), Options,
+    : CodeGenTargetMachineImpl(T,
+                               TT.computeDataLayout(Options.MCOptions.ABIName),
+                               TT, CPU, computeFSAdditions(FS, OL, TT), Options,
                                getEffectiveRelocModel(TT, RM),
                                getEffectivePPCCodeModel(TT, CM, JIT), OL),
       TLOF(createTLOF(getTargetTriple())),
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index 2fba090f2d501..b04e8874f58ad 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -912,7 +912,7 @@ bool PPCTTIImpl::areInlineCompatible(const Function *Caller,
 
 bool PPCTTIImpl::areTypesABICompatible(const Function *Caller,
                                        const Function *Callee,
-                                       const ArrayRef<Type *> &Types) const {
+                                       ArrayRef<Type *> Types) const {
 
   // We need to ensure that argument promotion does not
   // attempt to promote pointers to MMA types (__vector_pair
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
index 475472ac3720f..8d7f25539332e 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -147,7 +147,7 @@ class PPCTTIImpl final : public BasicTTIImplBase<PPCTTIImpl> {
   bool areInlineCompatible(const Function *Caller,
                            const Function *Callee) const override;
   bool areTypesABICompatible(const Function *Caller, const Function *Callee,
-                             const ArrayRef<Type *> &Types) const override;
+                             ArrayRef<Type *> Types) const override;
   bool supportsTailCallFor(const CallBase *CB) const override;
 
 private:
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
index 81981732ee080..282cf5d681685 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
@@ -92,6 +92,10 @@ class RISCVInstructionSelector : public InstructionSelector {
   void emitFence(AtomicOrdering FenceOrdering, SyncScope::ID FenceSSID,
                  MachineIRBuilder &MIB) const;
   bool selectUnmergeValues(MachineInstr &MI, MachineIRBuilder &MIB) const;
+  void addVectorLoadStoreOperands(MachineInstr &I,
+                                  SmallVectorImpl<SrcOp> &SrcOps,
+                                  unsigned &CurOp, bool IsMasked,
+                                  bool IsStrided) const;
   bool selectIntrinsicWithSideEffects(MachineInstr &I,
                                       MachineIRBuilder &MIB) const;
 
@@ -716,6 +720,26 @@ static unsigned selectRegImmLoadStoreOp(unsigned GenericOpc, unsigned OpSize) {
   return GenericOpc;
 }
 
+void RISCVInstructionSelector::addVectorLoadStoreOperands(
+    MachineInstr &I, SmallVectorImpl<SrcOp> &SrcOps, unsigned &CurOp,
+    bool IsMasked, bool IsStrided) const {
+  // Base Pointer
+  auto PtrReg = I.getOperand(CurOp++).getReg();
+  SrcOps.push_back(PtrReg);
+
+  // Stride
+  if (IsStrided) {
+    auto StrideReg = I.getOperand(CurOp++).getReg();
+    SrcOps.push_back(StrideReg);
+  }
+
+  // Mask
+  if (IsMasked) {
+    auto MaskReg = I.getOperand(CurOp++).getReg();
+    SrcOps.push_back(MaskReg);
+  }
+}
+
 bool RISCVInstructionSelector::selectIntrinsicWithSideEffects(
     MachineInstr &I, MachineIRBuilder &MIB) const {
   // Find the intrinsic ID.
@@ -752,21 +776,7 @@ bool RISCVInstructionSelector::selectIntrinsicWithSideEffects(
       SrcOps.push_back(Register(RISCV::NoRegister));
     }
 
-    // Base Pointer
-    auto PtrReg = I.getOperand(CurOp++).getReg();
-    SrcOps.push_back(PtrReg);
-
-    // Stride
-    if (IsStrided) {
-      auto StrideReg = I.getOperand(CurOp++).getReg();
-      SrcOps.push_back(StrideReg);
-    }
-
-    // Mask
-    if (IsMasked) {
-      auto MaskReg = I.getOperand(CurOp++).getReg();
-      SrcOps.push_back(MaskReg);
-    }
+    addVectorLoadStoreOperands(I, SrcOps, CurOp, IsMasked, IsStrided);
 
     RISCVVType::VLMUL LMUL = RISCVTargetLowering::getLMUL(getMVTForLLT(VT));
     const RISCV::VLEPseudo *P =
@@ -795,6 +805,48 @@ bool RISCVInstructionSelector::selectIntrinsicWithSideEffects(
     I.eraseFromParent();
     return constrainSelectedInstRegOperands(*PseudoMI, TII, TRI, RBI);
   }
+  case Intrinsic::riscv_vsm:
+  case Intrinsic::riscv_vse:
+  case Intrinsic::riscv_vse_mask:
+  case Intrinsic::riscv_vsse:
+  case Intrinsic::riscv_vsse_mask: {
+    bool IsMasked = IntrinID == Intrinsic::riscv_vse_mask ||
+                    IntrinID == Intrinsic::riscv_vsse_mask;
+    bool IsStrided = IntrinID == Intrinsic::riscv_vsse ||
+                     IntrinID == Intrinsic::riscv_vsse_mask;
+    LLT VT = MRI->getType(I.getOperand(1).getReg());
+    unsigned Log2SEW = Log2_32(VT.getScalarSizeInBits());
+
+    // Sources
+    unsigned CurOp = 1;
+    SmallVector<SrcOp, 4> SrcOps; // Source registers.
+
+    // Store value
+    auto PassthruReg = I.getOperand(CurOp++).getReg();
+    SrcOps.push_back(PassthruReg);
+
+    addVectorLoadStoreOperands(I, SrcOps, CurOp, IsMasked, IsStrided);
+
+    RISCVVType::VLMUL LMUL = RISCVTargetLowering::getLMUL(getMVTForLLT(VT));
+    const RISCV::VSEPseudo *P = RISCV::getVSEPseudo(
+        IsMasked, IsStrided, Log2SEW, static_cast<unsigned>(LMUL));
+
+    auto PseudoMI = MIB.buildInstr(P->Pseudo, {}, SrcOps);
+
+    // Select VL
+    auto VLOpFn = renderVLOp(I.getOperand(CurOp++));
+    for (auto &RenderFn : *VLOpFn)
+      RenderFn(PseudoMI);
+
+    // SEW
+    PseudoMI.addImm(Log2SEW);
+
+    // Memref
+    PseudoMI.cloneMemRefs(I);
+
+    I.eraseFromParent();
+    return constrainSelectedInstRegOperands(*PseudoMI, TII, TRI, RBI);
+  }
   }
 }
 
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
index 41a9c92cf99c3..96e8afca0680e 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
@@ -823,6 +823,7 @@ static bool relaxableFixupNeedsRelocation(const MCFixupKind Kind) {
     break;
   case RISCV::fixup_riscv_rvc_jump:
   case RISCV::fixup_riscv_rvc_branch:
+  case RISCV::fixup_riscv_rvc_imm:
   case RISCV::fixup_riscv_jal:
     return false;
   }
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
index 6d587e6f167fc..5934c91cb4b9a 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
@@ -688,6 +688,7 @@ uint64_t RISCVMCCodeEmitter::getImmOpValue(const MCInst &MI, unsigned OpNo,
       // the `jal` again in the assembler.
     } else if (MIFrm == RISCVII::InstFormatCI) {
       FixupKind = RISCV::fixup_riscv_rvc_imm;
+      AsmRelaxToLinkerRelaxableWithFeature(RISCV::FeatureVendorXqcili);
     } else if (MIFrm == RISCVII::InstFormatI) {
       FixupKind = RISCV::fixup_riscv_12_i;
     } else if (MIFrm == RISCVII::InstFormatQC_EB) {
diff --git a/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp
index 98b636e8e0e55..9bd66a43717e7 100644
--- a/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp
+++ b/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp
@@ -373,6 +373,26 @@ static void doAtomicBinOpExpansion(const RISCVInstrInfo *TII, MachineInstr &MI,
         .addReg(ScratchReg)
         .addImm(-1);
     break;
+  case AtomicRMWInst::Max:
+    BuildMI(LoopMBB, DL, TII->get(RISCV::MAX), ScratchReg)
+        .addReg(DestReg)
+        .addReg(IncrReg);
+    break;
+  case AtomicRMWInst::Min:
+    BuildMI(LoopMBB, DL, TII->get(RISCV::MIN), ScratchReg)
+        .addReg(DestReg)
+        .addReg(IncrReg);
+    break;
+  case AtomicRMWInst::UMax:
+    BuildMI(LoopMBB, DL, TII->get(RISCV::MAXU), ScratchReg)
+        .addReg(DestReg)
+        .addReg(IncrReg);
+    break;
+  case AtomicRMWInst::UMin:
+    BuildMI(LoopMBB, DL, TII->get(RISCV::MINU), ScratchReg)
+        .addReg(DestReg)
+        .addReg(IncrReg);
+    break;
   }
   BuildMI(LoopMBB, DL, TII->get(getSCForRMW(Ordering, Width, STI)), ScratchReg)
       .addReg(ScratchReg)
@@ -682,6 +702,9 @@ bool RISCVExpandAtomicPseudo::expandAtomicMinMaxOp(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
     AtomicRMWInst::BinOp BinOp, bool IsMasked, int Width,
     MachineBasicBlock::iterator &NextMBBI) {
+  // Using MIN(U)/MAX(U) is preferrable if permitted
+  if (STI->hasPermissiveZalrsc() && STI->hasStdExtZbb() && !IsMasked)
+    return expandAtomicBinOp(MBB, MBBI, BinOp, IsMasked, Width, NextMBBI);
 
   MachineInstr &MI = *MBBI;
   DebugLoc DL = MI.getDebugLoc();
diff --git a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
index 410561855e181..526675a682d86 100644
--- a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
@@ -127,6 +127,10 @@ bool RISCVExpandPseudo::expandMI(MachineBasicBlock &MBB,
   case RISCV::PseudoCCAND:
   case RISCV::PseudoCCOR:
   case RISCV::PseudoCCXOR:
+  case RISCV::PseudoCCMAX:
+  case RISCV::PseudoCCMAXU:
+  case RISCV::PseudoCCMIN:
+  case RISCV::PseudoCCMINU:
   case RISCV::PseudoCCADDW:
   case RISCV::PseudoCCSUBW:
   case RISCV::PseudoCCSLL:
@@ -217,6 +221,7 @@ bool RISCVExpandPseudo::expandCCOp(MachineBasicBlock &MBB,
         .addImm(0);
   } else {
     unsigned NewOpc;
+    // clang-format off
     switch (MI.getOpcode()) {
     default:
       llvm_unreachable("Unexpected opcode!");
@@ -228,6 +233,10 @@ bool RISCVExpandPseudo::expandCCOp(MachineBasicBlock &MBB,
     case RISCV::PseudoCCAND:   NewOpc = RISCV::AND;   break;
     case RISCV::PseudoCCOR:    NewOpc = RISCV::OR;    break;
     case RISCV::PseudoCCXOR:   NewOpc = RISCV::XOR;   break;
+    case RISCV::PseudoCCMAX:   NewOpc = RISCV::MAX;   break;
+    case RISCV::PseudoCCMIN:   NewOpc = RISCV::MIN;   break;
+    case RISCV::PseudoCCMAXU:  NewOpc = RISCV::MAXU;  break;
+    case RISCV::PseudoCCMINU:  NewOpc = RISCV::MINU;  break;
     case RISCV::PseudoCCADDI:  NewOpc = RISCV::ADDI;  break;
     case RISCV::PseudoCCSLLI:  NewOpc = RISCV::SLLI;  break;
     case RISCV::PseudoCCSRLI:  NewOpc = RISCV::SRLI;  break;
@@ -250,6 +259,7 @@ bool RISCVExpandPseudo::expandCCOp(MachineBasicBlock &MBB,
     case RISCV::PseudoCCNDS_BFOS: NewOpc = RISCV::NDS_BFOS; break;
     case RISCV::PseudoCCNDS_BFOZ: NewOpc = RISCV::NDS_BFOZ; break;
     }
+    // clang-format on
 
     if (NewOpc == RISCV::NDS_BFOZ || NewOpc == RISCV::NDS_BFOS) {
       BuildMI(TrueBB, DL, TII->get(NewOpc), DestReg)
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index 2754d789b9899..cfee6ab22d4ff 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -1851,6 +1851,11 @@ def TuneShortForwardBranchOpt
 def HasShortForwardBranchOpt : Predicate<"Subtarget->hasShortForwardBranchOpt()">;
 def NoShortForwardBranchOpt : Predicate<"!Subtarget->hasShortForwardBranchOpt()">;
 
+def TuneShortForwardBranchIMinMax
+    : SubtargetFeature<"short-forward-branch-i-minmax", "HasShortForwardBranchIMinMax",
+                       "true", "Enable short forward branch optimization for min,max instructions in Zbb",
+                       [TuneShortForwardBranchOpt]>;
+
 // Some subtargets require a S2V transfer buffer to move scalars into vectors.
 // FIXME: Forming .vx/.vf/.wx/.wf can reduce register pressure.
 def TuneNoSinkSplatOperands
@@ -1906,6 +1911,25 @@ def FeatureForcedAtomics : SubtargetFeature<
 def HasAtomicLdSt
     : Predicate<"Subtarget->hasStdExtZalrsc() || Subtarget->hasForcedAtomics()">;
 
+// The RISC-V Unprivileged Architecture - ISA Volume 1 (Version: 20250508)
+// [https://docs.riscv.org/reference/isa/_attachments/riscv-unprivileged.pdf]
+// in section 13.3. Eventual Success of Store-Conditional Instructions, defines
+// _constrained_ LR/SC loops:
+//   The dynamic code executed between the LR and SC instructions can only
+//   contain instructions from the base ''I'' instruction set, excluding loads,
+//   stores, backward jumps, taken backward branches, JALR, FENCE, and SYSTEM
+//   instructions. Compressed forms of the aforementioned ''I'' instructions in
+//   the Zca and Zcb extensions are also permitted.
+// LR/SC loops that do not adhere to the above are _unconstrained_ LR/SC loops,
+// and success is implementation specific. For implementations which know that
+// non-base instructions (such as the ''B'' extension) will not violate any
+// forward progress guarantees, using these instructions to reduce the LR/SC
+// sequence length is desirable.
+def FeaturePermissiveZalrsc
+    : SubtargetFeature<
+          "permissive-zalrsc", "HasPermissiveZalrsc", "true",
+          "Implementation permits non-base instructions between LR/SC pairs">;
+
 def FeatureTaggedGlobals : SubtargetFeature<"tagged-globals",
     "AllowTaggedGlobals",
     "true", "Use an instruction sequence for taking the address of a global "
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 9a6afa1cd4ea2..b25a05400fe31 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -3995,6 +3995,7 @@ bool RISCVDAGToDAGISel::hasAllNBitUsers(SDNode *Node, unsigned Bits,
     case RISCV::CTZW:
     case RISCV::CPOPW:
     case RISCV::SLLI_UW:
+    case RISCV::ABSW:
     case RISCV::FMV_W_X:
     case RISCV::FCVT_H_W:
     case RISCV::FCVT_H_W_INX:
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 1c930acd9c4a0..e0cf739f67d9b 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -433,6 +433,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
   if (Subtarget.hasStdExtP() ||
       (Subtarget.hasVendorXCValu() && !Subtarget.is64Bit())) {
     setOperationAction(ISD::ABS, XLenVT, Legal);
+    if (Subtarget.is64Bit())
+      setOperationAction(ISD::ABS, MVT::i32, Custom);
   } else if (Subtarget.hasShortForwardBranchOpt()) {
     // We can use PseudoCCSUB to implement ABS.
     setOperationAction(ISD::ABS, XLenVT, Legal);
@@ -14816,8 +14818,16 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
     assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
            "Unexpected custom legalisation");
 
+    if (Subtarget.hasStdExtP()) {
+      SDValue Src =
+          DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(0));
+      SDValue Abs = DAG.getNode(RISCVISD::ABSW, DL, MVT::i64, Src);
+      Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Abs));
+      return;
+    }
+
     if (Subtarget.hasStdExtZbb()) {
-      // Emit a special ABSW node that will be expanded to NEGW+MAX at isel.
+      // Emit a special node that will be expanded to NEGW+MAX at isel.
       // This allows us to remember that the result is sign extended. Expanding
       // to NEGW+MAX here requires a Freeze which breaks ComputeNumSignBits.
       SDValue Src = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64,
@@ -19784,7 +19794,9 @@ legalizeScatterGatherIndexType(SDLoc DL, SDValue &Index,
     // LLVM's legalization take care of the splitting.
     // FIXME: LLVM can't split VP_GATHER or VP_SCATTER yet.
     Index = DAG.getNode(ISD::SIGN_EXTEND, DL,
-                        IndexVT.changeVectorElementType(XLenVT), Index);
+                        EVT::getVectorVT(*DAG.getContext(), XLenVT,
+                                         IndexVT.getVectorElementCount()),
+                        Index);
   }
   IndexType = ISD::UNSIGNED_SCALED;
   return true;
@@ -20290,6 +20302,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
 
     break;
   }
+  case RISCVISD::ABSW:
   case RISCVISD::CLZW:
   case RISCVISD::CTZW: {
     // Only the lower 32 bits of the first operand are read
@@ -21862,6 +21875,7 @@ unsigned RISCVTargetLowering::ComputeNumSignBitsForTargetNode(
   case RISCVISD::REMUW:
   case RISCVISD::ROLW:
   case RISCVISD::RORW:
+  case RISCVISD::ABSW:
   case RISCVISD::FCVT_W_RV64:
   case RISCVISD::FCVT_WU_RV64:
   case RISCVISD::STRICT_FCVT_W_RV64:
@@ -23932,7 +23946,7 @@ RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
                                .Case("{t0}", RISCV::X5)
                                .Case("{t1}", RISCV::X6)
                                .Case("{t2}", RISCV::X7)
-                               .Cases("{s0}", "{fp}", RISCV::X8)
+                               .Cases({"{s0}", "{fp}"}, RISCV::X8)
                                .Case("{s1}", RISCV::X9)
                                .Case("{a0}", RISCV::X10)
                                .Case("{a1}", RISCV::X11)
@@ -23969,38 +23983,38 @@ RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
   // use the ABI names in register constraint lists.
   if (Subtarget.hasStdExtF()) {
     unsigned FReg = StringSwitch<unsigned>(Constraint.lower())
-                        .Cases("{f0}", "{ft0}", RISCV::F0_F)
-                        .Cases("{f1}", "{ft1}", RISCV::F1_F)
-                        .Cases("{f2}", "{ft2}", RISCV::F2_F)
-                        .Cases("{f3}", "{ft3}", RISCV::F3_F)
-                        .Cases("{f4}", "{ft4}", RISCV::F4_F)
-                        .Cases("{f5}", "{ft5}", RISCV::F5_F)
-                        .Cases("{f6}", "{ft6}", RISCV::F6_F)
-                        .Cases("{f7}", "{ft7}", RISCV::F7_F)
-                        .Cases("{f8}", "{fs0}", RISCV::F8_F)
-                        .Cases("{f9}", "{fs1}", RISCV::F9_F)
-                        .Cases("{f10}", "{fa0}", RISCV::F10_F)
-                        .Cases("{f11}", "{fa1}", RISCV::F11_F)
-                        .Cases("{f12}", "{fa2}", RISCV::F12_F)
-                        .Cases("{f13}", "{fa3}", RISCV::F13_F)
-                        .Cases("{f14}", "{fa4}", RISCV::F14_F)
-                        .Cases("{f15}", "{fa5}", RISCV::F15_F)
-                        .Cases("{f16}", "{fa6}", RISCV::F16_F)
-                        .Cases("{f17}", "{fa7}", RISCV::F17_F)
-                        .Cases("{f18}", "{fs2}", RISCV::F18_F)
-                        .Cases("{f19}", "{fs3}", RISCV::F19_F)
-                        .Cases("{f20}", "{fs4}", RISCV::F20_F)
-                        .Cases("{f21}", "{fs5}", RISCV::F21_F)
-                        .Cases("{f22}", "{fs6}", RISCV::F22_F)
-                        .Cases("{f23}", "{fs7}", RISCV::F23_F)
-                        .Cases("{f24}", "{fs8}", RISCV::F24_F)
-                        .Cases("{f25}", "{fs9}", RISCV::F25_F)
-                        .Cases("{f26}", "{fs10}", RISCV::F26_F)
-                        .Cases("{f27}", "{fs11}", RISCV::F27_F)
-                        .Cases("{f28}", "{ft8}", RISCV::F28_F)
-                        .Cases("{f29}", "{ft9}", RISCV::F29_F)
-                        .Cases("{f30}", "{ft10}", RISCV::F30_F)
-                        .Cases("{f31}", "{ft11}", RISCV::F31_F)
+                        .Cases({"{f0}", "{ft0}"}, RISCV::F0_F)
+                        .Cases({"{f1}", "{ft1}"}, RISCV::F1_F)
+                        .Cases({"{f2}", "{ft2}"}, RISCV::F2_F)
+                        .Cases({"{f3}", "{ft3}"}, RISCV::F3_F)
+                        .Cases({"{f4}", "{ft4}"}, RISCV::F4_F)
+                        .Cases({"{f5}", "{ft5}"}, RISCV::F5_F)
+                        .Cases({"{f6}", "{ft6}"}, RISCV::F6_F)
+                        .Cases({"{f7}", "{ft7}"}, RISCV::F7_F)
+                        .Cases({"{f8}", "{fs0}"}, RISCV::F8_F)
+                        .Cases({"{f9}", "{fs1}"}, RISCV::F9_F)
+                        .Cases({"{f10}", "{fa0}"}, RISCV::F10_F)
+                        .Cases({"{f11}", "{fa1}"}, RISCV::F11_F)
+                        .Cases({"{f12}", "{fa2}"}, RISCV::F12_F)
+                        .Cases({"{f13}", "{fa3}"}, RISCV::F13_F)
+                        .Cases({"{f14}", "{fa4}"}, RISCV::F14_F)
+                        .Cases({"{f15}", "{fa5}"}, RISCV::F15_F)
+                        .Cases({"{f16}", "{fa6}"}, RISCV::F16_F)
+                        .Cases({"{f17}", "{fa7}"}, RISCV::F17_F)
+                        .Cases({"{f18}", "{fs2}"}, RISCV::F18_F)
+                        .Cases({"{f19}", "{fs3}"}, RISCV::F19_F)
+                        .Cases({"{f20}", "{fs4}"}, RISCV::F20_F)
+                        .Cases({"{f21}", "{fs5}"}, RISCV::F21_F)
+                        .Cases({"{f22}", "{fs6}"}, RISCV::F22_F)
+                        .Cases({"{f23}", "{fs7}"}, RISCV::F23_F)
+                        .Cases({"{f24}", "{fs8}"}, RISCV::F24_F)
+                        .Cases({"{f25}", "{fs9}"}, RISCV::F25_F)
+                        .Cases({"{f26}", "{fs10}"}, RISCV::F26_F)
+                        .Cases({"{f27}", "{fs11}"}, RISCV::F27_F)
+                        .Cases({"{f28}", "{ft8}"}, RISCV::F28_F)
+                        .Cases({"{f29}", "{ft9}"}, RISCV::F29_F)
+                        .Cases({"{f30}", "{ft10}"}, RISCV::F30_F)
+                        .Cases({"{f31}", "{ft11}"}, RISCV::F31_F)
                         .Default(RISCV::NoRegister);
     if (FReg != RISCV::NoRegister) {
       assert(RISCV::F0_F <= FReg && FReg <= RISCV::F31_F && "Unknown fp-reg");
diff --git a/llvm/lib/Target/RISCV/RISCVInsertWriteVXRM.cpp b/llvm/lib/Target/RISCV/RISCVInsertWriteVXRM.cpp
index a1c8e23793b92..c58a5c07a34f7 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertWriteVXRM.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertWriteVXRM.cpp
@@ -48,7 +48,7 @@ class VXRMInfo {
   } State = Uninitialized;
 
 public:
-  VXRMInfo() {}
+  VXRMInfo() = default;
 
   static VXRMInfo getUnknown() {
     VXRMInfo Info;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 912b82d294f44..c9df787e0012d 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -869,7 +869,7 @@ std::optional<unsigned> getFoldedOpcode(MachineFunction &MF, MachineInstr &MI,
   }
 }
 
-// This is the version used during inline spilling
+// This is the version used during InlineSpiller::spillAroundUses
 MachineInstr *RISCVInstrInfo::foldMemoryOperandImpl(
     MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
     MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS,
@@ -1699,6 +1699,10 @@ unsigned getPredicatedOpcode(unsigned Opcode) {
   case RISCV::AND:   return RISCV::PseudoCCAND;
   case RISCV::OR:    return RISCV::PseudoCCOR;
   case RISCV::XOR:   return RISCV::PseudoCCXOR;
+  case RISCV::MAX:   return RISCV::PseudoCCMAX;
+  case RISCV::MAXU:  return RISCV::PseudoCCMAXU;
+  case RISCV::MIN:   return RISCV::PseudoCCMIN;
+  case RISCV::MINU:  return RISCV::PseudoCCMINU;
 
   case RISCV::ADDI:  return RISCV::PseudoCCADDI;
   case RISCV::SLLI:  return RISCV::PseudoCCSLLI;
@@ -1735,7 +1739,8 @@ unsigned getPredicatedOpcode(unsigned Opcode) {
 /// return the defining instruction.
 static MachineInstr *canFoldAsPredicatedOp(Register Reg,
                                            const MachineRegisterInfo &MRI,
-                                           const TargetInstrInfo *TII) {
+                                           const TargetInstrInfo *TII,
+                                           const RISCVSubtarget &STI) {
   if (!Reg.isVirtual())
     return nullptr;
   if (!MRI.hasOneNonDBGUse(Reg))
@@ -1743,6 +1748,12 @@ static MachineInstr *canFoldAsPredicatedOp(Register Reg,
   MachineInstr *MI = MRI.getVRegDef(Reg);
   if (!MI)
     return nullptr;
+
+  if (!STI.hasShortForwardBranchIMinMax() &&
+      (MI->getOpcode() == RISCV::MAX || MI->getOpcode() == RISCV::MIN ||
+       MI->getOpcode() == RISCV::MINU || MI->getOpcode() == RISCV::MAXU))
+    return nullptr;
+
   // Check if MI can be predicated and folded into the CCMOV.
   if (getPredicatedOpcode(MI->getOpcode()) == RISCV::INSTRUCTION_LIST_END)
     return nullptr;
@@ -1806,10 +1817,10 @@ RISCVInstrInfo::optimizeSelect(MachineInstr &MI,
 
   MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
   MachineInstr *DefMI =
-      canFoldAsPredicatedOp(MI.getOperand(5).getReg(), MRI, this);
+      canFoldAsPredicatedOp(MI.getOperand(5).getReg(), MRI, this, STI);
   bool Invert = !DefMI;
   if (!DefMI)
-    DefMI = canFoldAsPredicatedOp(MI.getOperand(4).getReg(), MRI, this);
+    DefMI = canFoldAsPredicatedOp(MI.getOperand(4).getReg(), MRI, this, STI);
   if (!DefMI)
     return nullptr;
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index 7c89686ebfb3c..9cb53fb27a2d2 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -768,7 +768,7 @@ def BGE  : BranchCC_rri<0b101, "bge">;
 def BLTU : BranchCC_rri<0b110, "bltu">;
 def BGEU : BranchCC_rri<0b111, "bgeu">;
 
-let IsSignExtendingOpW = 1 in {
+let IsSignExtendingOpW = 1, canFoldAsLoad = 1 in {
 def LB  : Load_ri<0b000, "lb">, Sched<[WriteLDB, ReadMemBase]>;
 def LH  : Load_ri<0b001, "lh">, Sched<[WriteLDH, ReadMemBase]>;
 def LW  : Load_ri<0b010, "lw">, Sched<[WriteLDW, ReadMemBase]>;
@@ -889,8 +889,10 @@ def CSRRCI : CSR_ii<0b111, "csrrci">;
 /// RV64I instructions
 
 let Predicates = [IsRV64] in {
+let canFoldAsLoad = 1 in {
 def LWU   : Load_ri<0b110, "lwu">, Sched<[WriteLDW, ReadMemBase]>;
 def LD    : Load_ri<0b011, "ld">, Sched<[WriteLDD, ReadMemBase]>;
+}
 def SD    : Store_rri<0b011, "sd">, Sched<[WriteSTD, ReadStoreData, ReadMemBase]>;
 
 let IsSignExtendingOpW = 1 in {
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
index afac37d6337d4..4ffe3e62ac501 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
@@ -71,6 +71,7 @@ defvar DExtsRV64 = [DExt, ZdinxExt];
 //===----------------------------------------------------------------------===//
 
 let Predicates = [HasStdExtD] in {
+let canFoldAsLoad = 1 in
 def FLD : FPLoad_r<0b011, "fld", FPR64, WriteFLD64>;
 
 // Operands for stores are in the order srcreg, base, offset rather than
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoF.td b/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
index 6571d998246a7..b30f8ec820c15 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
@@ -330,6 +330,7 @@ class PseudoFROUND<DAGOperand Ty, ValueType vt, ValueType intvt = XLenVT>
 //===----------------------------------------------------------------------===//
 
 let Predicates = [HasStdExtF] in {
+let canFoldAsLoad = 1 in
 def FLW : FPLoad_r<0b010, "flw", FPR32, WriteFLD32>;
 
 // Operands for stores are in the order srcreg, base, offset rather than
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
index cc085bb6c9fd7..4cbbba3aa68cb 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
@@ -1461,5 +1461,10 @@ let Predicates = [HasStdExtP, IsRV32] in {
 // Codegen patterns
 //===----------------------------------------------------------------------===//
 
+def riscv_absw : RVSDNode<"ABSW", SDTIntUnaryOp>;
+
 let Predicates = [HasStdExtP] in
 def : PatGpr<abs, ABS>;
+
+let Predicates = [HasStdExtP, IsRV64] in
+def : PatGpr<riscv_absw, ABSW>;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td b/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td
index 0114fbdc56302..5a67a5aaba293 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td
@@ -106,6 +106,10 @@ def PseudoCCSRA : SFBALU_rr;
 def PseudoCCAND : SFBALU_rr;
 def PseudoCCOR  : SFBALU_rr;
 def PseudoCCXOR : SFBALU_rr;
+def PseudoCCMAX : SFBALU_rr;
+def PseudoCCMIN : SFBALU_rr;
+def PseudoCCMAXU : SFBALU_rr;
+def PseudoCCMINU : SFBALU_rr;
 
 def PseudoCCADDI : SFBALU_ri;
 def PseudoCCANDI : SFBALU_ri;
diff --git a/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp b/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp
index d08115b72977f..ea98cdb4a1e67 100644
--- a/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp
+++ b/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp
@@ -172,6 +172,7 @@ static bool hasAllNBitUsers(const MachineInstr &OrigMI,
       case RISCV::CTZW:
       case RISCV::CPOPW:
       case RISCV::SLLI_UW:
+      case RISCV::ABSW:
       case RISCV::FMV_W_X:
       case RISCV::FCVT_H_W:
       case RISCV::FCVT_H_W_INX:
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
index e9f43b9a71648..84bb29433fb3b 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
@@ -438,18 +438,19 @@ void RISCVRegisterInfo::lowerSegmentSpillReload(MachineBasicBlock::iterator II,
   TypeSize VRegSize = OldLoc.getValue().divideCoefficientBy(NumRegs);
 
   Register VLENB = 0;
-  unsigned PreHandledNum = 0;
+  unsigned VLENBShift = 0;
+  unsigned PrevHandledNum = 0;
   unsigned I = 0;
   while (I != NumRegs) {
     auto [LMulHandled, RegClass, Opcode] =
         getSpillReloadInfo(NumRegs - I, RegEncoding, IsSpill);
     auto [RegNumHandled, _] = RISCVVType::decodeVLMUL(LMulHandled);
     bool IsLast = I + RegNumHandled == NumRegs;
-    if (PreHandledNum) {
+    if (PrevHandledNum) {
       Register Step;
       // Optimize for constant VLEN.
       if (auto VLEN = STI.getRealVLen()) {
-        int64_t Offset = *VLEN / 8 * PreHandledNum;
+        int64_t Offset = *VLEN / 8 * PrevHandledNum;
         Step = MRI.createVirtualRegister(&RISCV::GPRRegClass);
         STI.getInstrInfo()->movImm(MBB, II, DL, Step, Offset);
       } else {
@@ -457,15 +458,21 @@ void RISCVRegisterInfo::lowerSegmentSpillReload(MachineBasicBlock::iterator II,
           VLENB = MRI.createVirtualRegister(&RISCV::GPRRegClass);
           BuildMI(MBB, II, DL, TII->get(RISCV::PseudoReadVLENB), VLENB);
         }
-        uint32_t ShiftAmount = Log2_32(PreHandledNum);
-        if (ShiftAmount == 0)
-          Step = VLENB;
-        else {
-          Step = MRI.createVirtualRegister(&RISCV::GPRRegClass);
-          BuildMI(MBB, II, DL, TII->get(RISCV::SLLI), Step)
-              .addReg(VLENB, getKillRegState(IsLast))
-              .addImm(ShiftAmount);
+        uint32_t ShiftAmount = Log2_32(PrevHandledNum);
+        // To avoid using an extra register, we shift the VLENB register and
+        // remember how much it has been shifted. We can then use relative
+        // shifts to adjust to the desired shift amount.
+        if (VLENBShift > ShiftAmount) {
+          BuildMI(MBB, II, DL, TII->get(RISCV::SRLI), VLENB)
+              .addReg(VLENB, RegState::Kill)
+              .addImm(VLENBShift - ShiftAmount);
+        } else if (VLENBShift < ShiftAmount) {
+          BuildMI(MBB, II, DL, TII->get(RISCV::SLLI), VLENB)
+              .addReg(VLENB, RegState::Kill)
+              .addImm(ShiftAmount - VLENBShift);
         }
+        VLENBShift = ShiftAmount;
+        Step = VLENB;
       }
 
       BuildMI(MBB, II, DL, TII->get(RISCV::ADD), NewBase)
@@ -489,7 +496,7 @@ void RISCVRegisterInfo::lowerSegmentSpillReload(MachineBasicBlock::iterator II,
     if (IsSpill)
       MIB.addReg(Reg, RegState::Implicit);
 
-    PreHandledNum = RegNumHandled;
+    PrevHandledNum = RegNumHandled;
     RegEncoding += RegNumHandled;
     I += RegNumHandled;
   }
diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVTargetStreamer.cpp b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVTargetStreamer.cpp
index 0a318e0e01e59..ed6d355670cbd 100644
--- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVTargetStreamer.cpp
+++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVTargetStreamer.cpp
@@ -15,4 +15,4 @@
 using namespace llvm;
 
 SPIRVTargetStreamer::SPIRVTargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {}
-SPIRVTargetStreamer::~SPIRVTargetStreamer() {}
+SPIRVTargetStreamer::~SPIRVTargetStreamer() = default;
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
index 6181abb281cc6..47022b3f89a8b 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
@@ -745,7 +745,7 @@ Register SPIRVGlobalRegistry::buildGlobalVariable(
                  .addDef(ResVReg)
                  .addUse(getSPIRVTypeID(BaseType))
                  .addImm(static_cast<uint32_t>(Storage));
-  if (Init != 0)
+  if (Init)
     MIB.addUse(Init->getOperand(0).getReg());
   // ISel may introduce a new register on this step, so we need to add it to
   // DT and correct its type avoiding fails on the next stage.
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index 021353ab716f7..3f0424f436c72 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -222,6 +222,9 @@ class SPIRVInstructionSelector : public InstructionSelector {
   bool selectWaveReduceMax(Register ResVReg, const SPIRVType *ResType,
                            MachineInstr &I, bool IsUnsigned) const;
 
+  bool selectWaveReduceMin(Register ResVReg, const SPIRVType *ResType,
+                           MachineInstr &I, bool IsUnsigned) const;
+
   bool selectWaveReduceSum(Register ResVReg, const SPIRVType *ResType,
                            MachineInstr &I) const;
 
@@ -2456,6 +2459,35 @@ bool SPIRVInstructionSelector::selectWaveReduceMax(Register ResVReg,
       .constrainAllUses(TII, TRI, RBI);
 }
 
+bool SPIRVInstructionSelector::selectWaveReduceMin(Register ResVReg,
+                                                   const SPIRVType *ResType,
+                                                   MachineInstr &I,
+                                                   bool IsUnsigned) const {
+  assert(I.getNumOperands() == 3);
+  assert(I.getOperand(2).isReg());
+  MachineBasicBlock &BB = *I.getParent();
+  Register InputRegister = I.getOperand(2).getReg();
+  SPIRVType *InputType = GR.getSPIRVTypeForVReg(InputRegister);
+
+  if (!InputType)
+    report_fatal_error("Input Type could not be determined.");
+
+  SPIRVType *IntTy = GR.getOrCreateSPIRVIntegerType(32, I, TII);
+  // Retreive the operation to use based on input type
+  bool IsFloatTy = GR.isScalarOrVectorOfType(InputRegister, SPIRV::OpTypeFloat);
+  auto IntegerOpcodeType =
+      IsUnsigned ? SPIRV::OpGroupNonUniformUMin : SPIRV::OpGroupNonUniformSMin;
+  auto Opcode = IsFloatTy ? SPIRV::OpGroupNonUniformFMin : IntegerOpcodeType;
+  return BuildMI(BB, I, I.getDebugLoc(), TII.get(Opcode))
+      .addDef(ResVReg)
+      .addUse(GR.getSPIRVTypeID(ResType))
+      .addUse(GR.getOrCreateConstInt(SPIRV::Scope::Subgroup, I, IntTy, TII,
+                                     !STI.isShader()))
+      .addImm(SPIRV::GroupOperation::Reduce)
+      .addUse(I.getOperand(2).getReg())
+      .constrainAllUses(TII, TRI, RBI);
+}
+
 bool SPIRVInstructionSelector::selectWaveReduceSum(Register ResVReg,
                                                    const SPIRVType *ResType,
                                                    MachineInstr &I) const {
@@ -3119,6 +3151,14 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg,
     return selectInsertElt(ResVReg, ResType, I);
   case Intrinsic::spv_gep:
     return selectGEP(ResVReg, ResType, I);
+  case Intrinsic::spv_bitcast: {
+    Register OpReg = I.getOperand(2).getReg();
+    SPIRVType *OpType =
+        OpReg.isValid() ? GR.getSPIRVTypeForVReg(OpReg) : nullptr;
+    if (!GR.isBitcastCompatible(ResType, OpType))
+      report_fatal_error("incompatible result and operand types in a bitcast");
+    return selectOpWithSrcs(ResVReg, ResType, I, {OpReg}, SPIRV::OpBitcast);
+  }
   case Intrinsic::spv_unref_global:
   case Intrinsic::spv_init_global: {
     MachineInstr *MI = MRI->getVRegDef(I.getOperand(1).getReg());
@@ -3431,6 +3471,10 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg,
     return selectWaveReduceMax(ResVReg, ResType, I, /*IsUnsigned*/ true);
   case Intrinsic::spv_wave_reduce_max:
     return selectWaveReduceMax(ResVReg, ResType, I, /*IsUnsigned*/ false);
+  case Intrinsic::spv_wave_reduce_umin:
+    return selectWaveReduceMin(ResVReg, ResType, I, /*IsUnsigned*/ true);
+  case Intrinsic::spv_wave_reduce_min:
+    return selectWaveReduceMin(ResVReg, ResType, I, /*IsUnsigned*/ false);
   case Intrinsic::spv_wave_reduce_sum:
     return selectWaveReduceSum(ResVReg, ResType, I);
   case Intrinsic::spv_wave_readlane:
diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp
index 6e444c98de8da..65dffc7908b78 100644
--- a/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp
@@ -73,16 +73,23 @@ class SPIRVLegalizePointerCast : public FunctionPass {
   // Returns the loaded value.
   Value *loadVectorFromVector(IRBuilder<> &B, FixedVectorType *SourceType,
                               FixedVectorType *TargetType, Value *Source) {
-    assert(TargetType->getNumElements() <= SourceType->getNumElements());
     LoadInst *NewLoad = B.CreateLoad(SourceType, Source);
     buildAssignType(B, SourceType, NewLoad);
     Value *AssignValue = NewLoad;
     if (TargetType->getElementType() != SourceType->getElementType()) {
+      const DataLayout &DL = B.GetInsertBlock()->getModule()->getDataLayout();
+      [[maybe_unused]] TypeSize TargetTypeSize =
+          DL.getTypeSizeInBits(TargetType);
+      [[maybe_unused]] TypeSize SourceTypeSize =
+          DL.getTypeSizeInBits(SourceType);
+      assert(TargetTypeSize == SourceTypeSize);
       AssignValue = B.CreateIntrinsic(Intrinsic::spv_bitcast,
                                       {TargetType, SourceType}, {NewLoad});
       buildAssignType(B, TargetType, AssignValue);
+      return AssignValue;
     }
 
+    assert(TargetType->getNumElements() < SourceType->getNumElements());
     SmallVector<int> Mask(/* Size= */ TargetType->getNumElements());
     for (unsigned I = 0; I < TargetType->getNumElements(); ++I)
       Mask[I] = I;
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
index f7cdfcb65623b..db036a55ee6c6 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
@@ -613,8 +613,7 @@ static void collectOtherInstr(MachineInstr &MI, SPIRV::ModuleAnalysisInfo &MAI,
               << FinalFlags << "\n";
           MachineInstr *OrigMINonConst = const_cast<MachineInstr *>(OrigMI);
           MachineOperand &OrigFlagsOp = OrigMINonConst->getOperand(2);
-          OrigFlagsOp =
-              MachineOperand::CreateImm(static_cast<unsigned>(FinalFlags));
+          OrigFlagsOp = MachineOperand::CreateImm(FinalFlags);
           return; // Merge done, so we found a duplicate; don't add it to MAI.MS
         }
       }
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h
index 2d19f6de604e4..44b6c66d361bf 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h
@@ -81,7 +81,7 @@ struct RequirementHandler {
   void initAvailableCapabilitiesForVulkan(const SPIRVSubtarget &ST);
 
 public:
-  RequirementHandler() {}
+  RequirementHandler() = default;
   void clear() {
     MinimalCaps.clear();
     AllCaps.clear();
diff --git a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
index db6f2d61e8f29..d538009f0ecbe 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
@@ -192,31 +192,43 @@ static void buildOpBitcast(SPIRVGlobalRegistry *GR, MachineIRBuilder &MIB,
         .addUse(OpReg);
 }
 
-// We do instruction selections early instead of calling MIB.buildBitcast()
-// generating the general op code G_BITCAST. When MachineVerifier validates
-// G_BITCAST we see a check of a kind: if Source Type is equal to Destination
-// Type then report error "bitcast must change the type". This doesn't take into
-// account the notion of a typed pointer that is important for SPIR-V where a
-// user may and should use bitcast between pointers with different pointee types
-// (https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#OpBitcast).
-// It's important for correct lowering in SPIR-V, because interpretation of the
-// data type is not left to instructions that utilize the pointer, but encoded
-// by the pointer declaration, and the SPIRV target can and must handle the
-// declaration and use of pointers that specify the type of data they point to.
-// It's not feasible to improve validation of G_BITCAST using just information
-// provided by low level types of source and destination. Therefore we don't
-// produce G_BITCAST as the general op code with semantics different from
-// OpBitcast, but rather lower to OpBitcast immediately. As for now, the only
-// difference would be that CombinerHelper couldn't transform known patterns
-// around G_BUILD_VECTOR. See discussion
-// in https://github.com/llvm/llvm-project/pull/110270 for even more context.
-static void selectOpBitcasts(MachineFunction &MF, SPIRVGlobalRegistry *GR,
-                             MachineIRBuilder MIB) {
+// We lower G_BITCAST to OpBitcast here to avoid a MachineVerifier error.
+// The verifier checks if the source and destination LLTs of a G_BITCAST are
+// different, but this check is too strict for SPIR-V's typed pointers, which
+// may have the same LLT but different SPIRVType (e.g. pointers to different
+// pointee types). By lowering to OpBitcast here, we bypass the verifier's
+// check. See discussion in https://github.com/llvm/llvm-project/pull/110270
+// for more context.
+//
+// We also handle the llvm.spv.bitcast intrinsic here. If the source and
+// destination SPIR-V types are the same, we lower it to a COPY to enable
+// further optimizations like copy propagation.
+static void lowerBitcasts(MachineFunction &MF, SPIRVGlobalRegistry *GR,
+                          MachineIRBuilder MIB) {
   SmallVector<MachineInstr *, 16> ToErase;
   for (MachineBasicBlock &MBB : MF) {
     for (MachineInstr &MI : MBB) {
+      if (isSpvIntrinsic(MI, Intrinsic::spv_bitcast)) {
+        Register DstReg = MI.getOperand(0).getReg();
+        Register SrcReg = MI.getOperand(2).getReg();
+        SPIRVType *DstType = GR->getSPIRVTypeForVReg(DstReg);
+        assert(
+            DstType &&
+            "Expected destination SPIR-V type to have been assigned already.");
+        SPIRVType *SrcType = GR->getSPIRVTypeForVReg(SrcReg);
+        assert(SrcType &&
+               "Expected source SPIR-V type to have been assigned already.");
+        if (DstType == SrcType) {
+          MIB.setInsertPt(*MI.getParent(), MI);
+          MIB.buildCopy(DstReg, SrcReg);
+          ToErase.push_back(&MI);
+          continue;
+        }
+      }
+
       if (MI.getOpcode() != TargetOpcode::G_BITCAST)
         continue;
+
       MIB.setInsertPt(*MI.getParent(), MI);
       buildOpBitcast(GR, MIB, MI.getOperand(0).getReg(),
                      MI.getOperand(1).getReg());
@@ -237,16 +249,11 @@ static void insertBitcasts(MachineFunction &MF, SPIRVGlobalRegistry *GR,
   SmallVector<MachineInstr *, 10> ToErase;
   for (MachineBasicBlock &MBB : MF) {
     for (MachineInstr &MI : MBB) {
-      if (!isSpvIntrinsic(MI, Intrinsic::spv_bitcast) &&
-          !isSpvIntrinsic(MI, Intrinsic::spv_ptrcast))
+      if (!isSpvIntrinsic(MI, Intrinsic::spv_ptrcast))
         continue;
       assert(MI.getOperand(2).isReg());
       MIB.setInsertPt(*MI.getParent(), MI);
       ToErase.push_back(&MI);
-      if (isSpvIntrinsic(MI, Intrinsic::spv_bitcast)) {
-        MIB.buildBitcast(MI.getOperand(0).getReg(), MI.getOperand(2).getReg());
-        continue;
-      }
       Register Def = MI.getOperand(0).getReg();
       Register Source = MI.getOperand(2).getReg();
       Type *ElemTy = getMDOperandAsType(MI.getOperand(3).getMetadata(), 0);
@@ -1089,7 +1096,7 @@ bool SPIRVPreLegalizer::runOnMachineFunction(MachineFunction &MF) {
   removeImplicitFallthroughs(MF, MIB);
   insertSpirvDecorations(MF, GR, MIB);
   insertInlineAsm(MF, GR, ST, MIB);
-  selectOpBitcasts(MF, GR, MIB);
+  lowerBitcasts(MF, GR, MIB);
 
   return true;
 }
diff --git a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp
index 7dd0b95cd9763..5ba035682238b 100644
--- a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp
@@ -69,7 +69,7 @@ static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) {
 }
 
 // Pin SPIRVTargetObjectFile's vtables to this file.
-SPIRVTargetObjectFile::~SPIRVTargetObjectFile() {}
+SPIRVTargetObjectFile::~SPIRVTargetObjectFile() = default;
 
 SPIRVTargetMachine::SPIRVTargetMachine(const Target &T, const Triple &TT,
                                        StringRef CPU, StringRef FS,
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 3da720f54e6ab..58109acc92015 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -8973,8 +8973,7 @@ SystemZTargetLowering::getJumpConditionMergingParams(Instruction::BinaryOps Opc,
         if (const auto *CB = dyn_cast<CallBase>(RHSVal)) {
           if (CB->isInlineAsm()) {
             const InlineAsm *IA = cast<InlineAsm>(CB->getCalledOperand());
-            return IA &&
-                   IA->getConstraintString().find("{@cc}") != std::string::npos;
+            return IA && IA->getConstraintString().contains("{@cc}");
           }
         }
       }
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetObjectFile.h b/llvm/lib/Target/SystemZ/SystemZTargetObjectFile.h
index 9d0adbb81d86d..87ec2564edcfb 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetObjectFile.h
+++ b/llvm/lib/Target/SystemZ/SystemZTargetObjectFile.h
@@ -16,7 +16,7 @@ namespace llvm {
 /// This implementation is used for SystemZ ELF targets.
 class SystemZELFTargetObjectFile : public TargetLoweringObjectFileELF {
 public:
-  SystemZELFTargetObjectFile() {}
+  SystemZELFTargetObjectFile() = default;
 
   /// Describe a TLS variable address within debug info.
   const MCExpr *getDebugThreadLocalSymbol(const MCSymbol *Sym) const override;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyExceptionInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyExceptionInfo.h
index 7845cdfaebec7..1bfc61f0ab611 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyExceptionInfo.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyExceptionInfo.h
@@ -76,7 +76,7 @@ class WebAssemblyException {
     BlockSet.insert(MBB);
   }
   ArrayRef<MachineBasicBlock *> getBlocks() const { return Blocks; }
-  using block_iterator = typename ArrayRef<MachineBasicBlock *>::const_iterator;
+  using block_iterator = ArrayRef<MachineBasicBlock *>::const_iterator;
   block_iterator block_begin() const { return getBlocks().begin(); }
   block_iterator block_end() const { return getBlocks().end(); }
   inline iterator_range<block_iterator> blocks() const {
@@ -96,7 +96,7 @@ class WebAssemblyException {
   void addSubException(std::unique_ptr<WebAssemblyException> E) {
     SubExceptions.push_back(std::move(E));
   }
-  using iterator = typename decltype(SubExceptions)::const_iterator;
+  using iterator = decltype(SubExceptions)::const_iterator;
   iterator begin() const { return SubExceptions.begin(); }
   iterator end() const { return SubExceptions.end(); }
 
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp
index 27f7e1ada1250..5a1779c2c80fb 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp
@@ -81,7 +81,7 @@ WebAssemblyFrameLowering::getLocalForStackObject(MachineFunction &MF,
   // Abuse object size to record number of WebAssembly locals allocated to
   // this object.
   MFI.setObjectSize(FrameIndex, ValueVTs.size());
-  return static_cast<unsigned>(Local);
+  return Local;
 }
 
 /// We need a base pointer in the case of having items on the stack that
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
index ff4d64693284a..ee575e3527673 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
@@ -207,8 +207,7 @@ template <> struct MappingTraits<WebAssemblyFunctionInfo> {
 template <> struct CustomMappingTraits<BBNumberMap> {
   static void inputOne(IO &YamlIO, StringRef Key,
                        BBNumberMap &SrcToUnwindDest) {
-    YamlIO.mapRequired(Key.str().c_str(),
-                       SrcToUnwindDest[std::atoi(Key.str().c_str())]);
+    YamlIO.mapRequired(Key, SrcToUnwindDest[std::atoi(Key.str().c_str())]);
   }
 
   static void output(IO &YamlIO, BBNumberMap &SrcToUnwindDest) {
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblySortRegion.h b/llvm/lib/Target/WebAssembly/WebAssemblySortRegion.h
index e92bf17641854..96b8a4e33cbb7 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblySortRegion.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblySortRegion.h
@@ -35,7 +35,7 @@ class SortRegion {
   virtual MachineBasicBlock *getHeader() const = 0;
   virtual bool contains(const MachineBasicBlock *MBB) const = 0;
   virtual unsigned getNumBlocks() const = 0;
-  using block_iterator = typename ArrayRef<MachineBasicBlock *>::const_iterator;
+  using block_iterator = ArrayRef<MachineBasicBlock *>::const_iterator;
   virtual iterator_range<block_iterator> blocks() const = 0;
   virtual bool isLoop() const = 0;
 };
diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
index 127ee67517aea..bac3692aebf83 100644
--- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -1121,7 +1121,7 @@ class X86AsmParser : public MCTargetAsmParser {
     void setTypeInfo(AsmTypeInfo Type) { CurType = Type; }
   };
 
-  bool Error(SMLoc L, const Twine &Msg, SMRange Range = std::nullopt,
+  bool Error(SMLoc L, const Twine &Msg, SMRange Range = {},
              bool MatchingInlineAsm = false) {
     MCAsmParser &Parser = getParser();
     if (MatchingInlineAsm) {
@@ -2470,10 +2470,10 @@ bool X86AsmParser::ParseIntelOffsetOperator(const MCExpr *&Val, StringRef &ID,
 // Report back its kind, or IOK_INVALID if does not evaluated as a known one
 unsigned X86AsmParser::IdentifyIntelInlineAsmOperator(StringRef Name) {
   return StringSwitch<unsigned>(Name)
-    .Cases("TYPE","type",IOK_TYPE)
-    .Cases("SIZE","size",IOK_SIZE)
-    .Cases("LENGTH","length",IOK_LENGTH)
-    .Default(IOK_INVALID);
+      .Cases({"TYPE", "type"}, IOK_TYPE)
+      .Cases({"SIZE", "size"}, IOK_SIZE)
+      .Cases({"LENGTH", "length"}, IOK_LENGTH)
+      .Default(IOK_INVALID);
 }
 
 /// Parse the 'LENGTH', 'TYPE' and 'SIZE' operators.  The LENGTH operator
@@ -2516,8 +2516,8 @@ unsigned X86AsmParser::ParseIntelInlineAsmOperator(unsigned OpKind) {
 unsigned X86AsmParser::IdentifyMasmOperator(StringRef Name) {
   return StringSwitch<unsigned>(Name.lower())
       .Case("type", MOK_TYPE)
-      .Cases("size", "sizeof", MOK_SIZEOF)
-      .Cases("length", "lengthof", MOK_LENGTHOF)
+      .Cases({"size", "sizeof"}, MOK_SIZEOF)
+      .Cases({"length", "lengthof"}, MOK_LENGTHOF)
       .Default(MOK_INVALID);
 }
 
@@ -2581,21 +2581,21 @@ bool X86AsmParser::ParseMasmOperator(unsigned OpKind, int64_t &Val) {
 bool X86AsmParser::ParseIntelMemoryOperandSize(unsigned &Size,
                                                StringRef *SizeStr) {
   Size = StringSwitch<unsigned>(getTok().getString())
-    .Cases("BYTE", "byte", 8)
-    .Cases("WORD", "word", 16)
-    .Cases("DWORD", "dword", 32)
-    .Cases("FLOAT", "float", 32)
-    .Cases("LONG", "long", 32)
-    .Cases("FWORD", "fword", 48)
-    .Cases("DOUBLE", "double", 64)
-    .Cases("QWORD", "qword", 64)
-    .Cases("MMWORD","mmword", 64)
-    .Cases("XWORD", "xword", 80)
-    .Cases("TBYTE", "tbyte", 80)
-    .Cases("XMMWORD", "xmmword", 128)
-    .Cases("YMMWORD", "ymmword", 256)
-    .Cases("ZMMWORD", "zmmword", 512)
-    .Default(0);
+             .Cases({"BYTE", "byte"}, 8)
+             .Cases({"WORD", "word"}, 16)
+             .Cases({"DWORD", "dword"}, 32)
+             .Cases({"FLOAT", "float"}, 32)
+             .Cases({"LONG", "long"}, 32)
+             .Cases({"FWORD", "fword"}, 48)
+             .Cases({"DOUBLE", "double"}, 64)
+             .Cases({"QWORD", "qword"}, 64)
+             .Cases({"MMWORD", "mmword"}, 64)
+             .Cases({"XWORD", "xword"}, 80)
+             .Cases({"TBYTE", "tbyte"}, 80)
+             .Cases({"XMMWORD", "xmmword"}, 128)
+             .Cases({"YMMWORD", "ymmword"}, 256)
+             .Cases({"ZMMWORD", "zmmword"}, 512)
+             .Default(0);
   if (Size) {
     if (SizeStr)
       *SizeStr = getTok().getString();
@@ -2886,22 +2886,22 @@ bool X86AsmParser::parseATTOperand(OperandVector &Operands) {
 // otherwise the EFLAGS Condition Code enumerator.
 X86::CondCode X86AsmParser::ParseConditionCode(StringRef CC) {
   return StringSwitch<X86::CondCode>(CC)
-      .Case("o", X86::COND_O)          // Overflow
-      .Case("no", X86::COND_NO)        // No Overflow
-      .Cases("b", "nae", X86::COND_B)  // Below/Neither Above nor Equal
-      .Cases("ae", "nb", X86::COND_AE) // Above or Equal/Not Below
-      .Cases("e", "z", X86::COND_E)    // Equal/Zero
-      .Cases("ne", "nz", X86::COND_NE) // Not Equal/Not Zero
-      .Cases("be", "na", X86::COND_BE) // Below or Equal/Not Above
-      .Cases("a", "nbe", X86::COND_A)  // Above/Neither Below nor Equal
-      .Case("s", X86::COND_S)          // Sign
-      .Case("ns", X86::COND_NS)        // No Sign
-      .Cases("p", "pe", X86::COND_P)   // Parity/Parity Even
-      .Cases("np", "po", X86::COND_NP) // No Parity/Parity Odd
-      .Cases("l", "nge", X86::COND_L)  // Less/Neither Greater nor Equal
-      .Cases("ge", "nl", X86::COND_GE) // Greater or Equal/Not Less
-      .Cases("le", "ng", X86::COND_LE) // Less or Equal/Not Greater
-      .Cases("g", "nle", X86::COND_G)  // Greater/Neither Less nor Equal
+      .Case("o", X86::COND_O)            // Overflow
+      .Case("no", X86::COND_NO)          // No Overflow
+      .Cases({"b", "nae"}, X86::COND_B)  // Below/Neither Above nor Equal
+      .Cases({"ae", "nb"}, X86::COND_AE) // Above or Equal/Not Below
+      .Cases({"e", "z"}, X86::COND_E)    // Equal/Zero
+      .Cases({"ne", "nz"}, X86::COND_NE) // Not Equal/Not Zero
+      .Cases({"be", "na"}, X86::COND_BE) // Below or Equal/Not Above
+      .Cases({"a", "nbe"}, X86::COND_A)  // Above/Neither Below nor Equal
+      .Case("s", X86::COND_S)            // Sign
+      .Case("ns", X86::COND_NS)          // No Sign
+      .Cases({"p", "pe"}, X86::COND_P)   // Parity/Parity Even
+      .Cases({"np", "po"}, X86::COND_NP) // No Parity/Parity Odd
+      .Cases({"l", "nge"}, X86::COND_L)  // Less/Neither Greater nor Equal
+      .Cases({"ge", "nl"}, X86::COND_GE) // Greater or Equal/Not Less
+      .Cases({"le", "ng"}, X86::COND_LE) // Less or Equal/Not Greater
+      .Cases({"g", "nle"}, X86::COND_G)  // Greater/Neither Less nor Equal
       .Default(X86::COND_INVALID);
 }
 
@@ -4322,7 +4322,7 @@ bool X86AsmParser::matchAndEmitATTInstruction(
     SMLoc IDLoc, unsigned &Opcode, MCInst &Inst, OperandVector &Operands,
     MCStreamer &Out, uint64_t &ErrorInfo, bool MatchingInlineAsm) {
   X86Operand &Op = static_cast<X86Operand &>(*Operands[0]);
-  SMRange EmptyRange = std::nullopt;
+  SMRange EmptyRange;
   // In 16-bit mode, if data32 is specified, temporarily switch to 32-bit mode
   // when matching the instruction.
   if (ForcedDataPrefix == X86::Is32Bit)
@@ -4548,7 +4548,7 @@ bool X86AsmParser::matchAndEmitIntelInstruction(
     SMLoc IDLoc, unsigned &Opcode, MCInst &Inst, OperandVector &Operands,
     MCStreamer &Out, uint64_t &ErrorInfo, bool MatchingInlineAsm) {
   X86Operand &Op = static_cast<X86Operand &>(*Operands[0]);
-  SMRange EmptyRange = std::nullopt;
+  SMRange EmptyRange;
   // Find one unsized memory operand, if present.
   X86Operand *UnsizedMemOp = nullptr;
   for (const auto &Op : Operands) {
diff --git a/llvm/lib/Target/X86/AsmParser/X86Operand.h b/llvm/lib/Target/X86/AsmParser/X86Operand.h
index 89ac53e0ecac9..a92272573bacd 100644
--- a/llvm/lib/Target/X86/AsmParser/X86Operand.h
+++ b/llvm/lib/Target/X86/AsmParser/X86Operand.h
@@ -620,37 +620,6 @@ struct X86Operand final : public MCParsedAsmOperand {
     Inst.addOperand(MCOperand::createReg(Reg));
   }
 
-  bool isTILEPair() const {
-    return Kind == Register &&
-           X86MCRegisterClasses[X86::TILERegClassID].contains(getReg());
-  }
-
-  void addTILEPairOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    MCRegister Reg = getReg();
-    switch (Reg.id()) {
-    default:
-      llvm_unreachable("Invalid tile register!");
-    case X86::TMM0:
-    case X86::TMM1:
-      Reg = X86::TMM0_TMM1;
-      break;
-    case X86::TMM2:
-    case X86::TMM3:
-      Reg = X86::TMM2_TMM3;
-      break;
-    case X86::TMM4:
-    case X86::TMM5:
-      Reg = X86::TMM4_TMM5;
-      break;
-    case X86::TMM6:
-    case X86::TMM7:
-      Reg = X86::TMM6_TMM7;
-      break;
-    }
-    Inst.addOperand(MCOperand::createReg(Reg));
-  }
-
   void addMemOperands(MCInst &Inst, unsigned N) const {
     assert((N == 5) && "Invalid number of operands!");
     if (getMemBaseReg())
diff --git a/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp b/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
index 4927b453458ef..7d2b5eb900133 100644
--- a/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
+++ b/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
@@ -810,10 +810,6 @@ static int readModRM(struct InternalInstruction *insn) {
       if (index > 7)                                                           \
         *valid = 0;                                                            \
       return prefix##_TMM0 + index;                                            \
-    case TYPE_TMM_PAIR:                                                        \
-      if (index > 7)                                                           \
-        *valid = 0;                                                            \
-      return prefix##_TMM0_TMM1 + (index / 2);                                 \
     case TYPE_VK:                                                              \
       index &= 0xf;                                                            \
       if (index > 7)                                                           \
@@ -2323,7 +2319,6 @@ static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand,
   case TYPE_YMM:
   case TYPE_ZMM:
   case TYPE_TMM:
-  case TYPE_TMM_PAIR:
   case TYPE_VK_PAIR:
   case TYPE_VK:
   case TYPE_DEBUGREG:
diff --git a/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
index dc9af2caa77b1..b0aa70be12d83 100644
--- a/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
+++ b/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
@@ -535,12 +535,6 @@ namespace X86Disassembler {
   ENTRY(TMM6)                                                                  \
   ENTRY(TMM7)
 
-#define REGS_TMM_PAIRS                                                         \
-  ENTRY(TMM0_TMM1)                                                             \
-  ENTRY(TMM2_TMM3)                                                             \
-  ENTRY(TMM4_TMM5)                                                             \
-  ENTRY(TMM6_TMM7)
-
 #define ALL_EA_BASES                                                           \
   EA_BASES_16BIT                                                               \
   EA_BASES_32BIT                                                               \
@@ -565,7 +559,6 @@ namespace X86Disassembler {
   REGS_DEBUG                                                                   \
   REGS_CONTROL                                                                 \
   REGS_TMM                                                                     \
-  REGS_TMM_PAIRS                                                               \
   ENTRY(RIP)
 
 /// All possible values of the base field for effective-address
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
index 1c5f1663d4f52..759d95e5a18ea 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
@@ -467,22 +467,3 @@ void X86InstPrinterCommon::printVKPair(const MCInst *MI, unsigned OpNo,
   }
   llvm_unreachable("Unknown mask pair register name");
 }
-
-void X86InstPrinterCommon::printTILEPair(const MCInst *MI, unsigned OpNo,
-                                         raw_ostream &OS) {
-  switch (MI->getOperand(OpNo).getReg()) {
-  case X86::TMM0_TMM1:
-    printRegName(OS, X86::TMM0);
-    return;
-  case X86::TMM2_TMM3:
-    printRegName(OS, X86::TMM2);
-    return;
-  case X86::TMM4_TMM5:
-    printRegName(OS, X86::TMM4);
-    return;
-  case X86::TMM6_TMM7:
-    printRegName(OS, X86::TMM6);
-    return;
-  }
-  llvm_unreachable("Unknown mask pair register name");
-}
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h
index 2c9467ca7c615..cb55f2f0019b5 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h
@@ -40,7 +40,6 @@ class X86InstPrinterCommon : public MCInstPrinter {
                       const MCSubtargetInfo &STI);
   void printOptionalSegReg(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printVKPair(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
-  void printTILEPair(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index a1fd366e59444..9e291a6ae431f 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -274,9 +274,6 @@ def FeatureAMXFP8 : SubtargetFeature<"amx-fp8", "HasAMXFP8", "true",
 def FeatureAMXMOVRS : SubtargetFeature<"amx-movrs", "HasAMXMOVRS", "true",
                                        "Support AMX-MOVRS instructions",
                                        [FeatureAMXTILE]>;
-def FeatureAMXTRANSPOSE : SubtargetFeature<"amx-transpose", "HasAMXTRANSPOSE", "true",
-                                           "Support AMX amx-transpose instructions",
-                                           [FeatureAMXTILE]>;
 def FeatureAMXAVX512 : SubtargetFeature<"amx-avx512",
                                         "HasAMXAVX512", "true",
                                         "Support AMX-AVX512 instructions",
@@ -1177,8 +1174,7 @@ def ProcessorFeatures {
                                                   FeatureAMXMOVRS,
                                                   FeatureAMXAVX512,
                                                   FeatureAMXFP8,
-                                                  FeatureAMXTF32,
-                                                  FeatureAMXTRANSPOSE];
+                                                  FeatureAMXTF32];
   list<SubtargetFeature> DMRFeatures =
     !listconcat(GNRDFeatures, DMRAdditionalFeatures);
 
diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
index 4a9b824b0db14..e3c44c048f7bf 100644
--- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp
+++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
@@ -649,149 +649,6 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,
     MI.setDesc(TII->get(Opc));
     return true;
   }
-  // TILEPAIRLOAD is just for TILEPair spill, we don't have corresponding
-  // AMX instruction to support it. So, split it to 2 load instructions:
-  // "TILEPAIRLOAD TMM0:TMM1, Base, Scale, Index, Offset, Segment" -->
-  // "TILELOAD TMM0, Base, Scale, Index, Offset, Segment" +
-  // "TILELOAD TMM1, Base, Scale, Index, Offset + TMM_SIZE, Segment"
-  case X86::PTILEPAIRLOAD: {
-    int64_t Disp = MBBI->getOperand(1 + X86::AddrDisp).getImm();
-    Register TReg = MBBI->getOperand(0).getReg();
-    bool DstIsDead = MBBI->getOperand(0).isDead();
-    Register TReg0 = TRI->getSubReg(TReg, X86::sub_t0);
-    Register TReg1 = TRI->getSubReg(TReg, X86::sub_t1);
-    unsigned TmmSize = TRI->getRegSizeInBits(X86::TILERegClass) / 8;
-
-    MachineInstrBuilder MIBLo =
-        BuildMI(MBB, MBBI, DL, TII->get(X86::TILELOADD))
-            .addReg(TReg0, RegState::Define | getDeadRegState(DstIsDead));
-    MachineInstrBuilder MIBHi =
-        BuildMI(MBB, MBBI, DL, TII->get(X86::TILELOADD))
-            .addReg(TReg1, RegState::Define | getDeadRegState(DstIsDead));
-
-    for (int i = 0; i < X86::AddrNumOperands; ++i) {
-      MIBLo.add(MBBI->getOperand(1 + i));
-      if (i == X86::AddrDisp)
-        MIBHi.addImm(Disp + TmmSize);
-      else
-        MIBHi.add(MBBI->getOperand(1 + i));
-    }
-
-    // Make sure the first stride reg used in first tileload is alive.
-    MachineOperand &Stride =
-        MIBLo.getInstr()->getOperand(1 + X86::AddrIndexReg);
-    Stride.setIsKill(false);
-
-    // Split the memory operand, adjusting the offset and size for the halves.
-    MachineMemOperand *OldMMO = MBBI->memoperands().front();
-    MachineFunction *MF = MBB.getParent();
-    MachineMemOperand *MMOLo = MF->getMachineMemOperand(OldMMO, 0, TmmSize);
-    MachineMemOperand *MMOHi =
-        MF->getMachineMemOperand(OldMMO, TmmSize, TmmSize);
-
-    MIBLo.setMemRefs(MMOLo);
-    MIBHi.setMemRefs(MMOHi);
-
-    // Delete the pseudo.
-    MBB.erase(MBBI);
-    return true;
-  }
-  // Similar with TILEPAIRLOAD, TILEPAIRSTORE is just for TILEPair spill, no
-  // corresponding AMX instruction to support it. So, split it too:
-  // "TILEPAIRSTORE Base, Scale, Index, Offset, Segment, TMM0:TMM1" -->
-  // "TILESTORE Base, Scale, Index, Offset, Segment, TMM0" +
-  // "TILESTORE Base, Scale, Index, Offset + TMM_SIZE, Segment, TMM1"
-  case X86::PTILEPAIRSTORE: {
-    int64_t Disp = MBBI->getOperand(X86::AddrDisp).getImm();
-    Register TReg = MBBI->getOperand(X86::AddrNumOperands).getReg();
-    bool SrcIsKill = MBBI->getOperand(X86::AddrNumOperands).isKill();
-    Register TReg0 = TRI->getSubReg(TReg, X86::sub_t0);
-    Register TReg1 = TRI->getSubReg(TReg, X86::sub_t1);
-    unsigned TmmSize = TRI->getRegSizeInBits(X86::TILERegClass) / 8;
-
-    MachineInstrBuilder MIBLo =
-        BuildMI(MBB, MBBI, DL, TII->get(X86::TILESTORED));
-    MachineInstrBuilder MIBHi =
-        BuildMI(MBB, MBBI, DL, TII->get(X86::TILESTORED));
-
-    for (int i = 0; i < X86::AddrNumOperands; ++i) {
-      MIBLo.add(MBBI->getOperand(i));
-      if (i == X86::AddrDisp)
-        MIBHi.addImm(Disp + TmmSize);
-      else
-        MIBHi.add(MBBI->getOperand(i));
-    }
-    MIBLo.addReg(TReg0, getKillRegState(SrcIsKill));
-    MIBHi.addReg(TReg1, getKillRegState(SrcIsKill));
-
-    // Make sure the first stride reg used in first tilestore is alive.
-    MachineOperand &Stride = MIBLo.getInstr()->getOperand(X86::AddrIndexReg);
-    Stride.setIsKill(false);
-
-    // Split the memory operand, adjusting the offset and size for the halves.
-    MachineMemOperand *OldMMO = MBBI->memoperands().front();
-    MachineFunction *MF = MBB.getParent();
-    MachineMemOperand *MMOLo = MF->getMachineMemOperand(OldMMO, 0, TmmSize);
-    MachineMemOperand *MMOHi =
-        MF->getMachineMemOperand(OldMMO, TmmSize, TmmSize);
-
-    MIBLo.setMemRefs(MMOLo);
-    MIBHi.setMemRefs(MMOHi);
-
-    // Delete the pseudo.
-    MBB.erase(MBBI);
-    return true;
-  }
-  case X86::PT2RPNTLVWZ0V:
-  case X86::PT2RPNTLVWZ0T1V:
-  case X86::PT2RPNTLVWZ1V:
-  case X86::PT2RPNTLVWZ1T1V:
-  case X86::PT2RPNTLVWZ0RSV:
-  case X86::PT2RPNTLVWZ0RST1V:
-  case X86::PT2RPNTLVWZ1RSV:
-  case X86::PT2RPNTLVWZ1RST1V: {
-    for (unsigned i = 3; i > 0; --i)
-      MI.removeOperand(i);
-    unsigned Opc;
-    switch (Opcode) {
-    case X86::PT2RPNTLVWZ0V:
-      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0);
-      break;
-    case X86::PT2RPNTLVWZ0T1V:
-      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0T1);
-      break;
-    case X86::PT2RPNTLVWZ1V:
-      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1);
-      break;
-    case X86::PT2RPNTLVWZ1T1V:
-      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1T1);
-      break;
-    case X86::PT2RPNTLVWZ0RSV:
-      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RS);
-      break;
-    case X86::PT2RPNTLVWZ0RST1V:
-      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RST1);
-      break;
-    case X86::PT2RPNTLVWZ1RSV:
-      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RS);
-      break;
-    case X86::PT2RPNTLVWZ1RST1V:
-      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RST1);
-      break;
-    default:
-      llvm_unreachable("Impossible Opcode!");
-    }
-    MI.setDesc(TII->get(Opc));
-    return true;
-  }
-  case X86::PTTRANSPOSEDV:
-  case X86::PTCONJTFP16V: {
-    for (int i = 2; i > 0; --i)
-      MI.removeOperand(i);
-    MI.setDesc(TII->get(Opcode == X86::PTTRANSPOSEDV ? X86::TTRANSPOSED
-                                                     : X86::TCONJTFP16));
-    return true;
-  }
   case X86::PTCMMIMFP16PSV:
   case X86::PTCMMRLFP16PSV:
   case X86::PTDPBSSDV:
@@ -800,13 +657,7 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,
   case X86::PTDPBUUDV:
   case X86::PTDPBF16PSV:
   case X86::PTDPFP16PSV:
-  case X86::PTTDPBF16PSV:
-  case X86::PTTDPFP16PSV:
-  case X86::PTTCMMIMFP16PSV:
-  case X86::PTTCMMRLFP16PSV:
-  case X86::PTCONJTCMMIMFP16PSV:
   case X86::PTMMULTF32PSV:
-  case X86::PTTMMULTF32PSV:
   case X86::PTDPBF8PSV:
   case X86::PTDPBHF8PSV:
   case X86::PTDPHBF8PSV:
@@ -816,6 +667,7 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,
       MI.removeOperand(i);
     unsigned Opc;
     switch (Opcode) {
+      // clang-format off
     case X86::PTCMMIMFP16PSV:  Opc = X86::TCMMIMFP16PS; break;
     case X86::PTCMMRLFP16PSV:  Opc = X86::TCMMRLFP16PS; break;
     case X86::PTDPBSSDV:   Opc = X86::TDPBSSD; break;
@@ -824,40 +676,12 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,
     case X86::PTDPBUUDV:   Opc = X86::TDPBUUD; break;
     case X86::PTDPBF16PSV: Opc = X86::TDPBF16PS; break;
     case X86::PTDPFP16PSV: Opc = X86::TDPFP16PS; break;
-    case X86::PTTDPBF16PSV:
-      Opc = X86::TTDPBF16PS;
-      break;
-    case X86::PTTDPFP16PSV:
-      Opc = X86::TTDPFP16PS;
-      break;
-    case X86::PTTCMMIMFP16PSV:
-      Opc = X86::TTCMMIMFP16PS;
-      break;
-    case X86::PTTCMMRLFP16PSV:
-      Opc = X86::TTCMMRLFP16PS;
-      break;
-    case X86::PTCONJTCMMIMFP16PSV:
-      Opc = X86::TCONJTCMMIMFP16PS;
-      break;
-    case X86::PTMMULTF32PSV:
-      Opc = X86::TMMULTF32PS;
-      break;
-    case X86::PTTMMULTF32PSV:
-      Opc = X86::TTMMULTF32PS;
-      break;
-    case X86::PTDPBF8PSV:
-      Opc = X86::TDPBF8PS;
-      break;
-    case X86::PTDPBHF8PSV:
-      Opc = X86::TDPBHF8PS;
-      break;
-    case X86::PTDPHBF8PSV:
-      Opc = X86::TDPHBF8PS;
-      break;
-    case X86::PTDPHF8PSV:
-      Opc = X86::TDPHF8PS;
-      break;
-
+    case X86::PTMMULTF32PSV: Opc = X86::TMMULTF32PS; break;
+    case X86::PTDPBF8PSV: Opc = X86::TDPBF8PS; break;
+    case X86::PTDPBHF8PSV: Opc = X86::TDPBHF8PS; break;
+    case X86::PTDPHBF8PSV: Opc = X86::TDPHBF8PS; break;
+    case X86::PTDPHF8PSV: Opc = X86::TDPHF8PS; break;
+    // clang-format on
     default:
       llvm_unreachable("Unexpected Opcode");
     }
diff --git a/llvm/lib/Target/X86/X86FastPreTileConfig.cpp b/llvm/lib/Target/X86/X86FastPreTileConfig.cpp
index 787b71d425cb3..06f729a7e0cdc 100644
--- a/llvm/lib/Target/X86/X86FastPreTileConfig.cpp
+++ b/llvm/lib/Target/X86/X86FastPreTileConfig.cpp
@@ -267,24 +267,16 @@ void X86FastPreTileConfig::reload(MachineBasicBlock::iterator UseMI,
                     << printReg(TileReg, TRI) << '\n');
 }
 
-static unsigned getTileDefNum(MachineRegisterInfo *MRI, Register Reg) {
-  if (Reg.isVirtual()) {
-    unsigned RegClassID = MRI->getRegClass(Reg)->getID();
-    if (RegClassID == X86::TILERegClassID)
-      return 1;
-    if (RegClassID == X86::TILEPAIRRegClassID)
-      return 2;
-  } else {
-    if (Reg >= X86::TMM0 && Reg <= X86::TMM7)
-      return 1;
-    if (Reg >= X86::TMM0_TMM1 && Reg <= X86::TMM6_TMM7)
-      return 2;
+static bool isTileRegister(MachineRegisterInfo *MRI, Register Reg) {
+  if (Reg.isVirtual() &&
+      (MRI->getRegClass(Reg)->getID() == X86::TILERegClassID)) {
+    return true;
   }
-  return 0;
-}
 
-static bool isTileRegister(MachineRegisterInfo *MRI, Register VirtReg) {
-  return getTileDefNum(MRI, VirtReg) > 0;
+  if (Reg >= X86::TMM0 && Reg <= X86::TMM7)
+    return true;
+
+  return false;
 }
 
 static bool isTileDef(MachineRegisterInfo *MRI, MachineInstr &MI) {
@@ -296,7 +288,7 @@ static bool isTileDef(MachineRegisterInfo *MRI, MachineInstr &MI) {
   if (!MO.isReg())
     return false;
 
-  return getTileDefNum(MRI, MO.getReg()) > 0;
+  return isTileRegister(MRI, MO.getReg());
 }
 
 static ShapeT getShape(MachineRegisterInfo *MRI, Register TileReg) {
@@ -636,19 +628,7 @@ bool X86FastPreTileConfig::configBasicBlock(MachineBasicBlock &MBB) {
       else if (dominates(MBB, LastShapeMI, ColMI))
         LastShapeMI = ColMI;
     }
-    unsigned TileDefNum = getTileDefNum(MRI, MI.getOperand(0).getReg());
-    if (TileDefNum > 1) {
-      for (unsigned I = 1; I < TileDefNum; I++) {
-        MachineOperand *ColxMO = &MI.getOperand(2 + I);
-        MachineInstr *ColxMI = MRI->getVRegDef(ColxMO->getReg());
-        if (ColxMI->getParent() == &MBB) {
-          if (!LastShapeMI)
-            LastShapeMI = ColxMI;
-          else if (dominates(MBB, LastShapeMI, ColxMI))
-            LastShapeMI = ColxMI;
-        }
-      }
-    }
+
     // If there is user live out of the tilecfg, spill it and reload in
     // before the user.
     Register TileReg = MI.getOperand(0).getReg();
diff --git a/llvm/lib/Target/X86/X86FastTileConfig.cpp b/llvm/lib/Target/X86/X86FastTileConfig.cpp
index 11d331b11737f..d86ae36aa2a67 100644
--- a/llvm/lib/Target/X86/X86FastTileConfig.cpp
+++ b/llvm/lib/Target/X86/X86FastTileConfig.cpp
@@ -77,14 +77,14 @@ INITIALIZE_PASS_BEGIN(X86FastTileConfig, DEBUG_TYPE,
 INITIALIZE_PASS_END(X86FastTileConfig, DEBUG_TYPE,
                     "Fast Tile Register Configure", false, false)
 
-static unsigned getNumDefTiles(MachineRegisterInfo *MRI, MachineInstr &MI) {
+static bool isTileDef(MachineRegisterInfo *MRI, MachineInstr &MI) {
   // There is no phi instruction after register allocation.
   assert(MI.isPHI() == false);
   // The instruction must have 3 operands: tile def, row, col.
   // It should be AMX pseudo instruction that have shape operand.
   if (MI.isDebugInstr() || MI.isCopy() || MI.getNumOperands() < 3 ||
       !MI.isPseudo())
-    return 0;
+    return false;
   MachineOperand &MO = MI.getOperand(0);
 
   if (MO.isReg()) {
@@ -93,24 +93,18 @@ static unsigned getNumDefTiles(MachineRegisterInfo *MRI, MachineInstr &MI) {
     // register is not rewritten yet.
     if (Reg.isVirtual()) {
       if (MRI->getRegClass(Reg)->getID() == X86::TILERegClassID)
-        return 1;
-      if (MRI->getRegClass(Reg)->getID() == X86::TILEPAIRRegClassID)
-        return 2;
+        return true;
     }
     if (Reg >= X86::TMM0 && Reg <= X86::TMM7)
-      return 1;
-    if (Reg >= X86::TMM0_TMM1 && Reg <= X86::TMM6_TMM7)
-      return 2;
+      return true;
   }
 
-  return 0;
+  return false;
 }
 
 static unsigned getTMMIndex(Register Reg) {
   if (Reg >= X86::TMM0 && Reg <= X86::TMM7)
     return Reg - X86::TMM0;
-  if (Reg >= X86::TMM0_TMM1 && Reg <= X86::TMM6_TMM7)
-    return (Reg - X86::TMM0_TMM1) * 2;
   llvm_unreachable("Invalid Tmm Reg!");
 }
 
@@ -120,17 +114,14 @@ bool X86FastTileConfig::configBasicBlock(MachineBasicBlock &MBB) {
   bool Change = false;
   SmallVector<std::pair<unsigned, ShapeT>, 6> ShapeInfos;
   for (MachineInstr &MI : reverse(MBB)) {
-    unsigned DefNum = getNumDefTiles(MRI, MI);
-    if (DefNum == 0 && MI.getOpcode() != X86::PLDTILECFGV)
+    if (!isTileDef(MRI, MI) && MI.getOpcode() != X86::PLDTILECFGV)
       continue;
     // AMX instructions that define tile register.
     if (MI.getOpcode() != X86::PLDTILECFGV) {
       MachineOperand &Row = MI.getOperand(1);
       unsigned TMMIdx = getTMMIndex(MI.getOperand(0).getReg());
-      for (unsigned I = 0; I < DefNum; I++) {
-        MachineOperand &Col = MI.getOperand(2 + I);
-        ShapeInfos.push_back({TMMIdx + I, ShapeT(&Row, &Col)});
-      }
+      MachineOperand &Col = MI.getOperand(2);
+      ShapeInfos.push_back({TMMIdx, ShapeT(&Row, &Col)});
     } else { // PLDTILECFGV
       // Rewrite the shape information to memory. Stack slot should have
       // been initialized to zero in pre config.
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 4393f6ecaa033..d4418c8563780 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -337,23 +337,8 @@ namespace {
     // lowering but before ISEL.
     bool isAMXSDNode(SDNode *N) const {
       // Check if N is AMX SDNode:
-      // 1. check specific opcode since these carry MVT::Untyped instead of
-      // x86amx_type;
-      // 2. check result type;
-      // 3. check operand type;
-      switch (N->getOpcode()) {
-      default:
-        break;
-      case X86::PT2RPNTLVWZ0V:
-      case X86::PT2RPNTLVWZ0T1V:
-      case X86::PT2RPNTLVWZ1V:
-      case X86::PT2RPNTLVWZ1T1V:
-      case X86::PT2RPNTLVWZ0RSV:
-      case X86::PT2RPNTLVWZ0RST1V:
-      case X86::PT2RPNTLVWZ1RSV:
-      case X86::PT2RPNTLVWZ1RST1V:
-        return true;
-      }
+      // 1. check result type;
+      // 2. check operand type;
       for (unsigned Idx = 0, E = N->getNumValues(); Idx != E; ++Idx) {
         if (N->getValueType(Idx) == MVT::x86amx)
           return true;
@@ -5398,65 +5383,6 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
       ReplaceNode(Node, CNode);
       return;
     }
-    case Intrinsic::x86_t2rpntlvwz0rs:
-    case Intrinsic::x86_t2rpntlvwz0rst1:
-    case Intrinsic::x86_t2rpntlvwz1rs:
-    case Intrinsic::x86_t2rpntlvwz1rst1:
-      if (!Subtarget->hasAMXMOVRS())
-        break;
-      [[fallthrough]];
-    case Intrinsic::x86_t2rpntlvwz0:
-    case Intrinsic::x86_t2rpntlvwz0t1:
-    case Intrinsic::x86_t2rpntlvwz1:
-    case Intrinsic::x86_t2rpntlvwz1t1: {
-      if (!Subtarget->hasAMXTRANSPOSE())
-        break;
-      auto *MFI =
-          CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
-      MFI->setAMXProgModel(AMXProgModelEnum::DirectReg);
-      unsigned Opc;
-      switch (IntNo) {
-      default:
-        llvm_unreachable("Unexpected intrinsic!");
-      case Intrinsic::x86_t2rpntlvwz0:
-        Opc = X86::PT2RPNTLVWZ0;
-        break;
-      case Intrinsic::x86_t2rpntlvwz0t1:
-        Opc = X86::PT2RPNTLVWZ0T1;
-        break;
-      case Intrinsic::x86_t2rpntlvwz1:
-        Opc = X86::PT2RPNTLVWZ1;
-        break;
-      case Intrinsic::x86_t2rpntlvwz1t1:
-        Opc = X86::PT2RPNTLVWZ1T1;
-        break;
-      case Intrinsic::x86_t2rpntlvwz0rs:
-        Opc = X86::PT2RPNTLVWZ0RS;
-        break;
-      case Intrinsic::x86_t2rpntlvwz0rst1:
-        Opc = X86::PT2RPNTLVWZ0RST1;
-        break;
-      case Intrinsic::x86_t2rpntlvwz1rs:
-        Opc = X86::PT2RPNTLVWZ1RS;
-        break;
-      case Intrinsic::x86_t2rpntlvwz1rst1:
-        Opc = X86::PT2RPNTLVWZ1RST1;
-        break;
-      }
-      // FIXME: Match displacement and scale.
-      unsigned TIndex = Node->getConstantOperandVal(2);
-      SDValue TReg = getI8Imm(TIndex, dl);
-      SDValue Base = Node->getOperand(3);
-      SDValue Scale = getI8Imm(1, dl);
-      SDValue Index = Node->getOperand(4);
-      SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
-      SDValue Segment = CurDAG->getRegister(0, MVT::i16);
-      SDValue Chain = Node->getOperand(0);
-      SDValue Ops[] = {TReg, Base, Scale, Index, Disp, Segment, Chain};
-      MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
-      ReplaceNode(Node, CNode);
-      return;
-    }
     }
     break;
   }
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 410f20edc6281..133406bd8e0d7 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2572,11 +2572,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   }
 
   // Combine sin / cos into _sincos_stret if it is available.
-  if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
-      getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
-    setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
-    setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
-  }
+  setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
+  setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
 
   if (Subtarget.isTargetWin64()) {
     setOperationAction(ISD::SDIV, MVT::i128, Custom);
@@ -12216,7 +12213,7 @@ static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
     MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
     ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
                         : MVT::getVectorVT(ShiftSVT, Size / Scale);
-    return (int)ShiftAmt;
+    return ShiftAmt;
   };
 
   // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
@@ -22864,6 +22861,13 @@ static SDValue combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y,
   if (!OpVT.isScalarInteger() || OpSize < 128)
     return SDValue();
 
+  // Don't do this if we're not supposed to use the FPU.
+  bool NoImplicitFloatOps =
+      DAG.getMachineFunction().getFunction().hasFnAttribute(
+          Attribute::NoImplicitFloat);
+  if (Subtarget.useSoftFloat() || NoImplicitFloatOps)
+    return SDValue();
+
   // Ignore a comparison with zero because that gets special treatment in
   // EmitTest(). But make an exception for the special case of a pair of
   // logically-combined vector-sized operands compared to zero. This pattern may
@@ -22886,13 +22890,9 @@ static SDValue combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y,
   // Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.
   // Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.
   // Otherwise use PCMPEQ (plus AND) and mask testing.
-  bool NoImplicitFloatOps =
-      DAG.getMachineFunction().getFunction().hasFnAttribute(
-          Attribute::NoImplicitFloat);
-  if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
-      ((OpSize == 128 && Subtarget.hasSSE2()) ||
-       (OpSize == 256 && Subtarget.hasAVX()) ||
-       (OpSize == 512 && Subtarget.useAVX512Regs()))) {
+  if ((OpSize == 128 && Subtarget.hasSSE2()) ||
+      (OpSize == 256 && Subtarget.hasAVX()) ||
+      (OpSize == 512 && Subtarget.useAVX512Regs())) {
     bool HasPT = Subtarget.hasSSE41();
 
     // PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened
@@ -27949,67 +27949,6 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
       return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
                          Operation.getValue(1));
     }
-    case Intrinsic::x86_t2rpntlvwz0rs_internal:
-    case Intrinsic::x86_t2rpntlvwz0rst1_internal:
-    case Intrinsic::x86_t2rpntlvwz1rs_internal:
-    case Intrinsic::x86_t2rpntlvwz1rst1_internal:
-    case Intrinsic::x86_t2rpntlvwz0_internal:
-    case Intrinsic::x86_t2rpntlvwz0t1_internal:
-    case Intrinsic::x86_t2rpntlvwz1_internal:
-    case Intrinsic::x86_t2rpntlvwz1t1_internal: {
-      auto *X86MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>();
-      X86MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA);
-      unsigned IntNo = Op.getConstantOperandVal(1);
-      unsigned Opc = 0;
-      switch (IntNo) {
-      default:
-        llvm_unreachable("Unexpected intrinsic!");
-      case Intrinsic::x86_t2rpntlvwz0_internal:
-        Opc = X86::PT2RPNTLVWZ0V;
-        break;
-      case Intrinsic::x86_t2rpntlvwz0t1_internal:
-        Opc = X86::PT2RPNTLVWZ0T1V;
-        break;
-      case Intrinsic::x86_t2rpntlvwz1_internal:
-        Opc = X86::PT2RPNTLVWZ1V;
-        break;
-      case Intrinsic::x86_t2rpntlvwz1t1_internal:
-        Opc = X86::PT2RPNTLVWZ1T1V;
-        break;
-      case Intrinsic::x86_t2rpntlvwz0rs_internal:
-        Opc = X86::PT2RPNTLVWZ0RSV;
-        break;
-      case Intrinsic::x86_t2rpntlvwz0rst1_internal:
-        Opc = X86::PT2RPNTLVWZ0RST1V;
-        break;
-      case Intrinsic::x86_t2rpntlvwz1rs_internal:
-        Opc = X86::PT2RPNTLVWZ1RSV;
-        break;
-      case Intrinsic::x86_t2rpntlvwz1rst1_internal:
-        Opc = X86::PT2RPNTLVWZ1RST1V;
-        break;
-      }
-
-      SDLoc DL(Op);
-      SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
-
-      SDValue Ops[] = {Op.getOperand(2),                       // Row
-                       Op.getOperand(3),                       // Col0
-                       Op.getOperand(4),                       // Col1
-                       Op.getOperand(5),                       // Base
-                       DAG.getTargetConstant(1, DL, MVT::i8),  // Scale
-                       Op.getOperand(6),                       // Index
-                       DAG.getTargetConstant(0, DL, MVT::i32), // Disp
-                       DAG.getRegister(0, MVT::i16),           // Segment
-                       Op.getOperand(0)};                      // Chain
-
-      MachineSDNode *Res = DAG.getMachineNode(Opc, DL, VTs, Ops);
-      SDValue Res0 = DAG.getTargetExtractSubreg(X86::sub_t0, DL, MVT::x86amx,
-                                                SDValue(Res, 0));
-      SDValue Res1 = DAG.getTargetExtractSubreg(X86::sub_t1, DL, MVT::x86amx,
-                                                SDValue(Res, 0));
-      return DAG.getMergeValues({Res0, Res1, SDValue(Res, 1)}, DL);
-    }
     case Intrinsic::x86_atomic_bts_rm:
     case Intrinsic::x86_atomic_btc_rm:
     case Intrinsic::x86_atomic_btr_rm: {
@@ -33067,26 +33006,30 @@ static SDValue LowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG) {
 
 static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
                             SelectionDAG &DAG) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDValue Arg = Op.getOperand(0);
+  EVT ArgVT = Arg.getValueType();
+  bool isF64 = ArgVT == MVT::f64;
+
+  RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
+  const char *LibcallName = TLI.getLibcallName(LC);
+  if (!LibcallName)
+    return SDValue();
+
   assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
 
   // For MacOSX, we want to call an alternative entry point: __sincos_stret,
   // which returns the values as { float, float } (in XMM0) or
   // { double, double } (which is returned in XMM0, XMM1).
   SDLoc dl(Op);
-  SDValue Arg = Op.getOperand(0);
-  EVT ArgVT = Arg.getValueType();
   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
 
   TargetLowering::ArgListTy Args;
   Args.emplace_back(Arg, ArgTy);
 
-  bool isF64 = ArgVT == MVT::f64;
   // Only optimize x86_64 for now. i386 is a bit messy. For f32,
   // the small struct {f32, f32} is returned in (eax, edx). For f64,
   // the results are returned via SRet in memory.
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
-  const char *LibcallName = TLI.getLibcallName(LC);
   SDValue Callee =
       DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
 
@@ -37744,10 +37687,6 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     assert (Imm < 8 && "Illegal tmm index");
     return X86::TMM0 + Imm;
   };
-  auto TMMImmToTMMPair = [](unsigned Imm) {
-    assert(Imm < 8 && "Illegal tmm pair index.");
-    return X86::TMM0_TMM1 + Imm / 2;
-  };
   switch (MI.getOpcode()) {
   default:
     llvm_unreachable("Unexpected instr type to insert");
@@ -38128,53 +38067,25 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   case X86::PTDPBHF8PS:
   case X86::PTDPHBF8PS:
   case X86::PTDPHF8PS:
-  case X86::PTTDPBF16PS:
-  case X86::PTTDPFP16PS:
-  case X86::PTTCMMIMFP16PS:
-  case X86::PTTCMMRLFP16PS:
-  case X86::PTCONJTCMMIMFP16PS:
-  case X86::PTMMULTF32PS:
-  case X86::PTTMMULTF32PS: {
+  case X86::PTMMULTF32PS: {
     unsigned Opc;
     switch (MI.getOpcode()) {
     default: llvm_unreachable("illegal opcode!");
+      // clang-format off
     case X86::PTDPBSSD: Opc = X86::TDPBSSD; break;
     case X86::PTDPBSUD: Opc = X86::TDPBSUD; break;
     case X86::PTDPBUSD: Opc = X86::TDPBUSD; break;
     case X86::PTDPBUUD: Opc = X86::TDPBUUD; break;
     case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break;
     case X86::PTDPFP16PS: Opc = X86::TDPFP16PS; break;
-    case X86::PTCMMIMFP16PS:
-      Opc = X86::TCMMIMFP16PS;
-      break;
-    case X86::PTCMMRLFP16PS:
-      Opc = X86::TCMMRLFP16PS;
-      break;
+    case X86::PTCMMIMFP16PS: Opc = X86::TCMMIMFP16PS; break;
+    case X86::PTCMMRLFP16PS: Opc = X86::TCMMRLFP16PS; break;
     case X86::PTDPBF8PS: Opc = X86::TDPBF8PS; break;
     case X86::PTDPBHF8PS: Opc = X86::TDPBHF8PS; break;
     case X86::PTDPHBF8PS: Opc = X86::TDPHBF8PS; break;
     case X86::PTDPHF8PS: Opc = X86::TDPHF8PS; break;
-    case X86::PTTDPBF16PS:
-      Opc = X86::TTDPBF16PS;
-      break;
-    case X86::PTTDPFP16PS:
-      Opc = X86::TTDPFP16PS;
-      break;
-    case X86::PTTCMMIMFP16PS:
-      Opc = X86::TTCMMIMFP16PS;
-      break;
-    case X86::PTTCMMRLFP16PS:
-      Opc = X86::TTCMMRLFP16PS;
-      break;
-    case X86::PTCONJTCMMIMFP16PS:
-      Opc = X86::TCONJTCMMIMFP16PS;
-      break;
-    case X86::PTMMULTF32PS:
-      Opc = X86::TMMULTF32PS;
-      break;
-    case X86::PTTMMULTF32PS:
-      Opc = X86::TTMMULTF32PS;
-      break;
+    case X86::PTMMULTF32PS: Opc = X86::TMMULTF32PS; break;
+      // clang-format on
     }
 
     MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
@@ -38245,70 +38156,6 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     MI.eraseFromParent(); // The pseudo is gone now.
     return BB;
   }
-  case X86::PT2RPNTLVWZ0:
-  case X86::PT2RPNTLVWZ0T1:
-  case X86::PT2RPNTLVWZ1:
-  case X86::PT2RPNTLVWZ1T1:
-  case X86::PT2RPNTLVWZ0RS:
-  case X86::PT2RPNTLVWZ0RST1:
-  case X86::PT2RPNTLVWZ1RS:
-  case X86::PT2RPNTLVWZ1RST1: {
-    const DebugLoc &DL = MI.getDebugLoc();
-    unsigned Opc;
-#define GET_EGPR_IF_ENABLED(OPC) (Subtarget.hasEGPR() ? OPC##_EVEX : OPC)
-    switch (MI.getOpcode()) {
-    default:
-      llvm_unreachable("Unexpected instruction!");
-    case X86::PT2RPNTLVWZ0:
-      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0);
-      break;
-    case X86::PT2RPNTLVWZ0T1:
-      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0T1);
-      break;
-    case X86::PT2RPNTLVWZ1:
-      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1);
-      break;
-    case X86::PT2RPNTLVWZ1T1:
-      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1T1);
-      break;
-    case X86::PT2RPNTLVWZ0RS:
-      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RS);
-      break;
-    case X86::PT2RPNTLVWZ0RST1:
-      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RST1);
-      break;
-    case X86::PT2RPNTLVWZ1RS:
-      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RS);
-      break;
-    case X86::PT2RPNTLVWZ1RST1:
-      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RST1);
-      break;
-    }
-#undef GET_EGPR_IF_ENABLED
-    MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
-    MIB.addReg(TMMImmToTMMPair(MI.getOperand(0).getImm()), RegState::Define);
-
-    MIB.add(MI.getOperand(1)); // base
-    MIB.add(MI.getOperand(2)); // scale
-    MIB.add(MI.getOperand(3)); // index
-    MIB.add(MI.getOperand(4)); // displacement
-    MIB.add(MI.getOperand(5)); // segment
-    MI.eraseFromParent();      // The pseudo is gone now.
-    return BB;
-  }
-  case X86::PTTRANSPOSED:
-  case X86::PTCONJTFP16: {
-    const DebugLoc &DL = MI.getDebugLoc();
-    unsigned Opc = MI.getOpcode() == X86::PTTRANSPOSED ? X86::TTRANSPOSED
-                                                       : X86::TCONJTFP16;
-
-    MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
-    MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
-    MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
-
-    MI.eraseFromParent(); // The pseudo is gone now.
-    return BB;
-  }
   case X86::PTCVTROWPS2BF16Hrri:
   case X86::PTCVTROWPS2BF16Lrri:
   case X86::PTCVTROWPS2PHHrri:
@@ -48777,15 +48624,19 @@ static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC,
       SDValue BC0 = peekThroughBitcasts(Op0);
       if (BC0.getOpcode() == X86ISD::PCMPEQ &&
           ISD::isBuildVectorAllZeros(BC0.getOperand(1).getNode())) {
-        SDLoc DL(EFLAGS);
         CC = (CC == X86::COND_B ? X86::COND_E : X86::COND_NE);
-        SDValue X = DAG.getBitcast(OpVT, BC0.getOperand(0));
-        return DAG.getNode(EFLAGS.getOpcode(), DL, VT, X, X);
+        SDValue X = DAG.getBitcast(OpVT, DAG.getFreeze(BC0.getOperand(0)));
+        return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, X, X);
       }
     }
   }
 
   if (CC == X86::COND_E || CC == X86::COND_NE) {
+    // Canonicalize constant to RHS if we're just using ZF.
+    if (Op0 != Op1 && DAG.isConstantIntBuildVectorOrConstantInt(Op0) &&
+        !DAG.isConstantIntBuildVectorOrConstantInt(Op1))
+      return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op0);
+
     // TESTZ(X,~Y) == TESTC(Y,X)
     if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
       CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
@@ -48831,7 +48682,7 @@ static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC,
               MVT FloatSVT = MVT::getFloatingPointVT(EltBits);
               MVT FloatVT =
                   MVT::getVectorVT(FloatSVT, OpVT.getSizeInBits() / EltBits);
-              Res = DAG.getBitcast(FloatVT, Res);
+              Res = DAG.getBitcast(FloatVT, DAG.getFreeze(Res));
               return DAG.getNode(X86ISD::TESTP, SDLoc(EFLAGS), VT, Res, Res);
             } else if (EltBits == 16) {
               MVT MovmskVT = BCVT.is128BitVector() ? MVT::v16i8 : MVT::v32i8;
@@ -48849,13 +48700,31 @@ static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC,
       }
     }
 
-    // TESTZ(-1,X) == TESTZ(X,X)
-    if (ISD::isBuildVectorAllOnes(Op0.getNode()))
-      return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op1);
-
     // TESTZ(X,-1) == TESTZ(X,X)
-    if (ISD::isBuildVectorAllOnes(Op1.getNode()))
+    if (ISD::isBuildVectorAllOnes(Op1.getNode())) {
+      Op0 = DAG.getFreeze(Op0);
       return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0);
+    }
+
+    // Attempt to convert PTESTZ(X,SIGNMASK) -> VTESTPD/PSZ(X,X) on AVX targets.
+    if (EFLAGS.getOpcode() == X86ISD::PTEST && Subtarget.hasAVX()) {
+      KnownBits KnownOp1 = DAG.computeKnownBits(Op1);
+      assert(KnownOp1.getBitWidth() == 64 &&
+             "Illegal PTEST vector element width");
+      if (KnownOp1.isConstant()) {
+        const APInt &Mask = KnownOp1.getConstant();
+        if (Mask.isSignMask()) {
+          MVT FpVT = MVT::getVectorVT(MVT::f64, OpVT.getSizeInBits() / 64);
+          Op0 = DAG.getBitcast(FpVT, DAG.getFreeze(Op0));
+          return DAG.getNode(X86ISD::TESTP, SDLoc(EFLAGS), VT, Op0, Op0);
+        }
+        if (Mask.isSplat(32) && Mask.trunc(32).isSignMask()) {
+          MVT FpVT = MVT::getVectorVT(MVT::f32, OpVT.getSizeInBits() / 32);
+          Op0 = DAG.getBitcast(FpVT, DAG.getFreeze(Op0));
+          return DAG.getNode(X86ISD::TESTP, SDLoc(EFLAGS), VT, Op0, Op0);
+        }
+      }
+    }
 
     // TESTZ(OR(LO(X),HI(X)),OR(LO(Y),HI(Y))) -> TESTZ(X,Y)
     // TODO: Add COND_NE handling?
@@ -54491,6 +54360,7 @@ static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
 static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG,
                                const X86Subtarget &Subtarget,
                                const SDLoc &DL) {
+  using namespace SDPatternMatch;
   if (!VT.isVector() || !Subtarget.hasSSSE3())
     return SDValue();
 
@@ -54500,42 +54370,19 @@ static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG,
     return SDValue();
 
   SDValue SSatVal = detectSSatPattern(In, VT);
-  if (!SSatVal || SSatVal.getOpcode() != ISD::ADD)
-    return SDValue();
-
-  // Ok this is a signed saturation of an ADD. See if this ADD is adding pairs
-  // of multiplies from even/odd elements.
-  SDValue N0 = SSatVal.getOperand(0);
-  SDValue N1 = SSatVal.getOperand(1);
-
-  if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
+  if (!SSatVal)
     return SDValue();
 
-  SDValue N00 = N0.getOperand(0);
-  SDValue N01 = N0.getOperand(1);
-  SDValue N10 = N1.getOperand(0);
-  SDValue N11 = N1.getOperand(1);
-
+  // See if this is a signed saturation of an ADD, adding pairs of multiplies
+  // from even/odd elements, from zero_extend/sign_extend operands.
+  //
   // TODO: Handle constant vectors and use knownbits/computenumsignbits?
-  // Canonicalize zero_extend to LHS.
-  if (N01.getOpcode() == ISD::ZERO_EXTEND)
-    std::swap(N00, N01);
-  if (N11.getOpcode() == ISD::ZERO_EXTEND)
-    std::swap(N10, N11);
-
-  // Ensure we have a zero_extend and a sign_extend.
-  if (N00.getOpcode() != ISD::ZERO_EXTEND ||
-      N01.getOpcode() != ISD::SIGN_EXTEND ||
-      N10.getOpcode() != ISD::ZERO_EXTEND ||
-      N11.getOpcode() != ISD::SIGN_EXTEND)
+  SDValue N00, N01, N10, N11;
+  if (!sd_match(SSatVal,
+                m_Add(m_Mul(m_ZExt(m_Value(N00)), m_SExt(m_Value(N01))),
+                      m_Mul(m_ZExt(m_Value(N10)), m_SExt(m_Value(N11))))))
     return SDValue();
 
-  // Peek through the extends.
-  N00 = N00.getOperand(0);
-  N01 = N01.getOperand(0);
-  N10 = N10.getOperand(0);
-  N11 = N11.getOperand(0);
-
   // Ensure the extend is from vXi8.
   if (N00.getValueType().getVectorElementType() != MVT::i8 ||
       N01.getValueType().getVectorElementType() != MVT::i8 ||
@@ -54634,6 +54481,7 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
                                const X86Subtarget &Subtarget) {
   EVT VT = N->getValueType(0);
   SDValue Src = N->getOperand(0);
+  EVT SrcVT = Src.getValueType();
   SDLoc DL(N);
 
   // Attempt to pre-truncate inputs to arithmetic ops instead.
@@ -54652,6 +54500,42 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
   if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))
     return V;
 
+  // Fold trunc(srl(load(p),amt)) -> load(p+amt/8)
+  // If we're shifting down byte aligned bit chunks from a larger load for
+  // truncation, see if we can convert the shift into a pointer offset instead.
+  // Limit this to normal (non-ext) scalar integer loads.
+  if (SrcVT.isScalarInteger() && Src.getOpcode() == ISD::SRL &&
+      Src.hasOneUse() && Src.getOperand(0).hasOneUse() &&
+      ISD::isNormalLoad(Src.getOperand(0).getNode())) {
+    auto *Ld = cast<LoadSDNode>(Src.getOperand(0));
+    if (Ld->isSimple() && VT.isByteSized() &&
+        isPowerOf2_64(VT.getSizeInBits())) {
+      SDValue ShAmt = Src.getOperand(1);
+      KnownBits KnownAmt = DAG.computeKnownBits(ShAmt);
+      // Check the shift amount is byte aligned.
+      // Check the truncation doesn't use any shifted in (zero) top bits.
+      // Check the shift amount doesn't depend on the original load.
+      if (KnownAmt.countMinTrailingZeros() >= 3 &&
+          KnownAmt.getMaxValue().ule(SrcVT.getSizeInBits() -
+                                     VT.getSizeInBits()) &&
+          !Ld->isPredecessorOf(ShAmt.getNode())) {
+        EVT PtrVT = Ld->getBasePtr().getValueType();
+        SDValue PtrBitOfs = DAG.getZExtOrTrunc(ShAmt, DL, PtrVT);
+        SDValue PtrByteOfs =
+            DAG.getNode(ISD::SRL, DL, PtrVT, PtrBitOfs,
+                        DAG.getShiftAmountConstant(3, PtrVT, DL));
+        SDValue NewPtr = DAG.getMemBasePlusOffset(
+            Ld->getBasePtr(), PtrByteOfs, DL, SDNodeFlags::NoUnsignedWrap);
+        SDValue NewLoad =
+            DAG.getLoad(VT, DL, Ld->getChain(), NewPtr, Ld->getPointerInfo(),
+                        Align(), Ld->getMemOperand()->getFlags());
+        DAG.ReplaceAllUsesOfValueWith(Src.getOperand(0).getValue(1),
+                                      NewLoad.getValue(1));
+        return NewLoad;
+      }
+    }
+  }
+
   // The bitcast source is a direct mmx result.
   // Detect bitcasts between i32 to x86mmx
   if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td
index 69a5115201ef2..522782abd710f 100644
--- a/llvm/lib/Target/X86/X86InstrAMX.td
+++ b/llvm/lib/Target/X86/X86InstrAMX.td
@@ -338,188 +338,6 @@ let Predicates = [HasAMXFP8, In64BitMode] in {
   }
 }
 
-let Predicates = [HasAMXTILE, In64BitMode], isPseudo = true, SchedRW = [WriteSystem] in {
-  let mayStore = 1 in
-  def PTILEPAIRSTORE : PseudoI<(outs), (ins opaquemem:$src1, TILEPair:$src2), []>;
-  let mayLoad = 1 in
-  def PTILEPAIRLOAD : PseudoI<(outs TILEPair:$dst), (ins opaquemem:$src), []>;
-}
-
-multiclass T2RPNTLVW_Base<bits<8> op1, bits<8> op2, string rs, string suffix> {
-  def Z0#rs#suffix    : I<op1, MRMSrcMemFSIB, (outs TILEPair:$dst), (ins sibmem:$src),
-                          "t2rpntlvwz0" #!tolower(rs)# "\t{$src, $dst|$dst, $src}", []>, PS;
-  def Z0#rs#T1#suffix : I<op2, MRMSrcMemFSIB, (outs TILEPair:$dst), (ins sibmem:$src),
-                          "t2rpntlvwz0" #!tolower(rs)# "t1\t{$src, $dst|$dst, $src}", []>, PS;
-  def Z1#rs#suffix    : I<op1, MRMSrcMemFSIB, (outs TILEPair:$dst), (ins sibmem:$src),
-                          "t2rpntlvwz1" #!tolower(rs)# "\t{$src, $dst|$dst, $src}", []>, PD;
-  def Z1#rs#T1#suffix : I<op2, MRMSrcMemFSIB, (outs TILEPair:$dst), (ins sibmem:$src),
-                          "t2rpntlvwz1" #!tolower(rs)# "t1\t{$src, $dst|$dst, $src}", []>, PD;
-}
-
-let Predicates = [HasAMXTRANSPOSE, In64BitMode], SchedRW = [WriteSystem] in
-  defm T2RPNTLVW : T2RPNTLVW_Base<0x6e, 0x6f, "", "">, T8, VEX;
-
-let Predicates = [HasAMXTRANSPOSE, HasEGPR, In64BitMode], SchedRW = [WriteSystem] in
-  defm T2RPNTLVW : T2RPNTLVW_Base<0x6e, 0x6f, "", "_EVEX">, T8, EVEX, NoCD8;
-
-let Predicates = [HasAMXMOVRS, HasAMXTRANSPOSE, In64BitMode], SchedRW = [WriteSystem] in
-  defm T2RPNTLVW : T2RPNTLVW_Base<0xf8, 0xf9, "RS", "">, T_MAP5, VEX;
-
-let Predicates = [HasAMXMOVRS, HasAMXTRANSPOSE, HasEGPR, In64BitMode], SchedRW = [WriteSystem] in
-  defm T2RPNTLVW : T2RPNTLVW_Base<0xf8, 0xf9, "RS", "_EVEX">, T_MAP5, EVEX, NoCD8;
-
-let Predicates = [HasAMXTRANSPOSE, In64BitMode] in {
-  let SchedRW = [WriteSystem] in {
-    def TTRANSPOSED : I<0x5f, MRMSrcReg, (outs TILE:$dst), (ins TILE:$src),
-                        "ttransposed\t{$src, $dst|$dst, $src}", []>, VEX, T8, XS;
-    let isPseudo = true in {
-      def PT2RPNTLVWZ0V : PseudoI<(outs TILEPair:$dst),
-                                  (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4),
-                                  []>;
-      def PT2RPNTLVWZ0T1V : PseudoI<(outs TILEPair:$dst),
-                                  (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4),
-                                  []>;
-      def PT2RPNTLVWZ1V : PseudoI<(outs TILEPair:$dst),
-                                  (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4),
-                                  []>;
-      def PT2RPNTLVWZ1T1V : PseudoI<(outs TILEPair:$dst),
-                                  (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4),
-                                  []>;
-    }
-
-    def PTTRANSPOSEDV : PseudoI<(outs TILE:$dst),
-                                (ins GR16:$src1, GR16:$src2, TILE:$src),
-                                [(set TILE: $dst,
-                                 (int_x86_ttransposed_internal GR16:$src1, GR16:$src2,
-                                  TILE:$src))]>;
-
-    let usesCustomInserter = 1 in {
-      def PT2RPNTLVWZ0 : PseudoI<(outs), (ins u8imm:$dst,
-                                 sibmem:$src1), []>;
-      def PT2RPNTLVWZ0T1 : PseudoI<(outs), (ins u8imm:$dst,
-                                   sibmem:$src1), []>;
-      def PT2RPNTLVWZ1 : PseudoI<(outs), (ins u8imm:$dst,
-                                 sibmem:$src1), []>;
-      def PT2RPNTLVWZ1T1 : PseudoI<(outs), (ins u8imm:$dst,
-                                   sibmem:$src1), []>;
-      def PTTRANSPOSED : PseudoI<(outs), (ins u8imm:$dst, u8imm:$src),
-                                 [(int_x86_ttransposed timm:$dst, timm:$src)]>;
-    }
-  }
-} // HasAMXTILE, HasAMXTRANSPOSE
-
-let Predicates = [HasAMXBF16, HasAMXTRANSPOSE, In64BitMode], SchedRW = [WriteSystem] in {
-  let Constraints = "$src1 = $dst" in
-    def TTDPBF16PS : I<0x6c, MRMSrcReg4VOp3, (outs TILE:$dst),
-                       (ins TILE:$src1, TILE:$src2, TILE:$src3),
-                       "ttdpbf16ps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
-                       []>, VEX, VVVV, T8,XS;
-  let Constraints = "$src4 = $dst" in
-    def PTTDPBF16PSV : PseudoI<(outs TILE:$dst), (ins GR16:$src1,
-                                GR16:$src2, GR16:$src3, TILE:$src4,
-                                TILE:$src5, TILE:$src6),
-                                [(set TILE: $dst,
-                                  (int_x86_ttdpbf16ps_internal GR16:$src1, GR16:$src2,
-                                   GR16:$src3, TILE:$src4, TILE:$src5, TILE:$src6))]>;
-  let usesCustomInserter = 1 in
-    def PTTDPBF16PS : PseudoI<(outs), (ins u8imm:$src1, u8imm:$src2, u8imm:$src3),
-                              [(int_x86_ttdpbf16ps timm:$src1, timm:$src2, timm:$src3)]>;
-}
-
-let Predicates = [HasAMXFP16, HasAMXTRANSPOSE, In64BitMode], SchedRW = [WriteSystem] in {
-  let Constraints = "$src1 = $dst" in
-    def TTDPFP16PS : I<0x6c, MRMSrcReg4VOp3, (outs TILE:$dst),
-                       (ins TILE:$src1, TILE:$src2, TILE:$src3),
-                       "ttdpfp16ps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
-                       []>, VEX, VVVV, T8,XD;
-  let Constraints = "$src4 = $dst" in
-    def PTTDPFP16PSV : PseudoI<(outs TILE:$dst), (ins GR16:$src1,
-                                GR16:$src2, GR16:$src3, TILE:$src4,
-                                TILE:$src5, TILE:$src6),
-                                [(set TILE: $dst,
-                                  (int_x86_ttdpfp16ps_internal GR16:$src1, GR16:$src2,
-                                   GR16:$src3, TILE:$src4, TILE:$src5, TILE:$src6))]>;
-  let usesCustomInserter = 1 in
-    def PTTDPFP16PS : PseudoI<(outs), (ins u8imm:$src1, u8imm:$src2, u8imm:$src3),
-                              [(int_x86_ttdpfp16ps timm:$src1, timm:$src2, timm:$src3)]>;
-}
-
-let Predicates = [HasAMXCOMPLEX, HasAMXTRANSPOSE, In64BitMode], SchedRW = [WriteSystem] in {
-  let Constraints = "$src1 = $dst" in {
-    def TTCMMIMFP16PS : I<0x6b, MRMSrcReg4VOp3, (outs TILE:$dst),
-                          (ins TILE:$src1, TILE:$src2, TILE:$src3),
-                          "ttcmmimfp16ps\t{$src3, $src2, $src1|$src1, $src2, $src3}",
-                          []>, VEX, VVVV, T8,XD;
-    def TTCMMRLFP16PS: I<0x6b, MRMSrcReg4VOp3, (outs TILE:$dst),
-                         (ins TILE:$src1, TILE:$src2, TILE:$src3),
-                         "ttcmmrlfp16ps\t{$src3, $src2, $src1|$src1, $src2, $src3}",
-                         []>, VEX, VVVV, T8,XS;
-    def TCONJTCMMIMFP16PS : I<0x6b, MRMSrcReg4VOp3, (outs TILE:$dst),
-                          (ins TILE:$src1, TILE:$src2, TILE:$src3),
-                          "tconjtcmmimfp16ps\t{$src3, $src2, $src1|$src1, $src2, $src3}",
-                          []>, VEX, VVVV, WIG, T8,PS;
-  }
-  def TCONJTFP16 : I<0x6b, MRMSrcReg, (outs TILE:$dst), (ins TILE:$src),
-                     "tconjtfp16\t{$src, $dst|$dst, $src}", []>, VEX, T8,PD;
-
-  let Constraints = "$src4 = $dst" in {
-    def PTTCMMIMFP16PSV : PseudoI<(outs TILE:$dst), (ins GR16:$src1,
-                                  GR16:$src2, GR16:$src3, TILE:$src4,
-                                  TILE:$src5, TILE:$src6),
-                                  [(set TILE: $dst,
-                                    (int_x86_ttcmmimfp16ps_internal GR16:$src1, GR16:$src2,
-                                     GR16:$src3, TILE:$src4, TILE:$src5, TILE:$src6))]>;
-    def PTTCMMRLFP16PSV : PseudoI<(outs TILE:$dst), (ins GR16:$src1,
-                                  GR16:$src2, GR16:$src3, TILE:$src4,
-                                  TILE:$src5, TILE:$src6),
-                                  [(set TILE: $dst,
-                                    (int_x86_ttcmmrlfp16ps_internal GR16:$src1, GR16:$src2,
-                                     GR16:$src3, TILE:$src4, TILE:$src5, TILE:$src6))]>;
-    def PTCONJTCMMIMFP16PSV : PseudoI<(outs TILE:$dst), (ins GR16:$src1,
-                                      GR16:$src2, GR16:$src3, TILE:$src4,
-                                      TILE:$src5, TILE:$src6),
-                                      [(set TILE: $dst,
-                                        (int_x86_tconjtcmmimfp16ps_internal GR16:$src1, GR16:$src2,
-                                         GR16:$src3, TILE:$src4, TILE:$src5, TILE:$src6))]>;
-  }
-  def PTCONJTFP16V : PseudoI<(outs TILE:$dst), (ins GR16:$src1, GR16:$src2, TILE:$src3),
-                             [(set TILE: $dst, (int_x86_tconjtfp16_internal GR16:$src1, GR16:$src2, TILE:$src3))]>;
-
-  let usesCustomInserter = 1 in {
-    def PTTCMMIMFP16PS : PseudoI<(outs), (ins u8imm:$src1, u8imm:$src2, u8imm:$src3),
-                                 [(int_x86_ttcmmimfp16ps timm:$src1, timm:$src2, timm:$src3)]>;
-    def PTTCMMRLFP16PS : PseudoI<(outs), (ins u8imm:$src1, u8imm:$src2, u8imm:$src3),
-                                 [(int_x86_ttcmmrlfp16ps timm:$src1, timm:$src2, timm:$src3)]>;
-    def PTCONJTCMMIMFP16PS : PseudoI<(outs), (ins u8imm:$src1, u8imm:$src2, u8imm:$src3),
-                                     [(int_x86_tconjtcmmimfp16ps timm:$src1, timm:$src2, timm:$src3)]>;
-    def PTCONJTFP16 : PseudoI<(outs), (ins u8imm:$dst, u8imm:$src),
-                              [(int_x86_tconjtfp16 timm:$dst, timm:$src)]>;
-  }
-}
-
-let Predicates = [HasAMXMOVRS, HasAMXTRANSPOSE, In64BitMode], SchedRW = [WriteSystem] in {
-  let isPseudo = true in {
-    def PT2RPNTLVWZ0RSV   : PseudoI<(outs TILEPair:$dst),
-                              (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4),
-                              []>;
-    def PT2RPNTLVWZ0RST1V : PseudoI<(outs TILEPair:$dst),
-                              (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4),
-                              []>;
-    def PT2RPNTLVWZ1RSV   : PseudoI<(outs TILEPair:$dst),
-                              (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4),
-                              []>;
-    def PT2RPNTLVWZ1RST1V : PseudoI<(outs TILEPair:$dst),
-                              (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4),
-                              []>;
-  }
-  let  usesCustomInserter = 1 in {
-    def PT2RPNTLVWZ0RS   : PseudoI<(outs), (ins u8imm:$dst, sibmem:$src1), []>;
-    def PT2RPNTLVWZ0RST1 : PseudoI<(outs), (ins u8imm:$dst, sibmem:$src1), []>;
-    def PT2RPNTLVWZ1RS   : PseudoI<(outs), (ins u8imm:$dst, sibmem:$src1), []>;
-    def PT2RPNTLVWZ1RST1 : PseudoI<(outs), (ins u8imm:$dst, sibmem:$src1), []>;
-  }
-} // HasAMXMOVRS, HasAMXTRANSPOSE
-
 multiclass TILELOADDRS_Base<string suffix> {
   def suffix    : I<0x4a, MRMSrcMemFSIB, (outs TILE:$dst), (ins sibmem:$src1),
                     "tileloaddrs\t{$src1, $dst|$dst, $src1}", []>, T8, XD;
@@ -721,29 +539,3 @@ let Predicates = [HasAMXTF32, In64BitMode] in {
     }
   } // SchedRW = [WriteSystem]
 } // HasAMXTF32
-
-let Predicates = [HasAMXTF32, HasAMXTRANSPOSE, In64BitMode] in {
-  let SchedRW = [WriteSystem] in {
-    let Constraints = "$src1 = $dst" in {
-      def TTMMULTF32PS: I<0x48, MRMSrcReg4VOp3, (outs TILE:$dst),
-                         (ins TILE:$src1, TILE:$src2, TILE:$src3),
-                         "ttmmultf32ps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
-                         []>, VEX, VVVV, T8, PS;
-    }
-    let Constraints = "$src4 = $dst" in {
-      def PTTMMULTF32PSV : PseudoI<(outs TILE:$dst),
-                                   (ins GR16:$src1, GR16:$src2, GR16:$src3,
-                                    TILE:$src4, TILE:$src5, TILE:$src6),
-                                   [(set TILE:$dst,
-                                     (int_x86_ttmmultf32ps_internal GR16:$src1,
-                                      GR16:$src2, GR16:$src3, TILE:$src4,
-                                      TILE:$src5, TILE:$src6))]>;
-    }
-    let usesCustomInserter = 1 in {
-      def PTTMMULTF32PS : PseudoI<(outs),
-                                  (ins u8imm:$src1, u8imm:$src2, u8imm:$src3),
-                                  [(int_x86_ttmmultf32ps timm:$src1, timm:$src2,
-                                    timm:$src3)]>;
-    }
-  } // SchedRW = [WriteSystem]
-} // HasAMXTF32, HasAMXTRANSPOSE
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 5c23f917d0530..6b2a7a4ec3583 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -4544,11 +4544,6 @@ static unsigned getLoadStoreRegOpcode(Register Reg,
     return Load ? GET_EGPR_IF_ENABLED(X86::TILELOADD)
                 : GET_EGPR_IF_ENABLED(X86::TILESTORED);
 #undef GET_EGPR_IF_ENABLED
-  case 2048:
-    assert(X86::TILEPAIRRegClass.hasSubClassEq(RC) &&
-           "Unknown 2048-byte regclass");
-    assert(STI.hasAMXTILE() && "Using 2048-bit register requires AMX-TILE");
-    return Load ? X86::PTILEPAIRLOAD : X86::PTILEPAIRSTORE;
   }
 }
 
@@ -4743,8 +4738,6 @@ static bool isAMXOpcode(unsigned Opc) {
   case X86::TILESTORED:
   case X86::TILELOADD_EVEX:
   case X86::TILESTORED_EVEX:
-  case X86::PTILEPAIRLOAD:
-  case X86::PTILEPAIRSTORE:
     return true;
   }
 }
@@ -4757,8 +4750,7 @@ void X86InstrInfo::loadStoreTileReg(MachineBasicBlock &MBB,
   default:
     llvm_unreachable("Unexpected special opcode!");
   case X86::TILESTORED:
-  case X86::TILESTORED_EVEX:
-  case X86::PTILEPAIRSTORE: {
+  case X86::TILESTORED_EVEX: {
     // tilestored %tmm, (%sp, %idx)
     MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
     Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
@@ -4772,8 +4764,7 @@ void X86InstrInfo::loadStoreTileReg(MachineBasicBlock &MBB,
     break;
   }
   case X86::TILELOADD:
-  case X86::TILELOADD_EVEX:
-  case X86::PTILEPAIRLOAD: {
+  case X86::TILELOADD_EVEX: {
     // tileloadd (%sp, %idx), %tmm
     MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
     Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
diff --git a/llvm/lib/Target/X86/X86InstrOperands.td b/llvm/lib/Target/X86/X86InstrOperands.td
index 5207ecad127a2..6ba07f74d74c5 100644
--- a/llvm/lib/Target/X86/X86InstrOperands.td
+++ b/llvm/lib/Target/X86/X86InstrOperands.td
@@ -536,10 +536,3 @@ def VK8Pair : RegisterOperand<VK8PAIR, "printVKPair"> {
 def VK16Pair : RegisterOperand<VK16PAIR, "printVKPair"> {
   let ParserMatchClass = VK16PairAsmOperand;
 }
-
-let RenderMethod = "addTILEPairOperands" in
-  def TILEPairAsmOperand : AsmOperandClass { let Name = "TILEPair"; }
-
-def TILEPair : RegisterOperand<TILEPAIR, "printTILEPair"> {
-  let ParserMatchClass = TILEPairAsmOperand;
-}
diff --git a/llvm/lib/Target/X86/X86InstrPredicates.td b/llvm/lib/Target/X86/X86InstrPredicates.td
index c20bb05018b4d..98104a6fad1a9 100644
--- a/llvm/lib/Target/X86/X86InstrPredicates.td
+++ b/llvm/lib/Target/X86/X86InstrPredicates.td
@@ -183,7 +183,6 @@ def HasAMXINT8   : Predicate<"Subtarget->hasAMXINT8()">;
 def HasAMXCOMPLEX : Predicate<"Subtarget->hasAMXCOMPLEX()">;
 def HasAMXFP8    : Predicate<"Subtarget->hasAMXFP8()">;
 def HasAMXMOVRS  : Predicate<"Subtarget->hasAMXMOVRS()">;
-def HasAMXTRANSPOSE : Predicate<"Subtarget->hasAMXTRANSPOSE()">;
 def HasAMXAVX512 : Predicate<"Subtarget->hasAMXAVX512()">;
 def HasAMXTF32   : Predicate<"Subtarget->hasAMXTF32()">;
 def HasUINTR     : Predicate<"Subtarget->hasUINTR()">;
diff --git a/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp b/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp
index 090060eaa65e1..3b96e706fb607 100644
--- a/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp
+++ b/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp
@@ -115,9 +115,9 @@ struct MachineGadgetGraph : ImmutableGraph<MachineInstr *, int> {
   static constexpr MachineInstr *const ArgNodeSentinel = nullptr;
 
   using GraphT = ImmutableGraph<MachineInstr *, int>;
-  using Node = typename GraphT::Node;
-  using Edge = typename GraphT::Edge;
-  using size_type = typename GraphT::size_type;
+  using Node = GraphT::Node;
+  using Edge = GraphT::Edge;
+  using size_type = GraphT::size_type;
   MachineGadgetGraph(std::unique_ptr<Node[]> Nodes,
                      std::unique_ptr<Edge[]> Edges, size_type NodesSize,
                      size_type EdgesSize, int NumFences = 0, int NumGadgets = 0)
@@ -191,10 +191,10 @@ template <>
 struct DOTGraphTraits<MachineGadgetGraph *> : DefaultDOTGraphTraits {
   using GraphType = MachineGadgetGraph;
   using Traits = llvm::GraphTraits<GraphType *>;
-  using NodeRef = typename Traits::NodeRef;
-  using EdgeRef = typename Traits::EdgeRef;
-  using ChildIteratorType = typename Traits::ChildIteratorType;
-  using ChildEdgeIteratorType = typename Traits::ChildEdgeIteratorType;
+  using NodeRef = Traits::NodeRef;
+  using EdgeRef = Traits::EdgeRef;
+  using ChildIteratorType = Traits::ChildIteratorType;
+  using ChildEdgeIteratorType = Traits::ChildEdgeIteratorType;
 
   DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
 
@@ -227,9 +227,6 @@ struct DOTGraphTraits<MachineGadgetGraph *> : DefaultDOTGraphTraits {
 
 } // end namespace llvm
 
-constexpr MachineInstr *MachineGadgetGraph::ArgNodeSentinel;
-constexpr int MachineGadgetGraph::GadgetEdgeSentinel;
-
 char X86LoadValueInjectionLoadHardeningPass::ID = 0;
 
 void X86LoadValueInjectionLoadHardeningPass::getAnalysisUsage(
@@ -335,7 +332,7 @@ X86LoadValueInjectionLoadHardeningPass::getGadgetGraph(
   L.computePhiInfo();
 
   GraphBuilder Builder;
-  using GraphIter = typename GraphBuilder::BuilderNodeRef;
+  using GraphIter = GraphBuilder::BuilderNodeRef;
   DenseMap<MachineInstr *, GraphIter> NodeMap;
   int FenceCount = 0, GadgetCount = 0;
   auto MaybeAddNode = [&NodeMap, &Builder](MachineInstr *MI) {
diff --git a/llvm/lib/Target/X86/X86LowerAMXType.cpp b/llvm/lib/Target/X86/X86LowerAMXType.cpp
index 8ffd454f4f73e..2fc5d38ef5055 100644
--- a/llvm/lib/Target/X86/X86LowerAMXType.cpp
+++ b/llvm/lib/Target/X86/X86LowerAMXType.cpp
@@ -74,22 +74,6 @@ static bool isAMXCast(Instruction *II) {
          match(II, m_Intrinsic<Intrinsic::x86_cast_tile_to_vector>(m_Value()));
 }
 
-// Some instructions may return more than one tiles.
-// e.g: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal
-static unsigned getNumDefTiles(IntrinsicInst *II) {
-  Type *Ty = II->getType();
-  if (Ty->isX86_AMXTy())
-    return 1;
-
-  unsigned Num = 0;
-  for (unsigned i = 0; i < Ty->getNumContainedTypes(); i++) {
-    Type *STy = Ty->getContainedType(i);
-    if (STy->isX86_AMXTy())
-      Num++;
-  }
-  return Num;
-}
-
 static bool isAMXIntrinsic(Value *I) {
   auto *II = dyn_cast<IntrinsicInst>(I);
   if (!II)
@@ -98,7 +82,7 @@ static bool isAMXIntrinsic(Value *I) {
     return false;
   // Check if return type or parameter is x86_amx. If it is x86_amx
   // the intrinsic must be x86 amx intrinsics.
-  if (getNumDefTiles(II) > 0)
+  if (II->getType()->isX86_AMXTy())
     return true;
   for (Value *V : II->args()) {
     if (V->getType()->isX86_AMXTy())
@@ -137,27 +121,7 @@ static Instruction *getFirstNonAllocaInTheEntryBlock(Function &F) {
   llvm_unreachable("No terminator in the entry block!");
 }
 
-class ShapeCalculator {
-private:
-  const TargetMachine *TM = nullptr;
-
-  // In AMX intrinsics we let Shape = {Row, Col}, but the
-  // RealCol = Col / ElementSize. We may use the RealCol
-  // as a new Row for other new created AMX intrinsics.
-  std::map<Value *, Value *> Col2Row, Row2Col;
-
-public:
-  ShapeCalculator(const TargetMachine *TargetM) : TM(TargetM) {}
-  std::pair<Value *, Value *> getShape(IntrinsicInst *II, unsigned OpNo);
-  std::pair<Value *, Value *> getShape(PHINode *Phi);
-  Value *getRowFromCol(Instruction *II, Value *V, unsigned Granularity);
-  Value *getColFromRow(Instruction *II, Value *V, unsigned Granularity);
-};
-
-Value *ShapeCalculator::getRowFromCol(Instruction *II, Value *V,
-                                      unsigned Granularity) {
-  if (auto It = Col2Row.find(V); It != Col2Row.end())
-    return It->second;
+static Value *getRowFromCol(Instruction *II, Value *V, unsigned Granularity) {
   IRBuilder<> Builder(II);
   Value *RealRow = nullptr;
   if (isa<ConstantInt>(V))
@@ -186,47 +150,16 @@ Value *ShapeCalculator::getRowFromCol(Instruction *II, Value *V,
         getFirstNonAllocaInTheEntryBlock(*II->getFunction()));
     RealRow = NewBuilder.CreateUDiv(V, NewBuilder.getInt16(Granularity));
   }
-  Col2Row[V] = RealRow;
   return RealRow;
 }
 
-Value *ShapeCalculator::getColFromRow(Instruction *II, Value *V,
-                                      unsigned Granularity) {
-  if (auto It = Row2Col.find(V); It != Row2Col.end())
-    return It->second;
-  IRBuilder<> Builder(II);
-  Value *RealCol = nullptr;
-  if (isa<ConstantInt>(V))
-    RealCol =
-        Builder.getInt16((cast<ConstantInt>(V)->getSExtValue()) * Granularity);
-  else if (isa<Instruction>(V)) {
-    Builder.SetInsertPoint(cast<Instruction>(V));
-    RealCol = Builder.CreateNUWMul(V, Builder.getInt16(Granularity));
-    cast<Instruction>(RealCol)->moveAfter(cast<Instruction>(V));
-  } else {
-    // When it is not a const value and it is a function argument, we create
-    // Row at the entry bb.
-    IRBuilder<> NewBuilder(
-        getFirstNonAllocaInTheEntryBlock(*II->getFunction()));
-    RealCol = NewBuilder.CreateNUWMul(V, NewBuilder.getInt16(Granularity));
-  }
-  Row2Col[V] = RealCol;
-  return RealCol;
-}
-
 // TODO: Refine the row and col-in-bytes of tile to row and col of matrix.
-std::pair<Value *, Value *> ShapeCalculator::getShape(IntrinsicInst *II,
-                                                      unsigned OpNo) {
-  (void)TM;
+std::pair<Value *, Value *> getShape(IntrinsicInst *II, unsigned OpNo) {
   IRBuilder<> Builder(II);
   Value *Row = nullptr, *Col = nullptr;
   switch (II->getIntrinsicID()) {
   default:
     llvm_unreachable("Expect amx intrinsics");
-  case Intrinsic::x86_t2rpntlvwz0_internal:
-  case Intrinsic::x86_t2rpntlvwz0t1_internal:
-  case Intrinsic::x86_t2rpntlvwz1_internal:
-  case Intrinsic::x86_t2rpntlvwz1t1_internal:
   case Intrinsic::x86_tileloadd64_internal:
   case Intrinsic::x86_tileloaddt164_internal:
   case Intrinsic::x86_tilestored64_internal:
@@ -271,13 +204,6 @@ std::pair<Value *, Value *> ShapeCalculator::getShape(IntrinsicInst *II,
     }
     break;
   }
-  case Intrinsic::x86_ttransposed_internal:
-  case Intrinsic::x86_tconjtfp16_internal: {
-    assert((OpNo == 2) && "Illegal Operand Number.");
-    Row = getRowFromCol(II, II->getArgOperand(1), 4);
-    Col = getColFromRow(II, II->getArgOperand(0), 4);
-    break;
-  }
   case Intrinsic::x86_tcvtrowd2ps_internal:
   case Intrinsic::x86_tcvtrowps2bf16h_internal:
   case Intrinsic::x86_tcvtrowps2bf16l_internal:
@@ -289,34 +215,12 @@ std::pair<Value *, Value *> ShapeCalculator::getShape(IntrinsicInst *II,
     Col = II->getArgOperand(1);
     break;
   }
-  case Intrinsic::x86_ttdpbf16ps_internal:
-  case Intrinsic::x86_ttdpfp16ps_internal:
-  case Intrinsic::x86_ttcmmimfp16ps_internal:
-  case Intrinsic::x86_ttcmmrlfp16ps_internal:
-  case Intrinsic::x86_tconjtcmmimfp16ps_internal:
-  case Intrinsic::x86_ttmmultf32ps_internal: {
-    switch (OpNo) {
-    case 3:
-      Row = II->getArgOperand(0);
-      Col = II->getArgOperand(1);
-      break;
-    case 4:
-      Row = getRowFromCol(II, II->getArgOperand(2), 4);
-      Col = getColFromRow(II, II->getArgOperand(0), 4);
-      break;
-    case 5:
-      Row = getRowFromCol(II, II->getArgOperand(2), 4);
-      Col = II->getArgOperand(1);
-      break;
-    }
-    break;
-  }
   }
 
   return std::make_pair(Row, Col);
 }
 
-std::pair<Value *, Value *> ShapeCalculator::getShape(PHINode *Phi) {
+static std::pair<Value *, Value *> getShape(PHINode *Phi) {
   Use &U = *(Phi->use_begin());
   unsigned OpNo = U.getOperandNo();
   User *V = U.getUser();
@@ -349,15 +253,14 @@ std::pair<Value *, Value *> ShapeCalculator::getShape(PHINode *Phi) {
 namespace {
 class X86LowerAMXType {
   Function &Func;
-  ShapeCalculator *SC;
 
   // In AMX intrinsics we let Shape = {Row, Col}, but the
   // RealCol = Col / ElementSize. We may use the RealCol
   // as a new Row for other new created AMX intrinsics.
-  std::map<Value *, Value *> Col2Row, Row2Col;
+  std::map<Value *, Value *> Col2Row;
 
 public:
-  X86LowerAMXType(Function &F, ShapeCalculator *ShapeC) : Func(F), SC(ShapeC) {}
+  X86LowerAMXType(Function &F) : Func(F) {}
   bool visit();
   void combineLoadBitcast(LoadInst *LD, BitCastInst *Bitcast);
   void combineBitcastStore(BitCastInst *Bitcast, StoreInst *ST);
@@ -374,7 +277,7 @@ void X86LowerAMXType::combineLoadBitcast(LoadInst *LD, BitCastInst *Bitcast) {
   Use &U = *(Bitcast->use_begin());
   unsigned OpNo = U.getOperandNo();
   auto *II = cast<IntrinsicInst>(U.getUser());
-  std::tie(Row, Col) = SC->getShape(II, OpNo);
+  std::tie(Row, Col) = getShape(II, OpNo);
   IRBuilder<> Builder(Bitcast);
   // Use the maximun column as stride.
   Value *Stride = Builder.getInt64(64);
@@ -454,7 +357,7 @@ bool X86LowerAMXType::transformBitcast(BitCastInst *Bitcast) {
     Builder.CreateStore(Src, AllocaAddr);
     // TODO we can pick an constant operand for the shape.
     Value *Row = nullptr, *Col = nullptr;
-    std::tie(Row, Col) = SC->getShape(II, OpNo);
+    std::tie(Row, Col) = getShape(II, OpNo);
     std::array<Value *, 4> Args = {Row, Col, I8Ptr, Stride};
     Value *NewInst =
         Builder.CreateIntrinsic(Intrinsic::x86_tileloadd64_internal, Args);
@@ -594,18 +497,11 @@ static Value *getAllocaPos(BasicBlock *BB) {
 
 static Instruction *createTileStore(Instruction *TileDef, Value *Ptr) {
   assert(TileDef->getType()->isX86_AMXTy() && "Not define tile!");
-  auto *II = dyn_cast<IntrinsicInst>(TileDef);
-  unsigned Idx = 0;
-  // Extract tile from multiple tiles' def.
-  if (auto *Extr = dyn_cast<ExtractValueInst>(TileDef)) {
-    assert(Extr->hasIndices() && "Tile extract miss index!");
-    Idx = Extr->getIndices()[0];
-    II = cast<IntrinsicInst>(Extr->getOperand(0));
-  }
+  auto *II = cast<IntrinsicInst>(TileDef);
 
   assert(II && "Not tile intrinsic!");
-  Value *Row = II->getOperand(Idx);
-  Value *Col = II->getOperand(Idx + 1);
+  Value *Row = II->getOperand(0);
+  Value *Col = II->getOperand(1);
 
   BasicBlock *BB = TileDef->getParent();
   BasicBlock::iterator Iter = TileDef->getIterator();
@@ -624,20 +520,14 @@ static void replaceWithTileLoad(Use &U, Value *Ptr, bool IsPHI = false) {
 
   // Get tile shape.
   IntrinsicInst *II = nullptr;
-  unsigned Idx = 0;
   if (IsPHI) {
     Value *PhiOp = cast<PHINode>(V)->getIncomingValue(0);
     II = cast<IntrinsicInst>(PhiOp);
-  } else if (auto *Extr = dyn_cast<ExtractValueInst>(V)) {
-    // Extract tile from multiple tiles' def.
-    assert(Extr->hasIndices() && "Tile extract miss index!");
-    Idx = Extr->getIndices()[0];
-    II = cast<IntrinsicInst>(Extr->getOperand(0));
   } else {
     II = cast<IntrinsicInst>(V);
   }
-  Value *Row = II->getOperand(Idx);
-  Value *Col = II->getOperand(Idx + 1);
+  Value *Row = II->getOperand(0);
+  Value *Col = II->getOperand(1);
 
   Instruction *UserI = cast<Instruction>(U.getUser());
   IRBuilder<> Builder(UserI);
@@ -848,12 +738,10 @@ namespace {
 
 class X86LowerAMXCast {
   Function &Func;
-  ShapeCalculator *SC;
   std::unique_ptr<DominatorTree> DT;
 
 public:
-  X86LowerAMXCast(Function &F, ShapeCalculator *ShapeC)
-      : Func(F), SC(ShapeC), DT(nullptr) {}
+  X86LowerAMXCast(Function &F) : Func(F), DT(nullptr) {}
   bool combineCastStore(IntrinsicInst *Cast, StoreInst *ST);
   bool combineLoadCast(IntrinsicInst *Cast, LoadInst *LD);
   bool combineTilezero(IntrinsicInst *Cast);
@@ -932,7 +820,7 @@ bool X86LowerAMXCast::optimizeAMXCastFromPhi(
         if (!isa<UndefValue>(IncValue) && !IncConst->isZeroValue())
           return false;
         Value *Row = nullptr, *Col = nullptr;
-        std::tie(Row, Col) = SC->getShape(OldPN);
+        std::tie(Row, Col) = getShape(OldPN);
         // TODO: If it is not constant the Row and Col must domoniate tilezero
         // that we are going to create.
         if (!Row || !Col || !isa<Constant>(Row) || !isa<Constant>(Col))
@@ -1063,19 +951,6 @@ bool X86LowerAMXCast::optimizeAMXCastFromPhi(
   return true;
 }
 
-static Value *getShapeFromAMXIntrinsic(Value *Inst, unsigned ShapeIdx,
-                                       bool IsRow) {
-  if (!isAMXIntrinsic(Inst))
-    return nullptr;
-
-  auto *II = cast<IntrinsicInst>(Inst);
-  if (IsRow)
-    return II->getOperand(0);
-
-  assert(ShapeIdx < 2 && "Currently 2 shapes in 1 instruction at most!");
-  return II->getOperand(ShapeIdx + 1);
-}
-
 // %43 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %42)
 // store <256 x i32> %43, <256 x i32>* %p, align 64
 // -->
@@ -1090,38 +965,13 @@ bool X86LowerAMXCast::combineCastStore(IntrinsicInst *Cast, StoreInst *ST) {
   if (!Tile->hasOneUse())
     return false;
 
-  // We don't fetch shape from tilestore, we only get shape from tiledef,
-  // so we can set the max tile shape to tilestore for special cases.
+  auto *II = cast<IntrinsicInst>(Tile);
+  // Tile is output from AMX intrinsic. The first operand of the
+  // intrinsic is row, the second operand of the intrinsic is column.
+  Value *Row = II->getOperand(0);
+  Value *Col = II->getOperand(1);
+
   IRBuilder<> Builder(ST);
-  Value *Row = nullptr;
-  Value *Col = nullptr;
-
-  if (isAMXIntrinsic(Tile)) {
-    auto *II = cast<IntrinsicInst>(Tile);
-    // Tile is output from AMX intrinsic. The first operand of the
-    // intrinsic is row, the second operand of the intrinsic is column.
-    Row = II->getOperand(0);
-    Col = II->getOperand(1);
-  } else {
-    // Now we supported multi-tiles value in structure, so we may get tile
-    // from extracting multi-tiles structure.
-    // For example:
-    // %6 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 %1,
-    //      i16 %2, i16 %3, i8* %4, i64 %5)
-    // %7 = extractvalue { x86_amx, x86_amx } %6, 0
-    // %8 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %7)
-    // store <256 x i32> %8, <256 x i32>* %0, align 1024
-    //
-    // TODO: Currently we only handle extractvalue case, enhance me for other
-    // cases if possible.
-    auto *II = cast<ExtractValueInst>(Tile);
-    assert(II && "We meet unhandle source in fetching tile value!");
-    unsigned ShapeIdx = II->getIndices()[0];
-    Value *Tiles = II->getOperand(0);
-    Row = getShapeFromAMXIntrinsic(Tiles, ShapeIdx, true);
-    Col = getShapeFromAMXIntrinsic(Tiles, ShapeIdx, false);
-  }
-  assert(Row && Col && "Shape got failed!");
 
   // Stride should be equal to col(measured by bytes)
   Value *Stride = Builder.CreateSExt(Col, Builder.getInt64Ty());
@@ -1146,7 +996,7 @@ bool X86LowerAMXCast::combineLoadCast(IntrinsicInst *Cast, LoadInst *LD) {
   // shape information through def-use chain.
   if (!isAMXIntrinsic(II))
     return false;
-  std::tie(Row, Col) = SC->getShape(II, OpNo);
+  std::tie(Row, Col) = getShape(II, OpNo);
   IRBuilder<> Builder(LD);
   // Stride should be equal to col(measured by bytes)
   Value *Stride = Builder.CreateSExt(Col, Builder.getInt64Ty());
@@ -1189,7 +1039,7 @@ bool X86LowerAMXCast::combineTilezero(IntrinsicInst *Cast) {
   if (!isAMXIntrinsic(II))
     return false;
 
-  std::tie(Row, Col) = SC->getShape(II, OpNo);
+  std::tie(Row, Col) = getShape(II, OpNo);
 
   IRBuilder<> Builder(Cast);
   Value *NewInst =
@@ -1384,7 +1234,7 @@ bool X86LowerAMXCast::transformAMXCast(IntrinsicInst *AMXCast) {
     Builder.CreateStore(Src, AllocaAddr);
     // TODO we can pick an constant operand for the shape.
     Value *Row = nullptr, *Col = nullptr;
-    std::tie(Row, Col) = SC->getShape(II, OpNo);
+    std::tie(Row, Col) = getShape(II, OpNo);
     std::array<Value *, 4> Args = {
         Row, Col, I8Ptr, Builder.CreateSExt(Col, Builder.getInt64Ty())};
     Value *NewInst =
@@ -1445,14 +1295,13 @@ bool lowerAmxType(Function &F, const TargetMachine *TM,
     return false;
 
   bool C = false;
-  ShapeCalculator SC(TM);
-  X86LowerAMXCast LAC(F, &SC);
+  X86LowerAMXCast LAC(F);
   C |= LAC.combineAMXcast(TLI);
   // There might be remaining AMXcast after combineAMXcast and they should be
   // handled elegantly.
   C |= LAC.transformAllAMXCast();
 
-  X86LowerAMXType LAT(F, &SC);
+  X86LowerAMXType LAT(F);
   C |= LAT.visit();
 
   // Prepare for fast register allocation at O0.
diff --git a/llvm/lib/Target/X86/X86PreTileConfig.cpp b/llvm/lib/Target/X86/X86PreTileConfig.cpp
index 2a1c49957bf7a..8a1d00d2f6427 100644
--- a/llvm/lib/Target/X86/X86PreTileConfig.cpp
+++ b/llvm/lib/Target/X86/X86PreTileConfig.cpp
@@ -141,15 +141,10 @@ class X86PreTileConfig : public MachineFunctionPass {
     if (!MO.isReg() || !MO.getReg().isVirtual())
       return false;
 
-    unsigned Shapes = 0;
-    if (MRI->getRegClass(MO.getReg())->getID() == X86::TILERegClassID)
-      Shapes = 1;
-    if (MRI->getRegClass(MO.getReg())->getID() == X86::TILEPAIRRegClassID)
-      Shapes = 2;
-    if (!Shapes)
+    if (MRI->getRegClass(MO.getReg())->getID() != X86::TILERegClassID)
       return false;
 
-    collectShapeInfo(MI, Shapes);
+    collectShapeInfo(MI);
     return true;
   }
 
@@ -165,7 +160,7 @@ class X86PreTileConfig : public MachineFunctionPass {
   }
 
   /// Collect the shape def information for later use.
-  void collectShapeInfo(MachineInstr &MI, unsigned Shapes);
+  void collectShapeInfo(MachineInstr &MI);
 
   /// Try to hoist shapes definded below AMX instructions.
   bool hoistShapesInBB(MachineBasicBlock *MBB, SmallVectorImpl<MIRef> &Shapes) {
@@ -231,7 +226,7 @@ INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)
 INITIALIZE_PASS_END(X86PreTileConfig, "tilepreconfig",
                     "Tile Register Pre-configure", false, false)
 
-void X86PreTileConfig::collectShapeInfo(MachineInstr &MI, unsigned Shapes) {
+void X86PreTileConfig::collectShapeInfo(MachineInstr &MI) {
   auto RecordShape = [&](MachineInstr *MI, MachineBasicBlock *MBB) {
     MIRef MIR(MI, MBB);
     auto &Refs = ShapeBBs[MBB];
@@ -240,10 +235,8 @@ void X86PreTileConfig::collectShapeInfo(MachineInstr &MI, unsigned Shapes) {
       Refs.insert(I, MIR);
   };
 
-  // All shapes have same row in multi-tile operand.
-  SmallVector<Register, 8> WorkList;
-  for (unsigned I = 1; I < Shapes + 2; ++I)
-    WorkList.push_back(MI.getOperand(I).getReg());
+  SmallVector<Register, 8> WorkList(
+      {MI.getOperand(1).getReg(), MI.getOperand(2).getReg()});
   while (!WorkList.empty()) {
     Register R = WorkList.pop_back_val();
     MachineInstr *DefMI = MRI->getVRegDef(R);
@@ -252,13 +245,6 @@ void X86PreTileConfig::collectShapeInfo(MachineInstr &MI, unsigned Shapes) {
     if (DefMI->isMoveImmediate() || !DefVisited.insert(DefMI).second)
       continue;
 
-    // This happens when column = 0 in multi-tile operand.
-    if (DefMI->getOpcode() == X86::COPY) {
-      MachineInstr *MI = MRI->getVRegDef(DefMI->getOperand(1).getReg());
-      if (MI && MI->isMoveImmediate())
-        continue;
-    }
-
     if (DefMI->isPHI()) {
       for (unsigned I = 1; I < DefMI->getNumOperands(); I += 2)
         if (isLoopBackEdge(DefMBB, DefMI->getOperand(I + 1).getMBB()))
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp
index 76979e37c4618..72f38133e21ff 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.cpp
+++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -597,10 +597,6 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
       Reserved.set(*AI);
   }
 
-  // Reserve low half pair registers in case they are used by RA aggressively.
-  Reserved.set(X86::TMM0_TMM1);
-  Reserved.set(X86::TMM2_TMM3);
-
   assert(checkAllSuperRegsMarked(Reserved,
                                  {X86::SIL, X86::DIL, X86::BPL, X86::SPL,
                                   X86::SIH, X86::DIH, X86::BPH, X86::SPH}));
@@ -621,7 +617,7 @@ unsigned X86RegisterInfo::getNumSupportedRegs(const MachineFunction &MF) const {
   // and try to return the minimum number of registers supported by the target.
   static_assert((X86::R15WH + 1 == X86::YMM0) && (X86::YMM15 + 1 == X86::K0) &&
                     (X86::K6_K7 + 1 == X86::TMMCFG) &&
-                    (X86::TMM6_TMM7 + 1 == X86::R16) &&
+                    (X86::TMM7 + 1 == X86::R16) &&
                     (X86::R31WH + 1 == X86::NUM_TARGET_REGS),
                 "Register number may be incorrect");
 
@@ -694,8 +690,7 @@ bool X86RegisterInfo::isFixedRegister(const MachineFunction &MF,
 }
 
 bool X86RegisterInfo::isTileRegisterClass(const TargetRegisterClass *RC) const {
-  return RC->getID() == X86::TILERegClassID ||
-         RC->getID() == X86::TILEPAIRRegClassID;
+  return RC->getID() == X86::TILERegClassID;
 }
 
 void X86RegisterInfo::adjustStackMapLiveOutMask(uint32_t *Mask) const {
@@ -1062,17 +1057,9 @@ static ShapeT getTileShape(Register VirtReg, VirtRegMap *VRM,
   case X86::PTDPFP16PSV:
   case X86::PTCMMIMFP16PSV:
   case X86::PTCMMRLFP16PSV:
-  case X86::PTTRANSPOSEDV:
-  case X86::PTTDPBF16PSV:
-  case X86::PTTDPFP16PSV:
-  case X86::PTTCMMIMFP16PSV:
-  case X86::PTTCMMRLFP16PSV:
-  case X86::PTCONJTCMMIMFP16PSV:
-  case X86::PTCONJTFP16V:
   case X86::PTILELOADDRSV:
   case X86::PTILELOADDRST1V:
   case X86::PTMMULTF32PSV:
-  case X86::PTTMMULTF32PSV:
   case X86::PTDPBF8PSV:
   case X86::PTDPBHF8PSV:
   case X86::PTDPHBF8PSV:
@@ -1083,56 +1070,7 @@ static ShapeT getTileShape(Register VirtReg, VirtRegMap *VRM,
     VRM->assignVirt2Shape(VirtReg, Shape);
     return Shape;
   }
-  case X86::PT2RPNTLVWZ0V:
-  case X86::PT2RPNTLVWZ0T1V:
-  case X86::PT2RPNTLVWZ1V:
-  case X86::PT2RPNTLVWZ1T1V:
-  case X86::PT2RPNTLVWZ0RSV:
-  case X86::PT2RPNTLVWZ0RST1V:
-  case X86::PT2RPNTLVWZ1RSV:
-  case X86::PT2RPNTLVWZ1RST1V: {
-    MachineOperand &MO1 = MI->getOperand(1);
-    MachineOperand &MO2 = MI->getOperand(2);
-    MachineOperand &MO3 = MI->getOperand(3);
-    ShapeT Shape({&MO1, &MO2, &MO1, &MO3}, MRI);
-    VRM->assignVirt2Shape(VirtReg, Shape);
-    return Shape;
-  }
-  }
-}
-
-static bool canHintShape(ShapeT &PhysShape, ShapeT &VirtShape) {
-  unsigned PhysShapeNum = PhysShape.getShapeNum();
-  unsigned VirtShapeNum = VirtShape.getShapeNum();
-
-  if (PhysShapeNum < VirtShapeNum)
-    return false;
-
-  if (PhysShapeNum == VirtShapeNum) {
-    if (PhysShapeNum == 1)
-      return PhysShape == VirtShape;
-
-    for (unsigned I = 0; I < PhysShapeNum; I++) {
-      ShapeT PShape(PhysShape.getRow(I), PhysShape.getCol(I));
-      ShapeT VShape(VirtShape.getRow(I), VirtShape.getCol(I));
-      if (VShape != PShape)
-        return false;
-    }
-    return true;
-  }
-
-  // Hint subreg of mult-tile reg to single tile reg.
-  if (VirtShapeNum == 1) {
-    for (unsigned I = 0; I < PhysShapeNum; I++) {
-      ShapeT PShape(PhysShape.getRow(I), PhysShape.getCol(I));
-      if (VirtShape == PShape)
-        return true;
-    }
   }
-
-  // Note: Currently we have no requirement for case of
-  // (VirtShapeNum > 1 and PhysShapeNum > VirtShapeNum)
-  return false;
 }
 
 bool X86RegisterInfo::getRegAllocationHints(Register VirtReg,
@@ -1153,7 +1091,7 @@ bool X86RegisterInfo::getRegAllocationHints(Register VirtReg,
   if (!VRM)
     return BaseImplRetVal;
 
-  if (ID != X86::TILERegClassID && ID != X86::TILEPAIRRegClassID) {
+  if (ID != X86::TILERegClassID) {
     if (DisableRegAllocNDDHints || !ST.hasNDD() ||
         !TRI.isGeneralPurposeRegisterClass(&RC))
       return BaseImplRetVal;
@@ -1204,7 +1142,7 @@ bool X86RegisterInfo::getRegAllocationHints(Register VirtReg,
       return;
     }
     ShapeT PhysShape = getTileShape(VReg, const_cast<VirtRegMap *>(VRM), MRI);
-    if (canHintShape(PhysShape, VirtShape))
+    if (PhysShape == VirtShape)
       Hints.push_back(PhysReg);
   };
 
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.td b/llvm/lib/Target/X86/X86RegisterInfo.td
index 99b7910131dc5..692e42ae5e752 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.td
+++ b/llvm/lib/Target/X86/X86RegisterInfo.td
@@ -30,8 +30,6 @@ let Namespace = "X86" in {
   def sub_ymm      : SubRegIndex<256>;
   def sub_mask_0   : SubRegIndex<-1>;
   def sub_mask_1   : SubRegIndex<-1, -1>;
-  def sub_t0       : SubRegIndex<8192>;
-  def sub_t1       : SubRegIndex<8192, 8192>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -432,10 +430,6 @@ def TMM4:  X86Reg<"tmm4",   4>;
 def TMM5:  X86Reg<"tmm5",   5>;
 def TMM6:  X86Reg<"tmm6",   6>;
 def TMM7:  X86Reg<"tmm7",   7>;
-// TMM register pairs
-def TPAIRS : RegisterTuples<[sub_t0, sub_t1],
-                            [(add TMM0, TMM2, TMM4, TMM6),
-                             (add TMM1, TMM3, TMM5, TMM7)]>;
 }
 
 // Floating point stack registers. These don't map one-to-one to the FP
@@ -862,9 +856,6 @@ def VK64WM  : RegisterClass<"X86", [v64i1], 64, (add VK32WM)> {let Size = 64;}
 let CopyCost = -1 in // Don't allow copying of tile registers
 def TILE : RegisterClass<"X86", [x86amx], 8192,
                          (sequence "TMM%u", 0, 7)> {let Size = 8192;}
-// Need check alignment 3rd operand size=1024*2*8
-let isAllocatable = 1 in
-def TILEPAIR : RegisterClass<"X86", [untyped], 512, (add TPAIRS)> {let Size = 16384;}
 
 //===----------------------------------------------------------------------===//
 // Register categories.
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 3d8d0a236a3c1..0b1430e373fc7 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -6562,7 +6562,7 @@ bool X86TTIImpl::areInlineCompatible(const Function *Caller,
 
 bool X86TTIImpl::areTypesABICompatible(const Function *Caller,
                                        const Function *Callee,
-                                       const ArrayRef<Type *> &Types) const {
+                                       ArrayRef<Type *> Types) const {
   if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
     return false;
 
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index 133b3668a46c8..de5e1c297b1e4 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -296,7 +296,7 @@ class X86TTIImpl final : public BasicTTIImplBase<X86TTIImpl> {
   bool areInlineCompatible(const Function *Caller,
                            const Function *Callee) const override;
   bool areTypesABICompatible(const Function *Caller, const Function *Callee,
-                             const ArrayRef<Type *> &Type) const override;
+                             ArrayRef<Type *> Type) const override;
 
   uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override {
     return ST->getMaxInlineSizeThreshold();
diff --git a/llvm/lib/Target/X86/X86TileConfig.cpp b/llvm/lib/Target/X86/X86TileConfig.cpp
index 17a44dde6480f..09ef8fbc12de9 100644
--- a/llvm/lib/Target/X86/X86TileConfig.cpp
+++ b/llvm/lib/Target/X86/X86TileConfig.cpp
@@ -74,63 +74,6 @@ INITIALIZE_PASS_DEPENDENCY(VirtRegMapWrapperLegacy)
 INITIALIZE_PASS_END(X86TileConfig, DEBUG_TYPE, "Tile Register Configure", false,
                     false)
 
-unsigned getAMXRegNum(MachineRegisterInfo *MRI, Register Reg) {
-  if (Reg.isVirtual()) {
-    unsigned RegClassID = MRI->getRegClass(Reg)->getID();
-    if (RegClassID == X86::TILERegClassID)
-      return 1;
-    if (RegClassID == X86::TILEPAIRRegClassID)
-      return 2;
-  } else {
-    if (Reg >= X86::TMM0 && Reg <= X86::TMM7)
-      return 1;
-    if (Reg >= X86::TMM0_TMM1 && Reg <= X86::TMM6_TMM7)
-      return 2;
-  }
-  return 0;
-}
-
-static void collectVirtRegShapes(MachineRegisterInfo *MRI, VirtRegMap &VRM,
-                                 Register VirtReg,
-                                 SmallVector<ShapeT, 8> &Phys2Shapes) {
-  unsigned Num = getAMXRegNum(MRI, VirtReg);
-  MCRegister PhysReg = VRM.getPhys(VirtReg);
-  if (!PhysReg)
-    return;
-
-  if (Num == 1) {
-    unsigned Index = PhysReg - X86::TMM0;
-    if (!Phys2Shapes[Index].isValid()) {
-      ShapeT Shape = VRM.getShape(VirtReg);
-      Phys2Shapes[Index] = std::move(Shape);
-      return;
-    }
-  }
-  // Split tile pair shape info to 2 single tile shape info. e.g:
-  // Put TMM0_TMM1's Shape to TMM0's shape + TMM1's Shape in Phys2Shapes.
-  if (Num == 2) {
-    unsigned Index0 = (PhysReg - X86::TMM0_TMM1) * 2;
-    unsigned Index1 = (PhysReg - X86::TMM0_TMM1) * 2 + 1;
-
-    ShapeT Shape = VRM.getShape(VirtReg);
-    assert(Shape.getShapeNum() == 2 && "Unexpected shape number!");
-
-    if (!Phys2Shapes[Index0].isValid()) {
-      ShapeT Shape0(Shape.getRow(0), Shape.getCol(0), MRI);
-      Phys2Shapes[Index0] = std::move(Shape0);
-    }
-
-    if (!Phys2Shapes[Index1].isValid()) {
-      ShapeT Shape1(Shape.getRow(1), Shape.getCol(1), MRI);
-      Phys2Shapes[Index1] = std::move(Shape1);
-    }
-  }
-}
-
-static bool isAMXRegClass(MachineRegisterInfo *MRI, Register Reg) {
-  return getAMXRegNum(MRI, Reg) > 0;
-}
-
 bool X86TileConfig::runOnMachineFunction(MachineFunction &MF) {
   X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
   // Early exit in the common case of non-AMX code.
@@ -138,7 +81,7 @@ bool X86TileConfig::runOnMachineFunction(MachineFunction &MF) {
     return false;
 
   const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
-  const TargetRegisterInfo *TRI = ST.getRegisterInfo();
+  const X86RegisterInfo *TRI = ST.getRegisterInfo();
   const TargetInstrInfo *TII = ST.getInstrInfo();
   MachineRegisterInfo &MRI = MF.getRegInfo();
   LiveIntervals &LIS = getAnalysis<LiveIntervalsWrapperPass>().getLIS();
@@ -176,24 +119,29 @@ bool X86TileConfig::runOnMachineFunction(MachineFunction &MF) {
   assert(ConstMI && "Cannot find an insertion point");
 
   unsigned AMXRegNum = TRI->getRegClass(X86::TILERegClassID)->getNumRegs();
-  SmallVector<ShapeT, 8> Phys2Shapes(AMXRegNum, ShapeT());
+  SmallVector<Register, 8> Phys2Virt(AMXRegNum, 0);
   for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
     Register VirtReg = Register::index2VirtReg(I);
     if (MRI.reg_nodbg_empty(VirtReg))
       continue;
-    if (!isAMXRegClass(&MRI, VirtReg))
+    if (!TRI->isTileRegisterClass(MRI.getRegClass(VirtReg)))
+      continue;
+    MCRegister PhysReg = VRM.getPhys(VirtReg);
+    if (!PhysReg)
       continue;
-    collectVirtRegShapes(&MRI, VRM, VirtReg, Phys2Shapes);
+    unsigned Index = PhysReg - X86::TMM0;
+    if (!Phys2Virt[Index])
+      Phys2Virt[Index] = VirtReg;
   }
 
   // Fill in the shape of each tile physical register.
   for (unsigned I = 0; I < AMXRegNum; ++I) {
-    ShapeT Shape = Phys2Shapes[I];
-    if (!Shape.isValid())
+    if (!Phys2Virt[I])
       continue;
     DebugLoc DL;
     bool IsRow = true;
     MachineInstr *NewMI = nullptr;
+    ShapeT Shape = VRM.getShape(Phys2Virt[I]);
     for (auto &R : {Shape.getRow()->getReg(), Shape.getCol()->getReg()}) {
       // Here is the data format for the tile config.
       // 0      palette
@@ -222,14 +170,7 @@ bool X86TileConfig::runOnMachineFunction(MachineFunction &MF) {
                    "Cannot initialize with different shapes");
             continue;
           }
-          if (DefMI.getOperand(1).isImm()) {
-            Imm = DefMI.getOperand(1).getImm();
-          } else {
-            assert(DefMI.getOpcode() == X86::MOV32r0 &&
-                   "The opcode is assumed to be MOV32r0 if the operand is not "
-                   "immediate.");
-            Imm = 0;
-          }
+          Imm = DefMI.getOperand(1).getImm();
 
           NewMI = addFrameReference(
                       BuildMI(MF.front(), ++ConstMI->getIterator(), DL,
diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
index c8d193887d92f..c164762de2966 100644
--- a/llvm/lib/TargetParser/Host.cpp
+++ b/llvm/lib/TargetParser/Host.cpp
@@ -1179,7 +1179,7 @@ static const char *getAMDProcessorTypeAndSubtype(unsigned Family,
                                                  const unsigned *Features,
                                                  unsigned *Type,
                                                  unsigned *Subtype) {
-  const char *CPU = 0;
+  const char *CPU = nullptr;
 
   switch (Family) {
   case 4:
@@ -2192,7 +2192,6 @@ StringMap<bool> sys::getHostCPUFeatures() {
   bool HasLeaf1E = MaxLevel >= 0x1e &&
                    !getX86CpuIDAndInfoEx(0x1e, 0x1, &EAX, &EBX, &ECX, &EDX);
   Features["amx-fp8"] = HasLeaf1E && ((EAX >> 4) & 1) && HasAMXSave;
-  Features["amx-transpose"] = HasLeaf1E && ((EAX >> 5) & 1) && HasAMXSave;
   Features["amx-tf32"] = HasLeaf1E && ((EAX >> 6) & 1) && HasAMXSave;
   Features["amx-avx512"] = HasLeaf1E && ((EAX >> 7) & 1) && HasAMXSave;
   Features["amx-movrs"] = HasLeaf1E && ((EAX >> 8) & 1) && HasAMXSave;
diff --git a/llvm/lib/TargetParser/PPCTargetParser.cpp b/llvm/lib/TargetParser/PPCTargetParser.cpp
index d51044529a49d..f74d670df4306 100644
--- a/llvm/lib/TargetParser/PPCTargetParser.cpp
+++ b/llvm/lib/TargetParser/PPCTargetParser.cpp
@@ -48,9 +48,9 @@ StringRef normalizeCPUName(StringRef CPUName) {
   // accepting it. Clang has always ignored it and passed the
   // generic CPU ID to the back end.
   return StringSwitch<StringRef>(CPUName)
-      .Cases("common", "405", "generic")
-      .Cases("ppc440", "440fp", "440")
-      .Cases("630", "power3", "pwr3")
+      .Cases({"common", "405"}, "generic")
+      .Cases({"ppc440", "440fp"}, "440")
+      .Cases({"630", "power3"}, "pwr3")
       .Case("G3", "g3")
       .Case("G4", "g4")
       .Case("G4+", "g4+")
@@ -69,7 +69,7 @@ StringRef normalizeCPUName(StringRef CPUName) {
       .Case("power9", "pwr9")
       .Case("power10", "pwr10")
       .Case("power11", "pwr11")
-      .Cases("powerpc", "powerpc32", "ppc")
+      .Cases({"powerpc", "powerpc32"}, "ppc")
       .Case("powerpc64", "ppc64")
       .Case("powerpc64le", "ppc64le")
       .Default(CPUName);
diff --git a/llvm/lib/TargetParser/TargetDataLayout.cpp b/llvm/lib/TargetParser/TargetDataLayout.cpp
index d765d9ccb284d..d7359234b02f7 100644
--- a/llvm/lib/TargetParser/TargetDataLayout.cpp
+++ b/llvm/lib/TargetParser/TargetDataLayout.cpp
@@ -208,7 +208,7 @@ static std::string computeMipsDataLayout(const Triple &TT, StringRef ABIName) {
   return Ret;
 }
 
-static std::string computePowerDataLayout(const Triple &T) {
+static std::string computePowerDataLayout(const Triple &T, StringRef ABIName) {
   bool is64Bit = T.isPPC64();
   std::string Ret;
 
@@ -228,7 +228,8 @@ static std::string computePowerDataLayout(const Triple &T) {
   // If the target ABI uses function descriptors, then the alignment of function
   // pointers depends on the alignment used to emit the descriptor. Otherwise,
   // function pointers are aligned to 32 bits because the instructions must be.
-  if ((T.getArch() == Triple::ppc64 && !T.isPPC64ELFv2ABI())) {
+  if ((T.getArch() == Triple::ppc64 &&
+       (!T.isPPC64ELFv2ABI() && ABIName != "elfv2"))) {
     Ret += "-Fi64";
   } else if (T.isOSAIX()) {
     Ret += is64Bit ? "-Fi64" : "-Fi32";
@@ -573,7 +574,7 @@ std::string Triple::computeDataLayout(StringRef ABIName) const {
   case Triple::ppcle:
   case Triple::ppc64:
   case Triple::ppc64le:
-    return computePowerDataLayout(*this);
+    return computePowerDataLayout(*this, ABIName);
   case Triple::r600:
   case Triple::amdgcn:
     return computeAMDDataLayout(*this);
diff --git a/llvm/lib/TargetParser/X86TargetParser.cpp b/llvm/lib/TargetParser/X86TargetParser.cpp
index b13c795c1649c..37e8ad986aa55 100644
--- a/llvm/lib/TargetParser/X86TargetParser.cpp
+++ b/llvm/lib/TargetParser/X86TargetParser.cpp
@@ -143,7 +143,7 @@ constexpr FeatureBitset FeaturesDiamondRapids =
     FeatureAVXVNNIINT8 | FeatureAVXVNNIINT16 | FeatureSHA512 | FeatureSM3 |
     FeatureSM4 | FeatureEGPR | FeatureZU | FeatureCCMP | FeaturePush2Pop2 |
     FeaturePPX | FeatureNDD | FeatureNF | FeatureMOVRS | FeatureAMX_MOVRS |
-    FeatureAMX_AVX512 | FeatureAMX_FP8 | FeatureAMX_TF32 | FeatureAMX_TRANSPOSE;
+    FeatureAMX_AVX512 | FeatureAMX_FP8 | FeatureAMX_TF32;
 
 // Intel Atom processors.
 // Bonnell has feature parity with Core2 and adds MOVBE.
@@ -615,7 +615,6 @@ constexpr FeatureBitset ImpliedFeaturesAMX_FP16 = FeatureAMX_TILE;
 constexpr FeatureBitset ImpliedFeaturesAMX_INT8 = FeatureAMX_TILE;
 constexpr FeatureBitset ImpliedFeaturesAMX_COMPLEX = FeatureAMX_TILE;
 constexpr FeatureBitset ImpliedFeaturesAMX_FP8 = FeatureAMX_TILE;
-constexpr FeatureBitset ImpliedFeaturesAMX_TRANSPOSE = FeatureAMX_TILE;
 constexpr FeatureBitset ImpliedFeaturesAMX_MOVRS = FeatureAMX_TILE;
 constexpr FeatureBitset ImpliedFeaturesAMX_AVX512 =
     FeatureAMX_TILE | FeatureAVX10_2;
diff --git a/llvm/lib/TextAPI/BinaryReader/DylibReader.cpp b/llvm/lib/TextAPI/BinaryReader/DylibReader.cpp
index cda07e81faf1e..f55bc9c1a28c2 100644
--- a/llvm/lib/TextAPI/BinaryReader/DylibReader.cpp
+++ b/llvm/lib/TextAPI/BinaryReader/DylibReader.cpp
@@ -32,7 +32,7 @@ using namespace llvm::MachO;
 using namespace llvm::MachO::DylibReader;
 
 using TripleVec = std::vector<Triple>;
-static typename TripleVec::iterator emplace(TripleVec &Container, Triple &&T) {
+static TripleVec::iterator emplace(TripleVec &Container, Triple &&T) {
   auto I = partition_point(Container, [=](const Triple &CT) {
     return std::forward_as_tuple(CT.getArch(), CT.getOS(),
                                  CT.getEnvironment()) <
diff --git a/llvm/lib/TextAPI/RecordVisitor.cpp b/llvm/lib/TextAPI/RecordVisitor.cpp
index d333b33092263..24971a70f2ddf 100644
--- a/llvm/lib/TextAPI/RecordVisitor.cpp
+++ b/llvm/lib/TextAPI/RecordVisitor.cpp
@@ -15,7 +15,7 @@
 using namespace llvm;
 using namespace llvm::MachO;
 
-RecordVisitor::~RecordVisitor() {}
+RecordVisitor::~RecordVisitor() = default;
 void RecordVisitor::visitObjCInterface(const ObjCInterfaceRecord &) {}
 void RecordVisitor::visitObjCCategory(const ObjCCategoryRecord &) {}
 
diff --git a/llvm/lib/Transforms/Coroutines/CoroCloner.h b/llvm/lib/Transforms/Coroutines/CoroCloner.h
index e05fe28cb91f5..1e549f122b6ba 100644
--- a/llvm/lib/Transforms/Coroutines/CoroCloner.h
+++ b/llvm/lib/Transforms/Coroutines/CoroCloner.h
@@ -77,7 +77,7 @@ class BaseCloner {
       : OrigF(OrigF), Suffix(Suffix), Shape(Shape), FKind(FKind),
         Builder(OrigF.getContext()), TTI(TTI) {}
 
-  virtual ~BaseCloner() {}
+  virtual ~BaseCloner() = default;
 
   /// Create a clone for a continuation lowering.
   static Function *createClone(Function &OrigF, const Twine &Suffix,
diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index 50485615a9d4c..5ed47aec08b25 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -3619,7 +3619,7 @@ struct AAIntraFnReachabilityFunction final
       return true;
 
     RQITy StackRQI(A, From, To, ExclusionSet, false);
-    typename RQITy::Reachable Result;
+    RQITy::Reachable Result;
     if (!NonConstThis->checkQueryCache(A, StackRQI, Result))
       return NonConstThis->isReachableImpl(A, StackRQI,
                                            /*IsTemporaryRQI=*/true);
@@ -10701,7 +10701,7 @@ struct AAInterFnReachabilityFunction
     auto *NonConstThis = const_cast<AAInterFnReachabilityFunction *>(this);
 
     RQITy StackRQI(A, From, To, ExclusionSet, false);
-    typename RQITy::Reachable Result;
+    RQITy::Reachable Result;
     if (!NonConstThis->checkQueryCache(A, StackRQI, Result))
       return NonConstThis->isReachableImpl(A, StackRQI,
                                            /*IsTemporaryRQI=*/true);
diff --git a/llvm/lib/Transforms/IPO/ExpandVariadics.cpp b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp
index 042578d26818a..6a11aec6c5cb0 100644
--- a/llvm/lib/Transforms/IPO/ExpandVariadics.cpp
+++ b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp
@@ -380,7 +380,7 @@ bool ExpandVariadics::runOnModule(Module &M) {
           if (CB->isIndirectCall()) {
             FunctionType *FTy = CB->getFunctionType();
             if (FTy->isVarArg())
-              Changed |= expandCall(M, Builder, CB, FTy, 0);
+              Changed |= expandCall(M, Builder, CB, FTy, /*NF=*/nullptr);
           }
         }
       }
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index 894d83fa530b1..d35ae4730a9f3 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -1034,11 +1034,11 @@ class IndexCallsiteContextGraph
 } // namespace
 
 template <>
-struct llvm::DenseMapInfo<typename CallsiteContextGraph<
+struct llvm::DenseMapInfo<CallsiteContextGraph<
     ModuleCallsiteContextGraph, Function, Instruction *>::CallInfo>
     : public DenseMapInfo<std::pair<Instruction *, unsigned>> {};
 template <>
-struct llvm::DenseMapInfo<typename CallsiteContextGraph<
+struct llvm::DenseMapInfo<CallsiteContextGraph<
     IndexCallsiteContextGraph, FunctionSummary, IndexCall>::CallInfo>
     : public DenseMapInfo<std::pair<IndexCall, unsigned>> {};
 template <>
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index d7eb745c81317..2a87a0f9aaa99 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -208,7 +208,7 @@ namespace KernelInfo {
 // };
 
 #define KERNEL_ENVIRONMENT_IDX(MEMBER, IDX)                                    \
-  constexpr const unsigned MEMBER##Idx = IDX;
+  constexpr unsigned MEMBER##Idx = IDX;
 
 KERNEL_ENVIRONMENT_IDX(Configuration, 0)
 KERNEL_ENVIRONMENT_IDX(Ident, 1)
@@ -216,7 +216,7 @@ KERNEL_ENVIRONMENT_IDX(Ident, 1)
 #undef KERNEL_ENVIRONMENT_IDX
 
 #define KERNEL_ENVIRONMENT_CONFIGURATION_IDX(MEMBER, IDX)                      \
-  constexpr const unsigned MEMBER##Idx = IDX;
+  constexpr unsigned MEMBER##Idx = IDX;
 
 KERNEL_ENVIRONMENT_CONFIGURATION_IDX(UseGenericStateMachine, 0)
 KERNEL_ENVIRONMENT_CONFIGURATION_IDX(MayUseNestedParallelism, 1)
@@ -258,7 +258,7 @@ KERNEL_ENVIRONMENT_CONFIGURATION_GETTER(MaxTeams)
 
 GlobalVariable *
 getKernelEnvironementGVFromKernelInitCB(CallBase *KernelInitCB) {
-  constexpr const int InitKernelEnvironmentArgNo = 0;
+  constexpr int InitKernelEnvironmentArgNo = 0;
   return cast<GlobalVariable>(
       KernelInitCB->getArgOperand(InitKernelEnvironmentArgNo)
           ->stripPointerCasts());
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 3ddf182149e57..cbaff294819a2 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -3997,6 +3997,27 @@ static Value *foldOrUnsignedUMulOverflowICmp(BinaryOperator &I,
   return nullptr;
 }
 
+/// Fold select(X >s 0, 0, -X) | smax(X, 0) --> abs(X)
+///      select(X <s 0, -X, 0) | smax(X, 0) --> abs(X)
+static Value *FoldOrOfSelectSmaxToAbs(BinaryOperator &I,
+                                      InstCombiner::BuilderTy &Builder) {
+  Value *X;
+  Value *Sel;
+  if (match(&I,
+            m_c_Or(m_Value(Sel), m_OneUse(m_SMax(m_Value(X), m_ZeroInt()))))) {
+    auto NegX = m_Neg(m_Specific(X));
+    if (match(Sel, m_Select(m_SpecificICmp(ICmpInst::ICMP_SGT, m_Specific(X),
+                                           m_ZeroInt()),
+                            m_ZeroInt(), NegX)) ||
+        match(Sel, m_Select(m_SpecificICmp(ICmpInst::ICMP_SLT, m_Specific(X),
+                                           m_ZeroInt()),
+                            NegX, m_ZeroInt())))
+      return Builder.CreateBinaryIntrinsic(Intrinsic::abs, X,
+                                           Builder.getFalse());
+  }
+  return nullptr;
+}
+
 // FIXME: We use commutative matchers (m_c_*) for some, but not all, matches
 // here. We should standardize that construct where it is needed or choose some
 // other way to ensure that commutated variants of patterns are not missed.
@@ -4545,6 +4566,9 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
     if (Value *V = SimplifyAddWithRemainder(I))
       return replaceInstUsesWith(I, V);
 
+  if (Value *Res = FoldOrOfSelectSmaxToAbs(I, Builder))
+    return replaceInstUsesWith(I, Res);
+
   return nullptr;
 }
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 8d9933bfab938..92fca90ddb88a 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -3496,7 +3496,7 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
       if (isPowerOf2_64(AlignMask + 1)) {
         uint64_t Offset = 0;
         match(A, m_Add(m_Value(A), m_ConstantInt(Offset)));
-        if (match(A, m_PtrToInt(m_Value(A)))) {
+        if (match(A, m_PtrToIntOrAddr(m_Value(A)))) {
           /// Note: this doesn't preserve the offset information but merges
           /// offset and alignment.
           /// TODO: we can generate a GEP instead of merging the alignment with
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index f939e7aa78c33..614c6ebd63be6 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -2148,7 +2148,7 @@ Instruction *InstCombinerImpl::visitIntToPtr(IntToPtrInst &CI) {
   return nullptr;
 }
 
-Value *InstCombinerImpl::foldPtrToIntOfGEP(Type *IntTy, Value *Ptr) {
+Value *InstCombinerImpl::foldPtrToIntOrAddrOfGEP(Type *IntTy, Value *Ptr) {
   // Look through chain of one-use GEPs.
   Type *PtrTy = Ptr->getType();
   SmallVector<GEPOperator *> GEPs;
@@ -2210,7 +2210,7 @@ Instruction *InstCombinerImpl::visitPtrToInt(PtrToIntInst &CI) {
       Mask->getType() == Ty)
     return BinaryOperator::CreateAnd(Builder.CreatePtrToInt(Ptr, Ty), Mask);
 
-  if (Value *V = foldPtrToIntOfGEP(Ty, SrcOp))
+  if (Value *V = foldPtrToIntOrAddrOfGEP(Ty, SrcOp))
     return replaceInstUsesWith(CI, V);
 
   Value *Vec, *Scalar, *Index;
@@ -2228,6 +2228,21 @@ Instruction *InstCombinerImpl::visitPtrToInt(PtrToIntInst &CI) {
 }
 
 Instruction *InstCombinerImpl::visitPtrToAddr(PtrToAddrInst &CI) {
+  Value *SrcOp = CI.getPointerOperand();
+  Type *Ty = CI.getType();
+
+  // (ptrtoaddr (ptrmask P, M))
+  //    -> (and (ptrtoaddr P), M)
+  // This is generally beneficial as `and` is better supported than `ptrmask`.
+  Value *Ptr, *Mask;
+  if (match(SrcOp, m_OneUse(m_Intrinsic<Intrinsic::ptrmask>(m_Value(Ptr),
+                                                            m_Value(Mask)))) &&
+      Mask->getType() == Ty)
+    return BinaryOperator::CreateAnd(Builder.CreatePtrToAddr(Ptr), Mask);
+
+  if (Value *V = foldPtrToIntOrAddrOfGEP(Ty, SrcOp))
+    return replaceInstUsesWith(CI, V);
+
   // FIXME: Implement variants of ptrtoint folds.
   return commonCastTransforms(CI);
 }
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 9c75d9a6711b9..d85e4f7590197 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -700,7 +700,7 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
   /// folded operation.
   void PHIArgMergedDebugLoc(Instruction *Inst, PHINode &PN);
 
-  Value *foldPtrToIntOfGEP(Type *IntTy, Value *Ptr);
+  Value *foldPtrToIntOrAddrOfGEP(Type *IntTy, Value *Ptr);
   Instruction *foldGEPICmp(GEPOperator *GEPLHS, Value *RHS, CmpPredicate Cond,
                            Instruction &I);
   Instruction *foldSelectICmp(CmpPredicate Pred, SelectInst *SI, Value *RHS,
diff --git a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
index 5ba2167859490..cc53ec2c0f2f3 100644
--- a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
@@ -1957,8 +1957,12 @@ Value *DataFlowSanitizer::getShadowAddress(Value *Addr,
 Value *DataFlowSanitizer::getShadowAddress(Value *Addr,
                                            BasicBlock::iterator Pos) {
   IRBuilder<> IRB(Pos->getParent(), Pos);
-  Value *ShadowOffset = getShadowOffset(Addr, IRB);
-  return getShadowAddress(Addr, Pos, ShadowOffset);
+  Value *ShadowAddr = getShadowOffset(Addr, IRB);
+  uint64_t ShadowBase = MapParams->ShadowBase;
+  if (ShadowBase != 0)
+    ShadowAddr =
+        IRB.CreateAdd(ShadowAddr, ConstantInt::get(IntptrTy, ShadowBase));
+  return getShadowAddress(Addr, Pos, ShadowAddr);
 }
 
 Value *DFSanFunction::combineShadowsThenConvert(Type *T, Value *V1, Value *V2,
diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
index 7795cce9d9d3c..b5548d4f24a2f 100644
--- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -69,14 +69,6 @@ namespace llvm {
 // Command line option to enable vtable value profiling. Defined in
 // ProfileData/InstrProf.cpp: -enable-vtable-value-profiling=
 extern cl::opt<bool> EnableVTableValueProfiling;
-// TODO: Remove -debug-info-correlate in next LLVM release, in favor of
-// -profile-correlate=debug-info.
-cl::opt<bool> DebugInfoCorrelate(
-    "debug-info-correlate",
-    cl::desc("Use debug info to correlate profiles. (Deprecated, use "
-             "-profile-correlate=debug-info)"),
-    cl::init(false));
-
 LLVM_ABI cl::opt<InstrProfCorrelator::ProfCorrelatorKind> ProfileCorrelate(
     "profile-correlate",
     cl::desc("Use debug info or binary file to correlate profiles."),
@@ -1047,7 +1039,7 @@ void InstrLowerer::lowerValueProfileInst(InstrProfValueProfileInst *Ind) {
   // in lightweight mode. We need to move the value profile pointer to the
   // Counter struct to get this working.
   assert(
-      !DebugInfoCorrelate && ProfileCorrelate == InstrProfCorrelator::NONE &&
+      ProfileCorrelate == InstrProfCorrelator::NONE &&
       "Value profiling is not yet supported with lightweight instrumentation");
   GlobalVariable *Name = Ind->getName();
   auto It = ProfileDataMap.find(Name);
@@ -1504,7 +1496,7 @@ static inline Constant *getVTableAddrForProfData(GlobalVariable *GV) {
 }
 
 void InstrLowerer::getOrCreateVTableProfData(GlobalVariable *GV) {
-  assert(!DebugInfoCorrelate &&
+  assert(ProfileCorrelate != InstrProfCorrelator::DEBUG_INFO &&
          "Value profiling is not supported with lightweight instrumentation");
   if (GV->isDeclaration() || GV->hasAvailableExternallyLinkage())
     return;
@@ -1584,8 +1576,7 @@ GlobalVariable *InstrLowerer::setupProfileSection(InstrProfInstBase *Inc,
 
   // Use internal rather than private linkage so the counter variable shows up
   // in the symbol table when using debug info for correlation.
-  if ((DebugInfoCorrelate ||
-       ProfileCorrelate == InstrProfCorrelator::DEBUG_INFO) &&
+  if (ProfileCorrelate == InstrProfCorrelator::DEBUG_INFO &&
       TT.isOSBinFormatMachO() && Linkage == GlobalValue::PrivateLinkage)
     Linkage = GlobalValue::InternalLinkage;
 
@@ -1691,8 +1682,7 @@ InstrLowerer::getOrCreateRegionCounters(InstrProfCntrInstBase *Inc) {
   auto *CounterPtr = setupProfileSection(Inc, IPSK_cnts);
   PD.RegionCounters = CounterPtr;
 
-  if (DebugInfoCorrelate ||
-      ProfileCorrelate == InstrProfCorrelator::DEBUG_INFO) {
+  if (ProfileCorrelate == InstrProfCorrelator::DEBUG_INFO) {
     LLVMContext &Ctx = M.getContext();
     Function *Fn = Inc->getParent()->getParent();
     if (auto *SP = Fn->getSubprogram()) {
@@ -1737,7 +1727,7 @@ InstrLowerer::getOrCreateRegionCounters(InstrProfCntrInstBase *Inc) {
 void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) {
   // When debug information is correlated to profile data, a data variable
   // is not needed.
-  if (DebugInfoCorrelate || ProfileCorrelate == InstrProfCorrelator::DEBUG_INFO)
+  if (ProfileCorrelate == InstrProfCorrelator::DEBUG_INFO)
     return;
 
   GlobalVariable *NamePtr = Inc->getName();
diff --git a/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp b/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp
index a6ec6c1207767..b72d41a748857 100644
--- a/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp
@@ -127,15 +127,19 @@ static uint64_t computeStackId(const memprof::Frame &Frame) {
   return computeStackId(Frame.Function, Frame.LineOffset, Frame.Column);
 }
 
+static AllocationType getAllocType(const AllocationInfo *AllocInfo) {
+  return getAllocType(AllocInfo->Info.getTotalLifetimeAccessDensity(),
+                      AllocInfo->Info.getAllocCount(),
+                      AllocInfo->Info.getTotalLifetime());
+}
+
 static AllocationType addCallStack(CallStackTrie &AllocTrie,
                                    const AllocationInfo *AllocInfo,
                                    uint64_t FullStackId) {
   SmallVector<uint64_t> StackIds;
   for (const auto &StackFrame : AllocInfo->CallStack)
     StackIds.push_back(computeStackId(StackFrame));
-  auto AllocType = getAllocType(AllocInfo->Info.getTotalLifetimeAccessDensity(),
-                                AllocInfo->Info.getAllocCount(),
-                                AllocInfo->Info.getTotalLifetime());
+  auto AllocType = getAllocType(AllocInfo);
   std::vector<ContextTotalSize> ContextSizeInfo;
   if (recordContextSizeInfoForAnalysis()) {
     auto TotalSize = AllocInfo->Info.getTotalSize();
@@ -216,7 +220,6 @@ static void HandleUnsupportedAnnotationKinds(GlobalVariable &GVar,
   }
   LLVM_DEBUG(dbgs() << "Skip annotation for " << GVar.getName() << " due to "
                     << Reason << ".\n");
-  return;
 }
 
 struct AllocMatchInfo {
@@ -406,22 +409,39 @@ handleAllocSite(Instruction &I, CallBase *CI,
                 const std::set<const AllocationInfo *> &AllocInfoSet,
                 std::map<std::pair<uint64_t, unsigned>, AllocMatchInfo>
                     &FullStackIdToAllocMatchInfo) {
+  // TODO: Remove this once the profile creation logic deduplicates contexts
+  // that are the same other than the IsInlineFrame bool. Until then, keep the
+  // largest.
+  DenseMap<uint64_t, const AllocationInfo *> UniqueFullContextIdAllocInfo;
+  for (auto *AllocInfo : AllocInfoSet) {
+    auto FullStackId = computeFullStackId(AllocInfo->CallStack);
+    auto [It, Inserted] =
+        UniqueFullContextIdAllocInfo.insert({FullStackId, AllocInfo});
+    // If inserted entry, done.
+    if (Inserted)
+      continue;
+    // Keep the larger one, or the noncold one if they are the same size.
+    auto CurSize = It->second->Info.getTotalSize();
+    auto NewSize = AllocInfo->Info.getTotalSize();
+    if ((CurSize > NewSize) ||
+        (CurSize == NewSize &&
+         getAllocType(AllocInfo) != AllocationType::NotCold))
+      continue;
+    It->second = AllocInfo;
+  }
   // We may match this instruction's location list to multiple MIB
   // contexts. Add them to a Trie specialized for trimming the contexts to
   // the minimal needed to disambiguate contexts with unique behavior.
   CallStackTrie AllocTrie(&ORE, MaxColdSize);
   uint64_t TotalSize = 0;
   uint64_t TotalColdSize = 0;
-  for (auto *AllocInfo : AllocInfoSet) {
+  for (auto &[FullStackId, AllocInfo] : UniqueFullContextIdAllocInfo) {
     // Check the full inlined call stack against this one.
     // If we found and thus matched all frames on the call, include
     // this MIB.
     if (stackFrameIncludesInlinedCallStack(AllocInfo->CallStack,
                                            InlinedCallStack)) {
       NumOfMemProfMatchedAllocContexts++;
-      uint64_t FullStackId = 0;
-      if (ClPrintMemProfMatchInfo || recordContextSizeInfoForAnalysis())
-        FullStackId = computeFullStackId(AllocInfo->CallStack);
       auto AllocType = addCallStack(AllocTrie, AllocInfo, FullStackId);
       TotalSize += AllocInfo->Info.getTotalSize();
       if (AllocType == AllocationType::Cold)
diff --git a/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp
index 80e77e099c695..a2fad021e0480 100644
--- a/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp
@@ -161,7 +161,7 @@ template <char NsanTypeId>
 class ShadowTypeConfigImpl : public ShadowTypeConfig {
 public:
   char getNsanTypeId() const override { return NsanTypeId; }
-  static constexpr const char kNsanTypeId = NsanTypeId;
+  static constexpr char kNsanTypeId = NsanTypeId;
 };
 
 // `double` (`d`) shadow type.
diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index 71736cfa4d89a..af53fa0bae468 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -456,7 +456,7 @@ createIRLevelProfileFlagVar(Module &M,
     ProfileVersion |= VARIANT_MASK_INSTR_ENTRY;
   if (PGOInstrumentLoopEntries)
     ProfileVersion |= VARIANT_MASK_INSTR_LOOP_ENTRIES;
-  if (DebugInfoCorrelate || ProfileCorrelate == InstrProfCorrelator::DEBUG_INFO)
+  if (ProfileCorrelate == InstrProfCorrelator::DEBUG_INFO)
     ProfileVersion |= VARIANT_MASK_DBG_CORRELATE;
   if (PGOFunctionEntryCoverage)
     ProfileVersion |=
diff --git a/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp
index 78d4a57ecea87..87eba5f2c5242 100644
--- a/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp
@@ -58,6 +58,18 @@ static cl::opt<bool>
                           cl::desc("Writes always set the type"), cl::Hidden,
                           cl::init(false));
 
+static cl::opt<bool> ClOutlineInstrumentation(
+    "tysan-outline-instrumentation",
+    cl::desc("Uses function calls for all TySan instrumentation, reducing "
+             "ELF size"),
+    cl::Hidden, cl::init(false));
+
+static cl::opt<bool> ClVerifyOutlinedInstrumentation(
+    "tysan-verify-outlined-instrumentation",
+    cl::desc("Check types twice with both inlined instrumentation and "
+             "function calls. This verifies that they behave the same."),
+    cl::Hidden, cl::init(false));
+
 STATISTIC(NumInstrumentedAccesses, "Number of instrumented accesses");
 
 namespace {
@@ -105,12 +117,16 @@ struct TypeSanitizer {
   Regex AnonNameRegex;
   Type *IntptrTy;
   uint64_t PtrShift;
-  IntegerType *OrdTy;
+  IntegerType *OrdTy, *U64Ty;
 
   /// Callbacks to run-time library are computed in initializeCallbacks.
   FunctionCallee TysanCheck;
   FunctionCallee TysanCtorFunction;
 
+  FunctionCallee TysanIntrumentMemInst;
+  FunctionCallee TysanInstrumentWithShadowUpdate;
+  FunctionCallee TysanSetShadowType;
+
   /// Callback to set types for gloabls.
   Function *TysanGlobalsSetTypeFunction;
 };
@@ -130,6 +146,8 @@ TypeSanitizer::TypeSanitizer(Module &M)
 void TypeSanitizer::initializeCallbacks(Module &M) {
   IRBuilder<> IRB(M.getContext());
   OrdTy = IRB.getInt32Ty();
+  U64Ty = IRB.getInt64Ty();
+  Type *BoolType = IRB.getInt1Ty();
 
   AttributeList Attr;
   Attr = Attr.addFnAttribute(M.getContext(), Attribute::NoUnwind);
@@ -144,6 +162,30 @@ void TypeSanitizer::initializeCallbacks(Module &M) {
 
   TysanCtorFunction =
       M.getOrInsertFunction(kTysanModuleCtorName, Attr, IRB.getVoidTy());
+
+  TysanIntrumentMemInst = M.getOrInsertFunction(
+      "__tysan_instrument_mem_inst", Attr, IRB.getVoidTy(),
+      IRB.getPtrTy(), // Pointer of data to be written to
+      IRB.getPtrTy(), // Pointer of data to write
+      U64Ty,          // Size of the data in bytes
+      BoolType        // Do we need to call memmove
+  );
+
+  TysanInstrumentWithShadowUpdate = M.getOrInsertFunction(
+      "__tysan_instrument_with_shadow_update", Attr, IRB.getVoidTy(),
+      IRB.getPtrTy(), // Pointer to data to be read
+      IRB.getPtrTy(), // Pointer to type descriptor
+      BoolType,       // Do we need to type check this
+      U64Ty,          // Size of data we access in bytes
+      OrdTy           // Flags
+  );
+
+  TysanSetShadowType = M.getOrInsertFunction(
+      "__tysan_set_shadow_type", Attr, IRB.getVoidTy(),
+      IRB.getPtrTy(), // Pointer of data to be written to
+      IRB.getPtrTy(), // Pointer to the new type descriptor
+      U64Ty           // Size of data we access in bytes
+  );
 }
 
 void TypeSanitizer::instrumentGlobals(Module &M) {
@@ -587,6 +629,29 @@ bool TypeSanitizer::instrumentWithShadowUpdate(
 
   Value *TD = IRB.CreateBitCast(TDGV, IRB.getPtrTy());
 
+  if (ClOutlineInstrumentation) {
+    if (!ForceSetType && (!ClWritesAlwaysSetType || IsRead)) {
+      // We need to check the type here. If the type is unknown, then the read
+      // sets the type. If the type is known, then it is checked. If the type
+      // doesn't match, then we call the runtime type check (which may yet
+      // determine that the mismatch is okay).
+
+      Constant *Flags =
+          ConstantInt::get(OrdTy, (int)IsRead | (((int)IsWrite) << 1));
+
+      IRB.CreateCall(TysanInstrumentWithShadowUpdate,
+                     {Ptr, TD,
+                      SanitizeFunction ? IRB.getTrue() : IRB.getFalse(),
+                      IRB.getInt64(AccessSize), Flags});
+    } else if (ForceSetType || IsWrite) {
+      // In the mode where writes always set the type, for a write (which does
+      // not also read), we just set the type.
+      IRB.CreateCall(TysanSetShadowType, {Ptr, TD, IRB.getInt64(AccessSize)});
+    }
+
+    return true;
+  }
+
   Value *ShadowDataInt = convertToShadowDataInt(IRB, Ptr, IntptrTy, PtrShift,
                                                 ShadowBase, AppMemMask);
   Type *Int8PtrPtrTy = PointerType::get(IRB.getContext(), 0);
@@ -838,37 +903,47 @@ bool TypeSanitizer::instrumentMemInst(Value *V, Instruction *ShadowBase,
     }
   }
 
-  if (!ShadowBase)
-    ShadowBase = getShadowBase(*F);
-  if (!AppMemMask)
-    AppMemMask = getAppMemMask(*F);
+  if (ClOutlineInstrumentation) {
+    if (!Src)
+      Src = ConstantPointerNull::get(IRB.getPtrTy());
 
-  Value *ShadowDataInt = IRB.CreateAdd(
-      IRB.CreateShl(
-          IRB.CreateAnd(IRB.CreatePtrToInt(Dest, IntptrTy), AppMemMask),
-          PtrShift),
-      ShadowBase);
-  Value *ShadowData = IRB.CreateIntToPtr(ShadowDataInt, IRB.getPtrTy());
-
-  if (!Src) {
-    IRB.CreateMemSet(ShadowData, IRB.getInt8(0), IRB.CreateShl(Size, PtrShift),
-                     Align(1ull << PtrShift));
+    IRB.CreateCall(
+        TysanIntrumentMemInst,
+        {Dest, Src, Size, NeedsMemMove ? IRB.getTrue() : IRB.getFalse()});
     return true;
-  }
-
-  Value *SrcShadowDataInt = IRB.CreateAdd(
-      IRB.CreateShl(
-          IRB.CreateAnd(IRB.CreatePtrToInt(Src, IntptrTy), AppMemMask),
-          PtrShift),
-      ShadowBase);
-  Value *SrcShadowData = IRB.CreateIntToPtr(SrcShadowDataInt, IRB.getPtrTy());
-
-  if (NeedsMemMove) {
-    IRB.CreateMemMove(ShadowData, Align(1ull << PtrShift), SrcShadowData,
-                      Align(1ull << PtrShift), IRB.CreateShl(Size, PtrShift));
   } else {
-    IRB.CreateMemCpy(ShadowData, Align(1ull << PtrShift), SrcShadowData,
-                     Align(1ull << PtrShift), IRB.CreateShl(Size, PtrShift));
+    if (!ShadowBase)
+      ShadowBase = getShadowBase(*F);
+    if (!AppMemMask)
+      AppMemMask = getAppMemMask(*F);
+
+    Value *ShadowDataInt = IRB.CreateAdd(
+        IRB.CreateShl(
+            IRB.CreateAnd(IRB.CreatePtrToInt(Dest, IntptrTy), AppMemMask),
+            PtrShift),
+        ShadowBase);
+    Value *ShadowData = IRB.CreateIntToPtr(ShadowDataInt, IRB.getPtrTy());
+
+    if (!Src) {
+      IRB.CreateMemSet(ShadowData, IRB.getInt8(0),
+                       IRB.CreateShl(Size, PtrShift), Align(1ull << PtrShift));
+      return true;
+    }
+
+    Value *SrcShadowDataInt = IRB.CreateAdd(
+        IRB.CreateShl(
+            IRB.CreateAnd(IRB.CreatePtrToInt(Src, IntptrTy), AppMemMask),
+            PtrShift),
+        ShadowBase);
+    Value *SrcShadowData = IRB.CreateIntToPtr(SrcShadowDataInt, IRB.getPtrTy());
+
+    if (NeedsMemMove) {
+      IRB.CreateMemMove(ShadowData, Align(1ull << PtrShift), SrcShadowData,
+                        Align(1ull << PtrShift), IRB.CreateShl(Size, PtrShift));
+    } else {
+      IRB.CreateMemCpy(ShadowData, Align(1ull << PtrShift), SrcShadowData,
+                       Align(1ull << PtrShift), IRB.CreateShl(Size, PtrShift));
+    }
   }
 
   return true;
@@ -890,6 +965,16 @@ PreservedAnalyses TypeSanitizerPass::run(Module &M,
   for (Function &F : M) {
     const TargetLibraryInfo &TLI = FAM.getResult<TargetLibraryAnalysis>(F);
     TySan.sanitizeFunction(F, TLI);
+    if (ClVerifyOutlinedInstrumentation && ClOutlineInstrumentation) {
+      // Outlined instrumentation is a new option, and so this exists to
+      // verify there is no difference in behaviour between the options.
+      // If the outlined instrumentation triggers a verification failure
+      // when the original inlined instrumentation does not, or vice versa,
+      // then there is a discrepency which should be investigated.
+      ClOutlineInstrumentation = false;
+      TySan.sanitizeFunction(F, TLI);
+      ClOutlineInstrumentation = true;
+    }
   }
 
   return PreservedAnalyses::none();
diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
index 66e45ecbde7df..e84ca819b93d8 100644
--- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
@@ -122,16 +122,22 @@ static cl::opt<unsigned>
                   cl::desc("Maximum cost accepted for the transformation"),
                   cl::Hidden, cl::init(50));
 
-extern cl::opt<bool> ProfcheckDisableMetadataFixes;
-
-} // namespace llvm
-
 static cl::opt<double> MaxClonedRate(
     "dfa-max-cloned-rate",
     cl::desc(
         "Maximum cloned instructions rate accepted for the transformation"),
     cl::Hidden, cl::init(7.5));
 
+static cl::opt<unsigned>
+    MaxOuterUseBlocks("dfa-max-out-use-blocks",
+                      cl::desc("Maximum unduplicated blocks with outer uses "
+                               "accepted for the transformation"),
+                      cl::Hidden, cl::init(40));
+
+extern cl::opt<bool> ProfcheckDisableMetadataFixes;
+
+} // namespace llvm
+
 namespace {
 class SelectInstToUnfold {
   SelectInst *SI;
@@ -965,8 +971,16 @@ struct TransformDFA {
     // SLPVectorizer.
     // TODO: Thread the switch partially before reaching the threshold.
     uint64_t NumOrigInst = 0;
-    for (auto *BB : DuplicateMap.keys())
+    uint64_t NumOuterUseBlock = 0;
+    for (auto *BB : DuplicateMap.keys()) {
       NumOrigInst += BB->sizeWithoutDebug();
+      // Only unduplicated blocks with single predecessor require new phi
+      // nodes.
+      for (auto *Succ : successors(BB))
+        if (!DuplicateMap.count(Succ) && Succ->getSinglePredecessor())
+          NumOuterUseBlock++;
+    }
+
     if (double(NumClonedInst) / double(NumOrigInst) > MaxClonedRate) {
       LLVM_DEBUG(dbgs() << "DFA Jump Threading: Not jump threading, too much "
                            "instructions wll be cloned\n");
@@ -977,6 +991,20 @@ struct TransformDFA {
       return false;
     }
 
+    // Too much unduplicated blocks with outer uses may cause too much
+    // insertions of phi nodes for duplicated definitions. TODO: Drop this
+    // threshold if we come up with another way to reduce the number of inserted
+    // phi nodes.
+    if (NumOuterUseBlock > MaxOuterUseBlocks) {
+      LLVM_DEBUG(dbgs() << "DFA Jump Threading: Not jump threading, too much "
+                           "blocks with outer uses\n");
+      ORE->emit([&]() {
+        return OptimizationRemarkMissed(DEBUG_TYPE, "NotProfitable", Switch)
+               << "Too much blocks with outer uses.";
+      });
+      return false;
+    }
+
     InstructionCost DuplicationCost = 0;
 
     unsigned JumpTableSize = 0;
diff --git a/llvm/lib/Transforms/Scalar/DropUnnecessaryAssumes.cpp b/llvm/lib/Transforms/Scalar/DropUnnecessaryAssumes.cpp
index 89980d54ee897..a577f517d1e89 100644
--- a/llvm/lib/Transforms/Scalar/DropUnnecessaryAssumes.cpp
+++ b/llvm/lib/Transforms/Scalar/DropUnnecessaryAssumes.cpp
@@ -122,7 +122,8 @@ DropUnnecessaryAssumesPass::run(Function &F, FunctionAnalysisManager &FAM) {
 
     Value *Cond = Assume->getArgOperand(0);
     // Don't drop type tests, which have special semantics.
-    if (match(Cond, m_Intrinsic<Intrinsic::type_test>()))
+    if (match(Cond, m_Intrinsic<Intrinsic::type_test>()) ||
+        match(Cond, m_Intrinsic<Intrinsic::public_type_test>()))
       continue;
 
     SmallVector<Value *> Affected;
diff --git a/llvm/lib/Transforms/Scalar/GVNSink.cpp b/llvm/lib/Transforms/Scalar/GVNSink.cpp
index a06f8325c90bf..d564e32e26526 100644
--- a/llvm/lib/Transforms/Scalar/GVNSink.cpp
+++ b/llvm/lib/Transforms/Scalar/GVNSink.cpp
@@ -514,7 +514,7 @@ class ValueTable {
 
 class GVNSink {
 public:
-  GVNSink() {}
+  GVNSink() = default;
 
   bool run(Function &F) {
     LLVM_DEBUG(dbgs() << "GVNSink: running on function @" << F.getName()
diff --git a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
index 7ebcc219efc15..4ba4ba3850e58 100644
--- a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -162,8 +162,6 @@ class IndVarSimplify {
                                  const SCEV *ExitCount,
                                  PHINode *IndVar, SCEVExpander &Rewriter);
 
-  bool sinkUnusedInvariants(Loop *L);
-
 public:
   IndVarSimplify(LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
                  const DataLayout &DL, TargetLibraryInfo *TLI,
@@ -1079,85 +1077,6 @@ linearFunctionTestReplace(Loop *L, BasicBlock *ExitingBB,
   return true;
 }
 
-//===----------------------------------------------------------------------===//
-//  sinkUnusedInvariants. A late subpass to cleanup loop preheaders.
-//===----------------------------------------------------------------------===//
-
-/// If there's a single exit block, sink any loop-invariant values that
-/// were defined in the preheader but not used inside the loop into the
-/// exit block to reduce register pressure in the loop.
-bool IndVarSimplify::sinkUnusedInvariants(Loop *L) {
-  BasicBlock *ExitBlock = L->getExitBlock();
-  if (!ExitBlock) return false;
-
-  BasicBlock *Preheader = L->getLoopPreheader();
-  if (!Preheader) return false;
-
-  bool MadeAnyChanges = false;
-  for (Instruction &I : llvm::make_early_inc_range(llvm::reverse(*Preheader))) {
-
-    // Skip BB Terminator.
-    if (Preheader->getTerminator() == &I)
-      continue;
-
-    // New instructions were inserted at the end of the preheader.
-    if (isa<PHINode>(I))
-      break;
-
-    // Don't move instructions which might have side effects, since the side
-    // effects need to complete before instructions inside the loop.  Also don't
-    // move instructions which might read memory, since the loop may modify
-    // memory. Note that it's okay if the instruction might have undefined
-    // behavior: LoopSimplify guarantees that the preheader dominates the exit
-    // block.
-    if (I.mayHaveSideEffects() || I.mayReadFromMemory())
-      continue;
-
-    // Skip debug or pseudo instructions.
-    if (I.isDebugOrPseudoInst())
-      continue;
-
-    // Skip eh pad instructions.
-    if (I.isEHPad())
-      continue;
-
-    // Don't sink alloca: we never want to sink static alloca's out of the
-    // entry block, and correctly sinking dynamic alloca's requires
-    // checks for stacksave/stackrestore intrinsics.
-    // FIXME: Refactor this check somehow?
-    if (isa<AllocaInst>(&I))
-      continue;
-
-    // Determine if there is a use in or before the loop (direct or
-    // otherwise).
-    bool UsedInLoop = false;
-    for (Use &U : I.uses()) {
-      Instruction *User = cast<Instruction>(U.getUser());
-      BasicBlock *UseBB = User->getParent();
-      if (PHINode *P = dyn_cast<PHINode>(User)) {
-        unsigned i =
-          PHINode::getIncomingValueNumForOperand(U.getOperandNo());
-        UseBB = P->getIncomingBlock(i);
-      }
-      if (UseBB == Preheader || L->contains(UseBB)) {
-        UsedInLoop = true;
-        break;
-      }
-    }
-
-    // If there is, the def must remain in the preheader.
-    if (UsedInLoop)
-      continue;
-
-    // Otherwise, sink it to the exit block.
-    I.moveBefore(ExitBlock->getFirstInsertionPt());
-    SE->forgetValue(&I);
-    MadeAnyChanges = true;
-  }
-
-  return MadeAnyChanges;
-}
-
 static void replaceExitCond(BranchInst *BI, Value *NewCond,
                             SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
   auto *OldCond = BI->getCondition();
@@ -2065,10 +1984,6 @@ bool IndVarSimplify::run(Loop *L) {
 
   // The Rewriter may not be used from this point on.
 
-  // Loop-invariant instructions in the preheader that aren't used in the
-  // loop may be sunk below the loop to reduce register pressure.
-  Changed |= sinkUnusedInvariants(L);
-
   // rewriteFirstIterationLoopExitValues does not rely on the computation of
   // trip count and therefore can further simplify exit values in addition to
   // rewriteLoopExitValues.
diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
index b2c526b41502b..d13b9909660ec 100644
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -211,9 +211,15 @@ static Instruction *cloneInstructionInExitBlock(
 static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo,
                              MemorySSAUpdater &MSSAU);
 
-static void moveInstructionBefore(Instruction &I, BasicBlock::iterator Dest,
-                                  ICFLoopSafetyInfo &SafetyInfo,
-                                  MemorySSAUpdater &MSSAU, ScalarEvolution *SE);
+static void moveInstructionBefore(
+    Instruction &I, BasicBlock::iterator Dest, ICFLoopSafetyInfo &SafetyInfo,
+    MemorySSAUpdater &MSSAU, ScalarEvolution *SE,
+    MemorySSA::InsertionPlace Point = MemorySSA::BeforeTerminator);
+
+static bool sinkUnusedInvariantsFromPreheaderToExit(
+    Loop *L, AAResults *AA, ICFLoopSafetyInfo *SafetyInfo,
+    MemorySSAUpdater &MSSAU, ScalarEvolution *SE, DominatorTree *DT,
+    SinkAndHoistLICMFlags &SinkFlags, OptimizationRemarkEmitter *ORE);
 
 static void foreachMemoryAccess(MemorySSA *MSSA, Loop *L,
                                 function_ref<void(Instruction *)> Fn);
@@ -471,6 +477,12 @@ bool LoopInvariantCodeMotion::runOnLoop(Loop *L, AAResults *AA, LoopInfo *LI,
                                     TLI, TTI, L, MSSAU, &SafetyInfo, Flags, ORE)
             : sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, TTI, L,
                          MSSAU, &SafetyInfo, Flags, ORE);
+
+  // sink pre-header defs that are unused in-loop into the unique exit to reduce
+  // pressure.
+  Changed |= sinkUnusedInvariantsFromPreheaderToExit(L, AA, &SafetyInfo, MSSAU,
+                                                     SE, DT, Flags, ORE);
+
   Flags.setIsSink(false);
   if (Preheader)
     Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, AC, TLI, L,
@@ -1456,19 +1468,80 @@ static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo,
 
 static void moveInstructionBefore(Instruction &I, BasicBlock::iterator Dest,
                                   ICFLoopSafetyInfo &SafetyInfo,
-                                  MemorySSAUpdater &MSSAU,
-                                  ScalarEvolution *SE) {
+                                  MemorySSAUpdater &MSSAU, ScalarEvolution *SE,
+                                  MemorySSA::InsertionPlace Point) {
   SafetyInfo.removeInstruction(&I);
   SafetyInfo.insertInstructionTo(&I, Dest->getParent());
   I.moveBefore(*Dest->getParent(), Dest);
   if (MemoryUseOrDef *OldMemAcc = cast_or_null<MemoryUseOrDef>(
           MSSAU.getMemorySSA()->getMemoryAccess(&I)))
-    MSSAU.moveToPlace(OldMemAcc, Dest->getParent(),
-                      MemorySSA::BeforeTerminator);
+    MSSAU.moveToPlace(OldMemAcc, Dest->getParent(), Point);
   if (SE)
     SE->forgetBlockAndLoopDispositions(&I);
 }
 
+// If there's a single exit block, sink any loop-invariant values that were
+// defined in the preheader but not used inside the loop into the exit block
+// to reduce register pressure in the loop.
+static bool sinkUnusedInvariantsFromPreheaderToExit(
+    Loop *L, AAResults *AA, ICFLoopSafetyInfo *SafetyInfo,
+    MemorySSAUpdater &MSSAU, ScalarEvolution *SE, DominatorTree *DT,
+    SinkAndHoistLICMFlags &SinkFlags, OptimizationRemarkEmitter *ORE) {
+  BasicBlock *ExitBlock = L->getExitBlock();
+  if (!ExitBlock)
+    return false;
+
+  BasicBlock *Preheader = L->getLoopPreheader();
+  if (!Preheader)
+    return false;
+
+  bool MadeAnyChanges = false;
+
+  for (Instruction &I : llvm::make_early_inc_range(llvm::reverse(*Preheader))) {
+
+    // Skip terminator.
+    if (Preheader->getTerminator() == &I)
+      continue;
+
+    // New instructions were inserted at the end of the preheader.
+    if (isa<PHINode>(I))
+      break;
+
+    // Don't move instructions which might have side effects, since the side
+    // effects need to complete before instructions inside the loop. Note that
+    // it's okay if the instruction might have undefined behavior: LoopSimplify
+    // guarantees that the preheader dominates the exit block.
+    if (I.mayHaveSideEffects())
+      continue;
+
+    if (!canSinkOrHoistInst(I, AA, DT, L, MSSAU, true, SinkFlags, nullptr))
+      continue;
+
+    // Determine if there is a use in or before the loop (direct or
+    // otherwise).
+    bool UsedInLoopOrPreheader = false;
+    for (Use &U : I.uses()) {
+      auto *UserI = cast<Instruction>(U.getUser());
+      BasicBlock *UseBB = UserI->getParent();
+      if (auto *PN = dyn_cast<PHINode>(UserI)) {
+        UseBB = PN->getIncomingBlock(U);
+      }
+      if (UseBB == Preheader || L->contains(UseBB)) {
+        UsedInLoopOrPreheader = true;
+        break;
+      }
+    }
+    if (UsedInLoopOrPreheader)
+      continue;
+
+    moveInstructionBefore(I, ExitBlock->getFirstInsertionPt(), *SafetyInfo,
+                          MSSAU, SE, MemorySSA::Beginning);
+    MadeAnyChanges = true;
+  }
+
+  return MadeAnyChanges;
+}
+
 static Instruction *sinkThroughTriviallyReplaceablePHI(
     PHINode *TPN, Instruction *I, LoopInfo *LI,
     SmallDenseMap<BasicBlock *, Instruction *, 32> &SunkCopies,
diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 1a279b6198182..001215abcfb26 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -1318,6 +1318,11 @@ class LSRUse {
   /// the loop, in which case some special-case heuristics may be used.
   bool AllFixupsOutsideLoop = true;
 
+  /// This records whether all of the fixups using this LSRUse are unconditional
+  /// within the loop, meaning they will be executed on every path to the loop
+  /// latch. This includes fixups before early exits.
+  bool AllFixupsUnconditional = true;
+
   /// RigidFormula is set to true to guarantee that this use will be associated
   /// with a single formula--the one that initially matched. Some SCEV
   /// expressions cannot be expanded. This allows LSR to consider the registers
@@ -1421,16 +1426,22 @@ void Cost::RateRegister(const Formula &F, const SCEV *Reg,
     if (TTI->isIndexedLoadLegal(TTI->MIM_PostInc, AR->getType()) ||
         TTI->isIndexedStoreLegal(TTI->MIM_PostInc, AR->getType())) {
       const SCEV *Start;
-      const SCEVConstant *Step;
-      if (match(AR, m_scev_AffineAddRec(m_SCEV(Start), m_SCEVConstant(Step))))
+      const APInt *Step;
+      if (match(AR, m_scev_AffineAddRec(m_SCEV(Start), m_scev_APInt(Step)))) {
         // If the step size matches the base offset, we could use pre-indexed
         // addressing.
-        if (((AMK & TTI::AMK_PreIndexed) && F.BaseOffset.isFixed() &&
-             Step->getAPInt() == F.BaseOffset.getFixedValue()) ||
-            ((AMK & TTI::AMK_PostIndexed) && !isa<SCEVConstant>(Start) &&
-             SE->isLoopInvariant(Start, L)))
+        bool CanPreIndex = (AMK & TTI::AMK_PreIndexed) &&
+                           F.BaseOffset.isFixed() &&
+                           *Step == F.BaseOffset.getFixedValue();
+        bool CanPostIndex = (AMK & TTI::AMK_PostIndexed) &&
+                            !isa<SCEVConstant>(Start) &&
+                            SE->isLoopInvariant(Start, L);
+        // We can only pre or post index when the load/store is unconditional.
+        if ((CanPreIndex || CanPostIndex) && LU.AllFixupsUnconditional)
           LoopCost = 0;
+      }
     }
+
     // If the loop counts down to zero and we'll be using a hardware loop then
     // the addrec will be combined into the hardware loop instruction.
     if (LU.Kind == LSRUse::ICmpZero && F.countsDownToZero() &&
@@ -1783,6 +1794,9 @@ void LSRUse::print(raw_ostream &OS) const {
   if (AllFixupsOutsideLoop)
     OS << ", all-fixups-outside-loop";
 
+  if (AllFixupsUnconditional)
+    OS << ", all-fixups-unconditional";
+
   if (WidestFixupType)
     OS << ", widest fixup type: " << *WidestFixupType;
 }
@@ -2213,6 +2227,7 @@ class LSRInstance {
   void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
   void CountRegisters(const Formula &F, size_t LUIdx);
   bool InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F);
+  bool IsFixupExecutedEachIncrement(const LSRFixup &LF) const;
 
   void CollectLoopInvariantFixupsAndFormulae();
 
@@ -3607,6 +3622,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {
     LF.PostIncLoops = TmpPostIncLoops;
     LF.Offset = Offset;
     LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
+    LU.AllFixupsUnconditional &= IsFixupExecutedEachIncrement(LF);
 
     // Create SCEV as Formula for calculating baseline cost
     if (!VisitedLSRUse.count(LUIdx) && !LF.isUseFullyOutsideLoop(L)) {
@@ -3680,6 +3696,14 @@ bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) {
   return true;
 }
 
+/// Test whether this fixup will be executed each time the corresponding IV
+/// increment instruction is executed.
+bool LSRInstance::IsFixupExecutedEachIncrement(const LSRFixup &LF) const {
+  // If the fixup block dominates the IV increment block then there is no path
+  // through the loop to the increment that doesn't pass through the fixup.
+  return DT.dominates(LF.UserInst->getParent(), IVIncInsertPos->getParent());
+}
+
 /// Check for other uses of loop-invariant values which we're tracking. These
 /// other uses will pin these values in registers, making them less profitable
 /// for elimination.
@@ -3803,6 +3827,7 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
         LF.OperandValToReplace = U;
         LF.Offset = Offset;
         LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
+        LU.AllFixupsUnconditional &= IsFixupExecutedEachIncrement(LF);
         if (!LU.WidestFixupType ||
             SE.getTypeSizeInBits(LU.WidestFixupType) <
             SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
@@ -4940,6 +4965,7 @@ void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
       LLVM_DEBUG(dbgs() << "  Deleting use "; LU.print(dbgs()); dbgs() << '\n');
 
       LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop;
+      LUThatHas->AllFixupsUnconditional &= LU.AllFixupsUnconditional;
 
       // Transfer the fixups of LU to LUThatHas.
       for (LSRFixup &Fixup : LU.Fixups) {
diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index 3487e812a68a3..7e70ba274f161 100644
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -245,11 +245,14 @@ raw_ostream &operator<<(raw_ostream &OS, ShapeInfo SI) {
 
 } // namespace
 
-static bool isUniformShape(Value *V) {
+static bool isShapePreserving(Value *V) {
   Instruction *I = dyn_cast<Instruction>(V);
   if (!I)
     return true;
 
+  if (isa<SelectInst>(I))
+    return true;
+
   if (I->isBinaryOp())
     return true;
 
@@ -300,6 +303,16 @@ static bool isUniformShape(Value *V) {
   }
 }
 
+/// Return an iterator over the operands of \p I that should share shape
+/// information with \p I.
+static iterator_range<Use *> getShapedOperandsForInst(Instruction *I) {
+  assert(isShapePreserving(I) &&
+         "Can't retrieve shaped operands for an instruction that does not "
+         "preserve shape information");
+  auto Ops = I->operands();
+  return isa<SelectInst>(I) ? drop_begin(Ops) : Ops;
+}
+
 /// Return the ShapeInfo for the result of \p I, it it can be determined.
 static std::optional<ShapeInfo>
 computeShapeInfoForInst(Instruction *I,
@@ -329,9 +342,8 @@ computeShapeInfoForInst(Instruction *I,
       return OpShape->second;
   }
 
-  if (isUniformShape(I) || isa<SelectInst>(I)) {
-    auto Ops = I->operands();
-    auto ShapedOps = isa<SelectInst>(I) ? drop_begin(Ops) : Ops;
+  if (isShapePreserving(I)) {
+    auto ShapedOps = getShapedOperandsForInst(I);
     // Find the first operand that has a known shape and use that.
     for (auto &Op : ShapedOps) {
       auto OpShape = ShapeMap.find(Op.get());
@@ -710,10 +722,9 @@ class LowerMatrixIntrinsics {
       case Intrinsic::matrix_column_major_store:
         return true;
       default:
-        return isUniformShape(II);
+        break;
       }
-    return isUniformShape(V) || isa<StoreInst>(V) || isa<LoadInst>(V) ||
-           isa<SelectInst>(V);
+    return isShapePreserving(V) || isa<StoreInst>(V) || isa<LoadInst>(V);
   }
 
   /// Propagate the shape information of instructions to their users.
@@ -800,9 +811,8 @@ class LowerMatrixIntrinsics {
       } else if (isa<StoreInst>(V)) {
         // Nothing to do.  We forward-propagated to this so we would just
         // backward propagate to an instruction with an already known shape.
-      } else if (isUniformShape(V) || isa<SelectInst>(V)) {
-        auto Ops = cast<Instruction>(V)->operands();
-        auto ShapedOps = isa<SelectInst>(V) ? drop_begin(Ops) : Ops;
+      } else if (isShapePreserving(V)) {
+        auto ShapedOps = getShapedOperandsForInst(cast<Instruction>(V));
         // Propagate to all operands.
         ShapeInfo Shape = ShapeMap[V];
         for (Use &U : ShapedOps) {
diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index e043d072a7638..08be5df9872b7 100644
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -1534,8 +1534,8 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store,
   bool SrcNotDom = false;
 
   auto CaptureTrackingWithModRef =
-      [&](Instruction *AI,
-          function_ref<bool(Instruction *)> ModRefCallback) -> bool {
+      [&](Instruction *AI, function_ref<bool(Instruction *)> ModRefCallback,
+          bool &AddressCaptured) -> bool {
     SmallVector<Instruction *, 8> Worklist;
     Worklist.push_back(AI);
     unsigned MaxUsesToExplore = getDefaultMaxUsesToExploreForCaptureTracking();
@@ -1559,8 +1559,9 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store,
         if (!Visited.insert(&U).second)
           continue;
         UseCaptureInfo CI = DetermineUseCaptureKind(U, AI);
-        if (capturesAnything(CI.UseCC))
+        if (capturesAnyProvenance(CI.UseCC))
           return false;
+        AddressCaptured |= capturesAddress(CI.UseCC);
 
         if (UI->mayReadOrWriteMemory()) {
           if (UI->isLifetimeStartOrEnd()) {
@@ -1627,7 +1628,9 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store,
     return true;
   };
 
-  if (!CaptureTrackingWithModRef(DestAlloca, DestModRefCallback))
+  bool DestAddressCaptured = false;
+  if (!CaptureTrackingWithModRef(DestAlloca, DestModRefCallback,
+                                 DestAddressCaptured))
     return false;
   // Bailout if Dest may have any ModRef before Store.
   if (!ReachabilityWorklist.empty() &&
@@ -1653,7 +1656,14 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store,
     return true;
   };
 
-  if (!CaptureTrackingWithModRef(SrcAlloca, SrcModRefCallback))
+  bool SrcAddressCaptured = false;
+  if (!CaptureTrackingWithModRef(SrcAlloca, SrcModRefCallback,
+                                 SrcAddressCaptured))
+    return false;
+
+  // If both the source and destination address are captured, the fact that they
+  // are no longer two separate allocations may be observed.
+  if (DestAddressCaptured && SrcAddressCaptured)
     return false;
 
   // We can do the transformation. First, move the SrcAlloca to the start of the
diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index 5af6c96c56a06..bb6c879f4d47e 100644
--- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -81,6 +81,7 @@ STATISTIC(
 STATISTIC(NumInvariantConditionsInjected,
           "Number of invariant conditions injected and unswitched");
 
+namespace llvm {
 static cl::opt<bool> EnableNonTrivialUnswitch(
     "enable-nontrivial-unswitch", cl::init(false), cl::Hidden,
     cl::desc("Forcibly enables non-trivial loop unswitching rather than "
@@ -131,11 +132,17 @@ static cl::opt<bool> InjectInvariantConditions(
 
 static cl::opt<unsigned> InjectInvariantConditionHotnesThreshold(
     "simple-loop-unswitch-inject-invariant-condition-hotness-threshold",
-    cl::Hidden, cl::desc("Only try to inject loop invariant conditions and "
-                         "unswitch on them to eliminate branches that are "
-                         "not-taken 1/<this option> times or less."),
+    cl::Hidden,
+    cl::desc("Only try to inject loop invariant conditions and "
+             "unswitch on them to eliminate branches that are "
+             "not-taken 1/<this option> times or less."),
     cl::init(16));
 
+static cl::opt<bool> EstimateProfile("simple-loop-unswitch-estimate-profile",
+                                     cl::Hidden, cl::init(true));
+extern cl::opt<bool> ProfcheckDisableMetadataFixes;
+} // namespace llvm
+
 AnalysisKey ShouldRunExtraSimpleLoopUnswitch::Key;
 namespace {
 struct CompareDesc {
@@ -268,13 +275,42 @@ static bool areLoopExitPHIsLoopInvariant(const Loop &L,
   llvm_unreachable("Basic blocks should never be empty!");
 }
 
-/// Copy a set of loop invariant values \p ToDuplicate and insert them at the
+/// Copy a set of loop invariant values \p Invariants and insert them at the
 /// end of \p BB and conditionally branch on the copied condition. We only
 /// branch on a single value.
+/// We attempt to estimate the profile of the resulting conditional branch from
+/// \p ComputeProfFrom, which is the original conditional branch we're
+/// unswitching.
+/// When \p Direction is true, the \p Invariants form a disjunction, and the
+/// branch conditioned on it exits the loop on the "true" case. When \p
+/// Direction is false, the \p Invariants form a conjunction and the branch
+/// exits on the "false" case.
 static void buildPartialUnswitchConditionalBranch(
     BasicBlock &BB, ArrayRef<Value *> Invariants, bool Direction,
     BasicBlock &UnswitchedSucc, BasicBlock &NormalSucc, bool InsertFreeze,
-    const Instruction *I, AssumptionCache *AC, const DominatorTree &DT) {
+    const Instruction *I, AssumptionCache *AC, const DominatorTree &DT,
+    const BranchInst &ComputeProfFrom) {
+
+  SmallVector<uint32_t> BranchWeights;
+  bool HasBranchWeights = EstimateProfile && !ProfcheckDisableMetadataFixes &&
+                          extractBranchWeights(ComputeProfFrom, BranchWeights);
+  // If Direction is true, that means we had a disjunction and that the "true"
+  // case exits. The probability of the disjunction of the subset of terms is at
+  // most as high as the original one. So, if the probability is higher than the
+  // one we'd assign in absence of a profile (i.e. 0.5), we will use 0.5,
+  // but if it's lower, we will use the original probability.
+  // Conversely, if Direction is false, that means we had a conjunction, and the
+  // probability of exiting is captured in the second branch weight. That
+  // probability is a disjunction (of the negation of the original terms). The
+  // same reasoning applies as above.
+  // Issue #165649: should we expect BFI to conserve, and use that to calculate
+  // the branch weights?
+  if (HasBranchWeights &&
+      static_cast<double>(BranchWeights[Direction ? 0 : 1]) /
+              static_cast<double>(sum_of(BranchWeights)) >
+          0.5)
+    HasBranchWeights = false;
+
   IRBuilder<> IRB(&BB);
   IRB.SetCurrentDebugLocation(DebugLoc::getCompilerGenerated());
 
@@ -287,8 +323,14 @@ static void buildPartialUnswitchConditionalBranch(
 
   Value *Cond = Direction ? IRB.CreateOr(FrozenInvariants)
                           : IRB.CreateAnd(FrozenInvariants);
-  IRB.CreateCondBr(Cond, Direction ? &UnswitchedSucc : &NormalSucc,
-                   Direction ? &NormalSucc : &UnswitchedSucc);
+  auto *BR = IRB.CreateCondBr(
+      Cond, Direction ? &UnswitchedSucc : &NormalSucc,
+      Direction ? &NormalSucc : &UnswitchedSucc,
+      HasBranchWeights ? ComputeProfFrom.getMetadata(LLVMContext::MD_prof)
+                       : nullptr);
+  if (!HasBranchWeights)
+    setExplicitlyUnknownBranchWeightsIfProfiled(
+        *BR, *BR->getParent()->getParent(), DEBUG_TYPE);
 }
 
 /// Copy a set of loop invariant values, and conditionally branch on them.
@@ -658,7 +700,7 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT,
              " condition!");
     buildPartialUnswitchConditionalBranch(
         *OldPH, Invariants, ExitDirection, *UnswitchedBB, *NewPH,
-        FreezeLoopUnswitchCond, OldPH->getTerminator(), nullptr, DT);
+        FreezeLoopUnswitchCond, OldPH->getTerminator(), nullptr, DT, BI);
   }
 
   // Update the dominator tree with the added edge.
@@ -2477,7 +2519,7 @@ static void unswitchNontrivialInvariants(
     else {
       buildPartialUnswitchConditionalBranch(
           *SplitBB, Invariants, Direction, *ClonedPH, *LoopPH,
-          FreezeLoopUnswitchCond, BI, &AC, DT);
+          FreezeLoopUnswitchCond, BI, &AC, DT, *BI);
     }
     DTUpdates.push_back({DominatorTree::Insert, SplitBB, ClonedPH});
 
diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
index 0f3978f56045e..5f6f66a4bc213 100644
--- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -143,8 +143,8 @@ struct SubGraphTraits {
   class WrappedSuccIterator
       : public iterator_adaptor_base<
             WrappedSuccIterator, BaseSuccIterator,
-            typename std::iterator_traits<BaseSuccIterator>::iterator_category,
-            NodeRef, std::ptrdiff_t, NodeRef *, NodeRef> {
+            std::iterator_traits<BaseSuccIterator>::iterator_category, NodeRef,
+            std::ptrdiff_t, NodeRef *, NodeRef> {
     SmallDenseSet<RegionNode *> *Nodes;
 
   public:
diff --git a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
index 9829d4d50098c..11db0ec487328 100644
--- a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -674,6 +674,79 @@ BasicBlock *llvm::SplitEdge(BasicBlock *BB, BasicBlock *Succ, DominatorTree *DT,
   return SplitBlock(BB, BB->getTerminator(), DT, LI, MSSAU, BBName);
 }
 
+/// Helper function to update the cycle or loop information after inserting a
+/// new block between a callbr instruction and one of its target blocks.  Adds
+/// the new block to the innermost cycle or loop that the callbr instruction and
+/// the original target block share.
+/// \p LCI            cycle or loop information to update
+/// \p CallBrBlock    block containing the callbr instruction
+/// \p CallBrTarget   new target block of the callbr instruction
+/// \p Succ           original target block of the callbr instruction
+template <typename TI, typename T>
+static bool updateCycleLoopInfo(TI *LCI, BasicBlock *CallBrBlock,
+                                BasicBlock *CallBrTarget, BasicBlock *Succ) {
+  static_assert(std::is_same_v<TI, CycleInfo> || std::is_same_v<TI, LoopInfo>,
+                "type must be CycleInfo or LoopInfo");
+  if (!LCI)
+    return false;
+
+  T *LC;
+  if constexpr (std::is_same_v<TI, CycleInfo>)
+    LC = LCI->getSmallestCommonCycle(CallBrBlock, Succ);
+  else
+    LC = LCI->getSmallestCommonLoop(CallBrBlock, Succ);
+  if (!LC)
+    return false;
+
+  if constexpr (std::is_same_v<TI, CycleInfo>)
+    LCI->addBlockToCycle(CallBrTarget, LC);
+  else
+    LC->addBasicBlockToLoop(CallBrTarget, *LCI);
+
+  return true;
+}
+
+BasicBlock *llvm::SplitCallBrEdge(BasicBlock *CallBrBlock, BasicBlock *Succ,
+                                  unsigned SuccIdx, DomTreeUpdater *DTU,
+                                  CycleInfo *CI, LoopInfo *LI,
+                                  bool *UpdatedLI) {
+  CallBrInst *CallBr = dyn_cast<CallBrInst>(CallBrBlock->getTerminator());
+  assert(CallBr && "expected callbr terminator");
+  assert(SuccIdx < CallBr->getNumSuccessors() &&
+         Succ == CallBr->getSuccessor(SuccIdx) && "invalid successor index");
+
+  // Create a new block between callbr and the specified successor.
+  // splitBlockBefore cannot be re-used here since it cannot split if the split
+  // point is a PHI node (because BasicBlock::splitBasicBlockBefore cannot
+  // handle that). But we don't need to rewire every part of a potential PHI
+  // node. We only care about the edge between CallBrBlock and the original
+  // successor.
+  BasicBlock *CallBrTarget =
+      BasicBlock::Create(CallBrBlock->getContext(),
+                         CallBrBlock->getName() + ".target." + Succ->getName(),
+                         CallBrBlock->getParent());
+  // Rewire control flow from the new target block to the original successor.
+  Succ->replacePhiUsesWith(CallBrBlock, CallBrTarget);
+  // Rewire control flow from callbr to the new target block.
+  CallBr->setSuccessor(SuccIdx, CallBrTarget);
+  // Jump from the new target block to the original successor.
+  BranchInst::Create(Succ, CallBrTarget);
+
+  bool Updated =
+      updateCycleLoopInfo<LoopInfo, Loop>(LI, CallBrBlock, CallBrTarget, Succ);
+  if (UpdatedLI)
+    *UpdatedLI = Updated;
+  updateCycleLoopInfo<CycleInfo, Cycle>(CI, CallBrBlock, CallBrTarget, Succ);
+  if (DTU) {
+    DTU->applyUpdates({{DominatorTree::Insert, CallBrBlock, CallBrTarget}});
+    if (DTU->getDomTree().dominates(CallBrBlock, Succ))
+      DTU->applyUpdates({{DominatorTree::Delete, CallBrBlock, Succ},
+                         {DominatorTree::Insert, CallBrTarget, Succ}});
+  }
+
+  return CallBrTarget;
+}
+
 void llvm::setUnwindEdgeTo(Instruction *TI, BasicBlock *Succ) {
   if (auto *II = dyn_cast<InvokeInst>(TI))
     II->setUnwindDest(Succ);
diff --git a/llvm/lib/Transforms/Utils/ControlFlowUtils.cpp b/llvm/lib/Transforms/Utils/ControlFlowUtils.cpp
index 0046a00af4338..287a177371c80 100644
--- a/llvm/lib/Transforms/Utils/ControlFlowUtils.cpp
+++ b/llvm/lib/Transforms/Utils/ControlFlowUtils.cpp
@@ -13,6 +13,7 @@
 #include "llvm/Transforms/Utils/ControlFlowUtils.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/ValueHandle.h"
@@ -281,7 +282,9 @@ std::pair<BasicBlock *, bool> ControlFlowHub::finalize(
 
   for (auto [BB, Succ0, Succ1] : Branches) {
 #ifndef NDEBUG
-    assert(Incoming.insert(BB).second && "Duplicate entry for incoming block.");
+    assert(
+        (Incoming.insert(BB).second || isa<CallBrInst>(BB->getTerminator())) &&
+        "Duplicate entry for incoming block.");
 #endif
     if (Succ0)
       Outgoing.insert(Succ0);
diff --git a/llvm/lib/Transforms/Utils/FixIrreducible.cpp b/llvm/lib/Transforms/Utils/FixIrreducible.cpp
index 45e1d12c2bfff..804af22daa5af 100644
--- a/llvm/lib/Transforms/Utils/FixIrreducible.cpp
+++ b/llvm/lib/Transforms/Utils/FixIrreducible.cpp
@@ -79,6 +79,53 @@
 // Limitation: The pass cannot handle switch statements and indirect
 //             branches. Both must be lowered to plain branches first.
 //
+// CallBr support: CallBr is handled as a more general branch instruction which
+// can have multiple successors. The pass redirects the edges to intermediate
+// target blocks that unconditionally branch to the original callbr target
+// blocks. This allows the control flow hub to know to which of the original
+// target blocks to jump to.
+// Example input CFG:
+//                        Entry (callbr)
+//                       /     \
+//                      v       v
+//                      H ----> B
+//                      ^      /|
+//                       `----' |
+//                              v
+//                             Exit
+//
+// becomes:
+//                        Entry (callbr)
+//                       /     \
+//                      v       v
+//                 target.H   target.B
+//                      |       |
+//                      v       v
+//                      H ----> B
+//                      ^      /|
+//                       `----' |
+//                              v
+//                             Exit
+//
+// Note
+// OUTPUT CFG: Converted to a natural loop with a new header N.
+//
+//                        Entry (callbr)
+//                       /     \
+//                      v       v
+//                 target.H   target.B
+//                      \       /
+//                       \     /
+//                        v   v
+//                          N <---.
+//                         / \     \
+//                        /   \     |
+//                       v     v    /
+//                       H --> B --'
+//                             |
+//                             v
+//                            Exit
+//
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/FixIrreducible.h"
@@ -231,6 +278,7 @@ static bool fixIrreducible(Cycle &C, CycleInfo &CI, DominatorTree &DT,
     return false;
   LLVM_DEBUG(dbgs() << "Processing cycle:\n" << CI.print(&C) << "\n";);
 
+  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
   ControlFlowHub CHub;
   SetVector<BasicBlock *> Predecessors;
 
@@ -242,18 +290,32 @@ static bool fixIrreducible(Cycle &C, CycleInfo &CI, DominatorTree &DT,
   }
 
   for (BasicBlock *P : Predecessors) {
-    auto *Branch = cast<BranchInst>(P->getTerminator());
-    // Exactly one of the two successors is the header.
-    BasicBlock *Succ0 = Branch->getSuccessor(0) == Header ? Header : nullptr;
-    BasicBlock *Succ1 = Succ0 ? nullptr : Header;
-    if (!Succ0)
-      assert(Branch->getSuccessor(1) == Header);
-    assert(Succ0 || Succ1);
-    CHub.addBranch(P, Succ0, Succ1);
-
-    LLVM_DEBUG(dbgs() << "Added internal branch: " << P->getName() << " -> "
-                      << (Succ0 ? Succ0->getName() : "") << " "
-                      << (Succ1 ? Succ1->getName() : "") << "\n");
+    if (BranchInst *Branch = dyn_cast<BranchInst>(P->getTerminator())) {
+      // Exactly one of the two successors is the header.
+      BasicBlock *Succ0 = Branch->getSuccessor(0) == Header ? Header : nullptr;
+      BasicBlock *Succ1 = Succ0 ? nullptr : Header;
+      assert(Succ0 || Branch->getSuccessor(1) == Header);
+      assert(Succ0 || Succ1);
+      CHub.addBranch(P, Succ0, Succ1);
+
+      LLVM_DEBUG(dbgs() << "Added internal branch: " << printBasicBlock(P)
+                        << " -> " << printBasicBlock(Succ0)
+                        << (Succ0 && Succ1 ? " " : "") << printBasicBlock(Succ1)
+                        << '\n');
+    } else if (CallBrInst *CallBr = dyn_cast<CallBrInst>(P->getTerminator())) {
+      for (unsigned I = 0; I < CallBr->getNumSuccessors(); ++I) {
+        BasicBlock *Succ = CallBr->getSuccessor(I);
+        if (Succ != Header)
+          continue;
+        BasicBlock *NewSucc = SplitCallBrEdge(P, Succ, I, &DTU, &CI, LI);
+        CHub.addBranch(NewSucc, Succ);
+        LLVM_DEBUG(dbgs() << "Added internal branch: "
+                          << printBasicBlock(NewSucc) << " -> "
+                          << printBasicBlock(Succ) << '\n');
+      }
+    } else {
+      llvm_unreachable("unsupported block terminator");
+    }
   }
 
   // Redirect external incoming edges. This includes the edges on the header.
@@ -266,17 +328,32 @@ static bool fixIrreducible(Cycle &C, CycleInfo &CI, DominatorTree &DT,
   }
 
   for (BasicBlock *P : Predecessors) {
-    auto *Branch = cast<BranchInst>(P->getTerminator());
-    BasicBlock *Succ0 = Branch->getSuccessor(0);
-    Succ0 = C.contains(Succ0) ? Succ0 : nullptr;
-    BasicBlock *Succ1 =
-        Branch->isUnconditional() ? nullptr : Branch->getSuccessor(1);
-    Succ1 = Succ1 && C.contains(Succ1) ? Succ1 : nullptr;
-    CHub.addBranch(P, Succ0, Succ1);
-
-    LLVM_DEBUG(dbgs() << "Added external branch: " << P->getName() << " -> "
-                      << (Succ0 ? Succ0->getName() : "") << " "
-                      << (Succ1 ? Succ1->getName() : "") << "\n");
+    if (BranchInst *Branch = dyn_cast<BranchInst>(P->getTerminator()); Branch) {
+      BasicBlock *Succ0 = Branch->getSuccessor(0);
+      Succ0 = C.contains(Succ0) ? Succ0 : nullptr;
+      BasicBlock *Succ1 =
+          Branch->isUnconditional() ? nullptr : Branch->getSuccessor(1);
+      Succ1 = Succ1 && C.contains(Succ1) ? Succ1 : nullptr;
+      CHub.addBranch(P, Succ0, Succ1);
+
+      LLVM_DEBUG(dbgs() << "Added external branch: " << printBasicBlock(P)
+                        << " -> " << printBasicBlock(Succ0)
+                        << (Succ0 && Succ1 ? " " : "") << printBasicBlock(Succ1)
+                        << '\n');
+    } else if (CallBrInst *CallBr = dyn_cast<CallBrInst>(P->getTerminator())) {
+      for (unsigned I = 0; I < CallBr->getNumSuccessors(); ++I) {
+        BasicBlock *Succ = CallBr->getSuccessor(I);
+        if (!C.contains(Succ))
+          continue;
+        BasicBlock *NewSucc = SplitCallBrEdge(P, Succ, I, &DTU, &CI, LI);
+        CHub.addBranch(NewSucc, Succ);
+        LLVM_DEBUG(dbgs() << "Added external branch: "
+                          << printBasicBlock(NewSucc) << " -> "
+                          << printBasicBlock(Succ) << '\n');
+      }
+    } else {
+      llvm_unreachable("unsupported block terminator");
+    }
   }
 
   // Redirect all the backedges through a "hub" consisting of a series
@@ -292,7 +369,6 @@ static bool fixIrreducible(Cycle &C, CycleInfo &CI, DominatorTree &DT,
   SetVector<BasicBlock *> Entries;
   Entries.insert(C.entry_rbegin(), C.entry_rend());
 
-  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
   CHub.finalize(&DTU, GuardBlocks, "irr");
 #if defined(EXPENSIVE_CHECKS)
   assert(DT.verify(DominatorTree::VerificationLevel::Full));
@@ -325,8 +401,6 @@ static bool FixIrreducibleImpl(Function &F, CycleInfo &CI, DominatorTree &DT,
   LLVM_DEBUG(dbgs() << "===== Fix irreducible control-flow in function: "
                     << F.getName() << "\n");
 
-  assert(hasOnlySimpleTerminator(F) && "Unsupported block terminator.");
-
   bool Changed = false;
   for (Cycle *TopCycle : CI.toplevel_cycles()) {
     for (Cycle *C : depth_first(TopCycle)) {
diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
index 4fe736ac29b0a..94dfd3a974923 100644
--- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
@@ -499,9 +499,9 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
 
   const unsigned MaxTripCount = SE->getSmallConstantMaxTripCount(L);
   const bool MaxOrZero = SE->isBackedgeTakenCountMaxOrZero(L);
-  unsigned EstimatedLoopInvocationWeight = 0;
   std::optional<unsigned> OriginalTripCount =
-      llvm::getLoopEstimatedTripCount(L, &EstimatedLoopInvocationWeight);
+      llvm::getLoopEstimatedTripCount(L);
+  BranchProbability OriginalLoopProb = llvm::getLoopProbability(L);
 
   // Effectively "DCE" unrolled iterations that are beyond the max tripcount
   // and will never be executed.
@@ -592,11 +592,11 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
                                               : isEpilogProfitable(L);
 
   if (ULO.Runtime &&
-      !UnrollRuntimeLoopRemainder(L, ULO.Count, ULO.AllowExpensiveTripCount,
-                                  EpilogProfitability, ULO.UnrollRemainder,
-                                  ULO.ForgetAllSCEV, LI, SE, DT, AC, TTI,
-                                  PreserveLCSSA, ULO.SCEVExpansionBudget,
-                                  ULO.RuntimeUnrollMultiExit, RemainderLoop)) {
+      !UnrollRuntimeLoopRemainder(
+          L, ULO.Count, ULO.AllowExpensiveTripCount, EpilogProfitability,
+          ULO.UnrollRemainder, ULO.ForgetAllSCEV, LI, SE, DT, AC, TTI,
+          PreserveLCSSA, ULO.SCEVExpansionBudget, ULO.RuntimeUnrollMultiExit,
+          RemainderLoop, OriginalTripCount, OriginalLoopProb)) {
     if (ULO.Force)
       ULO.Runtime = false;
     else {
@@ -1130,11 +1130,46 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
     LI->erase(L);
     // We shouldn't try to use `L` anymore.
     L = nullptr;
-  } else if (OriginalTripCount) {
-    // Update the trip count. Note that the remainder has already logic
-    // computing it in `UnrollRuntimeLoopRemainder`.
-    setLoopEstimatedTripCount(L, *OriginalTripCount / ULO.Count,
-                              EstimatedLoopInvocationWeight);
+  } else {
+    // Update metadata for the loop's branch weights and estimated trip count:
+    // - If ULO.Runtime, UnrollRuntimeLoopRemainder sets the guard branch
+    //   weights, latch branch weights, and estimated trip count of the
+    //   remainder loop it creates.  It also sets the branch weights for the
+    //   unrolled loop guard it creates.  The branch weights for the unrolled
+    //   loop latch are adjusted below.  FIXME: Handle prologue loops.
+    // - Otherwise, if unrolled loop iteration latches become unconditional,
+    //   branch weights are adjusted above.  FIXME: Actually handle such
+    //   unconditional latches.
+    // - Otherwise, the original loop's branch weights are correct for the
+    //   unrolled loop, so do not adjust them.
+    // - In all cases, the unrolled loop's estimated trip count is set below.
+    //
+    // As an example of the last case, consider what happens if the unroll count
+    // is 4 for a loop with an estimated trip count of 10 when we do not create
+    // a remainder loop and all iterations' latches remain conditional.  Each
+    // unrolled iteration's latch still has the same probability of exiting the
+    // loop as it did when in the original loop, and thus it should still have
+    // the same branch weights.  Each unrolled iteration's non-zero probability
+    // of exiting already appropriately reduces the probability of reaching the
+    // remaining iterations just as it did in the original loop.  Trying to also
+    // adjust the branch weights of the final unrolled iteration's latch (i.e.,
+    // the backedge for the unrolled loop as a whole) to reflect its new trip
+    // count of 3 will erroneously further reduce its block frequencies.
+    // However, in case an analysis later needs to estimate the trip count of
+    // the unrolled loop as a whole without considering the branch weights for
+    // each unrolled iteration's latch within it, we store the new trip count as
+    // separate metadata.
+    if (!OriginalLoopProb.isUnknown() && ULO.Runtime && EpilogProfitability) {
+      // Where p is always the probability of executing at least 1 more
+      // iteration, the probability for at least n more iterations is p^n.
+      setLoopProbability(L, OriginalLoopProb.pow(ULO.Count));
+    }
+    if (OriginalTripCount) {
+      unsigned NewTripCount = *OriginalTripCount / ULO.Count;
+      if (!ULO.Runtime && *OriginalTripCount % ULO.Count)
+        ++NewTripCount;
+      setLoopEstimatedTripCount(L, NewTripCount);
+    }
   }
 
   // LoopInfo should not be valid, confirm that.
diff --git a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
index 6312831cf0ee0..1e8f6cc76900c 100644
--- a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@@ -40,6 +40,7 @@
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
 #include "llvm/Transforms/Utils/UnrollLoop.h"
+#include <cmath>
 
 using namespace llvm;
 
@@ -195,6 +196,21 @@ static void ConnectProlog(Loop *L, Value *BECount, unsigned Count,
   }
 }
 
+/// Assume, due to our position in the remainder loop or its guard, anywhere
+/// from 0 to \p N more iterations can possibly execute.  Among such cases in
+/// the original loop (with loop probability \p OriginalLoopProb), what is the
+/// probability of executing at least one more iteration?
+static BranchProbability
+probOfNextInRemainder(BranchProbability OriginalLoopProb, unsigned N) {
+  // Each of these variables holds the original loop's probability that the
+  // number of iterations it will execute is some m in the specified range.
+  BranchProbability ProbOne = OriginalLoopProb;                // 1 <= m
+  BranchProbability ProbTooMany = ProbOne.pow(N + 1);          // N + 1 <= m
+  BranchProbability ProbNotTooMany = ProbTooMany.getCompl();   // 0 <= m <= N
+  BranchProbability ProbOneNotTooMany = ProbOne - ProbTooMany; // 1 <= m <= N
+  return ProbOneNotTooMany / ProbNotTooMany;
+}
+
 /// Connect the unrolling epilog code to the original loop.
 /// The unrolling epilog code contains code to execute the
 /// 'extra' iterations if the run-time trip count modulo the
@@ -221,7 +237,8 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit,
                           BasicBlock *EpilogPreHeader, BasicBlock *NewPreHeader,
                           ValueToValueMapTy &VMap, DominatorTree *DT,
                           LoopInfo *LI, bool PreserveLCSSA, ScalarEvolution &SE,
-                          unsigned Count, AssumptionCache &AC) {
+                          unsigned Count, AssumptionCache &AC,
+                          BranchProbability OriginalLoopProb) {
   BasicBlock *Latch = L->getLoopLatch();
   assert(Latch && "Loop must have a latch");
   BasicBlock *EpilogLatch = cast<BasicBlock>(VMap[Latch]);
@@ -332,12 +349,19 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit,
                          PreserveLCSSA);
   // Add the branch to the exit block (around the epilog loop)
   MDNode *BranchWeights = nullptr;
-  if (hasBranchWeightMD(*Latch->getTerminator())) {
+  if (OriginalLoopProb.isUnknown() &&
+      hasBranchWeightMD(*Latch->getTerminator())) {
     // Assume equal distribution in interval [0, Count).
     MDBuilder MDB(B.getContext());
     BranchWeights = MDB.createBranchWeights(1, Count - 1);
   }
-  B.CreateCondBr(BrLoopExit, EpilogPreHeader, Exit, BranchWeights);
+  BranchInst *RemainderLoopGuard =
+      B.CreateCondBr(BrLoopExit, EpilogPreHeader, Exit, BranchWeights);
+  if (!OriginalLoopProb.isUnknown()) {
+    setBranchProbability(RemainderLoopGuard,
+                         probOfNextInRemainder(OriginalLoopProb, Count - 1),
+                         /*ForFirstTarget=*/true);
+  }
   InsertPt->eraseFromParent();
   if (DT) {
     auto *NewDom = DT->findNearestCommonDominator(Exit, NewExit);
@@ -357,14 +381,15 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit,
 /// The cloned blocks should be inserted between InsertTop and InsertBot.
 /// InsertTop should be new preheader, InsertBot new loop exit.
 /// Returns the new cloned loop that is created.
-static Loop *
-CloneLoopBlocks(Loop *L, Value *NewIter, const bool UseEpilogRemainder,
-                const bool UnrollRemainder,
-                BasicBlock *InsertTop,
-                BasicBlock *InsertBot, BasicBlock *Preheader,
+static Loop *CloneLoopBlocks(Loop *L, Value *NewIter,
+                             const bool UseEpilogRemainder,
+                             const bool UnrollRemainder, BasicBlock *InsertTop,
+                             BasicBlock *InsertBot, BasicBlock *Preheader,
                              std::vector<BasicBlock *> &NewBlocks,
                              LoopBlocksDFS &LoopBlocks, ValueToValueMapTy &VMap,
-                             DominatorTree *DT, LoopInfo *LI, unsigned Count) {
+                             DominatorTree *DT, LoopInfo *LI, unsigned Count,
+                             std::optional<unsigned> OriginalTripCount,
+                             BranchProbability OriginalLoopProb) {
   StringRef suffix = UseEpilogRemainder ? "epil" : "prol";
   BasicBlock *Header = L->getHeader();
   BasicBlock *Latch = L->getLoopLatch();
@@ -419,7 +444,8 @@ CloneLoopBlocks(Loop *L, Value *NewIter, const bool UseEpilogRemainder,
           Builder.CreateAdd(NewIdx, One, NewIdx->getName() + ".next");
       Value *IdxCmp = Builder.CreateICmpNE(IdxNext, NewIter, NewIdx->getName() + ".cmp");
       MDNode *BranchWeights = nullptr;
-      if (hasBranchWeightMD(*LatchBR)) {
+      if ((OriginalLoopProb.isUnknown() || !UseEpilogRemainder) &&
+          hasBranchWeightMD(*LatchBR)) {
         uint32_t ExitWeight;
         uint32_t BackEdgeWeight;
         if (Count >= 3) {
@@ -437,7 +463,29 @@ CloneLoopBlocks(Loop *L, Value *NewIter, const bool UseEpilogRemainder,
         MDBuilder MDB(Builder.getContext());
         BranchWeights = MDB.createBranchWeights(BackEdgeWeight, ExitWeight);
       }
-      Builder.CreateCondBr(IdxCmp, FirstLoopBB, InsertBot, BranchWeights);
+      BranchInst *RemainderLoopLatch =
+          Builder.CreateCondBr(IdxCmp, FirstLoopBB, InsertBot, BranchWeights);
+      if (!OriginalLoopProb.isUnknown() && UseEpilogRemainder) {
+        // Compute the total frequency of the original loop body from the
+        // remainder iterations.  Once we've reached them, the first of them
+        // always executes, so its frequency and probability are 1.
+        double FreqRemIters = 1;
+        if (Count > 2) {
+          BranchProbability ProbReaching = BranchProbability::getOne();
+          for (unsigned N = Count - 2; N >= 1; --N) {
+            ProbReaching *= probOfNextInRemainder(OriginalLoopProb, N);
+            FreqRemIters += double(ProbReaching.getNumerator()) /
+                            ProbReaching.getDenominator();
+          }
+        }
+        // Solve for the loop probability that would produce that frequency.
+        // Sum(i=0..inf)(Prob^i) = 1/(1-Prob) = FreqRemIters.
+        double ProbDouble = 1 - 1 / FreqRemIters;
+        BranchProbability Prob = BranchProbability::getBranchProbability(
+            std::round(ProbDouble * BranchProbability::getDenominator()),
+            BranchProbability::getDenominator());
+        setBranchProbability(RemainderLoopLatch, Prob, /*ForFirstTarget=*/true);
+      }
       NewIdx->addIncoming(Zero, InsertTop);
       NewIdx->addIncoming(IdxNext, NewBB);
       LatchBR->eraseFromParent();
@@ -460,25 +508,13 @@ CloneLoopBlocks(Loop *L, Value *NewIter, const bool UseEpilogRemainder,
 
   Loop *NewLoop = NewLoops[L];
   assert(NewLoop && "L should have been cloned");
-  MDNode *LoopID = NewLoop->getLoopID();
 
-  // Only add loop metadata if the loop is not going to be completely
-  // unrolled.
-  if (UnrollRemainder)
-    return NewLoop;
-
-  std::optional<MDNode *> NewLoopID = makeFollowupLoopID(
-      LoopID, {LLVMLoopUnrollFollowupAll, LLVMLoopUnrollFollowupRemainder});
-  if (NewLoopID) {
-    NewLoop->setLoopID(*NewLoopID);
-
-    // Do not setLoopAlreadyUnrolled if loop attributes have been defined
-    // explicitly.
-    return NewLoop;
-  }
+  if (OriginalTripCount && UseEpilogRemainder)
+    setLoopEstimatedTripCount(NewLoop, *OriginalTripCount % Count);
 
   // Add unroll disable metadata to disable future unrolling for this loop.
-  NewLoop->setLoopAlreadyUnrolled();
+  if (!UnrollRemainder)
+    NewLoop->setLoopAlreadyUnrolled();
   return NewLoop;
 }
 
@@ -603,7 +639,8 @@ bool llvm::UnrollRuntimeLoopRemainder(
     LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC,
     const TargetTransformInfo *TTI, bool PreserveLCSSA,
     unsigned SCEVExpansionBudget, bool RuntimeUnrollMultiExit,
-    Loop **ResultLoop) {
+    Loop **ResultLoop, std::optional<unsigned> OriginalTripCount,
+    BranchProbability OriginalLoopProb) {
   LLVM_DEBUG(dbgs() << "Trying runtime unrolling on Loop: \n");
   LLVM_DEBUG(L->dump());
   LLVM_DEBUG(UseEpilogRemainder ? dbgs() << "Using epilog remainder.\n"
@@ -823,12 +860,23 @@ bool llvm::UnrollRuntimeLoopRemainder(
   BasicBlock *UnrollingLoop = UseEpilogRemainder ? NewPreHeader : PrologExit;
   // Branch to either remainder (extra iterations) loop or unrolling loop.
   MDNode *BranchWeights = nullptr;
-  if (hasBranchWeightMD(*Latch->getTerminator())) {
+  if ((OriginalLoopProb.isUnknown() || !UseEpilogRemainder) &&
+      hasBranchWeightMD(*Latch->getTerminator())) {
     // Assume loop is nearly always entered.
     MDBuilder MDB(B.getContext());
     BranchWeights = MDB.createBranchWeights(EpilogHeaderWeights);
   }
-  B.CreateCondBr(BranchVal, RemainderLoop, UnrollingLoop, BranchWeights);
+  BranchInst *UnrollingLoopGuard =
+      B.CreateCondBr(BranchVal, RemainderLoop, UnrollingLoop, BranchWeights);
+  if (!OriginalLoopProb.isUnknown() && UseEpilogRemainder) {
+    // The original loop's first iteration always happens.  Compute the
+    // probability of the original loop executing Count-1 iterations after that
+    // to complete the first iteration of the unrolled loop.
+    BranchProbability ProbOne = OriginalLoopProb;
+    BranchProbability ProbRest = ProbOne.pow(Count - 1);
+    setBranchProbability(UnrollingLoopGuard, ProbRest,
+                         /*ForFirstTarget=*/false);
+  }
   PreHeaderBR->eraseFromParent();
   if (DT) {
     if (UseEpilogRemainder)
@@ -855,9 +903,10 @@ bool llvm::UnrollRuntimeLoopRemainder(
   // iterations. This function adds the appropriate CFG connections.
   BasicBlock *InsertBot = UseEpilogRemainder ? LatchExit : PrologExit;
   BasicBlock *InsertTop = UseEpilogRemainder ? EpilogPreHeader : PrologPreHeader;
-  Loop *remainderLoop = CloneLoopBlocks(
-      L, ModVal, UseEpilogRemainder, UnrollRemainder, InsertTop, InsertBot,
-      NewPreHeader, NewBlocks, LoopBlocks, VMap, DT, LI, Count);
+  Loop *remainderLoop =
+      CloneLoopBlocks(L, ModVal, UseEpilogRemainder, UnrollRemainder, InsertTop,
+                      InsertBot, NewPreHeader, NewBlocks, LoopBlocks, VMap, DT,
+                      LI, Count, OriginalTripCount, OriginalLoopProb);
 
   // Insert the cloned blocks into the function.
   F->splice(InsertBot->getIterator(), F, NewBlocks[0]->getIterator(), F->end());
@@ -956,7 +1005,8 @@ bool llvm::UnrollRuntimeLoopRemainder(
     // Connect the epilog code to the original loop and update the
     // PHI functions.
     ConnectEpilog(L, ModVal, NewExit, LatchExit, PreHeader, EpilogPreHeader,
-                  NewPreHeader, VMap, DT, LI, PreserveLCSSA, *SE, Count, *AC);
+                  NewPreHeader, VMap, DT, LI, PreserveLCSSA, *SE, Count, *AC,
+                  OriginalLoopProb);
 
     // Update counter in loop for unrolling.
     // Use an incrementing IV.  Pre-incr/post-incr is backedge/trip count.
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index b6ba82288aeb4..8be471bee5579 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -962,13 +962,51 @@ bool llvm::setLoopEstimatedTripCount(
   if (LatchBranch->getSuccessor(0) != L->getHeader())
     std::swap(BackedgeTakenWeight, LatchExitWeight);
 
-  MDBuilder MDB(LatchBranch->getContext());
-
   // Set/Update profile metadata.
-  LatchBranch->setMetadata(
-      LLVMContext::MD_prof,
-      MDB.createBranchWeights(BackedgeTakenWeight, LatchExitWeight));
+  setBranchWeights(*LatchBranch, {BackedgeTakenWeight, LatchExitWeight},
+                   /*IsExpected=*/false);
+
+  return true;
+}
+
+BranchProbability llvm::getLoopProbability(Loop *L) {
+  BranchInst *LatchBranch = getExpectedExitLoopLatchBranch(L);
+  if (!LatchBranch)
+    return BranchProbability::getUnknown();
+  bool FirstTargetIsLoop = LatchBranch->getSuccessor(0) == L->getHeader();
+  return getBranchProbability(LatchBranch, FirstTargetIsLoop);
+}
 
+bool llvm::setLoopProbability(Loop *L, BranchProbability P) {
+  BranchInst *LatchBranch = getExpectedExitLoopLatchBranch(L);
+  if (!LatchBranch)
+    return false;
+  bool FirstTargetIsLoop = LatchBranch->getSuccessor(0) == L->getHeader();
+  return setBranchProbability(LatchBranch, P, FirstTargetIsLoop);
+}
+
+BranchProbability llvm::getBranchProbability(BranchInst *B,
+                                             bool ForFirstTarget) {
+  if (B->getNumSuccessors() != 2)
+    return BranchProbability::getUnknown();
+  uint64_t Weight0, Weight1;
+  if (!extractBranchWeights(*B, Weight0, Weight1))
+    return BranchProbability::getUnknown();
+  if (!ForFirstTarget)
+    std::swap(Weight0, Weight1);
+  return BranchProbability::getBranchProbability(Weight0, Weight0 + Weight1);
+}
+
+bool llvm::setBranchProbability(BranchInst *B, BranchProbability P,
+                                bool ForFirstTarget) {
+  if (B->getNumSuccessors() != 2)
+    return false;
+  BranchProbability Prob0 = P;
+  BranchProbability Prob1 = P.getCompl();
+  if (!ForFirstTarget)
+    std::swap(Prob0, Prob1);
+  setBranchWeights(*B, {Prob0.getNumerator(), Prob1.getNumerator()},
+                   /*IsExpected=*/false);
   return true;
 }
 
diff --git a/llvm/lib/Transforms/Utils/PredicateInfo.cpp b/llvm/lib/Transforms/Utils/PredicateInfo.cpp
index a9ab3b3144829..27fed7340411b 100644
--- a/llvm/lib/Transforms/Utils/PredicateInfo.cpp
+++ b/llvm/lib/Transforms/Utils/PredicateInfo.cpp
@@ -809,7 +809,6 @@ class PredicateInfoAnnotatedWriter : public AssemblyAnnotationWriter {
   void emitInstructionAnnot(const Instruction *I,
                             formatted_raw_ostream &OS) override {
     if (const auto *PI = PredInfo->getPredicateInfoFor(I)) {
-      OS << "; Has predicate info\n";
       if (const auto *PB = dyn_cast<PredicateBranch>(PI)) {
         OS << "; branch predicate info { TrueEdge: " << PB->TrueEdge
            << " Comparison:" << *PB->Condition << " Edge: [";
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index c537be5cba37c..cbc604e87cf1a 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -80,6 +80,7 @@
 #include <algorithm>
 #include <cassert>
 #include <climits>
+#include <cmath>
 #include <cstddef>
 #include <cstdint>
 #include <iterator>
@@ -1866,10 +1867,19 @@ bool SimplifyCFGOpt::hoistCommonCodeFromSuccessors(Instruction *TI,
   // If either of the blocks has it's address taken, then we can't do this fold,
   // because the code we'd hoist would no longer run when we jump into the block
   // by it's address.
-  for (auto *Succ : successors(BB))
-    if (Succ->hasAddressTaken() || !Succ->getSinglePredecessor())
+  for (auto *Succ : successors(BB)) {
+    if (Succ->hasAddressTaken())
       return false;
-
+    if (Succ->getSinglePredecessor())
+      continue;
+    // If Succ has >1 predecessors, continue to check if the Succ contains only
+    // one `unreachable` inst. Since executing `unreachable` inst is an UB, we
+    // can relax the condition based on the assumptiom that the program would
+    // never enter Succ and trigger such an UB.
+    if (isa<UnreachableInst>(*Succ->begin()))
+      continue;
+    return false;
+  }
   // The second of pair is a SkipFlags bitmask.
   using SuccIterPair = std::pair<BasicBlock::iterator, unsigned>;
   SmallVector<SuccIterPair, 8> SuccIterPairs;
@@ -5228,32 +5238,52 @@ bool SimplifyCFGOpt::simplifyBranchOnICmpChain(BranchInst *BI,
         CompVal, DL.getIntPtrType(CompVal->getType()), "magicptr");
   }
 
-  // Create the new switch instruction now.
-  SwitchInst *New = Builder.CreateSwitch(CompVal, DefaultBB, Values.size());
-  if (HasProfile) {
-    // We know the weight of the default case. We don't know the weight of the
-    // other cases, but rather than completely lose profiling info, we split
-    // the remaining probability equally over them.
-    SmallVector<uint32_t> NewWeights(Values.size() + 1);
-    NewWeights[0] = BranchWeights[1]; // this is the default, and we swapped if
-                                      // TrueWhenEqual.
-    for (auto &V : drop_begin(NewWeights))
-      V = BranchWeights[0] / Values.size();
-    setBranchWeights(*New, NewWeights, /*IsExpected=*/false);
-  }
-
-  // Add all of the 'cases' to the switch instruction.
-  for (ConstantInt *Val : Values)
-    New->addCase(Val, EdgeBB);
+  // Check if we can represent the values as a contiguous range. If so, we use a
+  // range check + conditional branch instead of a switch.
+  if (Values.front()->getValue() - Values.back()->getValue() ==
+      Values.size() - 1) {
+    ConstantRange RangeToCheck = ConstantRange::getNonEmpty(
+        Values.back()->getValue(), Values.front()->getValue() + 1);
+    APInt Offset, RHS;
+    ICmpInst::Predicate Pred;
+    RangeToCheck.getEquivalentICmp(Pred, RHS, Offset);
+    Value *X = CompVal;
+    if (!Offset.isZero())
+      X = Builder.CreateAdd(X, ConstantInt::get(CompVal->getType(), Offset));
+    Value *Cond =
+        Builder.CreateICmp(Pred, X, ConstantInt::get(CompVal->getType(), RHS));
+    BranchInst *NewBI = Builder.CreateCondBr(Cond, EdgeBB, DefaultBB);
+    if (HasProfile)
+      setBranchWeights(*NewBI, BranchWeights, /*IsExpected=*/false);
+    // We don't need to update PHI nodes since we don't add any new edges.
+  } else {
+    // Create the new switch instruction now.
+    SwitchInst *New = Builder.CreateSwitch(CompVal, DefaultBB, Values.size());
+    if (HasProfile) {
+      // We know the weight of the default case. We don't know the weight of the
+      // other cases, but rather than completely lose profiling info, we split
+      // the remaining probability equally over them.
+      SmallVector<uint32_t> NewWeights(Values.size() + 1);
+      NewWeights[0] = BranchWeights[1]; // this is the default, and we swapped
+                                        // if TrueWhenEqual.
+      for (auto &V : drop_begin(NewWeights))
+        V = BranchWeights[0] / Values.size();
+      setBranchWeights(*New, NewWeights, /*IsExpected=*/false);
+    }
 
-  // We added edges from PI to the EdgeBB.  As such, if there were any
-  // PHI nodes in EdgeBB, they need entries to be added corresponding to
-  // the number of edges added.
-  for (BasicBlock::iterator BBI = EdgeBB->begin(); isa<PHINode>(BBI); ++BBI) {
-    PHINode *PN = cast<PHINode>(BBI);
-    Value *InVal = PN->getIncomingValueForBlock(BB);
-    for (unsigned i = 0, e = Values.size() - 1; i != e; ++i)
-      PN->addIncoming(InVal, BB);
+    // Add all of the 'cases' to the switch instruction.
+    for (ConstantInt *Val : Values)
+      New->addCase(Val, EdgeBB);
+
+    // We added edges from PI to the EdgeBB.  As such, if there were any
+    // PHI nodes in EdgeBB, they need entries to be added corresponding to
+    // the number of edges added.
+    for (BasicBlock::iterator BBI = EdgeBB->begin(); isa<PHINode>(BBI); ++BBI) {
+      PHINode *PN = cast<PHINode>(BBI);
+      Value *InVal = PN->getIncomingValueForBlock(BB);
+      for (unsigned i = 0, e = Values.size() - 1; i != e; ++i)
+        PN->addIncoming(InVal, BB);
+    }
   }
 
   // Erase the old branch instruction.
@@ -5926,7 +5956,7 @@ bool SimplifyCFGOpt::turnSwitchRangeIntoICmp(SwitchInst *SI,
   }
 
   // Update weight for the newly-created conditional branch.
-  if (hasBranchWeightMD(*SI)) {
+  if (hasBranchWeightMD(*SI) && NewBI->isConditional()) {
     SmallVector<uint64_t, 8> Weights;
     getBranchWeights(SI, Weights);
     if (Weights.size() == 1 + SI->getNumCases()) {
@@ -5948,14 +5978,14 @@ bool SimplifyCFGOpt::turnSwitchRangeIntoICmp(SwitchInst *SI,
   }
 
   // Prune obsolete incoming values off the successors' PHI nodes.
-  for (auto BBI = Dest->begin(); isa<PHINode>(BBI); ++BBI) {
+  for (auto &PHI : make_early_inc_range(Dest->phis())) {
     unsigned PreviousEdges = Cases->size();
     if (Dest == SI->getDefaultDest())
       ++PreviousEdges;
     for (unsigned I = 0, E = PreviousEdges - 1; I != E; ++I)
-      cast<PHINode>(BBI)->removeIncomingValue(SI->getParent());
+      PHI.removeIncomingValue(SI->getParent());
   }
-  for (auto BBI = OtherDest->begin(); isa<PHINode>(BBI); ++BBI) {
+  for (auto &PHI : make_early_inc_range(OtherDest->phis())) {
     unsigned PreviousEdges = OtherCases->size();
     if (OtherDest == SI->getDefaultDest())
       ++PreviousEdges;
@@ -5964,7 +5994,7 @@ bool SimplifyCFGOpt::turnSwitchRangeIntoICmp(SwitchInst *SI,
     if (NewBI->isUnconditional())
       ++E;
     for (unsigned I = 0; I != E; ++I)
-      cast<PHINode>(BBI)->removeIncomingValue(SI->getParent());
+      PHI.removeIncomingValue(SI->getParent());
   }
 
   // Clean up the default block - it may have phis or other instructions before
@@ -7603,7 +7633,35 @@ static bool simplifySwitchOfPowersOfTwo(SwitchInst *SI, IRBuilder<> &Builder,
     auto *DefaultCaseBB = SI->getDefaultDest();
     BasicBlock *SplitBB = SplitBlock(OrigBB, SI, DTU);
     auto It = OrigBB->getTerminator()->getIterator();
-    BranchInst::Create(SplitBB, DefaultCaseBB, IsPow2, It);
+    SmallVector<uint32_t> Weights;
+    auto HasWeights =
+        !ProfcheckDisableMetadataFixes && extractBranchWeights(*SI, Weights);
+    auto *BI = BranchInst::Create(SplitBB, DefaultCaseBB, IsPow2, It);
+    if (HasWeights && any_of(Weights, [](const auto &V) { return V != 0; })) {
+      // IsPow2 covers a subset of the cases in which we'd go to the default
+      // label. The other is those powers of 2 that don't appear in the case
+      // statement. We don't know the distribution of the values coming in, so
+      // the safest is to split 50-50 the original probability to `default`.
+      uint64_t OrigDenominator = sum_of(map_range(
+          Weights, [](const auto &V) { return static_cast<uint64_t>(V); }));
+      SmallVector<uint64_t> NewWeights(2);
+      NewWeights[1] = Weights[0] / 2;
+      NewWeights[0] = OrigDenominator - NewWeights[1];
+      setFittedBranchWeights(*BI, NewWeights, /*IsExpected=*/false);
+
+      // For the original switch, we reduce the weight of the default by the
+      // amount by which the previous branch contributes to getting to default,
+      // and then make sure the remaining weights have the same relative ratio
+      // wrt eachother.
+      uint64_t CasesDenominator = OrigDenominator - Weights[0];
+      Weights[0] /= 2;
+      for (auto &W : drop_begin(Weights))
+        W = NewWeights[0] * static_cast<double>(W) / CasesDenominator;
+
+      setBranchWeights(*SI, Weights, /*IsExpected=*/false);
+    }
+    // BI is handling the default case for SI, and so should share its DebugLoc.
+    BI->setDebugLoc(SI->getDebugLoc());
     It->eraseFromParent();
 
     addPredecessorToBlock(DefaultCaseBB, OrigBB, SplitBB);
diff --git a/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp b/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp
index 9f338dbc78cff..94c5c1709f43e 100644
--- a/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp
+++ b/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp
@@ -12,7 +12,11 @@
 //
 // Limitation: This assumes that all terminators in the CFG are direct branches
 //             (the "br" instruction). The presence of any other control flow
-//             such as indirectbr, switch or callbr will cause an assert.
+//             such as indirectbr or switch will cause an assert.
+//             The callbr terminator is supported by creating intermediate
+//             target blocks that unconditionally branch to the original target
+//             blocks. These intermediate target blocks can then be redirected
+//             through the ControlFlowHub as usual.
 //
 //===----------------------------------------------------------------------===//
 
@@ -150,25 +154,55 @@ static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) {
   SmallVector<BasicBlock *, 8> ExitingBlocks;
   L->getExitingBlocks(ExitingBlocks);
 
+  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
+  SmallVector<BasicBlock *, 8> CallBrTargetBlocksToFix;
   // Redirect exiting edges through a control flow hub.
   ControlFlowHub CHub;
-  for (auto *BB : ExitingBlocks) {
-    auto *Branch = cast<BranchInst>(BB->getTerminator());
-    BasicBlock *Succ0 = Branch->getSuccessor(0);
-    Succ0 = L->contains(Succ0) ? nullptr : Succ0;
-
-    BasicBlock *Succ1 =
-        Branch->isUnconditional() ? nullptr : Branch->getSuccessor(1);
-    Succ1 = L->contains(Succ1) ? nullptr : Succ1;
-    CHub.addBranch(BB, Succ0, Succ1);
-
-    LLVM_DEBUG(dbgs() << "Added exiting branch: " << BB->getName() << " -> {"
-                      << (Succ0 ? Succ0->getName() : "<none>") << ", "
-                      << (Succ1 ? Succ1->getName() : "<none>") << "}\n");
+
+  for (unsigned I = 0; I < ExitingBlocks.size(); ++I) {
+    BasicBlock *BB = ExitingBlocks[I];
+    if (BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator())) {
+      BasicBlock *Succ0 = Branch->getSuccessor(0);
+      Succ0 = L->contains(Succ0) ? nullptr : Succ0;
+
+      BasicBlock *Succ1 =
+          Branch->isUnconditional() ? nullptr : Branch->getSuccessor(1);
+      Succ1 = L->contains(Succ1) ? nullptr : Succ1;
+      CHub.addBranch(BB, Succ0, Succ1);
+
+      LLVM_DEBUG(dbgs() << "Added extiting branch: " << printBasicBlock(BB)
+                        << " -> " << printBasicBlock(Succ0)
+                        << (Succ0 && Succ1 ? " " : "") << printBasicBlock(Succ1)
+                        << '\n');
+    } else if (CallBrInst *CallBr = dyn_cast<CallBrInst>(BB->getTerminator())) {
+      for (unsigned J = 0; J < CallBr->getNumSuccessors(); ++J) {
+        BasicBlock *Succ = CallBr->getSuccessor(J);
+        if (L->contains(Succ))
+          continue;
+        bool UpdatedLI = false;
+        BasicBlock *NewSucc =
+            SplitCallBrEdge(BB, Succ, J, &DTU, nullptr, &LI, &UpdatedLI);
+        // Even if CallBr and Succ do not have a common parent loop, we need to
+        // add the new target block to the parent loop of the current loop.
+        if (!UpdatedLI)
+          CallBrTargetBlocksToFix.push_back(NewSucc);
+        // ExitingBlocks is later used to restore SSA, so we need to make sure
+        // that the blocks used for phi nodes in the guard blocks match the
+        // predecessors of the guard blocks, which, in the case of callbr, are
+        // the new intermediate target blocks instead of the callbr blocks
+        // themselves.
+        ExitingBlocks[I] = NewSucc;
+        CHub.addBranch(NewSucc, Succ);
+        LLVM_DEBUG(dbgs() << "Added exiting branch: "
+                          << printBasicBlock(NewSucc) << " -> "
+                          << printBasicBlock(Succ) << '\n');
+      }
+    } else {
+      llvm_unreachable("unsupported block terminator");
+    }
   }
 
   SmallVector<BasicBlock *, 8> GuardBlocks;
-  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
   BasicBlock *LoopExitBlock;
   bool ChangedCFG;
   std::tie(LoopExitBlock, ChangedCFG) = CHub.finalize(
@@ -187,10 +221,19 @@ static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) {
 
   // The guard blocks were created outside the loop, so they need to become
   // members of the parent loop.
-  if (auto ParentLoop = L->getParentLoop()) {
+  // Same goes for the callbr target blocks.  Although we try to add them to the
+  // smallest common parent loop of the callbr block and the corresponding
+  // original target block, there might not have been such a loop, in which case
+  // the newly created callbr target blocks are not part of any loop. For nested
+  // loops, this might result in them leading to a loop with multiple entry
+  // points.
+  if (auto *ParentLoop = L->getParentLoop()) {
     for (auto *G : GuardBlocks) {
       ParentLoop->addBasicBlockToLoop(G, LI);
     }
+    for (auto *C : CallBrTargetBlocksToFix) {
+      ParentLoop->addBasicBlockToLoop(C, LI);
+    }
     ParentLoop->verifyLoop();
   }
 
@@ -218,8 +261,6 @@ bool UnifyLoopExitsLegacyPass::runOnFunction(Function &F) {
   auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
 
-  assert(hasOnlySimpleTerminator(F) && "Unsupported block terminator.");
-
   return runImpl(LI, DT);
 }
 
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 3fed003282f2b..04b05627fa769 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -167,7 +167,7 @@ class VPBuilder {
                               DebugLoc DL = DebugLoc::getUnknown(),
                               const Twine &Name = "") {
     return tryInsertInstruction(
-        new VPInstruction(Opcode, Operands, Flags, DL, Name));
+        new VPInstruction(Opcode, Operands, Flags, {}, DL, Name));
   }
 
   VPInstruction *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands,
@@ -184,7 +184,7 @@ class VPBuilder {
                                      DebugLoc DL = DebugLoc::getUnknown(),
                                      const Twine &Name = "") {
     return tryInsertInstruction(
-        new VPInstruction(Opcode, Operands, WrapFlags, DL, Name));
+        new VPInstruction(Opcode, Operands, WrapFlags, {}, DL, Name));
   }
 
   VPInstruction *createNot(VPValue *Operand,
@@ -205,7 +205,7 @@ class VPBuilder {
 
     return tryInsertInstruction(new VPInstruction(
         Instruction::BinaryOps::Or, {LHS, RHS},
-        VPRecipeWithIRFlags::DisjointFlagsTy(false), DL, Name));
+        VPRecipeWithIRFlags::DisjointFlagsTy(false), {}, DL, Name));
   }
 
   VPInstruction *createLogicalAnd(VPValue *LHS, VPValue *RHS,
@@ -221,7 +221,7 @@ class VPBuilder {
                std::optional<FastMathFlags> FMFs = std::nullopt) {
     auto *Select =
         FMFs ? new VPInstruction(Instruction::Select, {Cond, TrueVal, FalseVal},
-                                 *FMFs, DL, Name)
+                                 *FMFs, {}, DL, Name)
              : new VPInstruction(Instruction::Select, {Cond, TrueVal, FalseVal},
                                  DL, Name);
     return tryInsertInstruction(Select);
@@ -235,7 +235,7 @@ class VPBuilder {
     assert(Pred >= CmpInst::FIRST_ICMP_PREDICATE &&
            Pred <= CmpInst::LAST_ICMP_PREDICATE && "invalid predicate");
     return tryInsertInstruction(
-        new VPInstruction(Instruction::ICmp, {A, B}, Pred, DL, Name));
+        new VPInstruction(Instruction::ICmp, {A, B}, Pred, {}, DL, Name));
   }
 
   /// Create a new FCmp VPInstruction with predicate \p Pred and operands \p A
@@ -246,7 +246,7 @@ class VPBuilder {
     assert(Pred >= CmpInst::FIRST_FCMP_PREDICATE &&
            Pred <= CmpInst::LAST_FCMP_PREDICATE && "invalid predicate");
     return tryInsertInstruction(
-        new VPInstruction(Instruction::FCmp, {A, B}, Pred, DL, Name));
+        new VPInstruction(Instruction::FCmp, {A, B}, Pred, {}, DL, Name));
   }
 
   VPInstruction *createPtrAdd(VPValue *Ptr, VPValue *Offset,
@@ -254,7 +254,7 @@ class VPBuilder {
                               const Twine &Name = "") {
     return tryInsertInstruction(
         new VPInstruction(VPInstruction::PtrAdd, {Ptr, Offset},
-                          GEPNoWrapFlags::none(), DL, Name));
+                          GEPNoWrapFlags::none(), {}, DL, Name));
   }
 
   VPInstruction *createNoWrapPtrAdd(VPValue *Ptr, VPValue *Offset,
@@ -262,7 +262,7 @@ class VPBuilder {
                                     DebugLoc DL = DebugLoc::getUnknown(),
                                     const Twine &Name = "") {
     return tryInsertInstruction(new VPInstruction(
-        VPInstruction::PtrAdd, {Ptr, Offset}, GEPFlags, DL, Name));
+        VPInstruction::PtrAdd, {Ptr, Offset}, GEPFlags, {}, DL, Name));
   }
 
   VPInstruction *createWidePtrAdd(VPValue *Ptr, VPValue *Offset,
@@ -270,7 +270,7 @@ class VPBuilder {
                                   const Twine &Name = "") {
     return tryInsertInstruction(
         new VPInstruction(VPInstruction::WidePtrAdd, {Ptr, Offset},
-                          GEPNoWrapFlags::none(), DL, Name));
+                          GEPNoWrapFlags::none(), {}, DL, Name));
   }
 
   VPPhi *createScalarPhi(ArrayRef<VPValue *> IncomingValues, DebugLoc DL,
@@ -280,8 +280,7 @@ class VPBuilder {
 
   VPValue *createElementCount(Type *Ty, ElementCount EC) {
     VPlan &Plan = *getInsertBlock()->getPlan();
-    VPValue *RuntimeEC =
-        Plan.getOrAddLiveIn(ConstantInt::get(Ty, EC.getKnownMinValue()));
+    VPValue *RuntimeEC = Plan.getConstantInt(Ty, EC.getKnownMinValue());
     if (EC.isScalable()) {
       VPValue *VScale = createNaryOp(VPInstruction::VScale, {}, Ty);
       RuntimeEC = EC.getKnownMinValue() == 1
@@ -304,9 +303,11 @@ class VPBuilder {
   }
 
   VPInstruction *createScalarCast(Instruction::CastOps Opcode, VPValue *Op,
-                                  Type *ResultTy, DebugLoc DL) {
+                                  Type *ResultTy, DebugLoc DL,
+                                  const VPIRFlags &Flags = {},
+                                  const VPIRMetadata &Metadata = {}) {
     return tryInsertInstruction(
-        new VPInstructionWithType(Opcode, Op, ResultTy, {}, DL));
+        new VPInstructionWithType(Opcode, Op, ResultTy, DL, Flags, Metadata));
   }
 
   VPValue *createScalarZExtOrTrunc(VPValue *Op, Type *ResultTy, Type *SrcTy,
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index f7968abbe5b6b..e5c3f17860103 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3908,7 +3908,7 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks(
         continue;
 
       VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind,
-                            *CM.PSE.getSE());
+                            *CM.PSE.getSE(), OrigLoop);
       precomputeCosts(*Plan, VF, CostCtx);
       auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry());
       for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
@@ -4166,7 +4166,7 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
       // Add on other costs that are modelled in VPlan, but not in the legacy
       // cost model.
       VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind,
-                            *CM.PSE.getSE());
+                            *CM.PSE.getSE(), OrigLoop);
       VPRegionBlock *VectorRegion = P->getVectorLoopRegion();
       assert(VectorRegion && "Expected to have a vector region!");
       for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
@@ -5750,13 +5750,18 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
              getMemoryInstructionCost(I, ElementCount::getFixed(1))));
         UpdateMemOpUserCost(cast<LoadInst>(I));
       } else if (const auto *Group = getInterleavedAccessGroup(I)) {
-        // Scalarize an interleave group of address loads.
-        for (unsigned I = 0; I < Group->getFactor(); ++I) {
-          if (Instruction *Member = Group->getMember(I)) {
-            setWideningDecision(
-                Member, VF, CM_Scalarize,
-                (VF.getKnownMinValue() *
-                 getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
+        // Scalarize all members of this interleaved group when any member
+        // is used as an address. The address-used load skips scalarization
+        // overhead, other members include it.
+        for (unsigned Idx = 0; Idx < Group->getFactor(); ++Idx) {
+          if (Instruction *Member = Group->getMember(Idx)) {
+            InstructionCost Cost =
+                AddrDefs.contains(Member)
+                    ? (VF.getKnownMinValue() *
+                       getMemoryInstructionCost(Member,
+                                                ElementCount::getFixed(1)))
+                    : getMemInstScalarizationCost(Member, VF);
+            setWideningDecision(Member, VF, CM_Scalarize, Cost);
             UpdateMemOpUserCost(cast<LoadInst>(Member));
           }
         }
@@ -6871,7 +6876,8 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
 
 InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
                                                ElementCount VF) const {
-  VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind, *PSE.getSE());
+  VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind, *PSE.getSE(),
+                        OrigLoop);
   InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);
 
   // Now compute and add the VPlan-based cost.
@@ -7105,12 +7111,13 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
   // case, don't trigger the assertion, as the extra simplifications may cause a
   // different VF to be picked by the VPlan-based cost model.
   VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind,
-                        *CM.PSE.getSE());
+                        *CM.PSE.getSE(), OrigLoop);
   precomputeCosts(BestPlan, BestFactor.Width, CostCtx);
   // Verify that the VPlan-based and legacy cost models agree, except for VPlans
   // with early exits and plans with additional VPlan simplifications. The
   // legacy cost model doesn't properly model costs for such loops.
   assert((BestFactor.Width == LegacyVF.Width || BestPlan.hasEarlyExit() ||
+          !Legal->getLAI()->getSymbolicStrides().empty() ||
           planContainsAdditionalSimplifications(getPlanFor(BestFactor.Width),
                                                 CostCtx, OrigLoop,
                                                 BestFactor.Width) ||
@@ -7745,8 +7752,7 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
     if (CM.isPredicatedInst(I)) {
       SmallVector<VPValue *> Ops(Operands);
       VPValue *Mask = getBlockInMask(Builder.getInsertBlock());
-      VPValue *One =
-          Plan.getOrAddLiveIn(ConstantInt::get(I->getType(), 1u, false));
+      VPValue *One = Plan.getConstantInt(I->getType(), 1u);
       auto *SafeRHS = Builder.createSelect(Mask, Ops[1], One, I->getDebugLoc());
       Ops[1] = SafeRHS;
       return new VPWidenRecipe(*I, Ops);
@@ -7799,11 +7805,10 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
   }
   case Instruction::ExtractValue: {
     SmallVector<VPValue *> NewOps(Operands);
-    Type *I32Ty = IntegerType::getInt32Ty(I->getContext());
     auto *EVI = cast<ExtractValueInst>(I);
     assert(EVI->getNumIndices() == 1 && "Expected one extractvalue index");
     unsigned Idx = EVI->getIndices()[0];
-    NewOps.push_back(Plan.getOrAddLiveIn(ConstantInt::get(I32Ty, Idx, false)));
+    NewOps.push_back(Plan.getConstantInt(32, Idx));
     return new VPWidenRecipe(*I, NewOps);
   }
   };
@@ -8172,8 +8177,7 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
            "Expected an ADD or SUB operation for predicated partial "
            "reductions (because the neutral element in the mask is zero)!");
     Cond = getBlockInMask(Builder.getInsertBlock());
-    VPValue *Zero =
-        Plan.getOrAddLiveIn(ConstantInt::get(Reduction->getType(), 0));
+    VPValue *Zero = Plan.getConstantInt(Reduction->getType(), 0);
     BinOp = Builder.createSelect(Cond, BinOp, Zero, Reduction->getDebugLoc());
   }
   return new VPPartialReductionRecipe(ReductionOpcode, Accumulator, BinOp, Cond,
@@ -8335,11 +8339,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
               &R) ||
           (isa<VPInstruction>(&R) && !UnderlyingValue))
         continue;
-
-      // FIXME: VPlan0, which models a copy of the original scalar loop, should
-      // not use VPWidenPHIRecipe to model the phis.
-      assert((isa<VPWidenPHIRecipe>(&R) || isa<VPInstruction>(&R)) &&
-             UnderlyingValue && "unsupported recipe");
+      assert(isa<VPInstruction>(&R) && UnderlyingValue && "unsupported recipe");
 
       // TODO: Gradually replace uses of underlying instruction by analyses on
       // VPlan.
@@ -8440,7 +8440,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
   // and mulacc-reduction are implemented.
   if (!CM.foldTailWithEVL()) {
     VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind,
-                          *CM.PSE.getSE());
+                          *CM.PSE.getSE(), OrigLoop);
     VPlanTransforms::runPass(VPlanTransforms::convertToAbstractRecipes, *Plan,
                              CostCtx, Range);
   }
@@ -8640,7 +8640,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
       } else if (PhiR->isInLoop() && Kind == RecurKind::AddChainWithSubs &&
                  CurrentLinkI->getOpcode() == Instruction::Sub) {
         Type *PhiTy = PhiR->getUnderlyingValue()->getType();
-        auto *Zero = Plan->getOrAddLiveIn(ConstantInt::get(PhiTy, 0));
+        auto *Zero = Plan->getConstantInt(PhiTy, 0);
         VPWidenRecipe *Sub = new VPWidenRecipe(
             Instruction::Sub, {Zero, CurrentLink->getOperand(1)}, {},
             VPIRMetadata(), CurrentLinkI->getDebugLoc());
@@ -8854,8 +8854,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
       ToDelete.push_back(Select);
 
       // Convert the reduction phi to operate on bools.
-      PhiR->setOperand(0, Plan->getOrAddLiveIn(ConstantInt::getFalse(
-                              OrigLoop->getHeader()->getContext())));
+      PhiR->setOperand(0, Plan->getFalse());
       continue;
     }
 
@@ -8877,9 +8876,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
       unsigned ScaleFactor =
           RecipeBuilder.getScalingForReduction(RdxDesc.getLoopExitInstr())
               .value_or(1);
-      Type *I32Ty = IntegerType::getInt32Ty(PhiTy->getContext());
-      auto *ScaleFactorVPV =
-          Plan->getOrAddLiveIn(ConstantInt::get(I32Ty, ScaleFactor));
+      auto *ScaleFactorVPV = Plan->getConstantInt(32, ScaleFactor);
       VPValue *StartV = PHBuilder.createNaryOp(
           VPInstruction::ReductionStartVector,
           {PhiR->getStartValue(), Iden, ScaleFactorVPV},
@@ -9910,7 +9907,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     bool ForceVectorization =
         Hints.getForce() == LoopVectorizeHints::FK_Enabled;
     VPCostContext CostCtx(CM.TTI, *CM.TLI, LVP.getPlanFor(VF.Width), CM,
-                          CM.CostKind, *CM.PSE.getSE());
+                          CM.CostKind, *CM.PSE.getSE(), L);
     if (!ForceVectorization &&
         !isOutsideLoopWorkProfitable(Checks, VF, L, PSE, CostCtx,
                                      LVP.getPlanFor(VF.Width), SEL,
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 4fcaf6dabb513..34b405ced8c0a 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -5608,6 +5608,7 @@ class BoUpSLP {
           for (ScheduleBundle *Bundle : Bundles) {
             if (ScheduleCopyableDataMap.empty() && TotalOpCount == 0)
               break;
+            SmallPtrSet<Value *, 4> ParentsUniqueUsers;
             // Need to search for the lane since the tree entry can be
             // reordered.
             auto *It = find(Bundle->getTreeEntry()->Scalars, In);
@@ -5636,6 +5637,22 @@ class BoUpSLP {
                       Bundle->getTreeEntry()->isCopyableElement(In)) &&
                      "Missed TreeEntry operands?");
 
+              bool IsNonSchedulableWithParentPhiNode =
+                  Bundle->getTreeEntry()->doesNotNeedToSchedule() &&
+                  Bundle->getTreeEntry()->UserTreeIndex &&
+                  Bundle->getTreeEntry()->UserTreeIndex.UserTE->hasState() &&
+                  Bundle->getTreeEntry()->UserTreeIndex.UserTE->getOpcode() ==
+                      Instruction::PHI;
+              // Count the number of unique phi nodes, which are the parent for
+              // parent entry, and exit, if all the unique phis are processed.
+              if (IsNonSchedulableWithParentPhiNode) {
+                const TreeEntry *ParentTE =
+                    Bundle->getTreeEntry()->UserTreeIndex.UserTE;
+                Value *User = ParentTE->Scalars[Lane];
+                if (!ParentsUniqueUsers.insert(User).second)
+                  break;
+              }
+
               for (unsigned OpIdx :
                    seq<unsigned>(Bundle->getTreeEntry()->getNumOperands()))
                 if (auto *I = dyn_cast<Instruction>(
@@ -5644,8 +5661,8 @@ class BoUpSLP {
                                     << *I << "\n");
                   DecrUnschedForInst(I, Bundle->getTreeEntry(), OpIdx, Checked);
                 }
-              // If parent node is schedulable, it will be handle correctly.
-              if (!Bundle->getTreeEntry()->doesNotNeedToSchedule())
+              // If parent node is schedulable, it will be handled correctly.
+              if (!IsNonSchedulableWithParentPhiNode)
                 break;
               It = std::find(std::next(It),
                              Bundle->getTreeEntry()->Scalars.end(), In);
@@ -16903,7 +16920,10 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
       // otherwise TEPtr depends on TE.
       if ((TEInsertBlock != InsertPt->getParent() ||
            TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
-          !CheckOrdering(InsertPt))
+          (!CheckOrdering(InsertPt) ||
+           (UseEI.UserTE->hasCopyableElements() &&
+            isUsedOutsideBlock(const_cast<Instruction *>(TEInsertPt)) &&
+            is_contained(UseEI.UserTE->Scalars, TEInsertPt))))
         continue;
       // The node is reused - exit.
       if (CheckAndUseSameNode(TEPtr))
@@ -22114,6 +22134,27 @@ bool BoUpSLP::collectValuesToDemote(
         {VectorizableTree[E.CombinedEntriesWithIndices.front().first].get(),
          VectorizableTree[E.CombinedEntriesWithIndices.back().first].get()});
 
+  if (E.isAltShuffle()) {
+    // Combining these opcodes may lead to incorrect analysis, skip for now.
+    auto IsDangerousOpcode = [](unsigned Opcode) {
+      switch (Opcode) {
+      case Instruction::Shl:
+      case Instruction::AShr:
+      case Instruction::LShr:
+      case Instruction::UDiv:
+      case Instruction::SDiv:
+      case Instruction::URem:
+      case Instruction::SRem:
+        return true;
+      default:
+        break;
+      }
+      return false;
+    };
+    if (IsDangerousOpcode(E.getAltOpcode()))
+      return FinalAnalysis();
+  }
+
   switch (E.getOpcode()) {
 
   // We can always demote truncations and extensions. Since truncations can
diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp
index 9c869dd1bbdca..d354933f9d4ec 100644
--- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp
+++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp
@@ -92,7 +92,7 @@ void MemDGNode::print(raw_ostream &OS, bool PrintDeps) const {
   DGNode::print(OS, false);
   if (PrintDeps) {
     // Print memory preds.
-    static constexpr const unsigned Indent = 4;
+    static constexpr unsigned Indent = 4;
     for (auto *Pred : MemPreds)
       OS.indent(Indent) << "<-" << *Pred->getInstruction() << "\n";
   }
diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp
index 86dbd2171a560..5534da902b968 100644
--- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp
+++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp
@@ -25,14 +25,14 @@ static cl::opt<bool>
                           "emit new instructions (*very* expensive)."));
 #endif // NDEBUG
 
-static constexpr const unsigned long StopAtDisabled =
+static constexpr unsigned long StopAtDisabled =
     std::numeric_limits<unsigned long>::max();
 static cl::opt<unsigned long>
     StopAt("sbvec-stop-at", cl::init(StopAtDisabled), cl::Hidden,
            cl::desc("Vectorize if the invocation count is < than this. 0 "
                     "disables vectorization."));
 
-static constexpr const unsigned long StopBundleDisabled =
+static constexpr unsigned long StopBundleDisabled =
     std::numeric_limits<unsigned long>::max();
 static cl::opt<unsigned long>
     StopBundle("sbvec-stop-bndl", cl::init(StopBundleDisabled), cl::Hidden,
diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp
index ed2f80ba8900a..2de692143c1b6 100644
--- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp
@@ -43,7 +43,7 @@ cl::opt<std::string> AllowFiles(
     "sbvec-allow-files", cl::init(".*"), cl::Hidden,
     cl::desc("Run the vectorizer only on file paths that match any in the "
              "list of comma-separated regex's."));
-static constexpr const char AllowFilesDelim = ',';
+static constexpr char AllowFilesDelim = ',';
 
 SandboxVectorizerPass::SandboxVectorizerPass() : FPM("fpm") {
   if (UserDefinedPassPipeline == DefaultPipelineMagicStr) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 1f10058ab4a9a..e1da070a1fb7f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -939,7 +939,7 @@ class VPIRMetadata {
   SmallVector<std::pair<unsigned, MDNode *>> Metadata;
 
 public:
-  VPIRMetadata() {}
+  VPIRMetadata() = default;
 
   /// Adds metatadata that can be preserved from the original instruction
   /// \p I.
@@ -950,12 +950,9 @@ class VPIRMetadata {
   VPIRMetadata(Instruction &I, LoopVersioning *LVer);
 
   /// Copy constructor for cloning.
-  VPIRMetadata(const VPIRMetadata &Other) : Metadata(Other.Metadata) {}
+  VPIRMetadata(const VPIRMetadata &Other) = default;
 
-  VPIRMetadata &operator=(const VPIRMetadata &Other) {
-    Metadata = Other.Metadata;
-    return *this;
-  }
+  VPIRMetadata &operator=(const VPIRMetadata &Other) = default;
 
   /// Add all metadata to \p I.
   void applyMetadata(Instruction &I) const;
@@ -1107,14 +1104,15 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
         VPIRMetadata(), Opcode(Opcode), Name(Name.str()) {}
 
   VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands,
-                const VPIRFlags &Flags, DebugLoc DL = DebugLoc::getUnknown(),
-                const Twine &Name = "");
+                const VPIRFlags &Flags, const VPIRMetadata &MD = {},
+                DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = "");
 
   VP_CLASSOF_IMPL(VPDef::VPInstructionSC)
 
   VPInstruction *clone() override {
     SmallVector<VPValue *, 2> Operands(operands());
-    auto *New = new VPInstruction(Opcode, Operands, *this, getDebugLoc(), Name);
+    auto *New =
+        new VPInstruction(Opcode, Operands, *this, *this, getDebugLoc(), Name);
     if (getUnderlyingValue())
       New->setUnderlyingValue(getUnderlyingInstr());
     return New;
@@ -1196,7 +1194,14 @@ class VPInstructionWithType : public VPInstruction {
   VPInstructionWithType(unsigned Opcode, ArrayRef<VPValue *> Operands,
                         Type *ResultTy, const VPIRFlags &Flags, DebugLoc DL,
                         const Twine &Name = "")
-      : VPInstruction(Opcode, Operands, Flags, DL, Name), ResultTy(ResultTy) {}
+      : VPInstruction(Opcode, Operands, Flags, {}, DL, Name),
+        ResultTy(ResultTy) {}
+
+  VPInstructionWithType(unsigned Opcode, ArrayRef<VPValue *> Operands,
+                        Type *ResultTy, DebugLoc DL, const VPIRFlags &Flags,
+                        const VPIRMetadata &Metadata, const Twine &Name = "")
+      : VPInstruction(Opcode, Operands, Flags, Metadata, DL, Name),
+        ResultTy(ResultTy) {}
 
   static inline bool classof(const VPRecipeBase *R) {
     // VPInstructionWithType are VPInstructions with specific opcodes requiring
@@ -3206,6 +3211,9 @@ class LLVM_ABI_FOR_TEST VPWidenMemoryRecipe : public VPRecipeBase,
       : VPRecipeBase(SC, Operands, DL), VPIRMetadata(Metadata), Ingredient(I),
         Alignment(Alignment), Consecutive(Consecutive), Reverse(Reverse) {
     assert((Consecutive || !Reverse) && "Reverse implies consecutive");
+    assert(isa<VPVectorEndPointerRecipe>(getAddr()) ||
+           !Reverse &&
+               "Reversed acccess without VPVectorEndPointerRecipe address?");
   }
 
 public:
@@ -3977,7 +3985,7 @@ class VPIRBasicBlock : public VPBasicBlock {
         IRBB(IRBB) {}
 
 public:
-  ~VPIRBasicBlock() override {}
+  ~VPIRBasicBlock() override = default;
 
   static inline bool classof(const VPBlockBase *V) {
     return V->getVPBlockID() == VPBlockBase::VPIRBasicBlockSC;
@@ -4029,7 +4037,7 @@ class LLVM_ABI_FOR_TEST VPRegionBlock : public VPBlockBase {
         IsReplicator(IsReplicator) {}
 
 public:
-  ~VPRegionBlock() override {}
+  ~VPRegionBlock() override = default;
 
   /// Method to support type inquiry through isa, cast, and dyn_cast.
   static inline bool classof(const VPBlockBase *V) {
@@ -4109,6 +4117,12 @@ class LLVM_ABI_FOR_TEST VPRegionBlock : public VPBlockBase {
   const VPCanonicalIVPHIRecipe *getCanonicalIV() const {
     return const_cast<VPRegionBlock *>(this)->getCanonicalIV();
   }
+
+  /// Return the type of the canonical IV for loop regions.
+  Type *getCanonicalIVType() { return getCanonicalIV()->getScalarType(); }
+  const Type *getCanonicalIVType() const {
+    return getCanonicalIV()->getScalarType();
+  }
 };
 
 inline VPRegionBlock *VPRecipeBase::getRegion() {
@@ -4387,15 +4401,25 @@ class VPlan {
   }
 
   /// Return a VPValue wrapping i1 true.
-  VPValue *getTrue() {
-    LLVMContext &Ctx = getContext();
-    return getOrAddLiveIn(ConstantInt::getTrue(Ctx));
-  }
+  VPValue *getTrue() { return getConstantInt(1, 1); }
 
   /// Return a VPValue wrapping i1 false.
-  VPValue *getFalse() {
-    LLVMContext &Ctx = getContext();
-    return getOrAddLiveIn(ConstantInt::getFalse(Ctx));
+  VPValue *getFalse() { return getConstantInt(1, 0); }
+
+  /// Return a VPValue wrapping a ConstantInt with the given type and value.
+  VPValue *getConstantInt(Type *Ty, uint64_t Val, bool IsSigned = false) {
+    return getOrAddLiveIn(ConstantInt::get(Ty, Val, IsSigned));
+  }
+
+  /// Return a VPValue wrapping a ConstantInt with the given bitwidth and value.
+  VPValue *getConstantInt(unsigned BitWidth, uint64_t Val,
+                          bool IsSigned = false) {
+    return getConstantInt(APInt(BitWidth, Val, IsSigned));
+  }
+
+  /// Return a VPValue wrapping a ConstantInt with the given APInt value.
+  VPValue *getConstantInt(const APInt &Val) {
+    return getOrAddLiveIn(ConstantInt::get(getContext(), Val));
   }
 
   /// Return the live-in VPValue for \p V, if there is one or nullptr otherwise.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 65688a3f0b6be..1a66d2049a8db 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -612,8 +612,7 @@ void VPlanTransforms::addMiddleCheck(VPlan &Plan,
   if (!RequiresScalarEpilogueCheck)
     Cmp = Plan.getFalse();
   else if (TailFolded)
-    Cmp = Plan.getOrAddLiveIn(
-        ConstantInt::getTrue(IntegerType::getInt1Ty(Plan.getContext())));
+    Cmp = Plan.getTrue();
   else
     Cmp = Builder.createICmp(CmpInst::ICMP_EQ, Plan.getTripCount(),
                              &Plan.getVectorTripCount(), LatchDL, "cmp.n");
@@ -712,8 +711,8 @@ void VPlanTransforms::addMinimumIterationCheck(
       // additional overflow check is required before entering the vector loop.
 
       // Get the maximum unsigned value for the type.
-      VPValue *MaxUIntTripCount = Plan.getOrAddLiveIn(ConstantInt::get(
-          TripCountTy, cast<IntegerType>(TripCountTy)->getMask()));
+      VPValue *MaxUIntTripCount =
+          Plan.getConstantInt(cast<IntegerType>(TripCountTy)->getMask());
       VPValue *DistanceToMax = Builder.createNaryOp(
           Instruction::Sub, {MaxUIntTripCount, TripCountVPV},
           DebugLoc::getUnknown());
diff --git a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
index 2aaabd9ebdd04..965426f86ff21 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
@@ -350,13 +350,14 @@ struct VPCostContext {
   SmallPtrSet<Instruction *, 8> SkipCostComputation;
   TargetTransformInfo::TargetCostKind CostKind;
   ScalarEvolution &SE;
+  const Loop *L;
 
   VPCostContext(const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI,
                 const VPlan &Plan, LoopVectorizationCostModel &CM,
                 TargetTransformInfo::TargetCostKind CostKind,
-                ScalarEvolution &SE)
+                ScalarEvolution &SE, const Loop *L)
       : TTI(TTI), TLI(TLI), Types(Plan), LLVMCtx(Plan.getContext()), CM(CM),
-        CostKind(CostKind), SE(SE) {}
+        CostKind(CostKind), SE(SE), L(L) {}
 
   /// Return the cost for \p UI with \p VF using the legacy cost model as
   /// fallback until computing the cost of all recipes migrates to VPlan.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
index b5b98c64543e4..b57c44872c1b6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
@@ -313,7 +313,8 @@ struct Recipe_match {
     // Check for recipes that do not have opcodes.
     if constexpr (std::is_same_v<RecipeTy, VPScalarIVStepsRecipe> ||
                   std::is_same_v<RecipeTy, VPCanonicalIVPHIRecipe> ||
-                  std::is_same_v<RecipeTy, VPDerivedIVRecipe>)
+                  std::is_same_v<RecipeTy, VPDerivedIVRecipe> ||
+                  std::is_same_v<RecipeTy, VPVectorEndPointerRecipe>)
       return DefR;
     else
       return DefR && DefR->getOpcode() == Opcode;
@@ -686,6 +687,64 @@ m_DerivedIV(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2) {
   return VPDerivedIV_match<Op0_t, Op1_t, Op2_t>({Op0, Op1, Op2});
 }
 
+template <typename Addr_t, typename Mask_t> struct Load_match {
+  Addr_t Addr;
+  Mask_t Mask;
+
+  Load_match(Addr_t Addr, Mask_t Mask) : Addr(Addr), Mask(Mask) {}
+
+  template <typename OpTy> bool match(const OpTy *V) const {
+    auto *Load = dyn_cast<VPWidenLoadRecipe>(V);
+    if (!Load || !Addr.match(Load->getAddr()) || !Load->isMasked() ||
+        !Mask.match(Load->getMask()))
+      return false;
+    return true;
+  }
+};
+
+/// Match a (possibly reversed) masked load.
+template <typename Addr_t, typename Mask_t>
+inline Load_match<Addr_t, Mask_t> m_MaskedLoad(const Addr_t &Addr,
+                                               const Mask_t &Mask) {
+  return Load_match<Addr_t, Mask_t>(Addr, Mask);
+}
+
+template <typename Addr_t, typename Val_t, typename Mask_t> struct Store_match {
+  Addr_t Addr;
+  Val_t Val;
+  Mask_t Mask;
+
+  Store_match(Addr_t Addr, Val_t Val, Mask_t Mask)
+      : Addr(Addr), Val(Val), Mask(Mask) {}
+
+  template <typename OpTy> bool match(const OpTy *V) const {
+    auto *Store = dyn_cast<VPWidenStoreRecipe>(V);
+    if (!Store || !Addr.match(Store->getAddr()) ||
+        !Val.match(Store->getStoredValue()) || !Store->isMasked() ||
+        !Mask.match(Store->getMask()))
+      return false;
+    return true;
+  }
+};
+
+/// Match a (possibly reversed) masked store.
+template <typename Addr_t, typename Val_t, typename Mask_t>
+inline Store_match<Addr_t, Val_t, Mask_t>
+m_MaskedStore(const Addr_t &Addr, const Val_t &Val, const Mask_t &Mask) {
+  return Store_match<Addr_t, Val_t, Mask_t>(Addr, Val, Mask);
+}
+
+template <typename Op0_t, typename Op1_t>
+using VectorEndPointerRecipe_match =
+    Recipe_match<std::tuple<Op0_t, Op1_t>, 0,
+                 /*Commutative*/ false, VPVectorEndPointerRecipe>;
+
+template <typename Op0_t, typename Op1_t>
+VectorEndPointerRecipe_match<Op0_t, Op1_t> m_VecEndPtr(const Op0_t &Op0,
+                                                       const Op1_t &Op1) {
+  return VectorEndPointerRecipe_match<Op0_t, Op1_t>(Op0, Op1);
+}
+
 /// Match a call argument at a given argument index.
 template <typename Opnd_t> struct Argument_match {
   /// Call argument index to match.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 9a63c802047ea..1ee405a62aa68 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -162,8 +162,12 @@ bool VPRecipeBase::mayHaveSideEffects() const {
   case VPPredInstPHISC:
   case VPVectorEndPointerSC:
     return false;
-  case VPInstructionSC:
-    return mayWriteToMemory();
+  case VPInstructionSC: {
+    auto *VPI = cast<VPInstruction>(this);
+    return mayWriteToMemory() ||
+           VPI->getOpcode() == VPInstruction::BranchOnCount ||
+           VPI->getOpcode() == VPInstruction::BranchOnCond;
+  }
   case VPWidenCallSC: {
     Function *Fn = cast<VPWidenCallRecipe>(this)->getCalledScalarFunction();
     return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn();
@@ -490,10 +494,10 @@ template class VPUnrollPartAccessor<3>;
 }
 
 VPInstruction::VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands,
-                             const VPIRFlags &Flags, DebugLoc DL,
-                             const Twine &Name)
+                             const VPIRFlags &Flags, const VPIRMetadata &MD,
+                             DebugLoc DL, const Twine &Name)
     : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, Flags, DL),
-      VPIRMetadata(), Opcode(Opcode), Name(Name.str()) {
+      VPIRMetadata(MD), Opcode(Opcode), Name(Name.str()) {
   assert(flagsValidForOpcode(getOpcode()) &&
          "Set flags not supported for the provided opcode");
   assert((getNumOperandsForOpcode(Opcode) == -1u ||
@@ -1241,6 +1245,8 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
   case Instruction::Select:
   case Instruction::PHI:
   case VPInstruction::AnyOf:
+  case VPInstruction::BranchOnCond:
+  case VPInstruction::BranchOnCount:
   case VPInstruction::Broadcast:
   case VPInstruction::BuildStructVector:
   case VPInstruction::BuildVector:
@@ -2372,9 +2378,8 @@ bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
     return false;
   auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue());
   auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue());
-  auto *CanIV = getRegion()->getCanonicalIV();
   return StartC && StartC->isZero() && StepC && StepC->isOne() &&
-         getScalarType() == CanIV->getScalarType();
+         getScalarType() == getRegion()->getCanonicalIVType();
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -3167,26 +3172,30 @@ bool VPReplicateRecipe::shouldPack() const {
   });
 }
 
-/// Returns true if \p Ptr is a pointer computation for which the legacy cost
-/// model computes a SCEV expression when computing the address cost.
-static bool shouldUseAddressAccessSCEV(const VPValue *Ptr) {
+/// Returns a SCEV expression for \p Ptr if it is a pointer computation for
+/// which the legacy cost model computes a SCEV expression when computing the
+/// address cost. Computing SCEVs for VPValues is incomplete and returns
+/// SCEVCouldNotCompute in cases the legacy cost model can compute SCEVs. In
+/// those cases we fall back to the legacy cost model. Otherwise return nullptr.
+static const SCEV *getAddressAccessSCEV(const VPValue *Ptr, ScalarEvolution &SE,
+                                        const Loop *L) {
   auto *PtrR = Ptr->getDefiningRecipe();
   if (!PtrR || !((isa<VPReplicateRecipe>(PtrR) &&
                   cast<VPReplicateRecipe>(PtrR)->getOpcode() ==
                       Instruction::GetElementPtr) ||
                  isa<VPWidenGEPRecipe>(PtrR) ||
                  match(Ptr, m_GetElementPtr(m_VPValue(), m_VPValue()))))
-    return false;
+    return nullptr;
 
   // We are looking for a GEP where all indices are either loop invariant or
   // inductions.
   for (VPValue *Opd : drop_begin(PtrR->operands())) {
     if (!Opd->isDefinedOutsideLoopRegions() &&
         !isa<VPScalarIVStepsRecipe, VPWidenIntOrFpInductionRecipe>(Opd))
-      return false;
+      return nullptr;
   }
 
-  return true;
+  return vputils::getSCEVExprForVPValue(Ptr, SE, L);
 }
 
 /// Returns true if \p V is used as part of the address of another load or
@@ -3354,9 +3363,8 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
 
     bool IsLoad = UI->getOpcode() == Instruction::Load;
     const VPValue *PtrOp = getOperand(!IsLoad);
-    // TODO: Handle cases where we need to pass a SCEV to
-    // getAddressComputationCost.
-    if (shouldUseAddressAccessSCEV(PtrOp))
+    const SCEV *PtrSCEV = getAddressAccessSCEV(PtrOp, Ctx.SE, Ctx.L);
+    if (isa_and_nonnull<SCEVCouldNotCompute>(PtrSCEV))
       break;
 
     Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0));
@@ -3374,7 +3382,7 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
     InstructionCost ScalarCost =
         ScalarMemOpCost + Ctx.TTI.getAddressComputationCost(
                               PtrTy, UsedByLoadStoreAddress ? nullptr : &Ctx.SE,
-                              nullptr, Ctx.CostKind);
+                              PtrSCEV, Ctx.CostKind);
     if (isSingleScalar())
       return ScalarCost;
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanSLP.h b/llvm/lib/Transforms/Vectorize/VPlanSLP.h
index 77ff36cc2c600..44972c68ba9c9 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanSLP.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanSLP.h
@@ -89,8 +89,7 @@ class VPlanSlp {
   /// Width of the widest combined bundle in bits.
   unsigned WidestBundleBits = 0;
 
-  using MultiNodeOpTy =
-      typename std::pair<VPInstruction *, SmallVector<VPValue *, 4>>;
+  using MultiNodeOpTy = std::pair<VPInstruction *, SmallVector<VPValue *, 4>>;
 
   // Input operand bundles for the current multi node. Each multi node operand
   // bundle contains values not matching the multi node's opcode. They will
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index acad795e327ba..9d9bb14530539 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -151,7 +151,27 @@ static bool cannotHoistOrSinkRecipe(const VPRecipeBase &R) {
 
 static bool sinkScalarOperands(VPlan &Plan) {
   auto Iter = vp_depth_first_deep(Plan.getEntry());
+  bool ScalarVFOnly = Plan.hasScalarVFOnly();
   bool Changed = false;
+
+  auto IsValidSinkCandidate = [ScalarVFOnly](VPBasicBlock *SinkTo,
+                                             VPSingleDefRecipe *Candidate) {
+    // We only know how to duplicate VPReplicateRecipes and
+    // VPScalarIVStepsRecipes for now.
+    if (!isa<VPReplicateRecipe, VPScalarIVStepsRecipe>(Candidate))
+      return false;
+
+    if (Candidate->getParent() == SinkTo || Candidate->mayHaveSideEffects() ||
+        Candidate->mayReadOrWriteMemory())
+      return false;
+
+    if (auto *RepR = dyn_cast<VPReplicateRecipe>(Candidate))
+      if (!ScalarVFOnly && RepR->isSingleScalar())
+        return false;
+
+    return true;
+  };
+
   // First, collect the operands of all recipes in replicate blocks as seeds for
   // sinking.
   SetVector<std::pair<VPBasicBlock *, VPSingleDefRecipe *>> WorkList;
@@ -159,51 +179,37 @@ static bool sinkScalarOperands(VPlan &Plan) {
     VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock();
     if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2)
       continue;
-    VPBasicBlock *VPBB = dyn_cast<VPBasicBlock>(EntryVPBB->getSuccessors()[0]);
-    if (!VPBB || VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock())
+    VPBasicBlock *VPBB = cast<VPBasicBlock>(EntryVPBB->getSuccessors().front());
+    if (VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock())
       continue;
     for (auto &Recipe : *VPBB) {
-      for (VPValue *Op : Recipe.operands())
+      for (VPValue *Op : Recipe.operands()) {
         if (auto *Def =
                 dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe()))
-          WorkList.insert({VPBB, Def});
+          if (IsValidSinkCandidate(VPBB, Def))
+            WorkList.insert({VPBB, Def});
+      }
     }
   }
 
-  bool ScalarVFOnly = Plan.hasScalarVFOnly();
   // Try to sink each replicate or scalar IV steps recipe in the worklist.
   for (unsigned I = 0; I != WorkList.size(); ++I) {
     VPBasicBlock *SinkTo;
     VPSingleDefRecipe *SinkCandidate;
     std::tie(SinkTo, SinkCandidate) = WorkList[I];
-    if (SinkCandidate->getParent() == SinkTo ||
-        SinkCandidate->mayHaveSideEffects() ||
-        SinkCandidate->mayReadOrWriteMemory())
-      continue;
-    if (auto *RepR = dyn_cast<VPReplicateRecipe>(SinkCandidate)) {
-      if (!ScalarVFOnly && RepR->isSingleScalar())
-        continue;
-    } else if (!isa<VPScalarIVStepsRecipe>(SinkCandidate))
-      continue;
 
-    bool NeedsDuplicating = false;
     // All recipe users of the sink candidate must be in the same block SinkTo
-    // or all users outside of SinkTo must be uniform-after-vectorization (
-    // i.e., only first lane is used) . In the latter case, we need to duplicate
-    // SinkCandidate.
-    auto CanSinkWithUser = [SinkTo, &NeedsDuplicating,
-                            SinkCandidate](VPUser *U) {
-      auto *UI = cast<VPRecipeBase>(U);
-      if (UI->getParent() == SinkTo)
-        return true;
-      NeedsDuplicating = UI->onlyFirstLaneUsed(SinkCandidate);
-      // We only know how to duplicate VPReplicateRecipes and
-      // VPScalarIVStepsRecipes for now.
-      return NeedsDuplicating &&
-             isa<VPReplicateRecipe, VPScalarIVStepsRecipe>(SinkCandidate);
-    };
-    if (!all_of(SinkCandidate->users(), CanSinkWithUser))
+    // or all users outside of SinkTo must have only their first lane used. In
+    // the latter case, we need to duplicate SinkCandidate.
+    auto UsersOutsideSinkTo =
+        make_filter_range(SinkCandidate->users(), [SinkTo](VPUser *U) {
+          return cast<VPRecipeBase>(U)->getParent() != SinkTo;
+        });
+    if (any_of(UsersOutsideSinkTo, [SinkCandidate](VPUser *U) {
+          return !U->onlyFirstLaneUsed(SinkCandidate);
+        }))
       continue;
+    bool NeedsDuplicating = !UsersOutsideSinkTo.empty();
 
     if (NeedsDuplicating) {
       if (ScalarVFOnly)
@@ -230,7 +236,8 @@ static bool sinkScalarOperands(VPlan &Plan) {
     for (VPValue *Op : SinkCandidate->operands())
       if (auto *Def =
               dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe()))
-        WorkList.insert({SinkTo, Def});
+        if (IsValidSinkCandidate(SinkTo, Def))
+          WorkList.insert({SinkTo, Def});
     Changed = true;
   }
   return Changed;
@@ -699,8 +706,7 @@ static void legalizeAndOptimizeInductions(VPlan &Plan) {
         continue;
 
       const InductionDescriptor &ID = PtrIV->getInductionDescriptor();
-      VPValue *StartV =
-          Plan.getOrAddLiveIn(ConstantInt::get(ID.getStep()->getType(), 0));
+      VPValue *StartV = Plan.getConstantInt(ID.getStep()->getType(), 0);
       VPValue *StepV = PtrIV->getOperand(1);
       VPScalarIVStepsRecipe *Steps = createScalarIVSteps(
           Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
@@ -820,7 +826,7 @@ static VPValue *optimizeEarlyExitInductionUser(VPlan &Plan,
   // Calculate the final index.
   VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
   auto *CanonicalIV = LoopRegion->getCanonicalIV();
-  Type *CanonicalIVType = CanonicalIV->getScalarType();
+  Type *CanonicalIVType = LoopRegion->getCanonicalIVType();
   VPBuilder B(cast<VPBasicBlock>(PredVPBB));
 
   DebugLoc DL = cast<VPInstruction>(Op)->getDebugLoc();
@@ -836,7 +842,7 @@ static VPValue *optimizeEarlyExitInductionUser(VPlan &Plan,
   // changed it means the exit is using the incremented value, so we need to
   // add the step.
   if (Incoming != WideIV) {
-    VPValue *One = Plan.getOrAddLiveIn(ConstantInt::get(CanonicalIVType, 1));
+    VPValue *One = Plan.getConstantInt(CanonicalIVType, 1);
     EndValue = B.createNaryOp(Instruction::Add, {EndValue, One}, DL);
   }
 
@@ -882,7 +888,7 @@ static VPValue *optimizeLatchExitInductionUser(
     return B.createNaryOp(Instruction::Sub, {EndValue, Step}, {}, "ind.escape");
   if (ScalarTy->isPointerTy()) {
     Type *StepTy = TypeInfo.inferScalarType(Step);
-    auto *Zero = Plan.getOrAddLiveIn(ConstantInt::get(StepTy, 0));
+    auto *Zero = Plan.getConstantInt(StepTy, 0);
     return B.createPtrAdd(EndValue,
                           B.createNaryOp(Instruction::Sub, {Zero, Step}),
                           DebugLoc::getUnknown(), "ind.escape");
@@ -1057,13 +1063,9 @@ static VPValue *tryToFoldLiveIns(VPSingleDefRecipe &R,
   return nullptr;
 }
 
-/// Try to simplify recipe \p R.
-static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
-  VPlan *Plan = R.getParent()->getPlan();
-
-  auto *Def = dyn_cast<VPSingleDefRecipe>(&R);
-  if (!Def)
-    return;
+/// Try to simplify VPSingleDefRecipe \p Def.
+static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo) {
+  VPlan *Plan = Def->getParent()->getPlan();
 
   // Simplification of live-in IR values for SingleDef recipes using
   // InstSimplifyFolder.
@@ -1073,7 +1075,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
     return Def->replaceAllUsesWith(V);
 
   // Fold PredPHI LiveIn -> LiveIn.
-  if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(&R)) {
+  if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(Def)) {
     VPValue *Op = PredPHI->getOperand(0);
     if (Op->isLiveIn())
       PredPHI->replaceAllUsesWith(Op);
@@ -1092,12 +1094,12 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
         return;
       if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
 
-        unsigned ExtOpcode = match(R.getOperand(0), m_SExt(m_VPValue()))
+        unsigned ExtOpcode = match(Def->getOperand(0), m_SExt(m_VPValue()))
                                  ? Instruction::SExt
                                  : Instruction::ZExt;
         auto *Ext = Builder.createWidenCast(Instruction::CastOps(ExtOpcode), A,
                                             TruncTy);
-        if (auto *UnderlyingExt = R.getOperand(0)->getUnderlyingValue()) {
+        if (auto *UnderlyingExt = Def->getOperand(0)->getUnderlyingValue()) {
           // UnderlyingExt has distinct return type, used to retain legacy cost.
           Ext->setUnderlyingValue(UnderlyingExt);
         }
@@ -1160,7 +1162,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
         Builder.createLogicalAnd(X, Builder.createOr(Y, Z)));
 
   // x && !x -> 0
-  if (match(&R, m_LogicalAnd(m_VPValue(X), m_Not(m_Deferred(X)))))
+  if (match(Def, m_LogicalAnd(m_VPValue(X), m_Not(m_Deferred(X)))))
     return Def->replaceAllUsesWith(Plan->getFalse());
 
   if (match(Def, m_Select(m_VPValue(), m_VPValue(X), m_Deferred(X))))
@@ -1188,8 +1190,8 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
     return Def->replaceAllUsesWith(A);
 
   if (match(Def, m_c_Mul(m_VPValue(A), m_ZeroInt())))
-    return Def->replaceAllUsesWith(R.getOperand(0) == A ? R.getOperand(1)
-                                                        : R.getOperand(0));
+    return Def->replaceAllUsesWith(
+        Def->getOperand(0) == A ? Def->getOperand(1) : Def->getOperand(0));
 
   if (match(Def, m_Not(m_VPValue(A)))) {
     if (match(A, m_Not(m_VPValue(A))))
@@ -1218,8 +1220,8 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
         }
         // If Cmp doesn't have a debug location, use the one from the negation,
         // to preserve the location.
-        if (!Cmp->getDebugLoc() && R.getDebugLoc())
-          Cmp->setDebugLoc(R.getDebugLoc());
+        if (!Cmp->getDebugLoc() && Def->getDebugLoc())
+          Cmp->setDebugLoc(Def->getDebugLoc());
       }
     }
   }
@@ -1245,7 +1247,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
   if (match(Def, m_Intrinsic<Intrinsic::vp_merge>(m_True(), m_VPValue(A),
                                                   m_VPValue(X), m_VPValue())) &&
       match(A, m_c_BinaryOr(m_Specific(X), m_VPValue(Y))) &&
-      TypeInfo.inferScalarType(R.getVPSingleValue())->isIntegerTy(1)) {
+      TypeInfo.inferScalarType(Def)->isIntegerTy(1)) {
     Def->setOperand(1, Def->getOperand(0));
     Def->setOperand(0, Y);
     return;
@@ -1253,35 +1255,41 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
 
   if (auto *Phi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Def)) {
     if (Phi->getOperand(0) == Phi->getOperand(1))
-      Def->replaceAllUsesWith(Phi->getOperand(0));
+      Phi->replaceAllUsesWith(Phi->getOperand(0));
     return;
   }
 
   // Look through ExtractLastElement (BuildVector ....).
-  if (match(&R, m_CombineOr(m_ExtractLastElement(m_BuildVector()),
-                            m_ExtractLastLanePerPart(m_BuildVector())))) {
-    auto *BuildVector = cast<VPInstruction>(R.getOperand(0));
+  if (match(Def, m_CombineOr(m_ExtractLastElement(m_BuildVector()),
+                             m_ExtractLastLanePerPart(m_BuildVector())))) {
+    auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
     Def->replaceAllUsesWith(
         BuildVector->getOperand(BuildVector->getNumOperands() - 1));
     return;
   }
 
   // Look through ExtractPenultimateElement (BuildVector ....).
-  if (match(&R, m_VPInstruction<VPInstruction::ExtractPenultimateElement>(
-                    m_BuildVector()))) {
-    auto *BuildVector = cast<VPInstruction>(R.getOperand(0));
+  if (match(Def, m_VPInstruction<VPInstruction::ExtractPenultimateElement>(
+                     m_BuildVector()))) {
+    auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
     Def->replaceAllUsesWith(
         BuildVector->getOperand(BuildVector->getNumOperands() - 2));
     return;
   }
 
   uint64_t Idx;
-  if (match(&R, m_ExtractElement(m_BuildVector(), m_ConstantInt(Idx)))) {
-    auto *BuildVector = cast<VPInstruction>(R.getOperand(0));
+  if (match(Def, m_ExtractElement(m_BuildVector(), m_ConstantInt(Idx)))) {
+    auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
     Def->replaceAllUsesWith(BuildVector->getOperand(Idx));
     return;
   }
 
+  if (match(Def, m_BuildVector()) && all_equal(Def->operands())) {
+    Def->replaceAllUsesWith(
+        Builder.createNaryOp(VPInstruction::Broadcast, Def->getOperand(0)));
+    return;
+  }
+
   if (auto *Phi = dyn_cast<VPPhi>(Def)) {
     if (Phi->getNumOperands() == 1)
       Phi->replaceAllUsesWith(Phi->getOperand(0));
@@ -1298,7 +1306,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
       isa<VPPhi>(X)) {
     auto *Phi = cast<VPPhi>(X);
     if (Phi->getOperand(1) != Def && match(Phi->getOperand(0), m_ZeroInt()) &&
-        Phi->getNumUsers() == 1 && (*Phi->user_begin() == &R)) {
+        Phi->getNumUsers() == 1 && (*Phi->user_begin() == Def)) {
       Phi->setOperand(0, Y);
       Def->replaceAllUsesWith(Phi);
       return;
@@ -1306,7 +1314,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
   }
 
   // VPVectorPointer for part 0 can be replaced by their start pointer.
-  if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(&R)) {
+  if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(Def)) {
     if (VecPtr->isFirstPart()) {
       VecPtr->replaceAllUsesWith(VecPtr->getOperand(0));
       return;
@@ -1361,9 +1369,9 @@ void VPlanTransforms::simplifyRecipes(VPlan &Plan) {
       Plan.getEntry());
   VPTypeAnalysis TypeInfo(Plan);
   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
-    for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
-      simplifyRecipe(R, TypeInfo);
-    }
+    for (VPRecipeBase &R : make_early_inc_range(*VPBB))
+      if (auto *Def = dyn_cast<VPSingleDefRecipe>(&R))
+        simplifyRecipe(Def, TypeInfo);
   }
 }
 
@@ -1419,6 +1427,8 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
                                           true /*IsSingleScalar*/);
       Clone->insertBefore(RepOrWidenR);
       RepOrWidenR->replaceAllUsesWith(Clone);
+      if (isDeadRecipe(*RepOrWidenR))
+        RepOrWidenR->eraseFromParent();
     }
   }
 }
@@ -1572,9 +1582,9 @@ static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan,
       continue;
 
     // Update IV operands and comparison bound to use new narrower type.
-    auto *NewStart = Plan.getOrAddLiveIn(ConstantInt::get(NewIVTy, 0));
+    auto *NewStart = Plan.getConstantInt(NewIVTy, 0);
     WideIV->setStartValue(NewStart);
-    auto *NewStep = Plan.getOrAddLiveIn(ConstantInt::get(NewIVTy, 1));
+    auto *NewStep = Plan.getConstantInt(NewIVTy, 1);
     WideIV->setStepValue(NewStep);
 
     auto *NewBTC = new VPWidenCastRecipe(
@@ -1693,8 +1703,7 @@ static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF,
 
   // When using wide lane masks, the return type of the get.active.lane.mask
   // intrinsic is VF x UF (last operand).
-  VPValue *ALMMultiplier =
-      Plan.getOrAddLiveIn(ConstantInt::get(IntegerType::getInt64Ty(Ctx), UF));
+  VPValue *ALMMultiplier = Plan.getConstantInt(64, UF);
   EntryALM->setOperand(2, ALMMultiplier);
   LoopALM->setOperand(2, ALMMultiplier);
 
@@ -2400,8 +2409,8 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
       "index.part.next");
 
   // Create the active lane mask instruction in the VPlan preheader.
-  VPValue *ALMMultiplier = Plan.getOrAddLiveIn(
-      ConstantInt::get(TopRegion->getCanonicalIV()->getScalarType(), 1));
+  VPValue *ALMMultiplier =
+      Plan.getConstantInt(TopRegion->getCanonicalIVType(), 1);
   auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
                                         {EntryIncrement, TC, ALMMultiplier}, DL,
                                         "active.lane.mask.entry");
@@ -2501,7 +2510,7 @@ void VPlanTransforms::addActiveLaneMask(
   } else {
     VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
     VPValue *ALMMultiplier = Plan.getOrAddLiveIn(
-        ConstantInt::get(LoopRegion->getCanonicalIV()->getScalarType(), 1));
+        ConstantInt::get(LoopRegion->getCanonicalIVType(), 1));
     LaneMask =
         B.createNaryOp(VPInstruction::ActiveLaneMask,
                        {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier},
@@ -2515,90 +2524,102 @@ void VPlanTransforms::addActiveLaneMask(
   HeaderMask->eraseFromParent();
 }
 
+template <typename Op0_t, typename Op1_t> struct RemoveMask_match {
+  Op0_t In;
+  Op1_t &Out;
+
+  RemoveMask_match(const Op0_t &In, Op1_t &Out) : In(In), Out(Out) {}
+
+  template <typename OpTy> bool match(OpTy *V) const {
+    if (m_Specific(In).match(V)) {
+      Out = nullptr;
+      return true;
+    }
+    if (m_LogicalAnd(m_Specific(In), m_VPValue(Out)).match(V))
+      return true;
+    return false;
+  }
+};
+
+/// Match a specific mask \p In, or a combination of it (logical-and In, Out).
+/// Returns the remaining part \p Out if so, or nullptr otherwise.
+template <typename Op0_t, typename Op1_t>
+static inline RemoveMask_match<Op0_t, Op1_t> m_RemoveMask(const Op0_t &In,
+                                                          Op1_t &Out) {
+  return RemoveMask_match<Op0_t, Op1_t>(In, Out);
+}
+
 /// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding
 /// EVL-based recipe without the header mask. Returns nullptr if no EVL-based
 /// recipe could be created.
 /// \p HeaderMask  Header Mask.
 /// \p CurRecipe   Recipe to be transform.
 /// \p TypeInfo    VPlan-based type analysis.
-/// \p AllOneMask  The vector mask parameter of vector-predication intrinsics.
 /// \p EVL         The explicit vector length parameter of vector-predication
 /// intrinsics.
 static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
                                        VPRecipeBase &CurRecipe,
-                                       VPTypeAnalysis &TypeInfo,
-                                       VPValue &AllOneMask, VPValue &EVL) {
-  // FIXME: Don't transform recipes to EVL recipes if they're not masked by the
-  // header mask.
-  auto GetNewMask = [&](VPValue *OrigMask) -> VPValue * {
-    assert(OrigMask && "Unmasked recipe when folding tail");
-    // HeaderMask will be handled using EVL.
-    VPValue *Mask;
-    if (match(OrigMask, m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(Mask))))
-      return Mask;
-    return HeaderMask == OrigMask ? nullptr : OrigMask;
-  };
+                                       VPTypeAnalysis &TypeInfo, VPValue &EVL) {
+  VPlan *Plan = CurRecipe.getParent()->getPlan();
+  VPValue *Addr, *Mask, *EndPtr;
 
   /// Adjust any end pointers so that they point to the end of EVL lanes not VF.
-  auto GetNewAddr = [&CurRecipe, &EVL](VPValue *Addr) -> VPValue * {
-    auto *EndPtr = dyn_cast<VPVectorEndPointerRecipe>(Addr);
-    if (!EndPtr)
-      return Addr;
-    assert(EndPtr->getOperand(1) == &EndPtr->getParent()->getPlan()->getVF() &&
-           "VPVectorEndPointerRecipe with non-VF VF operand?");
-    assert(
-        all_of(EndPtr->users(),
-               [](VPUser *U) {
-                 return cast<VPWidenMemoryRecipe>(U)->isReverse();
-               }) &&
-        "VPVectorEndPointRecipe not used by reversed widened memory recipe?");
-    VPVectorEndPointerRecipe *EVLAddr = EndPtr->clone();
-    EVLAddr->insertBefore(&CurRecipe);
-    EVLAddr->setOperand(1, &EVL);
-    return EVLAddr;
+  auto AdjustEndPtr = [&CurRecipe, &EVL](VPValue *EndPtr) {
+    auto *EVLEndPtr = cast<VPVectorEndPointerRecipe>(EndPtr)->clone();
+    EVLEndPtr->insertBefore(&CurRecipe);
+    EVLEndPtr->setOperand(1, &EVL);
+    return EVLEndPtr;
   };
 
-  return TypeSwitch<VPRecipeBase *, VPRecipeBase *>(&CurRecipe)
-      .Case<VPWidenLoadRecipe>([&](VPWidenLoadRecipe *L) {
-        VPValue *NewMask = GetNewMask(L->getMask());
-        VPValue *NewAddr = GetNewAddr(L->getAddr());
-        return new VPWidenLoadEVLRecipe(*L, NewAddr, EVL, NewMask);
-      })
-      .Case<VPWidenStoreRecipe>([&](VPWidenStoreRecipe *S) {
-        VPValue *NewMask = GetNewMask(S->getMask());
-        VPValue *NewAddr = GetNewAddr(S->getAddr());
-        return new VPWidenStoreEVLRecipe(*S, NewAddr, EVL, NewMask);
-      })
-      .Case<VPInterleaveRecipe>([&](VPInterleaveRecipe *IR) {
-        VPValue *NewMask = GetNewMask(IR->getMask());
-        return new VPInterleaveEVLRecipe(*IR, EVL, NewMask);
-      })
-      .Case<VPReductionRecipe>([&](VPReductionRecipe *Red) {
-        VPValue *NewMask = GetNewMask(Red->getCondOp());
-        return new VPReductionEVLRecipe(*Red, EVL, NewMask);
-      })
-      .Case<VPInstruction>([&](VPInstruction *VPI) -> VPRecipeBase * {
-        VPValue *LHS, *RHS;
-        // Transform select with a header mask condition
-        //   select(header_mask, LHS, RHS)
-        // into vector predication merge.
-        //   vp.merge(all-true, LHS, RHS, EVL)
-        if (!match(VPI, m_Select(m_Specific(HeaderMask), m_VPValue(LHS),
-                                 m_VPValue(RHS))))
-          return nullptr;
-        // Use all true as the condition because this transformation is
-        // limited to selects whose condition is a header mask.
-        return new VPWidenIntrinsicRecipe(
-            Intrinsic::vp_merge, {&AllOneMask, LHS, RHS, &EVL},
-            TypeInfo.inferScalarType(LHS), VPI->getDebugLoc());
-      })
-      .Default([&](VPRecipeBase *R) { return nullptr; });
+  if (match(&CurRecipe,
+            m_MaskedLoad(m_VPValue(Addr), m_RemoveMask(HeaderMask, Mask))) &&
+      !cast<VPWidenLoadRecipe>(CurRecipe).isReverse())
+    return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe), Addr,
+                                    EVL, Mask);
+
+  if (match(&CurRecipe,
+            m_MaskedLoad(m_VPValue(EndPtr), m_RemoveMask(HeaderMask, Mask))) &&
+      match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) &&
+      cast<VPWidenLoadRecipe>(CurRecipe).isReverse())
+    return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe),
+                                    AdjustEndPtr(EndPtr), EVL, Mask);
+
+  if (match(&CurRecipe, m_MaskedStore(m_VPValue(Addr), m_VPValue(),
+                                      m_RemoveMask(HeaderMask, Mask))) &&
+      !cast<VPWidenStoreRecipe>(CurRecipe).isReverse())
+    return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr,
+                                     EVL, Mask);
+
+  if (match(&CurRecipe, m_MaskedStore(m_VPValue(EndPtr), m_VPValue(),
+                                      m_RemoveMask(HeaderMask, Mask))) &&
+      match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) &&
+      cast<VPWidenStoreRecipe>(CurRecipe).isReverse())
+    return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe),
+                                     AdjustEndPtr(EndPtr), EVL, Mask);
+
+  if (auto *Rdx = dyn_cast<VPReductionRecipe>(&CurRecipe))
+    if (Rdx->isConditional() &&
+        match(Rdx->getCondOp(), m_RemoveMask(HeaderMask, Mask)))
+      return new VPReductionEVLRecipe(*Rdx, EVL, Mask);
+
+  if (auto *Interleave = dyn_cast<VPInterleaveRecipe>(&CurRecipe))
+    if (Interleave->getMask() &&
+        match(Interleave->getMask(), m_RemoveMask(HeaderMask, Mask)))
+      return new VPInterleaveEVLRecipe(*Interleave, EVL, Mask);
+
+  VPValue *LHS, *RHS;
+  if (match(&CurRecipe,
+            m_Select(m_Specific(HeaderMask), m_VPValue(LHS), m_VPValue(RHS))))
+    return new VPWidenIntrinsicRecipe(
+        Intrinsic::vp_merge, {Plan->getTrue(), LHS, RHS, &EVL},
+        TypeInfo.inferScalarType(LHS), CurRecipe.getDebugLoc());
+
+  return nullptr;
 }
 
 /// Replace recipes with their EVL variants.
 static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
   VPTypeAnalysis TypeInfo(Plan);
-  VPValue *AllOneMask = Plan.getTrue();
   VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
   VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
 
@@ -2658,7 +2679,7 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
             ConstantInt::getSigned(Type::getInt32Ty(Plan.getContext()), -1));
         VPWidenIntrinsicRecipe *VPSplice = new VPWidenIntrinsicRecipe(
             Intrinsic::experimental_vp_splice,
-            {V1, V2, Imm, AllOneMask, PrevEVL, &EVL},
+            {V1, V2, Imm, Plan.getTrue(), PrevEVL, &EVL},
             TypeInfo.inferScalarType(R.getVPSingleValue()), R.getDebugLoc());
         VPSplice->insertBefore(&R);
         R.getVPSingleValue()->replaceAllUsesWith(VPSplice);
@@ -2692,7 +2713,7 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
   for (VPUser *U : collectUsersRecursively(EVLMask)) {
     auto *CurRecipe = cast<VPRecipeBase>(U);
     VPRecipeBase *EVLRecipe =
-        optimizeMaskToEVL(EVLMask, *CurRecipe, TypeInfo, *AllOneMask, EVL);
+        optimizeMaskToEVL(EVLMask, *CurRecipe, TypeInfo, EVL);
     if (!EVLRecipe)
       continue;
 
@@ -2773,7 +2794,7 @@ void VPlanTransforms::addExplicitVectorLength(
   VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
 
   auto *CanonicalIVPHI = LoopRegion->getCanonicalIV();
-  auto *CanIVTy = CanonicalIVPHI->getScalarType();
+  auto *CanIVTy = LoopRegion->getCanonicalIVType();
   VPValue *StartV = CanonicalIVPHI->getStartValue();
 
   // Create the ExplicitVectorLengthPhi recipe in the main loop.
@@ -2788,8 +2809,7 @@ void VPlanTransforms::addExplicitVectorLength(
 
   if (MaxSafeElements) {
     // Support for MaxSafeDist for correct loop emission.
-    VPValue *AVLSafe =
-        Plan.getOrAddLiveIn(ConstantInt::get(CanIVTy, *MaxSafeElements));
+    VPValue *AVLSafe = Plan.getConstantInt(CanIVTy, *MaxSafeElements);
     VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe);
     AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc::getUnknown(),
                                "safe_avl");
@@ -2902,9 +2922,8 @@ void VPlanTransforms::canonicalizeEVLLoops(VPlan &Plan) {
 
   Type *AVLTy = VPTypeAnalysis(Plan).inferScalarType(AVLNext);
   VPBuilder Builder(LatchExitingBr);
-  VPValue *Cmp =
-      Builder.createICmp(CmpInst::ICMP_EQ, AVLNext,
-                         Plan.getOrAddLiveIn(ConstantInt::getNullValue(AVLTy)));
+  VPValue *Cmp = Builder.createICmp(CmpInst::ICMP_EQ, AVLNext,
+                                    Plan.getConstantInt(AVLTy, 0));
   Builder.createNaryOp(VPInstruction::BranchOnCond, Cmp);
   LatchExitingBr->eraseFromParent();
 }
@@ -2928,8 +2947,7 @@ void VPlanTransforms::replaceSymbolicStrides(
       // Only handle constant strides for now.
       continue;
 
-    auto *CI =
-        Plan.getOrAddLiveIn(ConstantInt::get(Stride->getType(), *StrideConst));
+    auto *CI = Plan.getConstantInt(*StrideConst);
     if (VPValue *StrideVPV = Plan.getLiveIn(StrideV))
       StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
 
@@ -2944,7 +2962,7 @@ void VPlanTransforms::replaceSymbolicStrides(
       unsigned BW = U->getType()->getScalarSizeInBits();
       APInt C =
           isa<SExtInst>(U) ? StrideConst->sext(BW) : StrideConst->zext(BW);
-      VPValue *CI = Plan.getOrAddLiveIn(ConstantInt::get(U->getType(), C));
+      VPValue *CI = Plan.getConstantInt(C);
       StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
     }
     RewriteMap[StrideV] = PSE.getSCEV(StrideV);
@@ -3123,8 +3141,7 @@ void VPlanTransforms::createInterleaveGroups(
                    DL.getTypeAllocSize(getLoadStoreType(IRInsertPos)) *
                        IG->getIndex(IRInsertPos),
                    /*IsSigned=*/true);
-      VPValue *OffsetVPV =
-          Plan.getOrAddLiveIn(ConstantInt::get(Plan.getContext(), -Offset));
+      VPValue *OffsetVPV = Plan.getConstantInt(-Offset);
       VPBuilder B(InsertPos);
       Addr = B.createNoWrapPtrAdd(InsertPos->getAddr(), OffsetVPV, NW);
     }
@@ -3648,6 +3665,37 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
     Sub = VecOp->getDefiningRecipe();
     VecOp = Tmp;
   }
+
+  // If ValB is a constant and can be safely extended, truncate it to the same
+  // type as ExtA's operand, then extend it to the same type as ExtA. This
+  // creates two uniform extends that can more easily be matched by the rest of
+  // the bundling code. The ExtB reference, ValB and operand 1 of Mul are all
+  // replaced with the new extend of the constant.
+  auto ExtendAndReplaceConstantOp = [&Ctx](VPWidenCastRecipe *ExtA,
+                                           VPWidenCastRecipe *&ExtB,
+                                           VPValue *&ValB, VPWidenRecipe *Mul) {
+    if (!ExtA || ExtB || !ValB->isLiveIn())
+      return;
+    Type *NarrowTy = Ctx.Types.inferScalarType(ExtA->getOperand(0));
+    Instruction::CastOps ExtOpc = ExtA->getOpcode();
+    const APInt *Const;
+    if (!match(ValB, m_APInt(Const)) ||
+        !llvm::canConstantBeExtended(
+            Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc)))
+      return;
+    // The truncate ensures that the type of each extended operand is the
+    // same, and it's been proven that the constant can be extended from
+    // NarrowTy safely. Necessary since ExtA's extended operand would be
+    // e.g. an i8, while the const will likely be an i32. This will be
+    // elided by later optimisations.
+    VPBuilder Builder(Mul);
+    auto *Trunc =
+        Builder.createWidenCast(Instruction::CastOps::Trunc, ValB, NarrowTy);
+    Type *WideTy = Ctx.Types.inferScalarType(ExtA);
+    ValB = ExtB = Builder.createWidenCast(ExtOpc, Trunc, WideTy);
+    Mul->setOperand(1, ExtB);
+  };
+
   // Try to match reduce.add(mul(...)).
   if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) {
     auto *RecipeA =
@@ -3656,6 +3704,9 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
         dyn_cast_if_present<VPWidenCastRecipe>(B->getDefiningRecipe());
     auto *Mul = cast<VPWidenRecipe>(VecOp->getDefiningRecipe());
 
+    // Convert reduce.add(mul(ext, const)) to reduce.add(mul(ext, ext(const)))
+    ExtendAndReplaceConstantOp(RecipeA, RecipeB, B, Mul);
+
     // Match reduce.add/sub(mul(ext, ext)).
     if (RecipeA && RecipeB && match(RecipeA, m_ZExtOrSExt(m_VPValue())) &&
         match(RecipeB, m_ZExtOrSExt(m_VPValue())) &&
@@ -3665,7 +3716,6 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
                                       cast<VPWidenRecipe>(Sub), Red);
       return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red);
     }
-    // Match reduce.add(mul).
     // TODO: Add an expression type for this variant with a negated mul
     if (!Sub && IsMulAccValidAndClampRange(Mul, nullptr, nullptr, nullptr))
       return new VPExpressionRecipe(Mul, Red);
@@ -3674,18 +3724,26 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
   // variants.
   if (Sub)
     return nullptr;
-  // Match reduce.add(ext(mul(ext(A), ext(B)))).
-  // All extend recipes must have same opcode or A == B
-  // which can be transform to reduce.add(zext(mul(sext(A), sext(B)))).
-  if (match(VecOp, m_ZExtOrSExt(m_Mul(m_ZExtOrSExt(m_VPValue()),
-                                      m_ZExtOrSExt(m_VPValue()))))) {
+
+  // Match reduce.add(ext(mul(A, B))).
+  if (match(VecOp, m_ZExtOrSExt(m_Mul(m_VPValue(A), m_VPValue(B))))) {
     auto *Ext = cast<VPWidenCastRecipe>(VecOp->getDefiningRecipe());
     auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0)->getDefiningRecipe());
-    auto *Ext0 =
-        cast<VPWidenCastRecipe>(Mul->getOperand(0)->getDefiningRecipe());
-    auto *Ext1 =
-        cast<VPWidenCastRecipe>(Mul->getOperand(1)->getDefiningRecipe());
-    if ((Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
+    auto *Ext0 = dyn_cast_if_present<VPWidenCastRecipe>(A->getDefiningRecipe());
+    auto *Ext1 = dyn_cast_if_present<VPWidenCastRecipe>(B->getDefiningRecipe());
+
+    // reduce.add(ext(mul(ext, const)))
+    // -> reduce.add(ext(mul(ext, ext(const))))
+    ExtendAndReplaceConstantOp(Ext0, Ext1, B, Mul);
+
+    // reduce.add(ext(mul(ext(A), ext(B))))
+    // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
+    // The inner extends must either have the same opcode as the outer extend or
+    // be the same, in which case the multiply can never result in a negative
+    // value and the outer extend can be folded away by doing wider
+    // extends for the operands of the mul.
+    if (Ext0 && Ext1 &&
+        (Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
         Ext0->getOpcode() == Ext1->getOpcode() &&
         IsMulAccValidAndClampRange(Mul, Ext0, Ext1, Ext) && Mul->hasOneUse()) {
       auto *NewExt0 = new VPWidenCastRecipe(
@@ -3824,8 +3882,7 @@ void VPlanTransforms::materializeBackedgeTakenCount(VPlan &Plan,
   VPBuilder Builder(VectorPH, VectorPH->begin());
   auto *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
   auto *TCMO = Builder.createNaryOp(
-      Instruction::Sub,
-      {Plan.getTripCount(), Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 1))},
+      Instruction::Sub, {Plan.getTripCount(), Plan.getConstantInt(TCTy, 1)},
       DebugLoc::getCompilerGenerated(), "trip.count.minus.1");
   BTC->replaceAllUsesWith(TCMO);
 }
@@ -3950,9 +4007,8 @@ void VPlanTransforms::materializeVectorTripCount(VPlan &Plan,
   if (TailByMasking) {
     TC = Builder.createNaryOp(
         Instruction::Add,
-        {TC, Builder.createNaryOp(
-                 Instruction::Sub,
-                 {Step, Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 1))})},
+        {TC, Builder.createNaryOp(Instruction::Sub,
+                                  {Step, Plan.getConstantInt(TCTy, 1)})},
         DebugLoc::getCompilerGenerated(), "n.rnd.up");
   }
 
@@ -3974,8 +4030,8 @@ void VPlanTransforms::materializeVectorTripCount(VPlan &Plan,
   if (RequiresScalarEpilogue) {
     assert(!TailByMasking &&
            "requiring scalar epilogue is not supported with fail folding");
-    VPValue *IsZero = Builder.createICmp(
-        CmpInst::ICMP_EQ, R, Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 0)));
+    VPValue *IsZero =
+        Builder.createICmp(CmpInst::ICMP_EQ, R, Plan.getConstantInt(TCTy, 0));
     R = Builder.createSelect(IsZero, Step, R);
   }
 
@@ -4013,7 +4069,7 @@ void VPlanTransforms::materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH,
   }
   VF.replaceAllUsesWith(RuntimeVF);
 
-  VPValue *UF = Plan.getOrAddLiveIn(ConstantInt::get(TCTy, Plan.getUF()));
+  VPValue *UF = Plan.getConstantInt(TCTy, Plan.getUF());
   VPValue *MulByUF = Builder.createNaryOp(Instruction::Mul, {RuntimeVF, UF});
   VFxUF.replaceAllUsesWith(MulByUF);
 }
@@ -4021,7 +4077,7 @@ void VPlanTransforms::materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH,
 DenseMap<const SCEV *, Value *>
 VPlanTransforms::expandSCEVs(VPlan &Plan, ScalarEvolution &SE) {
   const DataLayout &DL = SE.getDataLayout();
-  SCEVExpander Expander(SE, DL, "induction", /*PreserveLCSSA=*/true);
+  SCEVExpander Expander(SE, DL, "induction", /*PreserveLCSSA=*/false);
 
   auto *Entry = cast<VPIRBasicBlock>(Plan.getEntry());
   BasicBlock *EntryBB = Entry->getIRBasicBlock();
@@ -4133,7 +4189,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
   unsigned VFMinVal = VF.getKnownMinValue();
   SmallVector<VPInterleaveRecipe *> StoreGroups;
   for (auto &R : *VectorLoop->getEntryBasicBlock()) {
-    if (isa<VPCanonicalIVPHIRecipe>(&R) || match(&R, m_BranchOnCount()))
+    if (isa<VPCanonicalIVPHIRecipe>(&R))
       continue;
 
     if (isa<VPDerivedIVRecipe, VPScalarIVStepsRecipe>(&R) &&
@@ -4293,17 +4349,17 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
   VPBuilder PHBuilder(Plan.getVectorPreheader());
 
   VPValue *UF = Plan.getOrAddLiveIn(
-      ConstantInt::get(CanIV->getScalarType(), 1 * Plan.getUF()));
+      ConstantInt::get(VectorLoop->getCanonicalIVType(), 1 * Plan.getUF()));
   if (VF.isScalable()) {
     VPValue *VScale = PHBuilder.createElementCount(
-        CanIV->getScalarType(), ElementCount::getScalable(1));
+        VectorLoop->getCanonicalIVType(), ElementCount::getScalable(1));
     VPValue *VScaleUF = PHBuilder.createNaryOp(Instruction::Mul, {VScale, UF});
     Inc->setOperand(1, VScaleUF);
     Plan.getVF().replaceAllUsesWith(VScale);
   } else {
     Inc->setOperand(1, UF);
     Plan.getVF().replaceAllUsesWith(
-        Plan.getOrAddLiveIn(ConstantInt::get(CanIV->getScalarType(), 1)));
+        Plan.getConstantInt(CanIV->getScalarType(), 1));
   }
   removeDeadRecipes(Plan);
 }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
index cfd1a741ee841..d6a002825e38d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -68,10 +68,9 @@ class UnrollState {
   void unrollWidenInductionByUF(VPWidenInductionRecipe *IV,
                                 VPBasicBlock::iterator InsertPtForPhi);
 
-  VPValue *getConstantVPV(unsigned Part) {
-    Type *CanIVIntTy =
-        Plan.getVectorLoopRegion()->getCanonicalIV()->getScalarType();
-    return Plan.getOrAddLiveIn(ConstantInt::get(CanIVIntTy, Part));
+  VPValue *getConstantInt(unsigned Part) {
+    Type *CanIVIntTy = Plan.getVectorLoopRegion()->getCanonicalIVType();
+    return Plan.getConstantInt(CanIVIntTy, Part);
   }
 
 public:
@@ -138,7 +137,7 @@ void UnrollState::unrollReplicateRegionByUF(VPRegionBlock *VPR) {
       for (const auto &[PartIR, Part0R] : zip(*PartIVPBB, *Part0VPBB)) {
         remapOperands(&PartIR, Part);
         if (auto *ScalarIVSteps = dyn_cast<VPScalarIVStepsRecipe>(&PartIR)) {
-          ScalarIVSteps->addOperand(getConstantVPV(Part));
+          ScalarIVSteps->addOperand(getConstantInt(Part));
         }
 
         addRecipeForPart(&Part0R, &PartIR, Part);
@@ -250,7 +249,7 @@ void UnrollState::unrollHeaderPHIByUF(VPHeaderPHIRecipe *R,
         for (unsigned Part = 1; Part != UF; ++Part)
           VPV2Parts[VPI][Part - 1] = StartV;
       }
-      Copy->addOperand(getConstantVPV(Part));
+      Copy->addOperand(getConstantInt(Part));
     } else {
       assert(isa<VPActiveLaneMaskPHIRecipe>(R) &&
              "unexpected header phi recipe not needing unrolled part");
@@ -319,7 +318,7 @@ void UnrollState::unrollRecipeByUF(VPRecipeBase &R) {
             VPVectorPointerRecipe, VPVectorEndPointerRecipe>(Copy) ||
         match(Copy,
               m_VPInstruction<VPInstruction::CanonicalIVIncrementForPart>()))
-      Copy->addOperand(getConstantVPV(Part));
+      Copy->addOperand(getConstantInt(Part));
 
     if (isa<VPVectorPointerRecipe, VPVectorEndPointerRecipe>(R))
       Copy->setOperand(0, R.getOperand(0));
@@ -475,8 +474,7 @@ cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy,
     if (LaneDefs != Def2LaneDefs.end())
       return LaneDefs->second[Lane.getKnownLane()];
 
-    VPValue *Idx =
-        Plan.getOrAddLiveIn(ConstantInt::get(IdxTy, Lane.getKnownLane()));
+    VPValue *Idx = Plan.getConstantInt(IdxTy, Lane.getKnownLane());
     return Builder.createNaryOp(Instruction::ExtractElement, {Op, Idx});
   }
 
@@ -510,8 +508,7 @@ cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy,
           cast<VPInstruction>(Op)->getOperand(Lane.getKnownLane()));
       continue;
     }
-    VPValue *Idx =
-        Plan.getOrAddLiveIn(ConstantInt::get(IdxTy, Lane.getKnownLane()));
+    VPValue *Idx = Plan.getConstantInt(IdxTy, Lane.getKnownLane());
     VPValue *Ext = Builder.createNaryOp(Instruction::ExtractElement, {Op, Idx});
     NewOps.push_back(Ext);
   }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
index 4db92e7def3ed..c6380d30ab2e2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
@@ -32,22 +32,17 @@ bool vputils::onlyScalarValuesUsed(const VPValue *Def) {
 }
 
 VPValue *vputils::getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr) {
-  VPValue *Expanded = nullptr;
   if (auto *E = dyn_cast<SCEVConstant>(Expr))
-    Expanded = Plan.getOrAddLiveIn(E->getValue());
-  else {
-    auto *U = dyn_cast<SCEVUnknown>(Expr);
-    // Skip SCEV expansion if Expr is a SCEVUnknown wrapping a non-instruction
-    // value. Otherwise the value may be defined in a loop and using it directly
-    // will break LCSSA form. The SCEV expansion takes care of preserving LCSSA
-    // form.
-    if (U && !isa<Instruction>(U->getValue())) {
-      Expanded = Plan.getOrAddLiveIn(U->getValue());
-    } else {
-      Expanded = new VPExpandSCEVRecipe(Expr);
-      Plan.getEntry()->appendRecipe(Expanded->getDefiningRecipe());
-    }
-  }
+    return Plan.getOrAddLiveIn(E->getValue());
+  // Skip SCEV expansion if Expr is a SCEVUnknown wrapping a non-instruction
+  // value. Otherwise the value may be defined in a loop and using it directly
+  // will break LCSSA form. The SCEV expansion takes care of preserving LCSSA
+  // form.
+  auto *U = dyn_cast<SCEVUnknown>(Expr);
+  if (U && !isa<Instruction>(U->getValue()))
+    return Plan.getOrAddLiveIn(U->getValue());
+  auto *Expanded = new VPExpandSCEVRecipe(Expr);
+  Plan.getEntry()->appendRecipe(Expanded);
   return Expanded;
 }
 
@@ -75,7 +70,8 @@ bool vputils::isHeaderMask(const VPValue *V, const VPlan &Plan) {
          B == Plan.getBackedgeTakenCount();
 }
 
-const SCEV *vputils::getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE) {
+const SCEV *vputils::getSCEVExprForVPValue(const VPValue *V,
+                                           ScalarEvolution &SE, const Loop *L) {
   if (V->isLiveIn()) {
     if (Value *LiveIn = V->getLiveInIRValue())
       return SE.getSCEV(LiveIn);
@@ -86,6 +82,53 @@ const SCEV *vputils::getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE) {
   return TypeSwitch<const VPRecipeBase *, const SCEV *>(V->getDefiningRecipe())
       .Case<VPExpandSCEVRecipe>(
           [](const VPExpandSCEVRecipe *R) { return R->getSCEV(); })
+      .Case<VPCanonicalIVPHIRecipe>([&SE, L](const VPCanonicalIVPHIRecipe *R) {
+        if (!L)
+          return SE.getCouldNotCompute();
+        const SCEV *Start = getSCEVExprForVPValue(R->getOperand(0), SE, L);
+        return SE.getAddRecExpr(Start, SE.getOne(Start->getType()), L,
+                                SCEV::FlagAnyWrap);
+      })
+      .Case<VPDerivedIVRecipe>([&SE, L](const VPDerivedIVRecipe *R) {
+        const SCEV *Start = getSCEVExprForVPValue(R->getOperand(0), SE, L);
+        const SCEV *IV = getSCEVExprForVPValue(R->getOperand(1), SE, L);
+        const SCEV *Scale = getSCEVExprForVPValue(R->getOperand(2), SE, L);
+        if (any_of(ArrayRef({Start, IV, Scale}), IsaPred<SCEVCouldNotCompute>))
+          return SE.getCouldNotCompute();
+
+        return SE.getAddExpr(SE.getTruncateOrSignExtend(Start, IV->getType()),
+                             SE.getMulExpr(IV, SE.getTruncateOrSignExtend(
+                                                   Scale, IV->getType())));
+      })
+      .Case<VPScalarIVStepsRecipe>([&SE, L](const VPScalarIVStepsRecipe *R) {
+        const SCEV *IV = getSCEVExprForVPValue(R->getOperand(0), SE, L);
+        const SCEV *Step = getSCEVExprForVPValue(R->getOperand(1), SE, L);
+        if (isa<SCEVCouldNotCompute>(IV) || isa<SCEVCouldNotCompute>(Step) ||
+            !Step->isOne())
+          return SE.getCouldNotCompute();
+        return SE.getMulExpr(SE.getTruncateOrSignExtend(IV, Step->getType()),
+                             Step);
+      })
+      .Case<VPReplicateRecipe>([&SE, L](const VPReplicateRecipe *R) {
+        if (R->getOpcode() != Instruction::GetElementPtr)
+          return SE.getCouldNotCompute();
+
+        const SCEV *Base = getSCEVExprForVPValue(R->getOperand(0), SE, L);
+        if (isa<SCEVCouldNotCompute>(Base))
+          return SE.getCouldNotCompute();
+
+        SmallVector<const SCEV *> IndexExprs;
+        for (VPValue *Index : drop_begin(R->operands())) {
+          const SCEV *IndexExpr = getSCEVExprForVPValue(Index, SE, L);
+          if (isa<SCEVCouldNotCompute>(IndexExpr))
+            return SE.getCouldNotCompute();
+          IndexExprs.push_back(IndexExpr);
+        }
+
+        Type *SrcElementTy = cast<GetElementPtrInst>(R->getUnderlyingInstr())
+                                 ->getSourceElementType();
+        return SE.getGEPExpr(Base, IndexExprs, SrcElementTy);
+      })
       .Default([&SE](const VPRecipeBase *) { return SE.getCouldNotCompute(); });
 }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
index 37cd413da9079..c21a0e70c1392 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
@@ -37,7 +37,8 @@ VPValue *getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr);
 
 /// Return the SCEV expression for \p V. Returns SCEVCouldNotCompute if no
 /// SCEV expression could be constructed.
-const SCEV *getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE);
+const SCEV *getSCEVExprForVPValue(const VPValue *V, ScalarEvolution &SE,
+                                  const Loop *L = nullptr);
 
 /// Returns true if \p VPV is a single scalar, either because it produces the
 /// same value for all lanes or only has its first lane used.
diff --git a/llvm/test/Analysis/CostModel/AArch64/extract-last-active.ll b/llvm/test/Analysis/CostModel/AArch64/extract-last-active.ll
new file mode 100644
index 0000000000000..9efcf912076b0
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AArch64/extract-last-active.ll
@@ -0,0 +1,216 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -S -mtriple=aarch64--linux-gnu | FileCheck %s --check-prefix=NEON
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -S -mtriple=aarch64--linux-gnu -mattr=+sve | FileCheck %s --check-prefix=SVE
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -S -mtriple=aarch64--linux-gnu -mattr=+sme -force-streaming | FileCheck %s --check-prefix=SME-STREAMING
+
+define void @extractions() {
+; NEON-LABEL: 'extractions'
+; NEON-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v16i8 = call i8 @llvm.experimental.vector.extract.last.active.v16i8(<16 x i8> poison, <16 x i1> poison, i8 poison)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v8i16 = call i16 @llvm.experimental.vector.extract.last.active.v8i16(<8 x i16> poison, <8 x i1> poison, i16 poison)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v4i32 = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> poison, <4 x i1> poison, i32 poison)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v2i64 = call i64 @llvm.experimental.vector.extract.last.active.v2i64(<2 x i64> poison, <2 x i1> poison, i64 poison)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %v8f16 = call half @llvm.experimental.vector.extract.last.active.v8f16(<8 x half> poison, <8 x i1> poison, half poison)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %v8bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v8bf16(<8 x bfloat> poison, <8 x i1> poison, bfloat poison)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v4f32 = call float @llvm.experimental.vector.extract.last.active.v4f32(<4 x float> poison, <4 x i1> poison, float poison)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = call double @llvm.experimental.vector.extract.last.active.v2f64(<2 x double> poison, <2 x i1> poison, double poison)
+; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv16i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv16i8(<vscale x 16 x i8> poison, <vscale x 16 x i1> poison, i8 poison)
+; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv8i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i1> poison, i16 poison)
+; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv4i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> poison, i32 poison)
+; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv2i64 = call i64 @llvm.experimental.vector.extract.last.active.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i1> poison, i64 poison)
+; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv8f16 = call half @llvm.experimental.vector.extract.last.active.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x i1> poison, half poison)
+; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv8bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv8bf16(<vscale x 8 x bfloat> poison, <vscale x 8 x i1> poison, bfloat poison)
+; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv4f32 = call float @llvm.experimental.vector.extract.last.active.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x i1> poison, float poison)
+; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv2f64 = call double @llvm.experimental.vector.extract.last.active.nxv2f64(<vscale x 2 x double> poison, <vscale x 2 x i1> poison, double poison)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v32i8 = call i8 @llvm.experimental.vector.extract.last.active.v32i8(<32 x i8> poison, <32 x i1> poison, i8 poison)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v16i16 = call i16 @llvm.experimental.vector.extract.last.active.v16i16(<16 x i16> poison, <16 x i1> poison, i16 poison)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v8i32 = call i32 @llvm.experimental.vector.extract.last.active.v8i32(<8 x i32> poison, <8 x i1> poison, i32 poison)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v4i64 = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> poison, <4 x i1> poison, i64 poison)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %v16f16 = call half @llvm.experimental.vector.extract.last.active.v16f16(<16 x half> poison, <16 x i1> poison, half poison)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %v16bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v16bf16(<16 x bfloat> poison, <16 x i1> poison, bfloat poison)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v8f32 = call float @llvm.experimental.vector.extract.last.active.v8f32(<8 x float> poison, <8 x i1> poison, float poison)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = call double @llvm.experimental.vector.extract.last.active.v4f64(<4 x double> poison, <4 x i1> poison, double poison)
+; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv32i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv32i8(<vscale x 32 x i8> poison, <vscale x 32 x i1> poison, i8 poison)
+; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv16i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv16i16(<vscale x 16 x i16> poison, <vscale x 16 x i1> poison, i16 poison)
+; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv8i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv8i32(<vscale x 8 x i32> poison, <vscale x 8 x i1> poison, i32 poison)
+; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv4i64 = call i64 @llvm.experimental.vector.extract.last.active.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i1> poison, i64 poison)
+; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv16f16 = call half @llvm.experimental.vector.extract.last.active.nxv16f16(<vscale x 16 x half> poison, <vscale x 16 x i1> poison, half poison)
+; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv16bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv16bf16(<vscale x 16 x bfloat> poison, <vscale x 16 x i1> poison, bfloat poison)
+; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv8f32 = call float @llvm.experimental.vector.extract.last.active.nxv8f32(<vscale x 8 x float> poison, <vscale x 8 x i1> poison, float poison)
+; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv4f64 = call double @llvm.experimental.vector.extract.last.active.nxv4f64(<vscale x 4 x double> poison, <vscale x 4 x i1> poison, double poison)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v8i8 = call i8 @llvm.experimental.vector.extract.last.active.v8i8(<8 x i8> poison, <8 x i1> poison, i8 poison)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v4i16 = call i16 @llvm.experimental.vector.extract.last.active.v4i16(<4 x i16> poison, <4 x i1> poison, i16 poison)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v2i32 = call i32 @llvm.experimental.vector.extract.last.active.v2i32(<2 x i32> poison, <2 x i1> poison, i32 poison)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1i64 = call i64 @llvm.experimental.vector.extract.last.active.v1i64(<1 x i64> poison, <1 x i1> poison, i64 poison)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v4f16 = call half @llvm.experimental.vector.extract.last.active.v4f16(<4 x half> poison, <4 x i1> poison, half poison)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v4bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v4bf16(<4 x bfloat> poison, <4 x i1> poison, bfloat poison)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f32 = call float @llvm.experimental.vector.extract.last.active.v2f32(<2 x float> poison, <2 x i1> poison, float poison)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1f64 = call double @llvm.experimental.vector.extract.last.active.v1f64(<1 x double> poison, <1 x i1> poison, double poison)
+; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv8i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv8i8(<vscale x 8 x i8> poison, <vscale x 8 x i1> poison, i8 poison)
+; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv4i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv4i16(<vscale x 4 x i16> poison, <vscale x 4 x i1> poison, i16 poison)
+; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv2i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv2i32(<vscale x 2 x i32> poison, <vscale x 2 x i1> poison, i32 poison)
+; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv1i64 = call i64 @llvm.experimental.vector.extract.last.active.nxv1i64(<vscale x 1 x i64> poison, <vscale x 1 x i1> poison, i64 poison)
+; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv4f16 = call half @llvm.experimental.vector.extract.last.active.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x i1> poison, half poison)
+; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv4bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv4bf16(<vscale x 4 x bfloat> poison, <vscale x 4 x i1> poison, bfloat poison)
+; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv2f32 = call float @llvm.experimental.vector.extract.last.active.nxv2f32(<vscale x 2 x float> poison, <vscale x 2 x i1> poison, float poison)
+; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv1f64 = call double @llvm.experimental.vector.extract.last.active.nxv1f64(<vscale x 1 x double> poison, <vscale x 1 x i1> poison, double poison)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SVE-LABEL: 'extractions'
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8 = call i8 @llvm.experimental.vector.extract.last.active.v16i8(<16 x i8> poison, <16 x i1> poison, i8 poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16 = call i16 @llvm.experimental.vector.extract.last.active.v8i16(<8 x i16> poison, <8 x i1> poison, i16 poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32 = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> poison, <4 x i1> poison, i32 poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64 = call i64 @llvm.experimental.vector.extract.last.active.v2i64(<2 x i64> poison, <2 x i1> poison, i64 poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = call half @llvm.experimental.vector.extract.last.active.v8f16(<8 x half> poison, <8 x i1> poison, half poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v8bf16(<8 x bfloat> poison, <8 x i1> poison, bfloat poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32 = call float @llvm.experimental.vector.extract.last.active.v4f32(<4 x float> poison, <4 x i1> poison, float poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64 = call double @llvm.experimental.vector.extract.last.active.v2f64(<2 x double> poison, <2 x i1> poison, double poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv16i8(<vscale x 16 x i8> poison, <vscale x 16 x i1> poison, i8 poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i1> poison, i16 poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> poison, i32 poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64 = call i64 @llvm.experimental.vector.extract.last.active.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i1> poison, i64 poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = call half @llvm.experimental.vector.extract.last.active.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x i1> poison, half poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv8bf16(<vscale x 8 x bfloat> poison, <vscale x 8 x i1> poison, bfloat poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f32 = call float @llvm.experimental.vector.extract.last.active.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x i1> poison, float poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f64 = call double @llvm.experimental.vector.extract.last.active.nxv2f64(<vscale x 2 x double> poison, <vscale x 2 x i1> poison, double poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8 = call i8 @llvm.experimental.vector.extract.last.active.v32i8(<32 x i8> poison, <32 x i1> poison, i8 poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i16 = call i16 @llvm.experimental.vector.extract.last.active.v16i16(<16 x i16> poison, <16 x i1> poison, i16 poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32 = call i32 @llvm.experimental.vector.extract.last.active.v8i32(<8 x i32> poison, <8 x i1> poison, i32 poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64 = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> poison, <4 x i1> poison, i64 poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call half @llvm.experimental.vector.extract.last.active.v16f16(<16 x half> poison, <16 x i1> poison, half poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v16bf16(<16 x bfloat> poison, <16 x i1> poison, bfloat poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32 = call float @llvm.experimental.vector.extract.last.active.v8f32(<8 x float> poison, <8 x i1> poison, float poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64 = call double @llvm.experimental.vector.extract.last.active.v4f64(<4 x double> poison, <4 x i1> poison, double poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv32i8(<vscale x 32 x i8> poison, <vscale x 32 x i1> poison, i8 poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv16i16(<vscale x 16 x i16> poison, <vscale x 16 x i1> poison, i16 poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv8i32(<vscale x 8 x i32> poison, <vscale x 8 x i1> poison, i32 poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64 = call i64 @llvm.experimental.vector.extract.last.active.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i1> poison, i64 poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16f16 = call half @llvm.experimental.vector.extract.last.active.nxv16f16(<vscale x 16 x half> poison, <vscale x 16 x i1> poison, half poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv16bf16(<vscale x 16 x bfloat> poison, <vscale x 16 x i1> poison, bfloat poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32 = call float @llvm.experimental.vector.extract.last.active.nxv8f32(<vscale x 8 x float> poison, <vscale x 8 x i1> poison, float poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f64 = call double @llvm.experimental.vector.extract.last.active.nxv4f64(<vscale x 4 x double> poison, <vscale x 4 x i1> poison, double poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8 = call i8 @llvm.experimental.vector.extract.last.active.v8i8(<8 x i8> poison, <8 x i1> poison, i8 poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16 = call i16 @llvm.experimental.vector.extract.last.active.v4i16(<4 x i16> poison, <4 x i1> poison, i16 poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32 = call i32 @llvm.experimental.vector.extract.last.active.v2i32(<2 x i32> poison, <2 x i1> poison, i32 poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1i64 = call i64 @llvm.experimental.vector.extract.last.active.v1i64(<1 x i64> poison, <1 x i1> poison, i64 poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = call half @llvm.experimental.vector.extract.last.active.v4f16(<4 x half> poison, <4 x i1> poison, half poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v4bf16(<4 x bfloat> poison, <4 x i1> poison, bfloat poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = call float @llvm.experimental.vector.extract.last.active.v2f32(<2 x float> poison, <2 x i1> poison, float poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1f64 = call double @llvm.experimental.vector.extract.last.active.v1f64(<1 x double> poison, <1 x i1> poison, double poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv8i8(<vscale x 8 x i8> poison, <vscale x 8 x i1> poison, i8 poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv4i16(<vscale x 4 x i16> poison, <vscale x 4 x i1> poison, i16 poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv2i32(<vscale x 2 x i32> poison, <vscale x 2 x i1> poison, i32 poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64 = call i64 @llvm.experimental.vector.extract.last.active.nxv1i64(<vscale x 1 x i64> poison, <vscale x 1 x i1> poison, i64 poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = call half @llvm.experimental.vector.extract.last.active.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x i1> poison, half poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv4bf16(<vscale x 4 x bfloat> poison, <vscale x 4 x i1> poison, bfloat poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32 = call float @llvm.experimental.vector.extract.last.active.nxv2f32(<vscale x 2 x float> poison, <vscale x 2 x i1> poison, float poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64 = call double @llvm.experimental.vector.extract.last.active.nxv1f64(<vscale x 1 x double> poison, <vscale x 1 x i1> poison, double poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SME-STREAMING-LABEL: 'extractions'
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8 = call i8 @llvm.experimental.vector.extract.last.active.v16i8(<16 x i8> poison, <16 x i1> poison, i8 poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16 = call i16 @llvm.experimental.vector.extract.last.active.v8i16(<8 x i16> poison, <8 x i1> poison, i16 poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32 = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> poison, <4 x i1> poison, i32 poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64 = call i64 @llvm.experimental.vector.extract.last.active.v2i64(<2 x i64> poison, <2 x i1> poison, i64 poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = call half @llvm.experimental.vector.extract.last.active.v8f16(<8 x half> poison, <8 x i1> poison, half poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v8bf16(<8 x bfloat> poison, <8 x i1> poison, bfloat poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32 = call float @llvm.experimental.vector.extract.last.active.v4f32(<4 x float> poison, <4 x i1> poison, float poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64 = call double @llvm.experimental.vector.extract.last.active.v2f64(<2 x double> poison, <2 x i1> poison, double poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv16i8(<vscale x 16 x i8> poison, <vscale x 16 x i1> poison, i8 poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i1> poison, i16 poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> poison, i32 poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64 = call i64 @llvm.experimental.vector.extract.last.active.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i1> poison, i64 poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = call half @llvm.experimental.vector.extract.last.active.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x i1> poison, half poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv8bf16(<vscale x 8 x bfloat> poison, <vscale x 8 x i1> poison, bfloat poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f32 = call float @llvm.experimental.vector.extract.last.active.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x i1> poison, float poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f64 = call double @llvm.experimental.vector.extract.last.active.nxv2f64(<vscale x 2 x double> poison, <vscale x 2 x i1> poison, double poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8 = call i8 @llvm.experimental.vector.extract.last.active.v32i8(<32 x i8> poison, <32 x i1> poison, i8 poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i16 = call i16 @llvm.experimental.vector.extract.last.active.v16i16(<16 x i16> poison, <16 x i1> poison, i16 poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32 = call i32 @llvm.experimental.vector.extract.last.active.v8i32(<8 x i32> poison, <8 x i1> poison, i32 poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64 = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> poison, <4 x i1> poison, i64 poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call half @llvm.experimental.vector.extract.last.active.v16f16(<16 x half> poison, <16 x i1> poison, half poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v16bf16(<16 x bfloat> poison, <16 x i1> poison, bfloat poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32 = call float @llvm.experimental.vector.extract.last.active.v8f32(<8 x float> poison, <8 x i1> poison, float poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64 = call double @llvm.experimental.vector.extract.last.active.v4f64(<4 x double> poison, <4 x i1> poison, double poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv32i8(<vscale x 32 x i8> poison, <vscale x 32 x i1> poison, i8 poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv16i16(<vscale x 16 x i16> poison, <vscale x 16 x i1> poison, i16 poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv8i32(<vscale x 8 x i32> poison, <vscale x 8 x i1> poison, i32 poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64 = call i64 @llvm.experimental.vector.extract.last.active.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i1> poison, i64 poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16f16 = call half @llvm.experimental.vector.extract.last.active.nxv16f16(<vscale x 16 x half> poison, <vscale x 16 x i1> poison, half poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv16bf16(<vscale x 16 x bfloat> poison, <vscale x 16 x i1> poison, bfloat poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32 = call float @llvm.experimental.vector.extract.last.active.nxv8f32(<vscale x 8 x float> poison, <vscale x 8 x i1> poison, float poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f64 = call double @llvm.experimental.vector.extract.last.active.nxv4f64(<vscale x 4 x double> poison, <vscale x 4 x i1> poison, double poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8 = call i8 @llvm.experimental.vector.extract.last.active.v8i8(<8 x i8> poison, <8 x i1> poison, i8 poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16 = call i16 @llvm.experimental.vector.extract.last.active.v4i16(<4 x i16> poison, <4 x i1> poison, i16 poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32 = call i32 @llvm.experimental.vector.extract.last.active.v2i32(<2 x i32> poison, <2 x i1> poison, i32 poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1i64 = call i64 @llvm.experimental.vector.extract.last.active.v1i64(<1 x i64> poison, <1 x i1> poison, i64 poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = call half @llvm.experimental.vector.extract.last.active.v4f16(<4 x half> poison, <4 x i1> poison, half poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v4bf16(<4 x bfloat> poison, <4 x i1> poison, bfloat poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = call float @llvm.experimental.vector.extract.last.active.v2f32(<2 x float> poison, <2 x i1> poison, float poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1f64 = call double @llvm.experimental.vector.extract.last.active.v1f64(<1 x double> poison, <1 x i1> poison, double poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv8i8(<vscale x 8 x i8> poison, <vscale x 8 x i1> poison, i8 poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv4i16(<vscale x 4 x i16> poison, <vscale x 4 x i1> poison, i16 poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv2i32(<vscale x 2 x i32> poison, <vscale x 2 x i1> poison, i32 poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64 = call i64 @llvm.experimental.vector.extract.last.active.nxv1i64(<vscale x 1 x i64> poison, <vscale x 1 x i1> poison, i64 poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = call half @llvm.experimental.vector.extract.last.active.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x i1> poison, half poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv4bf16(<vscale x 4 x bfloat> poison, <vscale x 4 x i1> poison, bfloat poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32 = call float @llvm.experimental.vector.extract.last.active.nxv2f32(<vscale x 2 x float> poison, <vscale x 2 x i1> poison, float poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64 = call double @llvm.experimental.vector.extract.last.active.nxv1f64(<vscale x 1 x double> poison, <vscale x 1 x i1> poison, double poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+
+  ;; Legal types
+  %v16i8 = call i8 @llvm.experimental.vector.extract.last.active.v16i8(<16 x i8> poison, <16 x i1> poison, i8 poison)
+  %v8i16 = call i16 @llvm.experimental.vector.extract.last.active.v8i16(<8 x i16> poison, <8 x i1> poison, i16 poison)
+  %v4i32 = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> poison, <4 x i1> poison, i32 poison)
+  %v2i64 = call i64 @llvm.experimental.vector.extract.last.active.v2i64(<2 x i64> poison, <2 x i1> poison, i64 poison)
+  %v8f16 = call half @llvm.experimental.vector.extract.last.active.v8f16(<8 x half> poison, <8 x i1> poison, half poison)
+  %v8bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v8bf16(<8 x bfloat> poison, <8 x i1> poison, bfloat poison)
+  %v4f32 = call float @llvm.experimental.vector.extract.last.active.v4f32(<4 x float> poison, <4 x i1> poison, float poison)
+  %v2f64 = call double @llvm.experimental.vector.extract.last.active.v2f64(<2 x double> poison, <2 x i1> poison, double poison)
+  %nxv16i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv16i8(<vscale x 16 x i8> poison, <vscale x 16 x i1> poison, i8 poison)
+  %nxv8i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i1> poison, i16 poison)
+  %nxv4i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> poison, i32 poison)
+  %nxv2i64 = call i64 @llvm.experimental.vector.extract.last.active.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i1> poison, i64 poison)
+  %nxv8f16 = call half @llvm.experimental.vector.extract.last.active.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x i1> poison, half poison)
+  %nxv8bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv8bf16(<vscale x 8 x bfloat> poison, <vscale x 8 x i1> poison, bfloat poison)
+  %nxv4f32 = call float @llvm.experimental.vector.extract.last.active.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x i1> poison, float poison)
+  %nxv2f64 = call double @llvm.experimental.vector.extract.last.active.nxv2f64(<vscale x 2 x double> poison, <vscale x 2 x i1> poison, double poison)
+
+  ;; Wider-than-legal
+  %v32i8 = call i8 @llvm.experimental.vector.extract.last.active.v32i8(<32 x i8> poison, <32 x i1> poison, i8 poison)
+  %v16i16 = call i16 @llvm.experimental.vector.extract.last.active.v16i16(<16 x i16> poison, <16 x i1> poison, i16 poison)
+  %v8i32 = call i32 @llvm.experimental.vector.extract.last.active.v8i32(<8 x i32> poison, <8 x i1> poison, i32 poison)
+  %v4i64 = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> poison, <4 x i1> poison, i64 poison)
+  %v16f16 = call half @llvm.experimental.vector.extract.last.active.v16f16(<16 x half> poison, <16 x i1> poison, half poison)
+  %v16bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v16bf16(<16 x bfloat> poison, <16 x i1> poison, bfloat poison)
+  %v8f32 = call float @llvm.experimental.vector.extract.last.active.v8f32(<8 x float> poison, <8 x i1> poison, float poison)
+  %v4f64 = call double @llvm.experimental.vector.extract.last.active.v4f64(<4 x double> poison, <4 x i1> poison, double poison)
+  %nxv32i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv32i8(<vscale x 32 x i8> poison, <vscale x 32 x i1> poison, i8 poison)
+  %nxv16i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv16i16(<vscale x 16 x i16> poison, <vscale x 16 x i1> poison, i16 poison)
+  %nxv8i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv8i32(<vscale x 8 x i32> poison, <vscale x 8 x i1> poison, i32 poison)
+  %nxv4i64 = call i64 @llvm.experimental.vector.extract.last.active.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i1> poison, i64 poison)
+  %nxv16f16 = call half @llvm.experimental.vector.extract.last.active.nxv16f16(<vscale x 16 x half> poison, <vscale x 16 x i1> poison, half poison)
+  %nxv16bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv16bf16(<vscale x 16 x bfloat> poison, <vscale x 16 x i1> poison, bfloat poison)
+  %nxv8f32 = call float @llvm.experimental.vector.extract.last.active.nxv8f32(<vscale x 8 x float> poison, <vscale x 8 x i1> poison, float poison)
+  %nxv4f64 = call double @llvm.experimental.vector.extract.last.active.nxv4f64(<vscale x 4 x double> poison, <vscale x 4 x i1> poison, double poison)
+
+  ;; Narrower-than-legal
+  %v8i8 = call i8 @llvm.experimental.vector.extract.last.active.v8i8(<8 x i8> poison, <8 x i1> poison, i8 poison)
+  %v4i16 = call i16 @llvm.experimental.vector.extract.last.active.v4i16(<4 x i16> poison, <4 x i1> poison, i16 poison)
+  %v2i32 = call i32 @llvm.experimental.vector.extract.last.active.v2i32(<2 x i32> poison, <2 x i1> poison, i32 poison)
+  %v1i64 = call i64 @llvm.experimental.vector.extract.last.active.v1i64(<1 x i64> poison, <1 x i1> poison, i64 poison)
+  %v4f16 = call half @llvm.experimental.vector.extract.last.active.v4f16(<4 x half> poison, <4 x i1> poison, half poison)
+  %v4bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v4bf16(<4 x bfloat> poison, <4 x i1> poison, bfloat poison)
+  %v2f32 = call float @llvm.experimental.vector.extract.last.active.v2f32(<2 x float> poison, <2 x i1> poison, float poison)
+  %v1f64 = call double @llvm.experimental.vector.extract.last.active.v1f64(<1 x double> poison, <1 x i1> poison, double poison)
+  %nxv8i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv8i8(<vscale x 8 x i8> poison, <vscale x 8 x i1> poison, i8 poison)
+  %nxv4i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv4i16(<vscale x 4 x i16> poison, <vscale x 4 x i1> poison, i16 poison)
+  %nxv2i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv2i32(<vscale x 2 x i32> poison, <vscale x 2 x i1> poison, i32 poison)
+  %nxv1i64 = call i64 @llvm.experimental.vector.extract.last.active.nxv1i64(<vscale x 1 x i64> poison, <vscale x 1 x i1> poison, i64 poison)
+  %nxv4f16 = call half @llvm.experimental.vector.extract.last.active.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x i1> poison, half poison)
+  %nxv4bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv4bf16(<vscale x 4 x bfloat> poison, <vscale x 4 x i1> poison, bfloat poison)
+  %nxv2f32 = call float @llvm.experimental.vector.extract.last.active.nxv2f32(<vscale x 2 x float> poison, <vscale x 2 x i1> poison, float poison)
+  %nxv1f64 = call double @llvm.experimental.vector.extract.last.active.nxv1f64(<vscale x 1 x double> poison, <vscale x 1 x i1> poison, double poison)
+
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/AArch64/masked_ldst_vls.ll b/llvm/test/Analysis/CostModel/AArch64/masked_ldst_vls.ll
index fa53a184e317b..1920fc9b4a640 100644
--- a/llvm/test/Analysis/CostModel/AArch64/masked_ldst_vls.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/masked_ldst_vls.ll
@@ -1,17 +1,6 @@
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=256 | FileCheck %s -D#VBITS=256
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=384 | FileCheck %s -D#VBITS=256
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=512 | FileCheck %s -D#VBITS=512
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=640 | FileCheck %s -D#VBITS=512
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=768 | FileCheck %s -D#VBITS=512
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=896 | FileCheck %s -D#VBITS=512
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=1024 | FileCheck %s -D#VBITS=1024
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=1152 | FileCheck %s -D#VBITS=1024
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=1280 | FileCheck %s -D#VBITS=1024
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=1408 | FileCheck %s -D#VBITS=1024
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=1536 | FileCheck %s -D#VBITS=1024
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=1664 | FileCheck %s -D#VBITS=1024
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=1792 | FileCheck %s -D#VBITS=1024
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=1920 | FileCheck %s -D#VBITS=1024
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=2048 | FileCheck %s -D#VBITS=2048
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-fixed-length.ll b/llvm/test/Analysis/CostModel/AArch64/sve-fixed-length.ll
index df40a962d0def..e128987c5ab8d 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-fixed-length.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-fixed-length.ll
@@ -1,19 +1,8 @@
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s -D#VBITS=128
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=128 | FileCheck %s -D#VBITS=128
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=256 | FileCheck %s -D#VBITS=256
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=384 | FileCheck %s -D#VBITS=256
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=512 | FileCheck %s -D#VBITS=512
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=640 | FileCheck %s -D#VBITS=512
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=768 | FileCheck %s -D#VBITS=512
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=896 | FileCheck %s -D#VBITS=512
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=1024 | FileCheck %s -D#VBITS=1024
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=1152 | FileCheck %s -D#VBITS=1024
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=1280 | FileCheck %s -D#VBITS=1024
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=1408 | FileCheck %s -D#VBITS=1024
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=1536 | FileCheck %s -D#VBITS=1024
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=1664 | FileCheck %s -D#VBITS=1024
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=1792 | FileCheck %s -D#VBITS=1024
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=1920 | FileCheck %s -D#VBITS=1024
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=2048 | FileCheck %s -D#VBITS=2048
 
 ; VBITS represents the useful bit size of a vector register from the code
diff --git a/llvm/test/Analysis/DependenceAnalysis/GCD.ll b/llvm/test/Analysis/DependenceAnalysis/GCD.ll
index 03343e7a98211..cb14d189afe4c 100644
--- a/llvm/test/Analysis/DependenceAnalysis/GCD.ll
+++ b/llvm/test/Analysis/DependenceAnalysis/GCD.ll
@@ -254,7 +254,7 @@ define void @gcd4(ptr %A, ptr %B, i64 %M, i64 %N) nounwind uwtable ssp {
 ; CHECK-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %conv, ptr %arrayidx, align 4
 ; CHECK-NEXT:    da analyze - output [* *]!
 ; CHECK-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: %0 = load i32, ptr %arrayidx16, align 4
-; CHECK-NEXT:    da analyze - none!
+; CHECK-NEXT:    da analyze - flow [* *|<]!
 ; CHECK-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %0, ptr %B.addr.11, align 4
 ; CHECK-NEXT:    da analyze - confused!
 ; CHECK-NEXT:  Src: %0 = load i32, ptr %arrayidx16, align 4 --> Dst: %0 = load i32, ptr %arrayidx16, align 4
@@ -322,7 +322,7 @@ define void @gcd5(ptr %A, ptr %B, i64 %M, i64 %N) nounwind uwtable ssp {
 ; CHECK-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %conv, ptr %arrayidx, align 4
 ; CHECK-NEXT:    da analyze - output [* *]!
 ; CHECK-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: %0 = load i32, ptr %arrayidx16, align 4
-; CHECK-NEXT:    da analyze - flow [<> *]!
+; CHECK-NEXT:    da analyze - flow [* *|<]!
 ; CHECK-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %0, ptr %B.addr.11, align 4
 ; CHECK-NEXT:    da analyze - confused!
 ; CHECK-NEXT:  Src: %0 = load i32, ptr %arrayidx16, align 4 --> Dst: %0 = load i32, ptr %arrayidx16, align 4
@@ -390,7 +390,7 @@ define void @gcd6(i64 %n, ptr %A, ptr %B) nounwind uwtable ssp {
 ; CHECK-NEXT:  Src: store i32 %conv, ptr %arrayidx5, align 4 --> Dst: store i32 %conv, ptr %arrayidx5, align 4
 ; CHECK-NEXT:    da analyze - output [* *]!
 ; CHECK-NEXT:  Src: store i32 %conv, ptr %arrayidx5, align 4 --> Dst: %2 = load i32, ptr %arrayidx9, align 4
-; CHECK-NEXT:    da analyze - none!
+; CHECK-NEXT:    da analyze - flow [* *|<]!
 ; CHECK-NEXT:  Src: store i32 %conv, ptr %arrayidx5, align 4 --> Dst: store i32 %2, ptr %B.addr.12, align 4
 ; CHECK-NEXT:    da analyze - confused!
 ; CHECK-NEXT:  Src: %2 = load i32, ptr %arrayidx9, align 4 --> Dst: %2 = load i32, ptr %arrayidx9, align 4
diff --git a/llvm/test/Analysis/DependenceAnalysis/SymbolicSIV.ll b/llvm/test/Analysis/DependenceAnalysis/SymbolicSIV.ll
index cdfaec76fa892..73a415baef4c4 100644
--- a/llvm/test/Analysis/DependenceAnalysis/SymbolicSIV.ll
+++ b/llvm/test/Analysis/DependenceAnalysis/SymbolicSIV.ll
@@ -384,7 +384,7 @@ define void @symbolicsiv6(ptr %A, ptr %B, i64 %n, i64 %N, i64 %M) nounwind uwtab
 ; CHECK-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %conv, ptr %arrayidx, align 4
 ; CHECK-NEXT:    da analyze - none!
 ; CHECK-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: %0 = load i32, ptr %arrayidx7, align 4
-; CHECK-NEXT:    da analyze - none!
+; CHECK-NEXT:    da analyze - flow [*|<]!
 ; CHECK-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %0, ptr %B.addr.02, align 4
 ; CHECK-NEXT:    da analyze - confused!
 ; CHECK-NEXT:  Src: %0 = load i32, ptr %arrayidx7, align 4 --> Dst: %0 = load i32, ptr %arrayidx7, align 4
@@ -440,7 +440,7 @@ define void @symbolicsiv7(ptr %A, ptr %B, i64 %n, i64 %N, i64 %M) nounwind uwtab
 ; CHECK-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %conv, ptr %arrayidx, align 4
 ; CHECK-NEXT:    da analyze - none!
 ; CHECK-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: %1 = load i32, ptr %arrayidx6, align 4
-; CHECK-NEXT:    da analyze - flow [<>]!
+; CHECK-NEXT:    da analyze - flow [*|<]!
 ; CHECK-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %1, ptr %B.addr.02, align 4
 ; CHECK-NEXT:    da analyze - confused!
 ; CHECK-NEXT:  Src: %1 = load i32, ptr %arrayidx6, align 4 --> Dst: %1 = load i32, ptr %arrayidx6, align 4
diff --git a/llvm/test/Analysis/DependenceAnalysis/compute-absolute-value.ll b/llvm/test/Analysis/DependenceAnalysis/compute-absolute-value.ll
index 64fad37ab699a..783150af2cd13 100644
--- a/llvm/test/Analysis/DependenceAnalysis/compute-absolute-value.ll
+++ b/llvm/test/Analysis/DependenceAnalysis/compute-absolute-value.ll
@@ -18,7 +18,7 @@ define void @unknown_sign(ptr %a, i64 %k) {
 ; CHECK-NEXT:  Src: store i8 1, ptr %idx.0, align 1 --> Dst: store i8 1, ptr %idx.0, align 1
 ; CHECK-NEXT:    da analyze - none!
 ; CHECK-NEXT:  Src: store i8 1, ptr %idx.0, align 1 --> Dst: store i8 2, ptr %idx.1, align 1
-; CHECK-NEXT:    da analyze - output [<>]!
+; CHECK-NEXT:    da analyze - output [*|<]!
 ; CHECK-NEXT:  Src: store i8 2, ptr %idx.1, align 1 --> Dst: store i8 2, ptr %idx.1, align 1
 ; CHECK-NEXT:    da analyze - none!
 ;
diff --git a/llvm/test/Analysis/DependenceAnalysis/gcd-miv-overflow.ll b/llvm/test/Analysis/DependenceAnalysis/gcd-miv-overflow.ll
new file mode 100644
index 0000000000000..9169ac323d834
--- /dev/null
+++ b/llvm/test/Analysis/DependenceAnalysis/gcd-miv-overflow.ll
@@ -0,0 +1,63 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -disable-output "-passes=print<da>" 2>&1 \
+; RUN:     | FileCheck %s --check-prefixes=CHECK,CHECK-ALL
+; RUN: opt < %s -disable-output "-passes=print<da>" -da-enable-dependence-test=gcd-miv 2>&1 \
+; RUN:     | FileCheck %s --check-prefixes=CHECK,CHECK-GCD-MIV
+
+; offset0 = 4;
+; offset1 = 0;
+; for (i = 0; i < 100; i++) {
+;   A[offset0] = 1;
+;   A[offset1] = 2;
+;   offset0 += 3*m;
+;   offset1 += 3;
+; }
+;
+; Dependency exists between the two stores. E.g., consider `m` is
+; 12297829382473034411, which is a modular multiplicative inverse of 3 under
+; modulo 2^64. Then `offset0` is effectively `i + 4`, so accesses will be as
+; follows:
+;
+;   - A[offset0] : A[4], A[5], A[6], ...
+;   - A[offset1] : A[0], A[3], A[6], ...
+;
+define void @gcdmiv_coef_ovfl(ptr %A, i64 %m) {
+; CHECK-ALL-LABEL: 'gcdmiv_coef_ovfl'
+; CHECK-ALL-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-ALL-NEXT:    da analyze - none!
+; CHECK-ALL-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-ALL-NEXT:    da analyze - output [*|<]!
+; CHECK-ALL-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-ALL-NEXT:    da analyze - none!
+;
+; CHECK-GCD-MIV-LABEL: 'gcdmiv_coef_ovfl'
+; CHECK-GCD-MIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-GCD-MIV-NEXT:    da analyze - consistent output [*]!
+; CHECK-GCD-MIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-GCD-MIV-NEXT:    da analyze - consistent output [*|<]!
+; CHECK-GCD-MIV-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-GCD-MIV-NEXT:    da analyze - consistent output [*]!
+;
+entry:
+  %step = mul i64 3, %m
+  br label %loop
+
+loop:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop ]
+  %offset.0 = phi i64 [ 4, %entry ] , [ %offset.0.next, %loop ]
+  %offset.1 = phi i64 [ 0, %entry ] , [ %offset.1.next, %loop ]
+  %gep.0 = getelementptr inbounds i8, ptr %A, i64 %offset.0
+  %gep.1 = getelementptr inbounds i8, ptr %A, i64 %offset.1
+  store i8 1, ptr %gep.0
+  store i8 2, ptr %gep.1
+  %i.inc = add nuw nsw i64 %i, 1
+  %offset.0.next = add nsw i64 %offset.0, %step
+  %offset.1.next = add nsw i64 %offset.1, 3
+  %ec = icmp eq i64 %i.inc, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/Analysis/DependenceAnalysis/strong-siv-overflow.ll b/llvm/test/Analysis/DependenceAnalysis/strong-siv-overflow.ll
new file mode 100644
index 0000000000000..bf0fafcbfd6c9
--- /dev/null
+++ b/llvm/test/Analysis/DependenceAnalysis/strong-siv-overflow.ll
@@ -0,0 +1,68 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -disable-output "-passes=print<da>" 2>&1 \
+; RUN:     | FileCheck %s --check-prefixes=CHECK,CHECK-ALL
+; RUN: opt < %s -disable-output "-passes=print<da>" -da-enable-dependence-test=strong-siv 2>&1 \
+; RUN:     | FileCheck %s --check-prefixes=CHECK,CHECK-STRONG-SIV
+
+; for (i = 0; i < (1LL << 62); i++) {
+;   if (0 <= 2*i - 2)
+;     A[2*i - 2] = 1;
+;
+;   if (0 <= 2*i - 4)
+;     A[2*i - 4] = 2;
+; }
+;
+; FIXME: DependenceAnalysis currently detects no dependency between the two
+; stores, but it does exist. For example, each store will access A[0] when i
+; is 1 and 2 respectively.
+; The root cause is that the product of the BTC and the coefficient
+; ((1LL << 62) - 1 and 2) overflows in a signed sense.
+define void @strongsiv_const_ovfl(ptr %A) {
+; CHECK-LABEL: 'strongsiv_const_ovfl'
+; CHECK-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-NEXT:    da analyze - none!
+; CHECK-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-NEXT:    da analyze - none!
+; CHECK-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-NEXT:    da analyze - none!
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop.latch ]
+  %offset.0 = phi i64 [ -2, %entry ], [ %offset.0.next, %loop.latch ]
+  %offset.1 = phi i64 [ -4, %entry ], [ %offset.1.next, %loop.latch ]
+  %ec = icmp eq i64 %i, 4611686018427387904
+  br i1 %ec, label %exit, label %loop.body
+
+loop.body:
+  %cond.0 = icmp sge i64 %offset.0, 0
+  %cond.1 = icmp sge i64 %offset.1, 0
+  br i1 %cond.0, label %if.then.0, label %loop.middle
+
+if.then.0:
+  %gep.0 = getelementptr inbounds i8, ptr %A, i64 %offset.0
+  store i8 1, ptr %gep.0
+  br label %loop.middle
+
+loop.middle:
+  br i1 %cond.1, label %if.then.1, label %loop.latch
+
+if.then.1:
+  %gep.1 = getelementptr inbounds i8, ptr %A, i64 %offset.1
+  store i8 2, ptr %gep.1
+  br label %loop.latch
+
+loop.latch:
+  %i.inc = add nuw nsw i64 %i, 1
+  %offset.0.next = add nsw i64 %offset.0, 2
+  %offset.1.next = add nsw i64 %offset.1, 2
+  br label %loop.header
+
+exit:
+  ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK-ALL: {{.*}}
+; CHECK-STRONG-SIV: {{.*}}
diff --git a/llvm/test/Analysis/DependenceAnalysis/symbolic-rdiv-overflow.ll b/llvm/test/Analysis/DependenceAnalysis/symbolic-rdiv-overflow.ll
new file mode 100644
index 0000000000000..c5ff9884a0c62
--- /dev/null
+++ b/llvm/test/Analysis/DependenceAnalysis/symbolic-rdiv-overflow.ll
@@ -0,0 +1,137 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -disable-output "-passes=print<da>" 2>&1 \
+; RUN:     | FileCheck %s --check-prefixes=CHECK,CHECK-ALL
+; RUN: opt < %s -disable-output "-passes=print<da>" -da-enable-dependence-test=symbolic-rdiv 2>&1 \
+; RUN:     | FileCheck %s --check-prefixes=CHECK,CHECK-SYMBOLIC-RDIV
+
+; for (i = 0; i < (1LL << 62); i++) {
+;   if (0 <= 2*i - 2)
+;     A[2*i - 2] = 1;
+;   A[i] = 2;
+; }
+;
+; FIXME: DependenceAnalysis currently detects no dependency between the two
+; stores, but it does exist. For example, each store will access A[0] when i
+; is 1 and 0 respectively.
+; The root cause is that the product of the BTC and the coefficient 
+; ((1LL << 62) - 1 and 2) overflows in a signed sense.
+define void @symbolicrdiv_prod_ovfl(ptr %A) {
+; CHECK-ALL-LABEL: 'symbolicrdiv_prod_ovfl'
+; CHECK-ALL-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-ALL-NEXT:    da analyze - none!
+; CHECK-ALL-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-ALL-NEXT:    da analyze - none!
+; CHECK-ALL-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-ALL-NEXT:    da analyze - none!
+;
+; CHECK-SYMBOLIC-RDIV-LABEL: 'symbolicrdiv_prod_ovfl'
+; CHECK-SYMBOLIC-RDIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-SYMBOLIC-RDIV-NEXT:    da analyze - none!
+; CHECK-SYMBOLIC-RDIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-SYMBOLIC-RDIV-NEXT:    da analyze - none!
+; CHECK-SYMBOLIC-RDIV-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-SYMBOLIC-RDIV-NEXT:    da analyze - consistent output [*]!
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop.latch ]
+  %offset = phi i64 [ -2, %entry ], [ %offset.next, %loop.latch ]
+  %ec = icmp eq i64 %i, 4611686018427387904
+  br i1 %ec, label %exit, label %loop.body
+
+loop.body:
+  %cond = icmp sge i64 %offset, 0
+  br i1 %cond, label %if.then, label %loop.latch
+
+if.then:
+  %gep.0 = getelementptr inbounds i8, ptr %A, i64 %offset
+  store i8 1, ptr %gep.0
+  br label %loop.latch
+
+loop.latch:
+  %gep.1 = getelementptr inbounds i8, ptr %A, i64 %i
+  store i8 2, ptr %gep.1
+  %i.inc = add nuw nsw i64 %i, 1
+  %offset.next = add nsw i64 %offset, 2
+  br label %loop.header
+
+exit:
+  ret void
+}
+
+; offset0 = -4611686018427387904;  // -2^62
+; offset1 =  4611686018427387904;  // 2^62
+; for (i = 0; i < (1LL << 62) - 100; i++) {
+;   if (0 <= offset0)
+;     A[offset0] = 1;
+;   if (0 <= offset1)
+;     A[offset1] = 2;
+;   offset0 += 2;
+;   offset1 -= 1;
+; }
+;
+; FIXME: DependenceAnalysis currently detects no dependency between the two
+; stores, but it does exist. For example,
+;
+;  memory access           | i == 2^61 | i == 2^61 + 2^59 | i == 2^61 + 2^60  
+; -------------------------|-----------|------------------|-------------------
+;  A[2*i - 2^62] (offset0) |           | A[2^60]          | A[2^61]           
+;  A[-i + 2^62]  (offset1) | A[2^61]   |                  | A[2^60]           
+;
+; The root cause is that the calculation of the differenct between the two
+; constants (-2^62 and 2^62) overflows in a signed sense.
+define void @symbolicrdiv_delta_ovfl(ptr %A) {
+; CHECK-ALL-LABEL: 'symbolicrdiv_delta_ovfl'
+; CHECK-ALL-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-ALL-NEXT:    da analyze - none!
+; CHECK-ALL-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-ALL-NEXT:    da analyze - none!
+; CHECK-ALL-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-ALL-NEXT:    da analyze - none!
+;
+; CHECK-SYMBOLIC-RDIV-LABEL: 'symbolicrdiv_delta_ovfl'
+; CHECK-SYMBOLIC-RDIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-SYMBOLIC-RDIV-NEXT:    da analyze - consistent output [*]!
+; CHECK-SYMBOLIC-RDIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-SYMBOLIC-RDIV-NEXT:    da analyze - none!
+; CHECK-SYMBOLIC-RDIV-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-SYMBOLIC-RDIV-NEXT:    da analyze - consistent output [*]!
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop.latch ]
+  %offset.0 = phi i64 [ -4611686018427387904, %entry ], [ %offset.0.next, %loop.latch ]
+  %offset.1 = phi i64 [ 4611686018427387904, %entry ], [ %offset.1.next, %loop.latch ]
+  %cond.0 = icmp sge i64 %offset.0, 0
+  %cond.1 = icmp sge i64 %offset.1, 0
+  br i1 %cond.0, label %if.then.0, label %loop.middle
+
+if.then.0:
+  %gep.0 = getelementptr inbounds i8, ptr %A, i64 %offset.0
+  store i8 1, ptr %gep.0
+  br label %loop.middle
+
+loop.middle:
+  br i1 %cond.1, label %if.then.1, label %loop.latch
+
+if.then.1:
+  %gep.1 = getelementptr inbounds i8, ptr %A, i64 %offset.1
+  store i8 2, ptr %gep.1
+  br label %loop.latch
+
+loop.latch:
+  %i.inc = add nuw nsw i64 %i, 1
+  %offset.0.next = add nsw i64 %offset.0, 2
+  %offset.1.next = sub nsw i64 %offset.1, 1
+  %ec = icmp eq i64 %i.inc, 4611686018427387804 ; 2^62 - 100
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/Analysis/DependenceAnalysis/weak-crossing-siv-overflow.ll b/llvm/test/Analysis/DependenceAnalysis/weak-crossing-siv-overflow.ll
new file mode 100644
index 0000000000000..ba57c7bf5736a
--- /dev/null
+++ b/llvm/test/Analysis/DependenceAnalysis/weak-crossing-siv-overflow.ll
@@ -0,0 +1,125 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -disable-output "-passes=print<da>" 2>&1 \
+; RUN:     | FileCheck %s --check-prefixes=CHECK,CHECK-ALL
+; RUN: opt < %s -disable-output "-passes=print<da>" -da-enable-dependence-test=weak-crossing-siv 2>&1 \
+; RUN:     | FileCheck %s --check-prefixes=CHECK,CHECK-WEAK-CROSSING-SIV
+
+; max_i = INT64_MAX/3  // 3074457345618258602
+; for (long long i = 0; i <= max_i; i++) {
+;   A[-3*i + INT64_MAX] = 0;
+;   if (i)
+;     A[3*i - 2] = 1;
+; }
+;
+; FIXME: DependenceAnalysis currently detects no dependency between
+; `A[-3*i + INT64_MAX]` and `A[3*i - 2]`, but it does exist. For example,
+;
+;  memory access       | i == 1           | i == max_i
+; ---------------------|------------------|------------------
+;  A[-3*i + INT64_MAX] | A[INT64_MAX - 3] | A[1]
+;  A[3*i - 2]          | A[1]             | A[INT64_MAX - 3]
+;
+; The root cause is that the calculation of the differenct between the two
+; constants (INT64_MAX and -2) triggers an overflow.
+
+define void @weakcorssing_delta_ovfl(ptr %A) {
+; CHECK-ALL-LABEL: 'weakcorssing_delta_ovfl'
+; CHECK-ALL-NEXT:  Src: store i8 0, ptr %idx.0, align 1 --> Dst: store i8 0, ptr %idx.0, align 1
+; CHECK-ALL-NEXT:    da analyze - none!
+; CHECK-ALL-NEXT:  Src: store i8 0, ptr %idx.0, align 1 --> Dst: store i8 1, ptr %idx.1, align 1
+; CHECK-ALL-NEXT:    da analyze - none!
+; CHECK-ALL-NEXT:  Src: store i8 1, ptr %idx.1, align 1 --> Dst: store i8 1, ptr %idx.1, align 1
+; CHECK-ALL-NEXT:    da analyze - none!
+;
+; CHECK-WEAK-CROSSING-SIV-LABEL: 'weakcorssing_delta_ovfl'
+; CHECK-WEAK-CROSSING-SIV-NEXT:  Src: store i8 0, ptr %idx.0, align 1 --> Dst: store i8 0, ptr %idx.0, align 1
+; CHECK-WEAK-CROSSING-SIV-NEXT:    da analyze - consistent output [*]!
+; CHECK-WEAK-CROSSING-SIV-NEXT:  Src: store i8 0, ptr %idx.0, align 1 --> Dst: store i8 1, ptr %idx.1, align 1
+; CHECK-WEAK-CROSSING-SIV-NEXT:    da analyze - none!
+; CHECK-WEAK-CROSSING-SIV-NEXT:  Src: store i8 1, ptr %idx.1, align 1 --> Dst: store i8 1, ptr %idx.1, align 1
+; CHECK-WEAK-CROSSING-SIV-NEXT:    da analyze - consistent output [*]!
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop.latch ]
+  %subscript.0 = phi i64 [ 9223372036854775807, %entry ], [ %subscript.0.next, %loop.latch ]
+  %subscript.1 = phi i64 [ -2, %entry ], [ %subscript.1.next, %loop.latch ]
+  %idx.0 = getelementptr inbounds i8, ptr %A, i64 %subscript.0
+  store i8 0, ptr %idx.0
+  %cond.store = icmp ne i64 %i, 0
+  br i1 %cond.store, label %if.store, label %loop.latch
+
+if.store:
+  %idx.1 = getelementptr inbounds i8, ptr %A, i64 %subscript.1
+  store i8 1, ptr %idx.1
+  br label %loop.latch
+
+loop.latch:
+  %i.inc = add nuw nsw i64 %i, 1
+  %subscript.0.next = add nsw i64 %subscript.0, -3
+  %subscript.1.next = add nsw i64 %subscript.1, 3
+  %ec = icmp sgt i64 %i.inc, 3074457345618258602
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret void
+}
+
+; max_i = INT64_MAX/3  // 3074457345618258602
+; for (long long i = 0; i <= max_i; i++) {
+;   A[-3*i + INT64_MAX] = 0;
+;   A[3*i + 1] = 1;
+; }
+;
+; FIXME: DependenceAnalysis currently detects no dependency between
+; `A[-3*i + INT64_MAX]` and `A[3*i - 2]`, but it does exist. For example,
+;
+;  memory access       | i == 0 | i == 1           | i == max_i - 1 | i == max_i
+; ---------------------|--------|------------------|----------------|------------------
+;  A[-3*i + INT64_MAX] |        | A[INT64_MAX - 3] | A[1]           |
+;  A[3*i + 1]          | A[1]   |                  |                | A[INT64_MAX - 3]
+;
+; The root cause is that the product of the BTC, the coefficient, and 2
+; triggers an overflow.
+;
+define void @weakcorssing_prod_ovfl(ptr %A) {
+; CHECK-ALL-LABEL: 'weakcorssing_prod_ovfl'
+; CHECK-ALL-NEXT:  Src: store i8 0, ptr %idx.0, align 1 --> Dst: store i8 0, ptr %idx.0, align 1
+; CHECK-ALL-NEXT:    da analyze - none!
+; CHECK-ALL-NEXT:  Src: store i8 0, ptr %idx.0, align 1 --> Dst: store i8 1, ptr %idx.1, align 1
+; CHECK-ALL-NEXT:    da analyze - none!
+; CHECK-ALL-NEXT:  Src: store i8 1, ptr %idx.1, align 1 --> Dst: store i8 1, ptr %idx.1, align 1
+; CHECK-ALL-NEXT:    da analyze - none!
+;
+; CHECK-WEAK-CROSSING-SIV-LABEL: 'weakcorssing_prod_ovfl'
+; CHECK-WEAK-CROSSING-SIV-NEXT:  Src: store i8 0, ptr %idx.0, align 1 --> Dst: store i8 0, ptr %idx.0, align 1
+; CHECK-WEAK-CROSSING-SIV-NEXT:    da analyze - consistent output [*]!
+; CHECK-WEAK-CROSSING-SIV-NEXT:  Src: store i8 0, ptr %idx.0, align 1 --> Dst: store i8 1, ptr %idx.1, align 1
+; CHECK-WEAK-CROSSING-SIV-NEXT:    da analyze - none!
+; CHECK-WEAK-CROSSING-SIV-NEXT:  Src: store i8 1, ptr %idx.1, align 1 --> Dst: store i8 1, ptr %idx.1, align 1
+; CHECK-WEAK-CROSSING-SIV-NEXT:    da analyze - consistent output [*]!
+;
+entry:
+  br label %loop
+
+loop:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop ]
+  %subscript.0 = phi i64 [ 9223372036854775807, %entry ], [ %subscript.0.next, %loop ]
+  %subscript.1 = phi i64 [ 1, %entry ], [ %subscript.1.next, %loop ]
+  %idx.0 = getelementptr inbounds i8, ptr %A, i64 %subscript.0
+  %idx.1 = getelementptr inbounds i8, ptr %A, i64 %subscript.1
+  store i8 0, ptr %idx.0
+  store i8 1, ptr %idx.1
+  %i.inc = add nuw nsw i64 %i, 1
+  %subscript.0.next = add nsw i64 %subscript.0, -3
+  %subscript.1.next = add nsw i64 %subscript.1, 3
+  %ec = icmp sgt i64 %i.inc, 3074457345618258602
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/Analysis/DependenceAnalysis/weak-zero-siv-overflow.ll b/llvm/test/Analysis/DependenceAnalysis/weak-zero-siv-overflow.ll
new file mode 100644
index 0000000000000..6317c387858d3
--- /dev/null
+++ b/llvm/test/Analysis/DependenceAnalysis/weak-zero-siv-overflow.ll
@@ -0,0 +1,122 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -disable-output "-passes=print<da>" 2>&1 \
+; RUN:     | FileCheck %s --check-prefixes=CHECK,CHECK-ALL
+; RUN: opt < %s -disable-output "-passes=print<da>" -da-enable-dependence-test=weak-zero-siv 2>&1 \
+; RUN:     | FileCheck %s --check-prefixes=CHECK,CHECK-WEAK-ZERO-SIV
+
+; for (i = 0; i < (1LL << 62); i++) {
+;   if (0 <= 2*i - 2)
+;     A[2*i - 2] = 1;
+;   A[2] = 2;
+; }
+;
+; FIXME: DependenceAnalysis currently detects no dependency between the two
+; stores, but it does exist. The root cause is that the product of the BTC and
+; the coefficient ((1LL << 62) - 1 and 2) overflows in a signed sense.
+;
+define void @weakzero_dst_siv_prod_ovfl(ptr %A) {
+; CHECK-ALL-LABEL: 'weakzero_dst_siv_prod_ovfl'
+; CHECK-ALL-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-ALL-NEXT:    da analyze - none!
+; CHECK-ALL-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-ALL-NEXT:    da analyze - none!
+; CHECK-ALL-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-ALL-NEXT:    da analyze - consistent output [S]!
+;
+; CHECK-WEAK-ZERO-SIV-LABEL: 'weakzero_dst_siv_prod_ovfl'
+; CHECK-WEAK-ZERO-SIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-WEAK-ZERO-SIV-NEXT:    da analyze - consistent output [*]!
+; CHECK-WEAK-ZERO-SIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-WEAK-ZERO-SIV-NEXT:    da analyze - none!
+; CHECK-WEAK-ZERO-SIV-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-WEAK-ZERO-SIV-NEXT:    da analyze - consistent output [S]!
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop.latch ]
+  %offset = phi i64 [ -2, %entry ], [ %offset.next, %loop.latch ]
+  %ec = icmp eq i64 %i, 4611686018427387904
+  br i1 %ec, label %exit, label %loop.body
+
+loop.body:
+  %cond = icmp sge i64 %offset, 0
+  br i1 %cond, label %if.then, label %loop.latch
+
+if.then:
+  %gep.0 = getelementptr inbounds i8, ptr %A, i64 %offset
+  store i8 1, ptr %gep.0
+  br label %loop.latch
+
+loop.latch:
+  %gep.1 = getelementptr inbounds i8, ptr %A, i64 2
+  store i8 2, ptr %gep.1
+  %i.inc = add nuw nsw i64 %i, 1
+  %offset.next = add nsw i64 %offset, 2
+  br label %loop.header
+
+exit:
+  ret void
+}
+
+; for (i = 0; i < n; i++) {
+;   if (0 <= 2*i - 1)
+;     A[2*i - 1] = 1;
+;   A[INT64_MAX] = 2;
+; }
+;
+; FIXME: DependenceAnalysis currently detects no dependency between the two
+; stores, but it does exist. When `%n` is 2^62, the value of `%offset` will be
+; the same as INT64_MAX at the last iteration.
+; The root cause is that the calculation of the difference between the two
+; constants (INT64_MAX and -1) overflows in a signed sense.
+;
+define void @weakzero_dst_siv_delta_ovfl(ptr %A, i64 %n) {
+; CHECK-ALL-LABEL: 'weakzero_dst_siv_delta_ovfl'
+; CHECK-ALL-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-ALL-NEXT:    da analyze - none!
+; CHECK-ALL-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-ALL-NEXT:    da analyze - none!
+; CHECK-ALL-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-ALL-NEXT:    da analyze - consistent output [S]!
+;
+; CHECK-WEAK-ZERO-SIV-LABEL: 'weakzero_dst_siv_delta_ovfl'
+; CHECK-WEAK-ZERO-SIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-WEAK-ZERO-SIV-NEXT:    da analyze - consistent output [*]!
+; CHECK-WEAK-ZERO-SIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-WEAK-ZERO-SIV-NEXT:    da analyze - none!
+; CHECK-WEAK-ZERO-SIV-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-WEAK-ZERO-SIV-NEXT:    da analyze - consistent output [S]!
+;
+entry:
+  %guard = icmp sgt i64 %n, 0
+  br i1 %guard, label %loop.header, label %exit
+
+loop.header:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop.latch ]
+  %offset = phi i64 [ -2, %entry ], [ %offset.next, %loop.latch ]
+  %ec = icmp eq i64 %i, %n
+  br i1 %ec, label %exit, label %loop.body
+
+loop.body:
+  %cond = icmp sge i64 %offset, 0
+  br i1 %cond, label %if.then, label %loop.latch
+
+if.then:
+  %gep.0 = getelementptr inbounds i8, ptr %A, i64 %offset
+  store i8 1, ptr %gep.0
+  br label %loop.latch
+
+loop.latch:
+  %gep.1 = getelementptr inbounds i8, ptr %A, i64 9223372036854775807
+  store i8 2, ptr %gep.1
+  %i.inc = add nuw nsw i64 %i, 1
+  %offset.next = add nsw i64 %offset, 2
+  br label %loop.header
+
+exit:
+  ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/Analysis/HashRecognize/cyclic-redundancy-check.ll b/llvm/test/Analysis/HashRecognize/cyclic-redundancy-check.ll
index 7dec2f8f96906..78b4139d21982 100644
--- a/llvm/test/Analysis/HashRecognize/cyclic-redundancy-check.ll
+++ b/llvm/test/Analysis/HashRecognize/cyclic-redundancy-check.ll
@@ -1448,4 +1448,85 @@ exit:                                              ; preds = %loop
   ret i16 %crc.next
 }
 
+define i16 @not.crc.data.next.outside.user(i16 %crc.init, i16 %data.init) {
+; CHECK-LABEL: 'not.crc.data.next.outside.user'
+; CHECK-NEXT:  Did not find a hash algorithm
+; CHECK-NEXT:  Reason: Recurrences have stray uses
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %crc = phi i16 [ %crc.init, %entry ], [ %crc.next, %loop ]
+  %data = phi i16 [ %data.init, %entry ], [ %data.next, %loop ]
+  %xor.crc.data = xor i16 %data, %crc
+  %crc.shl = shl i16 %crc, 1
+  %crc.xor = xor i16 %crc.shl, 3
+  %check.sb = icmp slt i16 %xor.crc.data, 0
+  %crc.next = select i1 %check.sb, i16 %crc.xor, i16 %crc.shl
+  %data.next = shl i16 %data, 1
+  %iv.next = add nuw nsw i32 %iv, 1
+  %exit.cond = icmp samesign ult i32 %iv, 7
+  br i1 %exit.cond, label %loop, label %exit
+
+exit:
+  %ret = xor i16 %data.next, %crc.next
+  ret i16 %ret
+}
+
+define i16 @not.crc.data.phi.outside.user(i16 %crc.init, i16 %data.init) {
+; CHECK-LABEL: 'not.crc.data.phi.outside.user'
+; CHECK-NEXT:  Did not find a hash algorithm
+; CHECK-NEXT:  Reason: Recurrences have stray uses
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %crc = phi i16 [ %crc.init, %entry ], [ %crc.next, %loop ]
+  %data = phi i16 [ %data.init, %entry ], [ %data.next, %loop ]
+  %xor.crc.data = xor i16 %data, %crc
+  %crc.shl = shl i16 %crc, 1
+  %crc.xor = xor i16 %crc.shl, 3
+  %check.sb = icmp slt i16 %xor.crc.data, 0
+  %crc.next = select i1 %check.sb, i16 %crc.xor, i16 %crc.shl
+  %data.next = shl i16 %data, 1
+  %iv.next = add nuw nsw i32 %iv, 1
+  %exit.cond = icmp samesign ult i32 %iv, 7
+  br i1 %exit.cond, label %loop, label %exit
+
+exit:
+  %ret = xor i16 %data, %crc.next
+  ret i16 %ret
+}
+
+define i16 @not.crc.crc.phi.outside.user(i16 %crc.init, i16 %data.init) {
+; CHECK-LABEL: 'not.crc.crc.phi.outside.user'
+; CHECK-NEXT:  Did not find a hash algorithm
+; CHECK-NEXT:  Reason: Recurrences have stray uses
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %crc = phi i16 [ %crc.init, %entry ], [ %crc.next, %loop ]
+  %data = phi i16 [ %data.init, %entry ], [ %data.next, %loop ]
+  %xor.crc.data = xor i16 %data, %crc
+  %crc.shl = shl i16 %crc, 1
+  %crc.xor = xor i16 %crc.shl, 3
+  %check.sb = icmp slt i16 %xor.crc.data, 0
+  %crc.next = select i1 %check.sb, i16 %crc.xor, i16 %crc.shl
+  %data.next = shl i16 %data, 1
+  %iv.next = add nuw nsw i32 %iv, 1
+  %exit.cond = icmp samesign ult i32 %iv, 7
+  br i1 %exit.cond, label %loop, label %exit
+
+exit:
+  %ret = xor i16 %crc, %crc.next
+  ret i16 %ret
+}
+
 declare i16 @side.effect()
diff --git a/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll b/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll
index 362586af4f9b7..4fc506f1f5edf 100644
--- a/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll
+++ b/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll
@@ -87,6 +87,11 @@ declare void @llvm.nvvm.barrier(i32, i32)
 declare void @llvm.nvvm.barrier.sync(i32)
 declare void @llvm.nvvm.barrier.sync.cnt(i32, i32)
 
+declare float @llvm.nvvm.ex2.approx.f(float)
+declare double @llvm.nvvm.ex2.approx.d(double)
+declare <2 x half> @llvm.nvvm.ex2.approx.f16x2(<2 x half>)
+declare float @llvm.nvvm.ex2.approx.ftz.f(float)
+
 ; CHECK-LABEL: @simple_upgrade
 define void @simple_upgrade(i32 %a, i64 %b, i16 %c) {
 ; CHECK: call i32 @llvm.bitreverse.i32(i32 %a)
@@ -355,3 +360,15 @@ define void @cta_barriers(i32 %x, i32 %y) {
   call void @llvm.nvvm.barrier.sync.cnt(i32 %x, i32 %y)
   ret void
 }
+
+define void @nvvm_ex2_approx(float %a, double %b, half %c, <2 x half> %d) {
+; CHECK: call float @llvm.nvvm.ex2.approx.f32(float %a)
+; CHECK: call double @llvm.nvvm.ex2.approx.f64(double %b)
+; CHECK: call <2 x half> @llvm.nvvm.ex2.approx.v2f16(<2 x half> %d)
+; CHECK: call float @llvm.nvvm.ex2.approx.ftz.f32(float %a)
+  %r1 = call float @llvm.nvvm.ex2.approx.f(float %a)
+  %r2 = call double @llvm.nvvm.ex2.approx.d(double %b)
+  %r3 = call <2 x half> @llvm.nvvm.ex2.approx.f16x2(<2 x half> %d)
+  %r4 = call float @llvm.nvvm.ex2.approx.ftz.f(float %a)
+  ret void
+}
diff --git a/llvm/test/Assembler/constant-getelementptr-scalable_pointee.ll b/llvm/test/Assembler/constant-getelementptr-scalable_pointee.ll
new file mode 100644
index 0000000000000..d39039964b3b3
--- /dev/null
+++ b/llvm/test/Assembler/constant-getelementptr-scalable_pointee.ll
@@ -0,0 +1,8 @@
+; RUN: not llvm-as < %s 2>&1 | FileCheck %s
+; Test the case of an invalid pointee type on a constant GEP
+
+; CHECK: invalid base element for constant getelementptr
+
+define ptr @test_scalable_vector_gep(ptr %a) {
+  ret ptr getelementptr (<vscale x 1 x i8>, ptr @a, i64 1)
+}
diff --git a/llvm/test/Bitcode/dbg-data-size-roundtrip.ll b/llvm/test/Bitcode/dbg-data-size-roundtrip.ll
new file mode 100644
index 0000000000000..36a92538b8b7c
--- /dev/null
+++ b/llvm/test/Bitcode/dbg-data-size-roundtrip.ll
@@ -0,0 +1,19 @@
+; RUN: opt %s -o - -S | llvm-as - | llvm-dis - | FileCheck %s
+
+; CHECK: !DIBasicType(name: "unsigned _BitInt", size: 32, dataSize: 17, encoding: DW_ATE_unsigned)
+
+@a = global i8 0, align 1, !dbg !0
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!6, !7}
+!llvm.ident = !{!8}
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "a", scope: !2, file: !3, line: 4, type: !5, isLocal: false, isDefinition: true)
+!2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !3, producer: "clang version 22.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, globals: !4, splitDebugInlining: false, nameTableKind: None)
+!3 = !DIFile(filename: "bit-int.c", directory: "/")
+!4 = !{!0}
+!5 = !DIBasicType(name: "unsigned _BitInt", size: 32, dataSize: 17, encoding: DW_ATE_unsigned)
+!6 = !{i32 2, !"Debug Info Version", i32 3}
+!7 = !{i32 1, !"wchar_size", i32 4}
+!8 = !{!"clang version 22.0.0git"}
diff --git a/llvm/test/Bitcode/dwarf-objc-property.ll b/llvm/test/Bitcode/dwarf-objc-property.ll
new file mode 100644
index 0000000000000..f054f572feffa
--- /dev/null
+++ b/llvm/test/Bitcode/dwarf-objc-property.ll
@@ -0,0 +1,46 @@
+; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+; RUN: llvm-as < %s | llvm-dis | llvm-as | llvm-dis | FileCheck %s
+
+; CHECK: !DIObjCProperty(name: "autoSynthProp", file: !3, line: 5, attributes: 2316, type: !8)
+; CHECK: !DIObjCProperty(name: "synthProp", file: !3, line: 6, attributes: 2316, type: !8)
+; CHECK: !DIObjCProperty(name: "customGetterProp", file: !3, line: 7, getter: "customGetter", attributes: 2318, type: !8)
+; CHECK: !DIObjCProperty(name: "customSetterProp", file: !3, line: 8, setter: "customSetter:", attributes: 2444, type: !8)
+; CHECK: !DIObjCProperty(name: "customAccessorsProp", file: !3, line: 9, setter: "customSetter:", getter: "customGetter", attributes: 2446, type: !8)
+
+!llvm.module.flags = !{!0, !1}
+!llvm.dbg.cu = !{!2}
+
+!0 = !{i32 7, !"Dwarf Version", i32 5}
+!1 = !{i32 2, !"Debug Info Version", i32 3}
+!2 = distinct !DICompileUnit(language: DW_LANG_ObjC, file: !3, producer: "hand written", isOptimized: false, runtimeVersion: 2, emissionKind: FullDebug, retainedTypes: !4, splitDebugInlining: false, debugInfoForProfiling: true, nameTableKind: Apple)
+!3 = !DIFile(filename: "main.m", directory: "/tmp")
+!4 = !{!5}
+!5 = !DICompositeType(tag: DW_TAG_structure_type, name: "Foo", scope: !3, file: !3, line: 1, size: 128, flags: DIFlagObjcClassComplete, elements: !6, runtimeLang: DW_LANG_ObjC)
+!6 = !{!7, !9, !10, !11, !12, !13, !14, !15, !16, !17, !24, !27, !28, !29, !30, !31, !32}
+!7 = !DIObjCProperty(name: "autoSynthProp", file: !3, line: 5, attributes: 2316, type: !8)
+!8 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!9 = !DIObjCProperty(name: "synthProp", file: !3, line: 6, attributes: 2316, type: !8)
+!10 = !DIObjCProperty(name: "customGetterProp", file: !3, line: 7, getter: "customGetter", attributes: 2318, type: !8)
+!11 = !DIObjCProperty(name: "customSetterProp", file: !3, line: 8, setter: "customSetter:", attributes: 2444, type: !8)
+!12 = !DIObjCProperty(name: "customAccessorsProp", file: !3, line: 9, setter: "customSetter:", getter: "customGetter", attributes: 2446, type: !8)
+!13 = !DIDerivedType(tag: DW_TAG_member, name: "someBackingIvar", scope: !3, file: !3, line: 2, baseType: !8, size: 32, flags: DIFlagProtected, extraData: !9)
+!14 = !DIDerivedType(tag: DW_TAG_member, name: "_autoSynthProp", scope: !3, file: !3, line: 5, baseType: !8, size: 32, flags: DIFlagPrivate, extraData: !7)
+!15 = !DIDerivedType(tag: DW_TAG_member, name: "_customGetterProp", scope: !3, file: !3, line: 7, baseType: !8, size: 32, flags: DIFlagPrivate, extraData: !10)
+!16 = !DIDerivedType(tag: DW_TAG_member, name: "_customSetterProp", scope: !3, file: !3, line: 8, baseType: !8, size: 32, flags: DIFlagPrivate, extraData: !11)
+!17 = !DISubprogram(name: "-[Foo customGetter]", scope: !5, file: !3, line: 19, type: !18, scopeLine: 19, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit)
+!18 = !DISubroutineType(types: !19)
+!19 = !{!8, !20, !21}
+!20 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !5, size: 64, flags: DIFlagArtificial | DIFlagObjectPointer)
+!21 = !DIDerivedType(tag: DW_TAG_typedef, name: "SEL", file: !3, baseType: !22, flags: DIFlagArtificial)
+!22 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !23, size: 64)
+!23 = !DICompositeType(tag: DW_TAG_structure_type, name: "objc_selector", file: !3, flags: DIFlagFwdDecl)
+!24 = !DISubprogram(name: "-[Foo customSetter:]", scope: !5, file: !3, line: 23, type: !25, scopeLine: 23, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit)
+!25 = !DISubroutineType(types: !26)
+!26 = !{null, !20, !21, !8}
+!27 = !DISubprogram(name: "-[Foo synthProp]", scope: !5, file: !3, line: 17, type: !18, scopeLine: 17, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit)
+!28 = !DISubprogram(name: "-[Foo setSynthProp:]", scope: !5, file: !3, line: 17, type: !25, scopeLine: 17, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit)
+!29 = !DISubprogram(name: "-[Foo autoSynthProp]", scope: !5, file: !3, line: 5, type: !18, scopeLine: 5, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit)
+!30 = !DISubprogram(name: "-[Foo setAutoSynthProp:]", scope: !5, file: !3, line: 5, type: !25, scopeLine: 5, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit)
+!31 = !DISubprogram(name: "-[Foo setCustomGetterProp:]", scope: !5, file: !3, line: 7, type: !25, scopeLine: 7, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit)
+!32 = !DISubprogram(name: "-[Foo customSetterProp]", scope: !5, file: !3, line: 8, type: !18, scopeLine: 8, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit)
+
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-freeze.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-freeze.mir
index 6b84a8488e478..1950e602ec83a 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-freeze.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-freeze.mir
@@ -1440,3 +1440,50 @@ body:             |
     %freeze:_(<4 x s32>) = G_FREEZE %extract
     $q0 = COPY %freeze(<4 x s32>)
     RET_ReallyLR implicit $x0
+...
+---
+name:            ubfx_does_not_generate_poison
+body:             |
+  bb.1:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: ubfx_does_not_generate_poison
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: %c1:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s64) = G_FREEZE [[COPY]]
+    ; CHECK-NEXT: [[UBFX:%[0-9]+]]:_(s64) = G_UBFX [[FREEZE]], %c1(s64), %c1
+    ; CHECK-NEXT: $x0 = COPY [[UBFX]](s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %0:_(s64) = COPY $x0
+    %c1:_(s64) = G_CONSTANT i64 1
+    %1:_(s64) = G_UBFX %0, %c1, %c1
+    %2:_(s64) = G_FREEZE %1
+    $x0 = COPY %2(s64)
+    RET_ReallyLR implicit $x0
+
+...
+---
+name:            sbfx_does_not_generate_poison
+body:             |
+  bb.1:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: sbfx_does_not_generate_poison
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: %c1:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s64) = G_FREEZE [[COPY]]
+    ; CHECK-NEXT: [[SBFX:%[0-9]+]]:_(s64) = G_SBFX [[FREEZE]], %c1(s64), %c1
+    ; CHECK-NEXT: $x0 = COPY [[SBFX]](s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %0:_(s64) = COPY $x0
+    %c1:_(s64) = G_CONSTANT i64 1
+    %1:_(s64) = G_SBFX %0, %c1, %c1
+    %2:_(s64) = G_FREEZE %1
+    $x0 = COPY %2(s64)
+    RET_ReallyLR implicit $x0
+
+...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll b/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll
index 41f7ab89094ad..480fcbd6a9788 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll
@@ -4992,28 +4992,21 @@ define void @test_shl_i512_const_32(ptr %result, ptr %input) {
 ; GISEL-LABEL: test_shl_i512_const_32:
 ; GISEL:       ; %bb.0: ; %entry
 ; GISEL-NEXT:    ldp x8, x9, [x1]
-; GISEL-NEXT:    ldp x11, x12, [x1, #16]
-; GISEL-NEXT:    ldp x14, x15, [x1, #32]
-; GISEL-NEXT:    lsr x10, x8, #32
-; GISEL-NEXT:    lsr x13, x9, #32
-; GISEL-NEXT:    lsl x8, x8, #32
-; GISEL-NEXT:    orr x9, x10, x9, lsl #32
-; GISEL-NEXT:    lsr x10, x11, #32
-; GISEL-NEXT:    orr x11, x13, x11, lsl #32
-; GISEL-NEXT:    ldp x13, x16, [x1, #48]
-; GISEL-NEXT:    stp x8, x9, [x0]
-; GISEL-NEXT:    lsr x8, x12, #32
-; GISEL-NEXT:    orr x10, x10, x12, lsl #32
-; GISEL-NEXT:    lsr x12, x14, #32
-; GISEL-NEXT:    lsr x9, x15, #32
-; GISEL-NEXT:    orr x8, x8, x14, lsl #32
-; GISEL-NEXT:    stp x11, x10, [x0, #16]
-; GISEL-NEXT:    orr x11, x12, x15, lsl #32
-; GISEL-NEXT:    lsr x12, x13, #32
-; GISEL-NEXT:    orr x9, x9, x13, lsl #32
-; GISEL-NEXT:    stp x8, x11, [x0, #32]
-; GISEL-NEXT:    orr x8, x12, x16, lsl #32
-; GISEL-NEXT:    stp x9, x8, [x0, #48]
+; GISEL-NEXT:    ldp x10, x11, [x1, #16]
+; GISEL-NEXT:    ldp x13, x14, [x1, #32]
+; GISEL-NEXT:    lsl x12, x8, #32
+; GISEL-NEXT:    extr x8, x9, x8, #32
+; GISEL-NEXT:    extr x9, x10, x9, #32
+; GISEL-NEXT:    extr x10, x11, x10, #32
+; GISEL-NEXT:    ldp x15, x16, [x1, #48]
+; GISEL-NEXT:    stp x12, x8, [x0]
+; GISEL-NEXT:    extr x8, x13, x11, #32
+; GISEL-NEXT:    stp x9, x10, [x0, #16]
+; GISEL-NEXT:    extr x9, x14, x13, #32
+; GISEL-NEXT:    extr x10, x15, x14, #32
+; GISEL-NEXT:    stp x8, x9, [x0, #32]
+; GISEL-NEXT:    extr x8, x16, x15, #32
+; GISEL-NEXT:    stp x10, x8, [x0, #48]
 ; GISEL-NEXT:    ret
 entry:
   %input_val = load i512, ptr %input, align 64
@@ -5044,30 +5037,22 @@ define void @test_lshr_i512_const_32(ptr %result, ptr %input) {
 ;
 ; GISEL-LABEL: test_lshr_i512_const_32:
 ; GISEL:       ; %bb.0: ; %entry
-; GISEL-NEXT:    ldp x8, x9, [x1, #8]
-; GISEL-NEXT:    ldr x11, [x1]
-; GISEL-NEXT:    ldp x10, x14, [x1, #24]
-; GISEL-NEXT:    ldr x16, [x1, #56]
-; GISEL-NEXT:    lsl x12, x8, #32
-; GISEL-NEXT:    lsl x13, x9, #32
-; GISEL-NEXT:    lsl x15, x10, #32
-; GISEL-NEXT:    orr x11, x12, x11, lsr #32
-; GISEL-NEXT:    orr x8, x13, x8, lsr #32
-; GISEL-NEXT:    lsl x13, x14, #32
-; GISEL-NEXT:    orr x9, x15, x9, lsr #32
-; GISEL-NEXT:    ldp x12, x15, [x1, #40]
-; GISEL-NEXT:    stp x11, x8, [x0]
-; GISEL-NEXT:    orr x10, x13, x10, lsr #32
-; GISEL-NEXT:    lsl x8, x16, #32
-; GISEL-NEXT:    lsl x11, x12, #32
-; GISEL-NEXT:    lsl x13, x15, #32
-; GISEL-NEXT:    stp x9, x10, [x0, #16]
-; GISEL-NEXT:    orr x8, x8, x15, lsr #32
-; GISEL-NEXT:    lsr x10, x16, #32
-; GISEL-NEXT:    orr x11, x11, x14, lsr #32
-; GISEL-NEXT:    orr x9, x13, x12, lsr #32
-; GISEL-NEXT:    stp x8, x10, [x0, #48]
-; GISEL-NEXT:    stp x11, x9, [x0, #32]
+; GISEL-NEXT:    ldp x8, x9, [x1]
+; GISEL-NEXT:    ldp x10, x11, [x1, #16]
+; GISEL-NEXT:    ldp x12, x13, [x1, #32]
+; GISEL-NEXT:    extr x8, x9, x8, #32
+; GISEL-NEXT:    ldp x14, x15, [x1, #48]
+; GISEL-NEXT:    extr x9, x10, x9, #32
+; GISEL-NEXT:    extr x10, x11, x10, #32
+; GISEL-NEXT:    stp x8, x9, [x0]
+; GISEL-NEXT:    extr x8, x12, x11, #32
+; GISEL-NEXT:    extr x9, x13, x12, #32
+; GISEL-NEXT:    stp x10, x8, [x0, #16]
+; GISEL-NEXT:    extr x10, x14, x13, #32
+; GISEL-NEXT:    extr x8, x15, x14, #32
+; GISEL-NEXT:    stp x9, x10, [x0, #32]
+; GISEL-NEXT:    lsr x9, x15, #32
+; GISEL-NEXT:    stp x8, x9, [x0, #48]
 ; GISEL-NEXT:    ret
 entry:
   %input_val = load i512, ptr %input, align 64
@@ -5098,32 +5083,24 @@ define void @test_ashr_i512_const_32(ptr %result, ptr %input) {
 ;
 ; GISEL-LABEL: test_ashr_i512_const_32:
 ; GISEL:       ; %bb.0: ; %entry
-; GISEL-NEXT:    ldp x8, x9, [x1, #8]
-; GISEL-NEXT:    ldr x11, [x1]
-; GISEL-NEXT:    ldp x10, x13, [x1, #24]
-; GISEL-NEXT:    ldr x17, [x1, #56]
-; GISEL-NEXT:    lsl x12, x8, #32
-; GISEL-NEXT:    lsl x15, x9, #32
-; GISEL-NEXT:    lsl x16, x10, #32
-; GISEL-NEXT:    orr x11, x12, x11, lsr #32
-; GISEL-NEXT:    ldp x14, x12, [x1, #40]
-; GISEL-NEXT:    orr x8, x15, x8, lsr #32
-; GISEL-NEXT:    lsl x15, x13, #32
-; GISEL-NEXT:    orr x9, x16, x9, lsr #32
-; GISEL-NEXT:    asr x16, x17, #63
-; GISEL-NEXT:    stp x11, x8, [x0]
-; GISEL-NEXT:    lsl x11, x14, #32
-; GISEL-NEXT:    orr x10, x15, x10, lsr #32
-; GISEL-NEXT:    lsl x15, x12, #32
-; GISEL-NEXT:    orr x8, x11, x13, lsr #32
-; GISEL-NEXT:    lsl x11, x17, #32
-; GISEL-NEXT:    stp x9, x10, [x0, #16]
-; GISEL-NEXT:    orr x9, x15, x14, lsr #32
-; GISEL-NEXT:    lsl x13, x16, #32
-; GISEL-NEXT:    orr x10, x11, x12, lsr #32
-; GISEL-NEXT:    stp x8, x9, [x0, #32]
-; GISEL-NEXT:    orr x8, x13, x17, asr #32
-; GISEL-NEXT:    stp x10, x8, [x0, #48]
+; GISEL-NEXT:    ldp x8, x9, [x1]
+; GISEL-NEXT:    ldp x10, x11, [x1, #16]
+; GISEL-NEXT:    ldp x12, x13, [x1, #48]
+; GISEL-NEXT:    extr x8, x9, x8, #32
+; GISEL-NEXT:    ldp x14, x15, [x1, #32]
+; GISEL-NEXT:    extr x9, x10, x9, #32
+; GISEL-NEXT:    extr x10, x11, x10, #32
+; GISEL-NEXT:    stp x8, x9, [x0]
+; GISEL-NEXT:    asr x8, x13, #63
+; GISEL-NEXT:    extr x11, x14, x11, #32
+; GISEL-NEXT:    extr x9, x15, x14, #32
+; GISEL-NEXT:    lsl x8, x8, #32
+; GISEL-NEXT:    stp x10, x11, [x0, #16]
+; GISEL-NEXT:    extr x10, x12, x15, #32
+; GISEL-NEXT:    extr x11, x13, x12, #32
+; GISEL-NEXT:    orr x8, x8, x13, asr #32
+; GISEL-NEXT:    stp x9, x10, [x0, #32]
+; GISEL-NEXT:    stp x11, x8, [x0, #48]
 ; GISEL-NEXT:    ret
 entry:
   %input_val = load i512, ptr %input, align 64
@@ -5252,23 +5229,17 @@ define void @test_shl_i512_const_96(ptr %result, ptr %input) {
 ; GISEL-NEXT:    ldr x15, [x1, #48]
 ; GISEL-NEXT:    ldp x10, x11, [x1, #16]
 ; GISEL-NEXT:    ldp x12, x13, [x1, #32]
-; GISEL-NEXT:    lsr x14, x8, #32
-; GISEL-NEXT:    lsr x16, x9, #32
-; GISEL-NEXT:    lsl x8, x8, #32
-; GISEL-NEXT:    orr x9, x14, x9, lsl #32
-; GISEL-NEXT:    lsr x14, x10, #32
-; GISEL-NEXT:    orr x10, x16, x10, lsl #32
-; GISEL-NEXT:    stp xzr, x8, [x0]
-; GISEL-NEXT:    lsr x8, x11, #32
-; GISEL-NEXT:    orr x11, x14, x11, lsl #32
-; GISEL-NEXT:    lsr x14, x12, #32
-; GISEL-NEXT:    stp x9, x10, [x0, #16]
-; GISEL-NEXT:    lsr x9, x13, #32
-; GISEL-NEXT:    orr x8, x8, x12, lsl #32
-; GISEL-NEXT:    orr x10, x14, x13, lsl #32
-; GISEL-NEXT:    orr x9, x9, x15, lsl #32
-; GISEL-NEXT:    stp x11, x8, [x0, #32]
-; GISEL-NEXT:    stp x10, x9, [x0, #48]
+; GISEL-NEXT:    lsl x14, x8, #32
+; GISEL-NEXT:    extr x8, x9, x8, #32
+; GISEL-NEXT:    extr x9, x10, x9, #32
+; GISEL-NEXT:    extr x10, x11, x10, #32
+; GISEL-NEXT:    stp xzr, x14, [x0]
+; GISEL-NEXT:    stp x8, x9, [x0, #16]
+; GISEL-NEXT:    extr x8, x12, x11, #32
+; GISEL-NEXT:    extr x9, x13, x12, #32
+; GISEL-NEXT:    stp x10, x8, [x0, #32]
+; GISEL-NEXT:    extr x10, x15, x13, #32
+; GISEL-NEXT:    stp x9, x10, [x0, #48]
 ; GISEL-NEXT:    ret
 entry:
   %input_val = load i512, ptr %input, align 64
@@ -5297,27 +5268,21 @@ define void @test_lshr_i512_const_96(ptr %result, ptr %input) {
 ;
 ; GISEL-LABEL: test_lshr_i512_const_96:
 ; GISEL:       ; %bb.0: ; %entry
-; GISEL-NEXT:    ldp x8, x9, [x1, #16]
-; GISEL-NEXT:    ldr x10, [x1, #8]
-; GISEL-NEXT:    ldp x11, x14, [x1, #32]
-; GISEL-NEXT:    ldp x15, x16, [x1, #48]
-; GISEL-NEXT:    lsl x12, x8, #32
-; GISEL-NEXT:    lsl x13, x9, #32
-; GISEL-NEXT:    orr x10, x12, x10, lsr #32
-; GISEL-NEXT:    lsl x12, x11, #32
-; GISEL-NEXT:    orr x8, x13, x8, lsr #32
-; GISEL-NEXT:    lsl x13, x14, #32
-; GISEL-NEXT:    orr x9, x12, x9, lsr #32
-; GISEL-NEXT:    stp x10, x8, [x0]
-; GISEL-NEXT:    lsl x10, x15, #32
-; GISEL-NEXT:    orr x11, x13, x11, lsr #32
-; GISEL-NEXT:    lsl x12, x16, #32
-; GISEL-NEXT:    orr x8, x10, x14, lsr #32
-; GISEL-NEXT:    lsr x10, x16, #32
-; GISEL-NEXT:    stp x9, x11, [x0, #16]
-; GISEL-NEXT:    orr x9, x12, x15, lsr #32
-; GISEL-NEXT:    stp x10, xzr, [x0, #48]
-; GISEL-NEXT:    stp x8, x9, [x0, #32]
+; GISEL-NEXT:    ldp x8, x9, [x1, #8]
+; GISEL-NEXT:    ldr x14, [x1, #56]
+; GISEL-NEXT:    ldp x10, x11, [x1, #24]
+; GISEL-NEXT:    ldp x12, x13, [x1, #40]
+; GISEL-NEXT:    extr x8, x9, x8, #32
+; GISEL-NEXT:    extr x9, x10, x9, #32
+; GISEL-NEXT:    extr x10, x11, x10, #32
+; GISEL-NEXT:    stp x8, x9, [x0]
+; GISEL-NEXT:    extr x8, x12, x11, #32
+; GISEL-NEXT:    extr x9, x13, x12, #32
+; GISEL-NEXT:    stp x10, x8, [x0, #16]
+; GISEL-NEXT:    extr x10, x14, x13, #32
+; GISEL-NEXT:    lsr x8, x14, #32
+; GISEL-NEXT:    stp x9, x10, [x0, #32]
+; GISEL-NEXT:    stp x8, xzr, [x0, #48]
 ; GISEL-NEXT:    ret
 entry:
   %input_val = load i512, ptr %input, align 64
@@ -5347,29 +5312,23 @@ define void @test_ashr_i512_const_96(ptr %result, ptr %input) {
 ;
 ; GISEL-LABEL: test_ashr_i512_const_96:
 ; GISEL:       ; %bb.0: ; %entry
-; GISEL-NEXT:    ldp x8, x9, [x1, #16]
-; GISEL-NEXT:    ldr x11, [x1, #8]
-; GISEL-NEXT:    ldp x10, x13, [x1, #32]
-; GISEL-NEXT:    lsl x12, x8, #32
-; GISEL-NEXT:    lsl x14, x9, #32
-; GISEL-NEXT:    lsl x15, x10, #32
-; GISEL-NEXT:    orr x11, x12, x11, lsr #32
-; GISEL-NEXT:    ldp x12, x16, [x1, #48]
-; GISEL-NEXT:    orr x8, x14, x8, lsr #32
-; GISEL-NEXT:    lsl x14, x13, #32
-; GISEL-NEXT:    orr x9, x15, x9, lsr #32
-; GISEL-NEXT:    asr x15, x16, #63
-; GISEL-NEXT:    stp x11, x8, [x0]
-; GISEL-NEXT:    lsl x11, x12, #32
-; GISEL-NEXT:    orr x10, x14, x10, lsr #32
-; GISEL-NEXT:    lsl x14, x16, #32
-; GISEL-NEXT:    orr x8, x11, x13, lsr #32
+; GISEL-NEXT:    ldp x8, x9, [x1, #8]
+; GISEL-NEXT:    ldr x13, [x1, #40]
+; GISEL-NEXT:    ldp x10, x11, [x1, #24]
+; GISEL-NEXT:    ldp x14, x12, [x1, #48]
+; GISEL-NEXT:    extr x8, x9, x8, #32
+; GISEL-NEXT:    extr x9, x10, x9, #32
+; GISEL-NEXT:    extr x10, x11, x10, #32
+; GISEL-NEXT:    asr x15, x12, #63
+; GISEL-NEXT:    stp x8, x9, [x0]
+; GISEL-NEXT:    extr x8, x13, x11, #32
+; GISEL-NEXT:    extr x9, x14, x13, #32
 ; GISEL-NEXT:    lsl x11, x15, #32
-; GISEL-NEXT:    stp x9, x10, [x0, #16]
-; GISEL-NEXT:    orr x9, x14, x12, lsr #32
-; GISEL-NEXT:    orr x10, x11, x16, asr #32
-; GISEL-NEXT:    stp x8, x9, [x0, #32]
-; GISEL-NEXT:    stp x10, x15, [x0, #48]
+; GISEL-NEXT:    stp x10, x8, [x0, #16]
+; GISEL-NEXT:    extr x10, x12, x14, #32
+; GISEL-NEXT:    orr x8, x11, x12, asr #32
+; GISEL-NEXT:    stp x9, x10, [x0, #32]
+; GISEL-NEXT:    stp x8, x15, [x0, #48]
 ; GISEL-NEXT:    ret
 entry:
   %input_val = load i512, ptr %input, align 64
@@ -5404,28 +5363,21 @@ define void @test_shl_i512_const_1(ptr %result, ptr %input) {
 ; GISEL-LABEL: test_shl_i512_const_1:
 ; GISEL:       ; %bb.0: ; %entry
 ; GISEL-NEXT:    ldp x8, x9, [x1]
-; GISEL-NEXT:    ldp x11, x12, [x1, #16]
-; GISEL-NEXT:    ldp x14, x15, [x1, #32]
-; GISEL-NEXT:    lsr x10, x8, #63
-; GISEL-NEXT:    lsr x13, x9, #63
-; GISEL-NEXT:    lsl x8, x8, #1
-; GISEL-NEXT:    orr x9, x10, x9, lsl #1
-; GISEL-NEXT:    lsr x10, x11, #63
-; GISEL-NEXT:    orr x11, x13, x11, lsl #1
-; GISEL-NEXT:    ldp x13, x16, [x1, #48]
-; GISEL-NEXT:    stp x8, x9, [x0]
-; GISEL-NEXT:    lsr x8, x12, #63
-; GISEL-NEXT:    orr x10, x10, x12, lsl #1
-; GISEL-NEXT:    lsr x12, x14, #63
-; GISEL-NEXT:    lsr x9, x15, #63
-; GISEL-NEXT:    orr x8, x8, x14, lsl #1
-; GISEL-NEXT:    stp x11, x10, [x0, #16]
-; GISEL-NEXT:    orr x11, x12, x15, lsl #1
-; GISEL-NEXT:    lsr x12, x13, #63
-; GISEL-NEXT:    orr x9, x9, x13, lsl #1
-; GISEL-NEXT:    stp x8, x11, [x0, #32]
-; GISEL-NEXT:    orr x8, x12, x16, lsl #1
-; GISEL-NEXT:    stp x9, x8, [x0, #48]
+; GISEL-NEXT:    ldp x10, x11, [x1, #16]
+; GISEL-NEXT:    ldp x13, x14, [x1, #32]
+; GISEL-NEXT:    lsl x12, x8, #1
+; GISEL-NEXT:    extr x8, x9, x8, #63
+; GISEL-NEXT:    extr x9, x10, x9, #63
+; GISEL-NEXT:    extr x10, x11, x10, #63
+; GISEL-NEXT:    ldp x15, x16, [x1, #48]
+; GISEL-NEXT:    stp x12, x8, [x0]
+; GISEL-NEXT:    extr x8, x13, x11, #63
+; GISEL-NEXT:    stp x9, x10, [x0, #16]
+; GISEL-NEXT:    extr x9, x14, x13, #63
+; GISEL-NEXT:    extr x10, x15, x14, #63
+; GISEL-NEXT:    stp x8, x9, [x0, #32]
+; GISEL-NEXT:    extr x8, x16, x15, #63
+; GISEL-NEXT:    stp x10, x8, [x0, #48]
 ; GISEL-NEXT:    ret
 entry:
   %input_val = load i512, ptr %input, align 64
@@ -5457,30 +5409,22 @@ define void @test_lshr_i512_const_1(ptr %result, ptr %input) {
 ;
 ; GISEL-LABEL: test_lshr_i512_const_1:
 ; GISEL:       ; %bb.0: ; %entry
-; GISEL-NEXT:    ldp x8, x9, [x1, #8]
-; GISEL-NEXT:    ldr x11, [x1]
-; GISEL-NEXT:    ldp x10, x14, [x1, #24]
-; GISEL-NEXT:    ldr x16, [x1, #56]
-; GISEL-NEXT:    lsl x12, x8, #63
-; GISEL-NEXT:    lsl x13, x9, #63
-; GISEL-NEXT:    lsl x15, x10, #63
-; GISEL-NEXT:    orr x11, x12, x11, lsr #1
-; GISEL-NEXT:    orr x8, x13, x8, lsr #1
-; GISEL-NEXT:    lsl x13, x14, #63
-; GISEL-NEXT:    orr x9, x15, x9, lsr #1
-; GISEL-NEXT:    ldp x12, x15, [x1, #40]
-; GISEL-NEXT:    stp x11, x8, [x0]
-; GISEL-NEXT:    orr x10, x13, x10, lsr #1
-; GISEL-NEXT:    lsl x8, x16, #63
-; GISEL-NEXT:    lsl x11, x12, #63
-; GISEL-NEXT:    lsl x13, x15, #63
-; GISEL-NEXT:    stp x9, x10, [x0, #16]
-; GISEL-NEXT:    orr x8, x8, x15, lsr #1
-; GISEL-NEXT:    lsr x10, x16, #1
-; GISEL-NEXT:    orr x11, x11, x14, lsr #1
-; GISEL-NEXT:    orr x9, x13, x12, lsr #1
-; GISEL-NEXT:    stp x8, x10, [x0, #48]
-; GISEL-NEXT:    stp x11, x9, [x0, #32]
+; GISEL-NEXT:    ldp x8, x9, [x1]
+; GISEL-NEXT:    ldp x10, x11, [x1, #16]
+; GISEL-NEXT:    ldp x12, x13, [x1, #32]
+; GISEL-NEXT:    extr x8, x9, x8, #1
+; GISEL-NEXT:    ldp x14, x15, [x1, #48]
+; GISEL-NEXT:    extr x9, x10, x9, #1
+; GISEL-NEXT:    extr x10, x11, x10, #1
+; GISEL-NEXT:    stp x8, x9, [x0]
+; GISEL-NEXT:    extr x8, x12, x11, #1
+; GISEL-NEXT:    extr x9, x13, x12, #1
+; GISEL-NEXT:    stp x10, x8, [x0, #16]
+; GISEL-NEXT:    extr x10, x14, x13, #1
+; GISEL-NEXT:    extr x8, x15, x14, #1
+; GISEL-NEXT:    stp x9, x10, [x0, #32]
+; GISEL-NEXT:    lsr x9, x15, #1
+; GISEL-NEXT:    stp x8, x9, [x0, #48]
 ; GISEL-NEXT:    ret
 entry:
   %input_val = load i512, ptr %input, align 64
@@ -5512,32 +5456,24 @@ define void @test_ashr_i512_const_1(ptr %result, ptr %input) {
 ;
 ; GISEL-LABEL: test_ashr_i512_const_1:
 ; GISEL:       ; %bb.0: ; %entry
-; GISEL-NEXT:    ldp x8, x9, [x1, #8]
-; GISEL-NEXT:    ldr x11, [x1]
-; GISEL-NEXT:    ldp x10, x13, [x1, #24]
-; GISEL-NEXT:    ldr x17, [x1, #56]
-; GISEL-NEXT:    lsl x12, x8, #63
-; GISEL-NEXT:    lsl x15, x9, #63
-; GISEL-NEXT:    lsl x16, x10, #63
-; GISEL-NEXT:    orr x11, x12, x11, lsr #1
-; GISEL-NEXT:    ldp x14, x12, [x1, #40]
-; GISEL-NEXT:    orr x8, x15, x8, lsr #1
-; GISEL-NEXT:    lsl x15, x13, #63
-; GISEL-NEXT:    orr x9, x16, x9, lsr #1
-; GISEL-NEXT:    asr x16, x17, #63
-; GISEL-NEXT:    stp x11, x8, [x0]
-; GISEL-NEXT:    lsl x11, x14, #63
-; GISEL-NEXT:    orr x10, x15, x10, lsr #1
-; GISEL-NEXT:    lsl x15, x12, #63
-; GISEL-NEXT:    orr x8, x11, x13, lsr #1
-; GISEL-NEXT:    lsl x11, x17, #63
-; GISEL-NEXT:    stp x9, x10, [x0, #16]
-; GISEL-NEXT:    orr x9, x15, x14, lsr #1
-; GISEL-NEXT:    lsl x13, x16, #63
-; GISEL-NEXT:    orr x10, x11, x12, lsr #1
-; GISEL-NEXT:    stp x8, x9, [x0, #32]
-; GISEL-NEXT:    orr x8, x13, x17, asr #1
-; GISEL-NEXT:    stp x10, x8, [x0, #48]
+; GISEL-NEXT:    ldp x8, x9, [x1]
+; GISEL-NEXT:    ldp x10, x11, [x1, #16]
+; GISEL-NEXT:    ldp x12, x13, [x1, #48]
+; GISEL-NEXT:    extr x8, x9, x8, #1
+; GISEL-NEXT:    ldp x14, x15, [x1, #32]
+; GISEL-NEXT:    extr x9, x10, x9, #1
+; GISEL-NEXT:    extr x10, x11, x10, #1
+; GISEL-NEXT:    stp x8, x9, [x0]
+; GISEL-NEXT:    asr x8, x13, #63
+; GISEL-NEXT:    extr x11, x14, x11, #1
+; GISEL-NEXT:    extr x9, x15, x14, #1
+; GISEL-NEXT:    lsl x8, x8, #63
+; GISEL-NEXT:    stp x10, x11, [x0, #16]
+; GISEL-NEXT:    extr x10, x12, x15, #1
+; GISEL-NEXT:    extr x11, x13, x12, #1
+; GISEL-NEXT:    orr x8, x8, x13, asr #1
+; GISEL-NEXT:    stp x9, x10, [x0, #32]
+; GISEL-NEXT:    stp x11, x8, [x0, #48]
 ; GISEL-NEXT:    ret
 entry:
   %input_val = load i512, ptr %input, align 64
@@ -5571,28 +5507,21 @@ define void @test_shl_i512_const_15(ptr %result, ptr %input) {
 ; GISEL-LABEL: test_shl_i512_const_15:
 ; GISEL:       ; %bb.0: ; %entry
 ; GISEL-NEXT:    ldp x8, x9, [x1]
-; GISEL-NEXT:    ldp x11, x12, [x1, #16]
-; GISEL-NEXT:    ldp x14, x15, [x1, #32]
-; GISEL-NEXT:    lsr x10, x8, #49
-; GISEL-NEXT:    lsr x13, x9, #49
-; GISEL-NEXT:    lsl x8, x8, #15
-; GISEL-NEXT:    orr x9, x10, x9, lsl #15
-; GISEL-NEXT:    lsr x10, x11, #49
-; GISEL-NEXT:    orr x11, x13, x11, lsl #15
-; GISEL-NEXT:    ldp x13, x16, [x1, #48]
-; GISEL-NEXT:    stp x8, x9, [x0]
-; GISEL-NEXT:    lsr x8, x12, #49
-; GISEL-NEXT:    orr x10, x10, x12, lsl #15
-; GISEL-NEXT:    lsr x12, x14, #49
-; GISEL-NEXT:    lsr x9, x15, #49
-; GISEL-NEXT:    orr x8, x8, x14, lsl #15
-; GISEL-NEXT:    stp x11, x10, [x0, #16]
-; GISEL-NEXT:    orr x11, x12, x15, lsl #15
-; GISEL-NEXT:    lsr x12, x13, #49
-; GISEL-NEXT:    orr x9, x9, x13, lsl #15
-; GISEL-NEXT:    stp x8, x11, [x0, #32]
-; GISEL-NEXT:    orr x8, x12, x16, lsl #15
-; GISEL-NEXT:    stp x9, x8, [x0, #48]
+; GISEL-NEXT:    ldp x10, x11, [x1, #16]
+; GISEL-NEXT:    ldp x13, x14, [x1, #32]
+; GISEL-NEXT:    lsl x12, x8, #15
+; GISEL-NEXT:    extr x8, x9, x8, #49
+; GISEL-NEXT:    extr x9, x10, x9, #49
+; GISEL-NEXT:    extr x10, x11, x10, #49
+; GISEL-NEXT:    ldp x15, x16, [x1, #48]
+; GISEL-NEXT:    stp x12, x8, [x0]
+; GISEL-NEXT:    extr x8, x13, x11, #49
+; GISEL-NEXT:    stp x9, x10, [x0, #16]
+; GISEL-NEXT:    extr x9, x14, x13, #49
+; GISEL-NEXT:    extr x10, x15, x14, #49
+; GISEL-NEXT:    stp x8, x9, [x0, #32]
+; GISEL-NEXT:    extr x8, x16, x15, #49
+; GISEL-NEXT:    stp x10, x8, [x0, #48]
 ; GISEL-NEXT:    ret
 entry:
   %input_val = load i512, ptr %input, align 64
@@ -5624,30 +5553,22 @@ define void @test_lshr_i512_const_15(ptr %result, ptr %input) {
 ;
 ; GISEL-LABEL: test_lshr_i512_const_15:
 ; GISEL:       ; %bb.0: ; %entry
-; GISEL-NEXT:    ldp x8, x9, [x1, #8]
-; GISEL-NEXT:    ldr x11, [x1]
-; GISEL-NEXT:    ldp x10, x14, [x1, #24]
-; GISEL-NEXT:    ldr x16, [x1, #56]
-; GISEL-NEXT:    lsl x12, x8, #49
-; GISEL-NEXT:    lsl x13, x9, #49
-; GISEL-NEXT:    lsl x15, x10, #49
-; GISEL-NEXT:    orr x11, x12, x11, lsr #15
-; GISEL-NEXT:    orr x8, x13, x8, lsr #15
-; GISEL-NEXT:    lsl x13, x14, #49
-; GISEL-NEXT:    orr x9, x15, x9, lsr #15
-; GISEL-NEXT:    ldp x12, x15, [x1, #40]
-; GISEL-NEXT:    stp x11, x8, [x0]
-; GISEL-NEXT:    orr x10, x13, x10, lsr #15
-; GISEL-NEXT:    lsl x8, x16, #49
-; GISEL-NEXT:    lsl x11, x12, #49
-; GISEL-NEXT:    lsl x13, x15, #49
-; GISEL-NEXT:    stp x9, x10, [x0, #16]
-; GISEL-NEXT:    orr x8, x8, x15, lsr #15
-; GISEL-NEXT:    lsr x10, x16, #15
-; GISEL-NEXT:    orr x11, x11, x14, lsr #15
-; GISEL-NEXT:    orr x9, x13, x12, lsr #15
-; GISEL-NEXT:    stp x8, x10, [x0, #48]
-; GISEL-NEXT:    stp x11, x9, [x0, #32]
+; GISEL-NEXT:    ldp x8, x9, [x1]
+; GISEL-NEXT:    ldp x10, x11, [x1, #16]
+; GISEL-NEXT:    ldp x12, x13, [x1, #32]
+; GISEL-NEXT:    extr x8, x9, x8, #15
+; GISEL-NEXT:    ldp x14, x15, [x1, #48]
+; GISEL-NEXT:    extr x9, x10, x9, #15
+; GISEL-NEXT:    extr x10, x11, x10, #15
+; GISEL-NEXT:    stp x8, x9, [x0]
+; GISEL-NEXT:    extr x8, x12, x11, #15
+; GISEL-NEXT:    extr x9, x13, x12, #15
+; GISEL-NEXT:    stp x10, x8, [x0, #16]
+; GISEL-NEXT:    extr x10, x14, x13, #15
+; GISEL-NEXT:    extr x8, x15, x14, #15
+; GISEL-NEXT:    stp x9, x10, [x0, #32]
+; GISEL-NEXT:    lsr x9, x15, #15
+; GISEL-NEXT:    stp x8, x9, [x0, #48]
 ; GISEL-NEXT:    ret
 entry:
   %input_val = load i512, ptr %input, align 64
@@ -5679,32 +5600,24 @@ define void @test_ashr_i512_const_15(ptr %result, ptr %input) {
 ;
 ; GISEL-LABEL: test_ashr_i512_const_15:
 ; GISEL:       ; %bb.0: ; %entry
-; GISEL-NEXT:    ldp x8, x9, [x1, #8]
-; GISEL-NEXT:    ldr x11, [x1]
-; GISEL-NEXT:    ldp x10, x13, [x1, #24]
-; GISEL-NEXT:    ldr x17, [x1, #56]
-; GISEL-NEXT:    lsl x12, x8, #49
-; GISEL-NEXT:    lsl x15, x9, #49
-; GISEL-NEXT:    lsl x16, x10, #49
-; GISEL-NEXT:    orr x11, x12, x11, lsr #15
-; GISEL-NEXT:    ldp x14, x12, [x1, #40]
-; GISEL-NEXT:    orr x8, x15, x8, lsr #15
-; GISEL-NEXT:    lsl x15, x13, #49
-; GISEL-NEXT:    orr x9, x16, x9, lsr #15
-; GISEL-NEXT:    asr x16, x17, #63
-; GISEL-NEXT:    stp x11, x8, [x0]
-; GISEL-NEXT:    lsl x11, x14, #49
-; GISEL-NEXT:    orr x10, x15, x10, lsr #15
-; GISEL-NEXT:    lsl x15, x12, #49
-; GISEL-NEXT:    orr x8, x11, x13, lsr #15
-; GISEL-NEXT:    lsl x11, x17, #49
-; GISEL-NEXT:    stp x9, x10, [x0, #16]
-; GISEL-NEXT:    orr x9, x15, x14, lsr #15
-; GISEL-NEXT:    lsl x13, x16, #49
-; GISEL-NEXT:    orr x10, x11, x12, lsr #15
-; GISEL-NEXT:    stp x8, x9, [x0, #32]
-; GISEL-NEXT:    orr x8, x13, x17, asr #15
-; GISEL-NEXT:    stp x10, x8, [x0, #48]
+; GISEL-NEXT:    ldp x8, x9, [x1]
+; GISEL-NEXT:    ldp x10, x11, [x1, #16]
+; GISEL-NEXT:    ldp x12, x13, [x1, #48]
+; GISEL-NEXT:    extr x8, x9, x8, #15
+; GISEL-NEXT:    ldp x14, x15, [x1, #32]
+; GISEL-NEXT:    extr x9, x10, x9, #15
+; GISEL-NEXT:    extr x10, x11, x10, #15
+; GISEL-NEXT:    stp x8, x9, [x0]
+; GISEL-NEXT:    asr x8, x13, #63
+; GISEL-NEXT:    extr x11, x14, x11, #15
+; GISEL-NEXT:    extr x9, x15, x14, #15
+; GISEL-NEXT:    lsl x8, x8, #49
+; GISEL-NEXT:    stp x10, x11, [x0, #16]
+; GISEL-NEXT:    extr x10, x12, x15, #15
+; GISEL-NEXT:    extr x11, x13, x12, #15
+; GISEL-NEXT:    orr x8, x8, x13, asr #15
+; GISEL-NEXT:    stp x9, x10, [x0, #32]
+; GISEL-NEXT:    stp x11, x8, [x0, #48]
 ; GISEL-NEXT:    ret
 entry:
   %input_val = load i512, ptr %input, align 64
@@ -5738,28 +5651,21 @@ define void @test_shl_i512_const_63(ptr %result, ptr %input) {
 ; GISEL-LABEL: test_shl_i512_const_63:
 ; GISEL:       ; %bb.0: ; %entry
 ; GISEL-NEXT:    ldp x8, x9, [x1]
-; GISEL-NEXT:    ldp x11, x12, [x1, #16]
-; GISEL-NEXT:    ldp x14, x15, [x1, #32]
-; GISEL-NEXT:    lsr x10, x8, #1
-; GISEL-NEXT:    lsr x13, x9, #1
-; GISEL-NEXT:    lsl x8, x8, #63
-; GISEL-NEXT:    orr x9, x10, x9, lsl #63
-; GISEL-NEXT:    lsr x10, x11, #1
-; GISEL-NEXT:    orr x11, x13, x11, lsl #63
-; GISEL-NEXT:    ldp x13, x16, [x1, #48]
-; GISEL-NEXT:    stp x8, x9, [x0]
-; GISEL-NEXT:    lsr x8, x12, #1
-; GISEL-NEXT:    orr x10, x10, x12, lsl #63
-; GISEL-NEXT:    lsr x12, x14, #1
-; GISEL-NEXT:    lsr x9, x15, #1
-; GISEL-NEXT:    orr x8, x8, x14, lsl #63
-; GISEL-NEXT:    stp x11, x10, [x0, #16]
-; GISEL-NEXT:    orr x11, x12, x15, lsl #63
-; GISEL-NEXT:    lsr x12, x13, #1
-; GISEL-NEXT:    orr x9, x9, x13, lsl #63
-; GISEL-NEXT:    stp x8, x11, [x0, #32]
-; GISEL-NEXT:    orr x8, x12, x16, lsl #63
-; GISEL-NEXT:    stp x9, x8, [x0, #48]
+; GISEL-NEXT:    ldp x10, x11, [x1, #16]
+; GISEL-NEXT:    ldp x13, x14, [x1, #32]
+; GISEL-NEXT:    lsl x12, x8, #63
+; GISEL-NEXT:    extr x8, x9, x8, #1
+; GISEL-NEXT:    extr x9, x10, x9, #1
+; GISEL-NEXT:    extr x10, x11, x10, #1
+; GISEL-NEXT:    ldp x15, x16, [x1, #48]
+; GISEL-NEXT:    stp x12, x8, [x0]
+; GISEL-NEXT:    extr x8, x13, x11, #1
+; GISEL-NEXT:    stp x9, x10, [x0, #16]
+; GISEL-NEXT:    extr x9, x14, x13, #1
+; GISEL-NEXT:    extr x10, x15, x14, #1
+; GISEL-NEXT:    stp x8, x9, [x0, #32]
+; GISEL-NEXT:    extr x8, x16, x15, #1
+; GISEL-NEXT:    stp x10, x8, [x0, #48]
 ; GISEL-NEXT:    ret
 entry:
   %input_val = load i512, ptr %input, align 64
@@ -5791,30 +5697,22 @@ define void @test_lshr_i512_const_63(ptr %result, ptr %input) {
 ;
 ; GISEL-LABEL: test_lshr_i512_const_63:
 ; GISEL:       ; %bb.0: ; %entry
-; GISEL-NEXT:    ldp x8, x9, [x1, #8]
-; GISEL-NEXT:    ldr x11, [x1]
-; GISEL-NEXT:    ldp x10, x14, [x1, #24]
-; GISEL-NEXT:    ldr x16, [x1, #56]
-; GISEL-NEXT:    lsl x12, x8, #1
-; GISEL-NEXT:    lsl x13, x9, #1
-; GISEL-NEXT:    lsl x15, x10, #1
-; GISEL-NEXT:    orr x11, x12, x11, lsr #63
-; GISEL-NEXT:    orr x8, x13, x8, lsr #63
-; GISEL-NEXT:    lsl x13, x14, #1
-; GISEL-NEXT:    orr x9, x15, x9, lsr #63
-; GISEL-NEXT:    ldp x12, x15, [x1, #40]
-; GISEL-NEXT:    stp x11, x8, [x0]
-; GISEL-NEXT:    orr x10, x13, x10, lsr #63
-; GISEL-NEXT:    lsl x8, x16, #1
-; GISEL-NEXT:    lsl x11, x12, #1
-; GISEL-NEXT:    lsl x13, x15, #1
-; GISEL-NEXT:    stp x9, x10, [x0, #16]
-; GISEL-NEXT:    orr x8, x8, x15, lsr #63
-; GISEL-NEXT:    lsr x10, x16, #63
-; GISEL-NEXT:    orr x11, x11, x14, lsr #63
-; GISEL-NEXT:    orr x9, x13, x12, lsr #63
-; GISEL-NEXT:    stp x8, x10, [x0, #48]
-; GISEL-NEXT:    stp x11, x9, [x0, #32]
+; GISEL-NEXT:    ldp x8, x9, [x1]
+; GISEL-NEXT:    ldp x10, x11, [x1, #16]
+; GISEL-NEXT:    ldp x12, x13, [x1, #32]
+; GISEL-NEXT:    extr x8, x9, x8, #63
+; GISEL-NEXT:    ldp x14, x15, [x1, #48]
+; GISEL-NEXT:    extr x9, x10, x9, #63
+; GISEL-NEXT:    extr x10, x11, x10, #63
+; GISEL-NEXT:    stp x8, x9, [x0]
+; GISEL-NEXT:    extr x8, x12, x11, #63
+; GISEL-NEXT:    extr x9, x13, x12, #63
+; GISEL-NEXT:    stp x10, x8, [x0, #16]
+; GISEL-NEXT:    extr x10, x14, x13, #63
+; GISEL-NEXT:    extr x8, x15, x14, #63
+; GISEL-NEXT:    stp x9, x10, [x0, #32]
+; GISEL-NEXT:    lsr x9, x15, #63
+; GISEL-NEXT:    stp x8, x9, [x0, #48]
 ; GISEL-NEXT:    ret
 entry:
   %input_val = load i512, ptr %input, align 64
@@ -5846,30 +5744,22 @@ define void @test_ashr_i512_const_63(ptr %result, ptr %input) {
 ;
 ; GISEL-LABEL: test_ashr_i512_const_63:
 ; GISEL:       ; %bb.0: ; %entry
-; GISEL-NEXT:    ldp x8, x9, [x1, #8]
-; GISEL-NEXT:    ldr x10, [x1]
-; GISEL-NEXT:    ldp x11, x13, [x1, #24]
-; GISEL-NEXT:    ldr x17, [x1, #56]
-; GISEL-NEXT:    lsl x15, x9, #1
-; GISEL-NEXT:    lsl x12, x8, #1
-; GISEL-NEXT:    lsl x16, x11, #1
-; GISEL-NEXT:    orr x8, x15, x8, lsr #63
-; GISEL-NEXT:    lsl x15, x13, #1
-; GISEL-NEXT:    orr x10, x12, x10, lsr #63
-; GISEL-NEXT:    ldp x14, x12, [x1, #40]
-; GISEL-NEXT:    orr x9, x16, x9, lsr #63
-; GISEL-NEXT:    orr x11, x15, x11, lsr #63
-; GISEL-NEXT:    stp x10, x8, [x0]
-; GISEL-NEXT:    lsl x8, x17, #1
-; GISEL-NEXT:    lsl x16, x14, #1
-; GISEL-NEXT:    lsl x10, x12, #1
-; GISEL-NEXT:    stp x9, x11, [x0, #16]
-; GISEL-NEXT:    asr x9, x17, #63
-; GISEL-NEXT:    orr x8, x8, x12, lsr #63
-; GISEL-NEXT:    orr x13, x16, x13, lsr #63
-; GISEL-NEXT:    orr x10, x10, x14, lsr #63
-; GISEL-NEXT:    orr x9, x9, x9, lsl #1
-; GISEL-NEXT:    stp x13, x10, [x0, #32]
+; GISEL-NEXT:    ldp x8, x9, [x1]
+; GISEL-NEXT:    ldp x10, x11, [x1, #16]
+; GISEL-NEXT:    ldp x12, x13, [x1, #32]
+; GISEL-NEXT:    extr x8, x9, x8, #63
+; GISEL-NEXT:    ldp x14, x15, [x1, #48]
+; GISEL-NEXT:    extr x9, x10, x9, #63
+; GISEL-NEXT:    extr x10, x11, x10, #63
+; GISEL-NEXT:    stp x8, x9, [x0]
+; GISEL-NEXT:    extr x8, x12, x11, #63
+; GISEL-NEXT:    extr x9, x13, x12, #63
+; GISEL-NEXT:    extr x11, x14, x13, #63
+; GISEL-NEXT:    stp x10, x8, [x0, #16]
+; GISEL-NEXT:    asr x10, x15, #63
+; GISEL-NEXT:    extr x8, x15, x14, #63
+; GISEL-NEXT:    stp x9, x11, [x0, #32]
+; GISEL-NEXT:    orr x9, x10, x10, lsl #1
 ; GISEL-NEXT:    stp x8, x9, [x0, #48]
 ; GISEL-NEXT:    ret
 entry:
@@ -5906,23 +5796,17 @@ define void @test_shl_i512_const_65(ptr %result, ptr %input) {
 ; GISEL-NEXT:    ldr x15, [x1, #48]
 ; GISEL-NEXT:    ldp x10, x11, [x1, #16]
 ; GISEL-NEXT:    ldp x12, x13, [x1, #32]
-; GISEL-NEXT:    lsr x14, x8, #63
-; GISEL-NEXT:    lsr x16, x9, #63
-; GISEL-NEXT:    lsl x8, x8, #1
-; GISEL-NEXT:    orr x9, x14, x9, lsl #1
-; GISEL-NEXT:    lsr x14, x10, #63
-; GISEL-NEXT:    orr x10, x16, x10, lsl #1
-; GISEL-NEXT:    stp xzr, x8, [x0]
-; GISEL-NEXT:    lsr x8, x11, #63
-; GISEL-NEXT:    orr x11, x14, x11, lsl #1
-; GISEL-NEXT:    lsr x14, x12, #63
-; GISEL-NEXT:    stp x9, x10, [x0, #16]
-; GISEL-NEXT:    lsr x9, x13, #63
-; GISEL-NEXT:    orr x8, x8, x12, lsl #1
-; GISEL-NEXT:    orr x10, x14, x13, lsl #1
-; GISEL-NEXT:    orr x9, x9, x15, lsl #1
-; GISEL-NEXT:    stp x11, x8, [x0, #32]
-; GISEL-NEXT:    stp x10, x9, [x0, #48]
+; GISEL-NEXT:    lsl x14, x8, #1
+; GISEL-NEXT:    extr x8, x9, x8, #63
+; GISEL-NEXT:    extr x9, x10, x9, #63
+; GISEL-NEXT:    extr x10, x11, x10, #63
+; GISEL-NEXT:    stp xzr, x14, [x0]
+; GISEL-NEXT:    stp x8, x9, [x0, #16]
+; GISEL-NEXT:    extr x8, x12, x11, #63
+; GISEL-NEXT:    extr x9, x13, x12, #63
+; GISEL-NEXT:    stp x10, x8, [x0, #32]
+; GISEL-NEXT:    extr x10, x15, x13, #63
+; GISEL-NEXT:    stp x9, x10, [x0, #48]
 ; GISEL-NEXT:    ret
 entry:
   %input_val = load i512, ptr %input, align 64
@@ -5953,27 +5837,21 @@ define void @test_lshr_i512_const_65(ptr %result, ptr %input) {
 ;
 ; GISEL-LABEL: test_lshr_i512_const_65:
 ; GISEL:       ; %bb.0: ; %entry
-; GISEL-NEXT:    ldp x8, x9, [x1, #16]
-; GISEL-NEXT:    ldr x10, [x1, #8]
-; GISEL-NEXT:    ldp x11, x14, [x1, #32]
-; GISEL-NEXT:    ldp x15, x16, [x1, #48]
-; GISEL-NEXT:    lsl x12, x8, #63
-; GISEL-NEXT:    lsl x13, x9, #63
-; GISEL-NEXT:    orr x10, x12, x10, lsr #1
-; GISEL-NEXT:    lsl x12, x11, #63
-; GISEL-NEXT:    orr x8, x13, x8, lsr #1
-; GISEL-NEXT:    lsl x13, x14, #63
-; GISEL-NEXT:    orr x9, x12, x9, lsr #1
-; GISEL-NEXT:    stp x10, x8, [x0]
-; GISEL-NEXT:    lsl x10, x15, #63
-; GISEL-NEXT:    orr x11, x13, x11, lsr #1
-; GISEL-NEXT:    lsl x12, x16, #63
-; GISEL-NEXT:    orr x8, x10, x14, lsr #1
-; GISEL-NEXT:    lsr x10, x16, #1
-; GISEL-NEXT:    stp x9, x11, [x0, #16]
-; GISEL-NEXT:    orr x9, x12, x15, lsr #1
-; GISEL-NEXT:    stp x10, xzr, [x0, #48]
-; GISEL-NEXT:    stp x8, x9, [x0, #32]
+; GISEL-NEXT:    ldp x8, x9, [x1, #8]
+; GISEL-NEXT:    ldr x14, [x1, #56]
+; GISEL-NEXT:    ldp x10, x11, [x1, #24]
+; GISEL-NEXT:    ldp x12, x13, [x1, #40]
+; GISEL-NEXT:    extr x8, x9, x8, #1
+; GISEL-NEXT:    extr x9, x10, x9, #1
+; GISEL-NEXT:    extr x10, x11, x10, #1
+; GISEL-NEXT:    stp x8, x9, [x0]
+; GISEL-NEXT:    extr x8, x12, x11, #1
+; GISEL-NEXT:    extr x9, x13, x12, #1
+; GISEL-NEXT:    stp x10, x8, [x0, #16]
+; GISEL-NEXT:    extr x10, x14, x13, #1
+; GISEL-NEXT:    lsr x8, x14, #1
+; GISEL-NEXT:    stp x9, x10, [x0, #32]
+; GISEL-NEXT:    stp x8, xzr, [x0, #48]
 ; GISEL-NEXT:    ret
 entry:
   %input_val = load i512, ptr %input, align 64
@@ -6005,29 +5883,23 @@ define void @test_ashr_i512_const_65(ptr %result, ptr %input) {
 ;
 ; GISEL-LABEL: test_ashr_i512_const_65:
 ; GISEL:       ; %bb.0: ; %entry
-; GISEL-NEXT:    ldp x8, x9, [x1, #16]
-; GISEL-NEXT:    ldr x11, [x1, #8]
-; GISEL-NEXT:    ldp x10, x13, [x1, #32]
-; GISEL-NEXT:    lsl x12, x8, #63
-; GISEL-NEXT:    lsl x14, x9, #63
-; GISEL-NEXT:    lsl x15, x10, #63
-; GISEL-NEXT:    orr x11, x12, x11, lsr #1
-; GISEL-NEXT:    ldp x12, x16, [x1, #48]
-; GISEL-NEXT:    orr x8, x14, x8, lsr #1
-; GISEL-NEXT:    lsl x14, x13, #63
-; GISEL-NEXT:    orr x9, x15, x9, lsr #1
-; GISEL-NEXT:    asr x15, x16, #63
-; GISEL-NEXT:    stp x11, x8, [x0]
-; GISEL-NEXT:    lsl x11, x12, #63
-; GISEL-NEXT:    orr x10, x14, x10, lsr #1
-; GISEL-NEXT:    lsl x14, x16, #63
-; GISEL-NEXT:    orr x8, x11, x13, lsr #1
+; GISEL-NEXT:    ldp x8, x9, [x1, #8]
+; GISEL-NEXT:    ldr x13, [x1, #40]
+; GISEL-NEXT:    ldp x10, x11, [x1, #24]
+; GISEL-NEXT:    ldp x14, x12, [x1, #48]
+; GISEL-NEXT:    extr x8, x9, x8, #1
+; GISEL-NEXT:    extr x9, x10, x9, #1
+; GISEL-NEXT:    extr x10, x11, x10, #1
+; GISEL-NEXT:    asr x15, x12, #63
+; GISEL-NEXT:    stp x8, x9, [x0]
+; GISEL-NEXT:    extr x8, x13, x11, #1
+; GISEL-NEXT:    extr x9, x14, x13, #1
 ; GISEL-NEXT:    lsl x11, x15, #63
-; GISEL-NEXT:    stp x9, x10, [x0, #16]
-; GISEL-NEXT:    orr x9, x14, x12, lsr #1
-; GISEL-NEXT:    orr x10, x11, x16, asr #1
-; GISEL-NEXT:    stp x8, x9, [x0, #32]
-; GISEL-NEXT:    stp x10, x15, [x0, #48]
+; GISEL-NEXT:    stp x10, x8, [x0, #16]
+; GISEL-NEXT:    extr x10, x12, x14, #1
+; GISEL-NEXT:    orr x8, x11, x12, asr #1
+; GISEL-NEXT:    stp x9, x10, [x0, #32]
+; GISEL-NEXT:    stp x8, x15, [x0, #48]
 ; GISEL-NEXT:    ret
 entry:
   %input_val = load i512, ptr %input, align 64
@@ -6062,23 +5934,17 @@ define void @test_shl_i512_const_100(ptr %result, ptr %input) {
 ; GISEL-NEXT:    ldr x15, [x1, #48]
 ; GISEL-NEXT:    ldp x10, x11, [x1, #16]
 ; GISEL-NEXT:    ldp x12, x13, [x1, #32]
-; GISEL-NEXT:    lsr x14, x8, #28
-; GISEL-NEXT:    lsr x16, x9, #28
-; GISEL-NEXT:    lsl x8, x8, #36
-; GISEL-NEXT:    orr x9, x14, x9, lsl #36
-; GISEL-NEXT:    lsr x14, x10, #28
-; GISEL-NEXT:    orr x10, x16, x10, lsl #36
-; GISEL-NEXT:    stp xzr, x8, [x0]
-; GISEL-NEXT:    lsr x8, x11, #28
-; GISEL-NEXT:    orr x11, x14, x11, lsl #36
-; GISEL-NEXT:    lsr x14, x12, #28
-; GISEL-NEXT:    stp x9, x10, [x0, #16]
-; GISEL-NEXT:    lsr x9, x13, #28
-; GISEL-NEXT:    orr x8, x8, x12, lsl #36
-; GISEL-NEXT:    orr x10, x14, x13, lsl #36
-; GISEL-NEXT:    orr x9, x9, x15, lsl #36
-; GISEL-NEXT:    stp x11, x8, [x0, #32]
-; GISEL-NEXT:    stp x10, x9, [x0, #48]
+; GISEL-NEXT:    lsl x14, x8, #36
+; GISEL-NEXT:    extr x8, x9, x8, #28
+; GISEL-NEXT:    extr x9, x10, x9, #28
+; GISEL-NEXT:    extr x10, x11, x10, #28
+; GISEL-NEXT:    stp xzr, x14, [x0]
+; GISEL-NEXT:    stp x8, x9, [x0, #16]
+; GISEL-NEXT:    extr x8, x12, x11, #28
+; GISEL-NEXT:    extr x9, x13, x12, #28
+; GISEL-NEXT:    stp x10, x8, [x0, #32]
+; GISEL-NEXT:    extr x10, x15, x13, #28
+; GISEL-NEXT:    stp x9, x10, [x0, #48]
 ; GISEL-NEXT:    ret
 entry:
   %input_val = load i512, ptr %input, align 64
@@ -6109,27 +5975,21 @@ define void @test_lshr_i512_const_100(ptr %result, ptr %input) {
 ;
 ; GISEL-LABEL: test_lshr_i512_const_100:
 ; GISEL:       ; %bb.0: ; %entry
-; GISEL-NEXT:    ldp x8, x9, [x1, #16]
-; GISEL-NEXT:    ldr x10, [x1, #8]
-; GISEL-NEXT:    ldp x11, x14, [x1, #32]
-; GISEL-NEXT:    ldp x15, x16, [x1, #48]
-; GISEL-NEXT:    lsl x12, x8, #28
-; GISEL-NEXT:    lsl x13, x9, #28
-; GISEL-NEXT:    orr x10, x12, x10, lsr #36
-; GISEL-NEXT:    lsl x12, x11, #28
-; GISEL-NEXT:    orr x8, x13, x8, lsr #36
-; GISEL-NEXT:    lsl x13, x14, #28
-; GISEL-NEXT:    orr x9, x12, x9, lsr #36
-; GISEL-NEXT:    stp x10, x8, [x0]
-; GISEL-NEXT:    lsl x10, x15, #28
-; GISEL-NEXT:    orr x11, x13, x11, lsr #36
-; GISEL-NEXT:    lsl x12, x16, #28
-; GISEL-NEXT:    orr x8, x10, x14, lsr #36
-; GISEL-NEXT:    lsr x10, x16, #36
-; GISEL-NEXT:    stp x9, x11, [x0, #16]
-; GISEL-NEXT:    orr x9, x12, x15, lsr #36
-; GISEL-NEXT:    stp x10, xzr, [x0, #48]
-; GISEL-NEXT:    stp x8, x9, [x0, #32]
+; GISEL-NEXT:    ldp x8, x9, [x1, #8]
+; GISEL-NEXT:    ldr x14, [x1, #56]
+; GISEL-NEXT:    ldp x10, x11, [x1, #24]
+; GISEL-NEXT:    ldp x12, x13, [x1, #40]
+; GISEL-NEXT:    extr x8, x9, x8, #36
+; GISEL-NEXT:    extr x9, x10, x9, #36
+; GISEL-NEXT:    extr x10, x11, x10, #36
+; GISEL-NEXT:    stp x8, x9, [x0]
+; GISEL-NEXT:    extr x8, x12, x11, #36
+; GISEL-NEXT:    extr x9, x13, x12, #36
+; GISEL-NEXT:    stp x10, x8, [x0, #16]
+; GISEL-NEXT:    extr x10, x14, x13, #36
+; GISEL-NEXT:    lsr x8, x14, #36
+; GISEL-NEXT:    stp x9, x10, [x0, #32]
+; GISEL-NEXT:    stp x8, xzr, [x0, #48]
 ; GISEL-NEXT:    ret
 entry:
   %input_val = load i512, ptr %input, align 64
@@ -6161,29 +6021,23 @@ define void @test_ashr_i512_const_100(ptr %result, ptr %input) {
 ;
 ; GISEL-LABEL: test_ashr_i512_const_100:
 ; GISEL:       ; %bb.0: ; %entry
-; GISEL-NEXT:    ldp x8, x9, [x1, #16]
-; GISEL-NEXT:    ldr x11, [x1, #8]
-; GISEL-NEXT:    ldp x10, x13, [x1, #32]
-; GISEL-NEXT:    lsl x12, x8, #28
-; GISEL-NEXT:    lsl x14, x9, #28
-; GISEL-NEXT:    lsl x15, x10, #28
-; GISEL-NEXT:    orr x11, x12, x11, lsr #36
-; GISEL-NEXT:    ldp x12, x16, [x1, #48]
-; GISEL-NEXT:    orr x8, x14, x8, lsr #36
-; GISEL-NEXT:    lsl x14, x13, #28
-; GISEL-NEXT:    orr x9, x15, x9, lsr #36
-; GISEL-NEXT:    asr x15, x16, #63
-; GISEL-NEXT:    stp x11, x8, [x0]
-; GISEL-NEXT:    lsl x11, x12, #28
-; GISEL-NEXT:    orr x10, x14, x10, lsr #36
-; GISEL-NEXT:    lsl x14, x16, #28
-; GISEL-NEXT:    orr x8, x11, x13, lsr #36
+; GISEL-NEXT:    ldp x8, x9, [x1, #8]
+; GISEL-NEXT:    ldr x13, [x1, #40]
+; GISEL-NEXT:    ldp x10, x11, [x1, #24]
+; GISEL-NEXT:    ldp x14, x12, [x1, #48]
+; GISEL-NEXT:    extr x8, x9, x8, #36
+; GISEL-NEXT:    extr x9, x10, x9, #36
+; GISEL-NEXT:    extr x10, x11, x10, #36
+; GISEL-NEXT:    asr x15, x12, #63
+; GISEL-NEXT:    stp x8, x9, [x0]
+; GISEL-NEXT:    extr x8, x13, x11, #36
+; GISEL-NEXT:    extr x9, x14, x13, #36
 ; GISEL-NEXT:    lsl x11, x15, #28
-; GISEL-NEXT:    stp x9, x10, [x0, #16]
-; GISEL-NEXT:    orr x9, x14, x12, lsr #36
-; GISEL-NEXT:    orr x10, x11, x16, asr #36
-; GISEL-NEXT:    stp x8, x9, [x0, #32]
-; GISEL-NEXT:    stp x10, x15, [x0, #48]
+; GISEL-NEXT:    stp x10, x8, [x0, #16]
+; GISEL-NEXT:    extr x10, x12, x14, #36
+; GISEL-NEXT:    orr x8, x11, x12, asr #36
+; GISEL-NEXT:    stp x9, x10, [x0, #32]
+; GISEL-NEXT:    stp x8, x15, [x0, #48]
 ; GISEL-NEXT:    ret
 entry:
   %input_val = load i512, ptr %input, align 64
@@ -6219,23 +6073,17 @@ define void @test_shl_i512_const_127(ptr %result, ptr %input) {
 ; GISEL-NEXT:    ldr x15, [x1, #48]
 ; GISEL-NEXT:    ldp x10, x11, [x1, #16]
 ; GISEL-NEXT:    ldp x12, x13, [x1, #32]
-; GISEL-NEXT:    lsr x14, x8, #1
-; GISEL-NEXT:    lsr x16, x9, #1
-; GISEL-NEXT:    lsl x8, x8, #63
-; GISEL-NEXT:    orr x9, x14, x9, lsl #63
-; GISEL-NEXT:    lsr x14, x10, #1
-; GISEL-NEXT:    orr x10, x16, x10, lsl #63
-; GISEL-NEXT:    stp xzr, x8, [x0]
-; GISEL-NEXT:    lsr x8, x11, #1
-; GISEL-NEXT:    orr x11, x14, x11, lsl #63
-; GISEL-NEXT:    lsr x14, x12, #1
-; GISEL-NEXT:    stp x9, x10, [x0, #16]
-; GISEL-NEXT:    lsr x9, x13, #1
-; GISEL-NEXT:    orr x8, x8, x12, lsl #63
-; GISEL-NEXT:    orr x10, x14, x13, lsl #63
-; GISEL-NEXT:    orr x9, x9, x15, lsl #63
-; GISEL-NEXT:    stp x11, x8, [x0, #32]
-; GISEL-NEXT:    stp x10, x9, [x0, #48]
+; GISEL-NEXT:    lsl x14, x8, #63
+; GISEL-NEXT:    extr x8, x9, x8, #1
+; GISEL-NEXT:    extr x9, x10, x9, #1
+; GISEL-NEXT:    extr x10, x11, x10, #1
+; GISEL-NEXT:    stp xzr, x14, [x0]
+; GISEL-NEXT:    stp x8, x9, [x0, #16]
+; GISEL-NEXT:    extr x8, x12, x11, #1
+; GISEL-NEXT:    extr x9, x13, x12, #1
+; GISEL-NEXT:    stp x10, x8, [x0, #32]
+; GISEL-NEXT:    extr x10, x15, x13, #1
+; GISEL-NEXT:    stp x9, x10, [x0, #48]
 ; GISEL-NEXT:    ret
 entry:
   %input_val = load i512, ptr %input, align 64
@@ -6266,27 +6114,21 @@ define void @test_lshr_i512_const_127(ptr %result, ptr %input) {
 ;
 ; GISEL-LABEL: test_lshr_i512_const_127:
 ; GISEL:       ; %bb.0: ; %entry
-; GISEL-NEXT:    ldp x8, x9, [x1, #16]
-; GISEL-NEXT:    ldr x10, [x1, #8]
-; GISEL-NEXT:    ldp x11, x14, [x1, #32]
-; GISEL-NEXT:    ldp x15, x16, [x1, #48]
-; GISEL-NEXT:    lsl x12, x8, #1
-; GISEL-NEXT:    lsl x13, x9, #1
-; GISEL-NEXT:    orr x10, x12, x10, lsr #63
-; GISEL-NEXT:    lsl x12, x11, #1
-; GISEL-NEXT:    orr x8, x13, x8, lsr #63
-; GISEL-NEXT:    lsl x13, x14, #1
-; GISEL-NEXT:    orr x9, x12, x9, lsr #63
-; GISEL-NEXT:    stp x10, x8, [x0]
-; GISEL-NEXT:    lsl x10, x15, #1
-; GISEL-NEXT:    orr x11, x13, x11, lsr #63
-; GISEL-NEXT:    lsl x12, x16, #1
-; GISEL-NEXT:    orr x8, x10, x14, lsr #63
-; GISEL-NEXT:    lsr x10, x16, #63
-; GISEL-NEXT:    stp x9, x11, [x0, #16]
-; GISEL-NEXT:    orr x9, x12, x15, lsr #63
-; GISEL-NEXT:    stp x10, xzr, [x0, #48]
-; GISEL-NEXT:    stp x8, x9, [x0, #32]
+; GISEL-NEXT:    ldp x8, x9, [x1, #8]
+; GISEL-NEXT:    ldr x14, [x1, #56]
+; GISEL-NEXT:    ldp x10, x11, [x1, #24]
+; GISEL-NEXT:    ldp x12, x13, [x1, #40]
+; GISEL-NEXT:    extr x8, x9, x8, #63
+; GISEL-NEXT:    extr x9, x10, x9, #63
+; GISEL-NEXT:    extr x10, x11, x10, #63
+; GISEL-NEXT:    stp x8, x9, [x0]
+; GISEL-NEXT:    extr x8, x12, x11, #63
+; GISEL-NEXT:    extr x9, x13, x12, #63
+; GISEL-NEXT:    stp x10, x8, [x0, #16]
+; GISEL-NEXT:    extr x10, x14, x13, #63
+; GISEL-NEXT:    lsr x8, x14, #63
+; GISEL-NEXT:    stp x9, x10, [x0, #32]
+; GISEL-NEXT:    stp x8, xzr, [x0, #48]
 ; GISEL-NEXT:    ret
 entry:
   %input_val = load i512, ptr %input, align 64
@@ -6317,28 +6159,22 @@ define void @test_ashr_i512_const_127(ptr %result, ptr %input) {
 ;
 ; GISEL-LABEL: test_ashr_i512_const_127:
 ; GISEL:       ; %bb.0: ; %entry
-; GISEL-NEXT:    ldp x8, x9, [x1, #16]
-; GISEL-NEXT:    ldr x10, [x1, #8]
-; GISEL-NEXT:    ldp x11, x14, [x1, #32]
-; GISEL-NEXT:    ldp x15, x16, [x1, #48]
-; GISEL-NEXT:    lsl x12, x8, #1
-; GISEL-NEXT:    lsl x13, x9, #1
-; GISEL-NEXT:    orr x10, x12, x10, lsr #63
-; GISEL-NEXT:    lsl x12, x11, #1
-; GISEL-NEXT:    orr x8, x13, x8, lsr #63
-; GISEL-NEXT:    lsl x13, x14, #1
-; GISEL-NEXT:    orr x9, x12, x9, lsr #63
-; GISEL-NEXT:    lsl x12, x15, #1
-; GISEL-NEXT:    stp x10, x8, [x0]
-; GISEL-NEXT:    lsl x10, x16, #1
-; GISEL-NEXT:    orr x11, x13, x11, lsr #63
-; GISEL-NEXT:    asr x8, x16, #63
-; GISEL-NEXT:    orr x12, x12, x14, lsr #63
-; GISEL-NEXT:    stp x9, x11, [x0, #16]
-; GISEL-NEXT:    orr x9, x10, x15, lsr #63
-; GISEL-NEXT:    orr x10, x8, x8, lsl #1
-; GISEL-NEXT:    stp x12, x9, [x0, #32]
-; GISEL-NEXT:    stp x10, x8, [x0, #48]
+; GISEL-NEXT:    ldp x8, x9, [x1, #8]
+; GISEL-NEXT:    ldr x14, [x1, #56]
+; GISEL-NEXT:    ldp x10, x11, [x1, #24]
+; GISEL-NEXT:    ldp x12, x13, [x1, #40]
+; GISEL-NEXT:    extr x8, x9, x8, #63
+; GISEL-NEXT:    extr x9, x10, x9, #63
+; GISEL-NEXT:    extr x10, x11, x10, #63
+; GISEL-NEXT:    stp x8, x9, [x0]
+; GISEL-NEXT:    extr x8, x12, x11, #63
+; GISEL-NEXT:    asr x9, x14, #63
+; GISEL-NEXT:    extr x11, x13, x12, #63
+; GISEL-NEXT:    stp x10, x8, [x0, #16]
+; GISEL-NEXT:    extr x10, x14, x13, #63
+; GISEL-NEXT:    orr x8, x9, x9, lsl #1
+; GISEL-NEXT:    stp x11, x10, [x0, #32]
+; GISEL-NEXT:    stp x8, x9, [x0, #48]
 ; GISEL-NEXT:    ret
 entry:
   %input_val = load i512, ptr %input, align 64
diff --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
index b54f262dbbf4a..4894932d3c9b1 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
@@ -755,199 +755,117 @@ define i64 @red_mla_dup_ext_u8_s8_s64(ptr noalias noundef readonly captures(none
 ; CHECK-SD-NEXT:    // kill: def $w1 killed $w1 def $x1
 ; CHECK-SD-NEXT:    cbz w2, .LBB6_3
 ; CHECK-SD-NEXT:  // %bb.1: // %iter.check
-; CHECK-SD-NEXT:    str x25, [sp, #-64]! // 8-byte Folded Spill
-; CHECK-SD-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
-; CHECK-SD-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
-; CHECK-SD-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
-; CHECK-SD-NEXT:    .cfi_def_cfa_offset 64
-; CHECK-SD-NEXT:    .cfi_offset w19, -8
-; CHECK-SD-NEXT:    .cfi_offset w20, -16
-; CHECK-SD-NEXT:    .cfi_offset w21, -24
-; CHECK-SD-NEXT:    .cfi_offset w22, -32
-; CHECK-SD-NEXT:    .cfi_offset w23, -40
-; CHECK-SD-NEXT:    .cfi_offset w24, -48
-; CHECK-SD-NEXT:    .cfi_offset w25, -64
-; CHECK-SD-NEXT:    sxtb x9, w1
 ; CHECK-SD-NEXT:    cmp w2, #3
-; CHECK-SD-NEXT:    mov w10, w2
+; CHECK-SD-NEXT:    mov w9, w2
 ; CHECK-SD-NEXT:    b.hi .LBB6_4
 ; CHECK-SD-NEXT:  // %bb.2:
-; CHECK-SD-NEXT:    mov x11, xzr
+; CHECK-SD-NEXT:    mov x10, xzr
 ; CHECK-SD-NEXT:    mov x8, xzr
 ; CHECK-SD-NEXT:    b .LBB6_13
 ; CHECK-SD-NEXT:  .LBB6_3:
-; CHECK-SD-NEXT:    mov x0, xzr
+; CHECK-SD-NEXT:    mov x8, xzr
+; CHECK-SD-NEXT:    mov x0, x8
 ; CHECK-SD-NEXT:    ret
 ; CHECK-SD-NEXT:  .LBB6_4: // %vector.main.loop.iter.check
-; CHECK-SD-NEXT:    dup v0.2d, x9
 ; CHECK-SD-NEXT:    cmp w2, #16
 ; CHECK-SD-NEXT:    b.hs .LBB6_6
 ; CHECK-SD-NEXT:  // %bb.5:
-; CHECK-SD-NEXT:    mov x11, xzr
+; CHECK-SD-NEXT:    mov x10, xzr
 ; CHECK-SD-NEXT:    mov x8, xzr
 ; CHECK-SD-NEXT:    b .LBB6_10
 ; CHECK-SD-NEXT:  .LBB6_6: // %vector.ph
+; CHECK-SD-NEXT:    mov w8, w1
+; CHECK-SD-NEXT:    movi v0.2d, #0000000000000000
 ; CHECK-SD-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-SD-NEXT:    mov x8, v0.d[1]
-; CHECK-SD-NEXT:    and x12, x10, #0xc
+; CHECK-SD-NEXT:    sxtb x8, w8
+; CHECK-SD-NEXT:    movi v3.2d, #0000000000000000
 ; CHECK-SD-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-SD-NEXT:    movi v6.2d, #0000000000000000
 ; CHECK-SD-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-SD-NEXT:    and x11, x10, #0xfffffff0
-; CHECK-SD-NEXT:    movi v3.2d, #0000000000000000
+; CHECK-SD-NEXT:    and x11, x9, #0xc
 ; CHECK-SD-NEXT:    movi v7.2d, #0000000000000000
-; CHECK-SD-NEXT:    mov x15, x0
 ; CHECK-SD-NEXT:    movi v5.2d, #0000000000000000
-; CHECK-SD-NEXT:    movi v16.2d, #0000000000000000
-; CHECK-SD-NEXT:    and x16, x10, #0xfffffff0
-; CHECK-SD-NEXT:    movi v6.2d, #0000000000000000
-; CHECK-SD-NEXT:    fmov x13, d0
-; CHECK-SD-NEXT:    fmov x14, d0
+; CHECK-SD-NEXT:    and x10, x9, #0xfffffff0
+; CHECK-SD-NEXT:    dup v16.4s, w8
+; CHECK-SD-NEXT:    mov x8, x0
+; CHECK-SD-NEXT:    and x12, x9, #0xfffffff0
 ; CHECK-SD-NEXT:  .LBB6_7: // %vector.body
 ; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-SD-NEXT:    ldr q17, [x15], #16
-; CHECK-SD-NEXT:    subs x16, x16, #16
+; CHECK-SD-NEXT:    ldr q17, [x8], #16
+; CHECK-SD-NEXT:    subs x12, x12, #16
 ; CHECK-SD-NEXT:    ushll v18.8h, v17.8b, #0
-; CHECK-SD-NEXT:    ushll2 v19.8h, v17.16b, #0
-; CHECK-SD-NEXT:    ushll v17.4s, v18.4h, #0
-; CHECK-SD-NEXT:    ushll2 v20.4s, v19.8h, #0
-; CHECK-SD-NEXT:    ushll2 v18.4s, v18.8h, #0
-; CHECK-SD-NEXT:    ushll v19.4s, v19.4h, #0
-; CHECK-SD-NEXT:    ushll v21.2d, v17.2s, #0
-; CHECK-SD-NEXT:    ushll2 v22.2d, v20.4s, #0
-; CHECK-SD-NEXT:    ushll2 v17.2d, v17.4s, #0
-; CHECK-SD-NEXT:    ushll v23.2d, v18.2s, #0
-; CHECK-SD-NEXT:    ushll v20.2d, v20.2s, #0
-; CHECK-SD-NEXT:    ushll2 v18.2d, v18.4s, #0
-; CHECK-SD-NEXT:    fmov x17, d21
-; CHECK-SD-NEXT:    mov x2, v21.d[1]
-; CHECK-SD-NEXT:    ushll v21.2d, v19.2s, #0
-; CHECK-SD-NEXT:    ushll2 v19.2d, v19.4s, #0
-; CHECK-SD-NEXT:    fmov x18, d22
-; CHECK-SD-NEXT:    fmov x1, d17
-; CHECK-SD-NEXT:    fmov x3, d23
-; CHECK-SD-NEXT:    fmov x21, d20
-; CHECK-SD-NEXT:    fmov x22, d18
-; CHECK-SD-NEXT:    fmov x19, d21
-; CHECK-SD-NEXT:    mul x17, x13, x17
-; CHECK-SD-NEXT:    mov x4, v22.d[1]
-; CHECK-SD-NEXT:    fmov x24, d19
-; CHECK-SD-NEXT:    mov x5, v23.d[1]
-; CHECK-SD-NEXT:    mov x6, v21.d[1]
-; CHECK-SD-NEXT:    mov x7, v20.d[1]
-; CHECK-SD-NEXT:    mov x20, v18.d[1]
-; CHECK-SD-NEXT:    mov x23, v19.d[1]
-; CHECK-SD-NEXT:    mov x25, v17.d[1]
-; CHECK-SD-NEXT:    mul x18, x14, x18
-; CHECK-SD-NEXT:    mul x1, x13, x1
-; CHECK-SD-NEXT:    fmov d17, x17
-; CHECK-SD-NEXT:    mul x3, x13, x3
-; CHECK-SD-NEXT:    fmov d18, x18
-; CHECK-SD-NEXT:    mul x19, x13, x19
-; CHECK-SD-NEXT:    fmov d19, x1
-; CHECK-SD-NEXT:    mul x21, x13, x21
-; CHECK-SD-NEXT:    fmov d20, x3
-; CHECK-SD-NEXT:    mul x22, x13, x22
-; CHECK-SD-NEXT:    fmov d21, x19
-; CHECK-SD-NEXT:    mul x24, x13, x24
-; CHECK-SD-NEXT:    fmov d24, x21
-; CHECK-SD-NEXT:    mul x2, x8, x2
-; CHECK-SD-NEXT:    fmov d22, x22
-; CHECK-SD-NEXT:    mul x4, x8, x4
-; CHECK-SD-NEXT:    fmov d23, x24
-; CHECK-SD-NEXT:    mul x5, x8, x5
-; CHECK-SD-NEXT:    mov v17.d[1], x2
-; CHECK-SD-NEXT:    mul x6, x8, x6
-; CHECK-SD-NEXT:    mov v18.d[1], x4
-; CHECK-SD-NEXT:    mul x7, x8, x7
-; CHECK-SD-NEXT:    mov v20.d[1], x5
-; CHECK-SD-NEXT:    add v1.2d, v17.2d, v1.2d
-; CHECK-SD-NEXT:    mul x20, x8, x20
-; CHECK-SD-NEXT:    mov v21.d[1], x6
-; CHECK-SD-NEXT:    add v6.2d, v18.2d, v6.2d
-; CHECK-SD-NEXT:    mul x23, x8, x23
-; CHECK-SD-NEXT:    mov v24.d[1], x7
-; CHECK-SD-NEXT:    add v4.2d, v20.2d, v4.2d
-; CHECK-SD-NEXT:    mul x17, x8, x25
-; CHECK-SD-NEXT:    mov v22.d[1], x20
-; CHECK-SD-NEXT:    add v7.2d, v21.2d, v7.2d
-; CHECK-SD-NEXT:    mov v23.d[1], x23
-; CHECK-SD-NEXT:    add v16.2d, v24.2d, v16.2d
-; CHECK-SD-NEXT:    mov v19.d[1], x17
-; CHECK-SD-NEXT:    add v3.2d, v22.2d, v3.2d
-; CHECK-SD-NEXT:    add v5.2d, v23.2d, v5.2d
-; CHECK-SD-NEXT:    add v2.2d, v19.2d, v2.2d
+; CHECK-SD-NEXT:    ushll2 v17.8h, v17.16b, #0
+; CHECK-SD-NEXT:    ushll2 v19.4s, v18.8h, #0
+; CHECK-SD-NEXT:    ushll v20.4s, v17.4h, #0
+; CHECK-SD-NEXT:    ushll v18.4s, v18.4h, #0
+; CHECK-SD-NEXT:    ushll2 v17.4s, v17.8h, #0
+; CHECK-SD-NEXT:    smlal2 v2.2d, v16.4s, v19.4s
+; CHECK-SD-NEXT:    smlal2 v4.2d, v16.4s, v20.4s
+; CHECK-SD-NEXT:    smlal v6.2d, v16.2s, v20.2s
+; CHECK-SD-NEXT:    smlal v3.2d, v16.2s, v19.2s
+; CHECK-SD-NEXT:    smlal2 v1.2d, v16.4s, v18.4s
+; CHECK-SD-NEXT:    smlal v7.2d, v16.2s, v17.2s
+; CHECK-SD-NEXT:    smlal v0.2d, v16.2s, v18.2s
+; CHECK-SD-NEXT:    smlal2 v5.2d, v16.4s, v17.4s
 ; CHECK-SD-NEXT:    b.ne .LBB6_7
 ; CHECK-SD-NEXT:  // %bb.8: // %middle.block
-; CHECK-SD-NEXT:    add v1.2d, v1.2d, v7.2d
-; CHECK-SD-NEXT:    add v4.2d, v4.2d, v16.2d
-; CHECK-SD-NEXT:    cmp x11, x10
-; CHECK-SD-NEXT:    add v2.2d, v2.2d, v5.2d
-; CHECK-SD-NEXT:    add v3.2d, v3.2d, v6.2d
+; CHECK-SD-NEXT:    add v0.2d, v0.2d, v6.2d
+; CHECK-SD-NEXT:    add v3.2d, v3.2d, v7.2d
+; CHECK-SD-NEXT:    cmp x10, x9
 ; CHECK-SD-NEXT:    add v1.2d, v1.2d, v4.2d
-; CHECK-SD-NEXT:    add v2.2d, v2.2d, v3.2d
+; CHECK-SD-NEXT:    add v2.2d, v2.2d, v5.2d
+; CHECK-SD-NEXT:    add v0.2d, v0.2d, v3.2d
 ; CHECK-SD-NEXT:    add v1.2d, v1.2d, v2.2d
-; CHECK-SD-NEXT:    addp d1, v1.2d
-; CHECK-SD-NEXT:    fmov x8, d1
+; CHECK-SD-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-SD-NEXT:    addp d0, v0.2d
+; CHECK-SD-NEXT:    fmov x8, d0
 ; CHECK-SD-NEXT:    b.eq .LBB6_15
 ; CHECK-SD-NEXT:  // %bb.9: // %vec.epilog.iter.check
-; CHECK-SD-NEXT:    cbz x12, .LBB6_13
+; CHECK-SD-NEXT:    cbz x11, .LBB6_13
 ; CHECK-SD-NEXT:  .LBB6_10: // %vec.epilog.ph
+; CHECK-SD-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-SD-NEXT:    mov w11, w1
 ; CHECK-SD-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-SD-NEXT:    movi v2.2d, #0000000000000000
-; CHECK-SD-NEXT:    mov x13, x11
+; CHECK-SD-NEXT:    sxtb x11, w11
 ; CHECK-SD-NEXT:    movi v3.2d, #0x000000000000ff
-; CHECK-SD-NEXT:    fmov x14, d0
-; CHECK-SD-NEXT:    and x11, x10, #0xfffffffc
-; CHECK-SD-NEXT:    fmov x15, d0
-; CHECK-SD-NEXT:    sub x12, x13, x11
-; CHECK-SD-NEXT:    add x13, x0, x13
-; CHECK-SD-NEXT:    mov v1.d[0], x8
-; CHECK-SD-NEXT:    mov x8, v0.d[1]
+; CHECK-SD-NEXT:    dup v2.2s, w11
+; CHECK-SD-NEXT:    mov x11, x10
+; CHECK-SD-NEXT:    and x10, x9, #0xfffffffc
+; CHECK-SD-NEXT:    mov v0.d[0], x8
+; CHECK-SD-NEXT:    sub x8, x11, x10
+; CHECK-SD-NEXT:    add x11, x0, x11
 ; CHECK-SD-NEXT:  .LBB6_11: // %vec.epilog.vector.body
 ; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-SD-NEXT:    ldr s0, [x13], #4
-; CHECK-SD-NEXT:    adds x12, x12, #4
-; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-SD-NEXT:    ushll v4.2d, v0.2s, #0
-; CHECK-SD-NEXT:    ushll2 v0.2d, v0.4s, #0
+; CHECK-SD-NEXT:    ldr s4, [x11], #4
+; CHECK-SD-NEXT:    adds x8, x8, #4
+; CHECK-SD-NEXT:    ushll v4.8h, v4.8b, #0
+; CHECK-SD-NEXT:    ushll v4.4s, v4.4h, #0
+; CHECK-SD-NEXT:    ushll v5.2d, v4.2s, #0
+; CHECK-SD-NEXT:    ushll2 v4.2d, v4.4s, #0
+; CHECK-SD-NEXT:    and v5.16b, v5.16b, v3.16b
 ; CHECK-SD-NEXT:    and v4.16b, v4.16b, v3.16b
-; CHECK-SD-NEXT:    and v0.16b, v0.16b, v3.16b
-; CHECK-SD-NEXT:    fmov x16, d4
-; CHECK-SD-NEXT:    fmov x18, d0
-; CHECK-SD-NEXT:    mov x17, v4.d[1]
-; CHECK-SD-NEXT:    mov x1, v0.d[1]
-; CHECK-SD-NEXT:    mul x16, x14, x16
-; CHECK-SD-NEXT:    mul x18, x15, x18
-; CHECK-SD-NEXT:    mul x17, x8, x17
-; CHECK-SD-NEXT:    fmov d0, x16
-; CHECK-SD-NEXT:    mul x1, x8, x1
-; CHECK-SD-NEXT:    fmov d4, x18
-; CHECK-SD-NEXT:    mov v0.d[1], x17
-; CHECK-SD-NEXT:    mov v4.d[1], x1
-; CHECK-SD-NEXT:    add v1.2d, v0.2d, v1.2d
-; CHECK-SD-NEXT:    add v2.2d, v4.2d, v2.2d
+; CHECK-SD-NEXT:    xtn v5.2s, v5.2d
+; CHECK-SD-NEXT:    xtn v4.2s, v4.2d
+; CHECK-SD-NEXT:    smlal v1.2d, v2.2s, v4.2s
+; CHECK-SD-NEXT:    smlal v0.2d, v2.2s, v5.2s
 ; CHECK-SD-NEXT:    b.ne .LBB6_11
 ; CHECK-SD-NEXT:  // %bb.12: // %vec.epilog.middle.block
-; CHECK-SD-NEXT:    add v0.2d, v1.2d, v2.2d
-; CHECK-SD-NEXT:    cmp x11, x10
+; CHECK-SD-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-SD-NEXT:    cmp x10, x9
 ; CHECK-SD-NEXT:    addp d0, v0.2d
 ; CHECK-SD-NEXT:    fmov x8, d0
 ; CHECK-SD-NEXT:    b.eq .LBB6_15
 ; CHECK-SD-NEXT:  .LBB6_13: // %for.body.preheader
-; CHECK-SD-NEXT:    sub x10, x10, x11
-; CHECK-SD-NEXT:    add x11, x0, x11
+; CHECK-SD-NEXT:    sxtb x11, w1
+; CHECK-SD-NEXT:    sub x9, x9, x10
+; CHECK-SD-NEXT:    add x10, x0, x10
 ; CHECK-SD-NEXT:  .LBB6_14: // %for.body
 ; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-SD-NEXT:    ldrb w12, [x11], #1
-; CHECK-SD-NEXT:    subs x10, x10, #1
-; CHECK-SD-NEXT:    smaddl x8, w12, w9, x8
+; CHECK-SD-NEXT:    ldrb w12, [x10], #1
+; CHECK-SD-NEXT:    subs x9, x9, #1
+; CHECK-SD-NEXT:    smaddl x8, w12, w11, x8
 ; CHECK-SD-NEXT:    b.ne .LBB6_14
-; CHECK-SD-NEXT:  .LBB6_15:
-; CHECK-SD-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
-; CHECK-SD-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
-; CHECK-SD-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
-; CHECK-SD-NEXT:    ldr x25, [sp], #64 // 8-byte Folded Reload
+; CHECK-SD-NEXT:  .LBB6_15: // %for.cond.cleanup
 ; CHECK-SD-NEXT:    mov x0, x8
 ; CHECK-SD-NEXT:    ret
 ;
@@ -957,63 +875,64 @@ define i64 @red_mla_dup_ext_u8_s8_s64(ptr noalias noundef readonly captures(none
 ; CHECK-GI-NEXT:    cbz w2, .LBB6_7
 ; CHECK-GI-NEXT:  // %bb.1: // %iter.check
 ; CHECK-GI-NEXT:    movi d0, #0000000000000000
-; CHECK-GI-NEXT:    sxtb x9, w1
-; CHECK-GI-NEXT:    mov x11, xzr
+; CHECK-GI-NEXT:    mov x10, xzr
 ; CHECK-GI-NEXT:    cmp w2, #4
-; CHECK-GI-NEXT:    mov w10, w2
+; CHECK-GI-NEXT:    mov w9, w2
 ; CHECK-GI-NEXT:    b.lo .LBB6_12
 ; CHECK-GI-NEXT:  // %bb.2: // %vector.main.loop.iter.check
 ; CHECK-GI-NEXT:    movi d0, #0000000000000000
-; CHECK-GI-NEXT:    dup v1.2d, x9
-; CHECK-GI-NEXT:    mov x11, xzr
+; CHECK-GI-NEXT:    mov x10, xzr
 ; CHECK-GI-NEXT:    cmp w2, #16
 ; CHECK-GI-NEXT:    b.lo .LBB6_9
 ; CHECK-GI-NEXT:  // %bb.3: // %vector.ph
+; CHECK-GI-NEXT:    mov w8, w1
 ; CHECK-GI-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-GI-NEXT:    xtn v2.2s, v1.2d
-; CHECK-GI-NEXT:    and x8, x10, #0xc
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    sxtb x8, w8
+; CHECK-GI-NEXT:    movi v2.2d, #0000000000000000
 ; CHECK-GI-NEXT:    movi v3.2d, #0000000000000000
 ; CHECK-GI-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-GI-NEXT:    and x11, x10, #0xfffffff0
-; CHECK-GI-NEXT:    movi v5.2d, #0000000000000000
 ; CHECK-GI-NEXT:    movi v6.2d, #0000000000000000
-; CHECK-GI-NEXT:    mov x12, x0
+; CHECK-GI-NEXT:    and x10, x9, #0xfffffff0
+; CHECK-GI-NEXT:    dup v5.2d, x8
 ; CHECK-GI-NEXT:    movi v7.2d, #0000000000000000
-; CHECK-GI-NEXT:    movi v16.2d, #0000000000000000
-; CHECK-GI-NEXT:    and x13, x10, #0xfffffff0
-; CHECK-GI-NEXT:    movi v17.2d, #0000000000000000
+; CHECK-GI-NEXT:    and x8, x9, #0xc
+; CHECK-GI-NEXT:    mov x11, x0
+; CHECK-GI-NEXT:    and x12, x9, #0xfffffff0
+; CHECK-GI-NEXT:    xtn v16.2s, v5.2d
+; CHECK-GI-NEXT:    movi v5.2d, #0000000000000000
 ; CHECK-GI-NEXT:  .LBB6_4: // %vector.body
 ; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-GI-NEXT:    ldr q18, [x12], #16
-; CHECK-GI-NEXT:    subs x13, x13, #16
-; CHECK-GI-NEXT:    ushll v19.8h, v18.8b, #0
-; CHECK-GI-NEXT:    ushll2 v18.8h, v18.16b, #0
-; CHECK-GI-NEXT:    ushll v20.4s, v19.4h, #0
-; CHECK-GI-NEXT:    ushll2 v19.4s, v19.8h, #0
-; CHECK-GI-NEXT:    ushll v21.4s, v18.4h, #0
+; CHECK-GI-NEXT:    ldr q17, [x11], #16
+; CHECK-GI-NEXT:    subs x12, x12, #16
+; CHECK-GI-NEXT:    ushll v18.8h, v17.8b, #0
+; CHECK-GI-NEXT:    ushll2 v17.8h, v17.16b, #0
+; CHECK-GI-NEXT:    ushll v19.4s, v18.4h, #0
 ; CHECK-GI-NEXT:    ushll2 v18.4s, v18.8h, #0
-; CHECK-GI-NEXT:    mov d22, v20.d[1]
-; CHECK-GI-NEXT:    mov d23, v19.d[1]
-; CHECK-GI-NEXT:    mov d24, v21.d[1]
-; CHECK-GI-NEXT:    mov d25, v18.d[1]
-; CHECK-GI-NEXT:    smlal v0.2d, v2.2s, v20.2s
-; CHECK-GI-NEXT:    smlal v4.2d, v2.2s, v19.2s
-; CHECK-GI-NEXT:    smlal v6.2d, v2.2s, v21.2s
-; CHECK-GI-NEXT:    smlal v16.2d, v2.2s, v18.2s
-; CHECK-GI-NEXT:    smlal v3.2d, v2.2s, v22.2s
-; CHECK-GI-NEXT:    smlal v5.2d, v2.2s, v23.2s
-; CHECK-GI-NEXT:    smlal v7.2d, v2.2s, v24.2s
-; CHECK-GI-NEXT:    smlal v17.2d, v2.2s, v25.2s
+; CHECK-GI-NEXT:    ushll v20.4s, v17.4h, #0
+; CHECK-GI-NEXT:    ushll2 v17.4s, v17.8h, #0
+; CHECK-GI-NEXT:    mov d21, v19.d[1]
+; CHECK-GI-NEXT:    mov d22, v18.d[1]
+; CHECK-GI-NEXT:    mov d23, v20.d[1]
+; CHECK-GI-NEXT:    mov d24, v17.d[1]
+; CHECK-GI-NEXT:    smlal v0.2d, v16.2s, v19.2s
+; CHECK-GI-NEXT:    smlal v2.2d, v16.2s, v18.2s
+; CHECK-GI-NEXT:    smlal v4.2d, v16.2s, v20.2s
+; CHECK-GI-NEXT:    smlal v6.2d, v16.2s, v17.2s
+; CHECK-GI-NEXT:    smlal v1.2d, v16.2s, v21.2s
+; CHECK-GI-NEXT:    smlal v3.2d, v16.2s, v22.2s
+; CHECK-GI-NEXT:    smlal v5.2d, v16.2s, v23.2s
+; CHECK-GI-NEXT:    smlal v7.2d, v16.2s, v24.2s
 ; CHECK-GI-NEXT:    b.ne .LBB6_4
 ; CHECK-GI-NEXT:  // %bb.5: // %middle.block
-; CHECK-GI-NEXT:    add v0.2d, v0.2d, v3.2d
+; CHECK-GI-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    add v1.2d, v2.2d, v3.2d
+; CHECK-GI-NEXT:    cmp x10, x9
 ; CHECK-GI-NEXT:    add v2.2d, v4.2d, v5.2d
-; CHECK-GI-NEXT:    cmp x11, x10
 ; CHECK-GI-NEXT:    add v3.2d, v6.2d, v7.2d
-; CHECK-GI-NEXT:    add v4.2d, v16.2d, v17.2d
-; CHECK-GI-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-GI-NEXT:    add v2.2d, v3.2d, v4.2d
-; CHECK-GI-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    add v1.2d, v2.2d, v3.2d
+; CHECK-GI-NEXT:    add v0.2d, v0.2d, v1.2d
 ; CHECK-GI-NEXT:    addp d0, v0.2d
 ; CHECK-GI-NEXT:    b.ne .LBB6_8
 ; CHECK-GI-NEXT:  // %bb.6:
@@ -1027,50 +946,54 @@ define i64 @red_mla_dup_ext_u8_s8_s64(ptr noalias noundef readonly captures(none
 ; CHECK-GI-NEXT:  .LBB6_8: // %vec.epilog.iter.check
 ; CHECK-GI-NEXT:    cbz x8, .LBB6_12
 ; CHECK-GI-NEXT:  .LBB6_9: // %vec.epilog.ph
+; CHECK-GI-NEXT:    mov w8, w1
 ; CHECK-GI-NEXT:    mov v0.d[1], xzr
-; CHECK-GI-NEXT:    movi v2.2d, #0000000000000000
-; CHECK-GI-NEXT:    mov x12, x11
-; CHECK-GI-NEXT:    xtn v1.2s, v1.2d
-; CHECK-GI-NEXT:    and x11, x10, #0xfffffffc
-; CHECK-GI-NEXT:    sub x8, x12, x11
-; CHECK-GI-NEXT:    add x12, x0, x12
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    sxtb x8, w8
+; CHECK-GI-NEXT:    mov x11, x10
+; CHECK-GI-NEXT:    and x10, x9, #0xfffffffc
+; CHECK-GI-NEXT:    dup v2.2d, x8
+; CHECK-GI-NEXT:    sub x8, x11, x10
+; CHECK-GI-NEXT:    add x11, x0, x11
+; CHECK-GI-NEXT:    xtn v2.2s, v2.2d
 ; CHECK-GI-NEXT:  .LBB6_10: // %vec.epilog.vector.body
 ; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-GI-NEXT:    ldr w13, [x12], #4
+; CHECK-GI-NEXT:    ldr w12, [x11], #4
 ; CHECK-GI-NEXT:    adds x8, x8, #4
-; CHECK-GI-NEXT:    fmov s3, w13
-; CHECK-GI-NEXT:    uxtb w13, w13
+; CHECK-GI-NEXT:    fmov s3, w12
+; CHECK-GI-NEXT:    uxtb w12, w12
 ; CHECK-GI-NEXT:    mov b4, v3.b[2]
 ; CHECK-GI-NEXT:    mov b5, v3.b[1]
 ; CHECK-GI-NEXT:    mov b6, v3.b[3]
-; CHECK-GI-NEXT:    fmov s3, w13
-; CHECK-GI-NEXT:    fmov w14, s4
-; CHECK-GI-NEXT:    fmov w15, s5
-; CHECK-GI-NEXT:    fmov w16, s6
+; CHECK-GI-NEXT:    fmov s3, w12
+; CHECK-GI-NEXT:    fmov w13, s4
+; CHECK-GI-NEXT:    fmov w14, s5
+; CHECK-GI-NEXT:    fmov w15, s6
+; CHECK-GI-NEXT:    uxtb w13, w13
 ; CHECK-GI-NEXT:    uxtb w14, w14
 ; CHECK-GI-NEXT:    uxtb w15, w15
-; CHECK-GI-NEXT:    uxtb w16, w16
-; CHECK-GI-NEXT:    fmov s4, w14
-; CHECK-GI-NEXT:    mov v3.s[1], w15
-; CHECK-GI-NEXT:    mov v4.s[1], w16
-; CHECK-GI-NEXT:    smlal v0.2d, v1.2s, v3.2s
-; CHECK-GI-NEXT:    smlal v2.2d, v1.2s, v4.2s
+; CHECK-GI-NEXT:    fmov s4, w13
+; CHECK-GI-NEXT:    mov v3.s[1], w14
+; CHECK-GI-NEXT:    mov v4.s[1], w15
+; CHECK-GI-NEXT:    smlal v0.2d, v2.2s, v3.2s
+; CHECK-GI-NEXT:    smlal v1.2d, v2.2s, v4.2s
 ; CHECK-GI-NEXT:    b.ne .LBB6_10
 ; CHECK-GI-NEXT:  // %bb.11: // %vec.epilog.middle.block
-; CHECK-GI-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-GI-NEXT:    cmp x11, x10
+; CHECK-GI-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    cmp x10, x9
 ; CHECK-GI-NEXT:    addp d0, v0.2d
 ; CHECK-GI-NEXT:    fmov x8, d0
 ; CHECK-GI-NEXT:    b.eq .LBB6_14
 ; CHECK-GI-NEXT:  .LBB6_12: // %for.body.preheader
-; CHECK-GI-NEXT:    sub x10, x10, x11
-; CHECK-GI-NEXT:    add x11, x0, x11
+; CHECK-GI-NEXT:    sxtb x11, w1
+; CHECK-GI-NEXT:    sub x9, x9, x10
+; CHECK-GI-NEXT:    add x10, x0, x10
 ; CHECK-GI-NEXT:  .LBB6_13: // %for.body
 ; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-GI-NEXT:    ldrb w8, [x11], #1
+; CHECK-GI-NEXT:    ldrb w8, [x10], #1
 ; CHECK-GI-NEXT:    fmov x12, d0
-; CHECK-GI-NEXT:    subs x10, x10, #1
-; CHECK-GI-NEXT:    madd x8, x8, x9, x12
+; CHECK-GI-NEXT:    subs x9, x9, #1
+; CHECK-GI-NEXT:    madd x8, x8, x11, x12
 ; CHECK-GI-NEXT:    fmov d0, x8
 ; CHECK-GI-NEXT:    b.ne .LBB6_13
 ; CHECK-GI-NEXT:  .LBB6_14: // %for.cond.cleanup
diff --git a/llvm/test/CodeGen/AArch64/adc.ll b/llvm/test/CodeGen/AArch64/adc.ll
index 12e8bf26c9eac..03f3cf192102d 100644
--- a/llvm/test/CodeGen/AArch64/adc.ll
+++ b/llvm/test/CodeGen/AArch64/adc.ll
@@ -71,9 +71,8 @@ define i128 @test_shifted(i128 %a, i128 %b) {
 ;
 ; CHECK-GI-LABEL: test_shifted:
 ; CHECK-GI:       ; %bb.0:
-; CHECK-GI-NEXT:    lsr x8, x2, #19
+; CHECK-GI-NEXT:    extr x8, x3, x2, #19
 ; CHECK-GI-NEXT:    adds x0, x0, x2, lsl #45
-; CHECK-GI-NEXT:    orr x8, x8, x3, lsl #45
 ; CHECK-GI-NEXT:    adc x1, x1, x8
 ; CHECK-GI-NEXT:    ret
   %rhs = shl i128 %b, 45
@@ -108,8 +107,7 @@ define i128 @test_extended(i128 %a, i16 %b) {
 ; CHECK-GI-NEXT:    sxth x8, w2
 ; CHECK-GI-NEXT:    adds x0, x0, w2, sxth #3
 ; CHECK-GI-NEXT:    asr x9, x8, #63
-; CHECK-GI-NEXT:    lsr x8, x8, #61
-; CHECK-GI-NEXT:    orr x8, x8, x9, lsl #3
+; CHECK-GI-NEXT:    extr x8, x9, x8, #61
 ; CHECK-GI-NEXT:    adc x1, x1, x8
 ; CHECK-GI-NEXT:    ret
   %ext = sext i16 %b to i128
diff --git a/llvm/test/CodeGen/AArch64/arm64-srl-and.ll b/llvm/test/CodeGen/AArch64/arm64-srl-and.ll
index b58f6ba96a5b8..330f27bd6c0cd 100644
--- a/llvm/test/CodeGen/AArch64/arm64-srl-and.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-srl-and.ll
@@ -1,22 +1,38 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -O3 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -O3 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64-linux-gnu -O3 -global-isel < %s | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 ; This used to miscompile:
 ; The 16-bit -1 should not become 32-bit -1 (sub w8, w8, #1).
 
 @g = global i16 0, align 4
 define i32 @srl_and()  {
-; CHECK-LABEL: srl_and:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    adrp x8, :got:g
-; CHECK-NEXT:    mov w9, #50
-; CHECK-NEXT:    ldr x8, [x8, :got_lo12:g]
-; CHECK-NEXT:    ldrh w8, [x8]
-; CHECK-NEXT:    eor w8, w8, w9
-; CHECK-NEXT:    mov w9, #65535
-; CHECK-NEXT:    add w8, w8, w9
-; CHECK-NEXT:    and w0, w8, w8, lsr #16
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: srl_and:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    adrp x8, :got:g
+; CHECK-SD-NEXT:    mov w9, #50 // =0x32
+; CHECK-SD-NEXT:    ldr x8, [x8, :got_lo12:g]
+; CHECK-SD-NEXT:    ldrh w8, [x8]
+; CHECK-SD-NEXT:    eor w8, w8, w9
+; CHECK-SD-NEXT:    mov w9, #65535 // =0xffff
+; CHECK-SD-NEXT:    add w8, w8, w9
+; CHECK-SD-NEXT:    and w0, w8, w8, lsr #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: srl_and:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, :got:g
+; CHECK-GI-NEXT:    mov w9, #50 // =0x32
+; CHECK-GI-NEXT:    ldr x8, [x8, :got_lo12:g]
+; CHECK-GI-NEXT:    ldrh w8, [x8]
+; CHECK-GI-NEXT:    eor w8, w8, w9
+; CHECK-GI-NEXT:    mov w9, #65535 // =0xffff
+; CHECK-GI-NEXT:    add w8, w9, w8, uxth
+; CHECK-GI-NEXT:    and w9, w8, #0xffff
+; CHECK-GI-NEXT:    cmp w8, w9
+; CHECK-GI-NEXT:    cset w8, ne
+; CHECK-GI-NEXT:    and w0, w9, w8
+; CHECK-GI-NEXT:    ret
 entry:
   %0 = load i16, ptr @g, align 4
   %1 = xor i16 %0, 50
@@ -29,3 +45,5 @@ entry:
 
   ret i32 %and
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/eor3.ll b/llvm/test/CodeGen/AArch64/eor3.ll
index eccd09131b525..594a73f70a7f9 100644
--- a/llvm/test/CodeGen/AArch64/eor3.ll
+++ b/llvm/test/CodeGen/AArch64/eor3.ll
@@ -277,3 +277,154 @@ define <2 x i64> @eor3_vnot(<2 x i64> %0, <2 x i64> %1) {
   ret <2 x i64> %4
 }
 
+define <1 x i64> @eor3_1x64(<1 x i64> %0, <1 x i64> %1, <1 x i64> %2) {
+; SHA3-LABEL: eor3_1x64:
+; SHA3:       // %bb.0:
+; SHA3-NEXT:    // kill: def $d0 killed $d0 def $q0
+; SHA3-NEXT:    // kill: def $d2 killed $d2 def $q2
+; SHA3-NEXT:    // kill: def $d1 killed $d1 def $q1
+; SHA3-NEXT:    eor3 v0.16b, v1.16b, v2.16b, v0.16b
+; SHA3-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; SHA3-NEXT:    ret
+;
+; NOSHA3-LABEL: eor3_1x64:
+; NOSHA3:       // %bb.0:
+; NOSHA3-NEXT:    eor v1.8b, v1.8b, v2.8b
+; NOSHA3-NEXT:    eor v0.8b, v1.8b, v0.8b
+; NOSHA3-NEXT:    ret
+;
+; SVE2-LABEL: eor3_1x64:
+; SVE2:       // %bb.0:
+; SVE2-NEXT:    // kill: def $d1 killed $d1 def $z1
+; SVE2-NEXT:    // kill: def $d2 killed $d2 def $z2
+; SVE2-NEXT:    // kill: def $d0 killed $d0 def $z0
+; SVE2-NEXT:    eor3 z1.d, z1.d, z2.d, z0.d
+; SVE2-NEXT:    fmov d0, d1
+; SVE2-NEXT:    ret
+;
+; SHA3-SVE2-LABEL: eor3_1x64:
+; SHA3-SVE2:       // %bb.0:
+; SHA3-SVE2-NEXT:    // kill: def $d0 killed $d0 def $q0
+; SHA3-SVE2-NEXT:    // kill: def $d2 killed $d2 def $q2
+; SHA3-SVE2-NEXT:    // kill: def $d1 killed $d1 def $q1
+; SHA3-SVE2-NEXT:    eor3 v0.16b, v1.16b, v2.16b, v0.16b
+; SHA3-SVE2-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; SHA3-SVE2-NEXT:    ret
+  %4 = xor <1 x i64> %1, %2
+  %5 = xor <1 x i64> %4, %0
+  ret <1 x i64> %5
+}
+
+define <2 x i32> @eor3_2x32(<2 x i32> %0, <2 x i32> %1, <2 x i32> %2) {
+; SHA3-LABEL: eor3_2x32:
+; SHA3:       // %bb.0:
+; SHA3-NEXT:    // kill: def $d0 killed $d0 def $q0
+; SHA3-NEXT:    // kill: def $d2 killed $d2 def $q2
+; SHA3-NEXT:    // kill: def $d1 killed $d1 def $q1
+; SHA3-NEXT:    eor3 v0.16b, v1.16b, v2.16b, v0.16b
+; SHA3-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; SHA3-NEXT:    ret
+;
+; NOSHA3-LABEL: eor3_2x32:
+; NOSHA3:       // %bb.0:
+; NOSHA3-NEXT:    eor v1.8b, v1.8b, v2.8b
+; NOSHA3-NEXT:    eor v0.8b, v1.8b, v0.8b
+; NOSHA3-NEXT:    ret
+;
+; SVE2-LABEL: eor3_2x32:
+; SVE2:       // %bb.0:
+; SVE2-NEXT:    // kill: def $d1 killed $d1 def $z1
+; SVE2-NEXT:    // kill: def $d2 killed $d2 def $z2
+; SVE2-NEXT:    // kill: def $d0 killed $d0 def $z0
+; SVE2-NEXT:    eor3 z1.d, z1.d, z2.d, z0.d
+; SVE2-NEXT:    fmov d0, d1
+; SVE2-NEXT:    ret
+;
+; SHA3-SVE2-LABEL: eor3_2x32:
+; SHA3-SVE2:       // %bb.0:
+; SHA3-SVE2-NEXT:    // kill: def $d0 killed $d0 def $q0
+; SHA3-SVE2-NEXT:    // kill: def $d2 killed $d2 def $q2
+; SHA3-SVE2-NEXT:    // kill: def $d1 killed $d1 def $q1
+; SHA3-SVE2-NEXT:    eor3 v0.16b, v1.16b, v2.16b, v0.16b
+; SHA3-SVE2-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; SHA3-SVE2-NEXT:    ret
+  %4 = xor <2 x i32> %1, %2
+  %5 = xor <2 x i32> %4, %0
+  ret <2 x i32> %5
+}
+
+define <4 x i16> @eor3_4x16(<4 x i16> %0, <4 x i16> %1, <4 x i16> %2) {
+; SHA3-LABEL: eor3_4x16:
+; SHA3:       // %bb.0:
+; SHA3-NEXT:    // kill: def $d0 killed $d0 def $q0
+; SHA3-NEXT:    // kill: def $d2 killed $d2 def $q2
+; SHA3-NEXT:    // kill: def $d1 killed $d1 def $q1
+; SHA3-NEXT:    eor3 v0.16b, v1.16b, v2.16b, v0.16b
+; SHA3-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; SHA3-NEXT:    ret
+;
+; NOSHA3-LABEL: eor3_4x16:
+; NOSHA3:       // %bb.0:
+; NOSHA3-NEXT:    eor v1.8b, v1.8b, v2.8b
+; NOSHA3-NEXT:    eor v0.8b, v1.8b, v0.8b
+; NOSHA3-NEXT:    ret
+;
+; SVE2-LABEL: eor3_4x16:
+; SVE2:       // %bb.0:
+; SVE2-NEXT:    // kill: def $d1 killed $d1 def $z1
+; SVE2-NEXT:    // kill: def $d2 killed $d2 def $z2
+; SVE2-NEXT:    // kill: def $d0 killed $d0 def $z0
+; SVE2-NEXT:    eor3 z1.d, z1.d, z2.d, z0.d
+; SVE2-NEXT:    fmov d0, d1
+; SVE2-NEXT:    ret
+;
+; SHA3-SVE2-LABEL: eor3_4x16:
+; SHA3-SVE2:       // %bb.0:
+; SHA3-SVE2-NEXT:    // kill: def $d0 killed $d0 def $q0
+; SHA3-SVE2-NEXT:    // kill: def $d2 killed $d2 def $q2
+; SHA3-SVE2-NEXT:    // kill: def $d1 killed $d1 def $q1
+; SHA3-SVE2-NEXT:    eor3 v0.16b, v1.16b, v2.16b, v0.16b
+; SHA3-SVE2-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; SHA3-SVE2-NEXT:    ret
+  %4 = xor <4 x i16> %1, %2
+  %5 = xor <4 x i16> %4, %0
+  ret <4 x i16> %5
+}
+
+define <8 x i8> @eor3_8x8(<8 x i8> %0, <8 x i8> %1, <8 x i8> %2) {
+; SHA3-LABEL: eor3_8x8:
+; SHA3:       // %bb.0:
+; SHA3-NEXT:    // kill: def $d0 killed $d0 def $q0
+; SHA3-NEXT:    // kill: def $d2 killed $d2 def $q2
+; SHA3-NEXT:    // kill: def $d1 killed $d1 def $q1
+; SHA3-NEXT:    eor3 v0.16b, v1.16b, v2.16b, v0.16b
+; SHA3-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; SHA3-NEXT:    ret
+;
+; NOSHA3-LABEL: eor3_8x8:
+; NOSHA3:       // %bb.0:
+; NOSHA3-NEXT:    eor v1.8b, v1.8b, v2.8b
+; NOSHA3-NEXT:    eor v0.8b, v1.8b, v0.8b
+; NOSHA3-NEXT:    ret
+;
+; SVE2-LABEL: eor3_8x8:
+; SVE2:       // %bb.0:
+; SVE2-NEXT:    // kill: def $d1 killed $d1 def $z1
+; SVE2-NEXT:    // kill: def $d2 killed $d2 def $z2
+; SVE2-NEXT:    // kill: def $d0 killed $d0 def $z0
+; SVE2-NEXT:    eor3 z1.d, z1.d, z2.d, z0.d
+; SVE2-NEXT:    fmov d0, d1
+; SVE2-NEXT:    ret
+;
+; SHA3-SVE2-LABEL: eor3_8x8:
+; SHA3-SVE2:       // %bb.0:
+; SHA3-SVE2-NEXT:    // kill: def $d0 killed $d0 def $q0
+; SHA3-SVE2-NEXT:    // kill: def $d2 killed $d2 def $q2
+; SHA3-SVE2-NEXT:    // kill: def $d1 killed $d1 def $q1
+; SHA3-SVE2-NEXT:    eor3 v0.16b, v1.16b, v2.16b, v0.16b
+; SHA3-SVE2-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; SHA3-SVE2-NEXT:    ret
+  %4 = xor <8 x i8> %1, %2
+  %5 = xor <8 x i8> %4, %0
+  ret <8 x i8> %5
+}
diff --git a/llvm/test/CodeGen/AArch64/fsh.ll b/llvm/test/CodeGen/AArch64/fsh.ll
index 765f6b77b41a9..7f07ef476b8aa 100644
--- a/llvm/test/CodeGen/AArch64/fsh.ll
+++ b/llvm/test/CodeGen/AArch64/fsh.ll
@@ -510,41 +510,40 @@ define i128 @fshl_i128(i128 %a, i128 %b, i128 %c) {
 ;
 ; CHECK-GI-LABEL: fshl_i128:
 ; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #64 // =0x40
 ; CHECK-GI-NEXT:    and x9, x4, #0x7f
-; CHECK-GI-NEXT:    mov w10, #64 // =0x40
-; CHECK-GI-NEXT:    lsl x14, x3, #63
-; CHECK-GI-NEXT:    sub x12, x10, x9
+; CHECK-GI-NEXT:    mov w10, #127 // =0x7f
+; CHECK-GI-NEXT:    sub x12, x8, x9
 ; CHECK-GI-NEXT:    lsl x13, x1, x9
-; CHECK-GI-NEXT:    mov w8, #127 // =0x7f
+; CHECK-GI-NEXT:    bic x10, x10, x4
 ; CHECK-GI-NEXT:    lsr x12, x0, x12
-; CHECK-GI-NEXT:    bic x8, x8, x4
-; CHECK-GI-NEXT:    sub x15, x9, #64
+; CHECK-GI-NEXT:    sub x14, x9, #64
+; CHECK-GI-NEXT:    lsl x15, x0, x9
+; CHECK-GI-NEXT:    extr x16, x3, x2, #1
 ; CHECK-GI-NEXT:    cmp x9, #64
-; CHECK-GI-NEXT:    lsl x9, x0, x9
-; CHECK-GI-NEXT:    lsl x15, x0, x15
-; CHECK-GI-NEXT:    orr x12, x12, x13
-; CHECK-GI-NEXT:    orr x13, x14, x2, lsr #1
-; CHECK-GI-NEXT:    lsr x14, x3, #1
-; CHECK-GI-NEXT:    sub x10, x10, x8
-; CHECK-GI-NEXT:    sub x16, x8, #64
-; CHECK-GI-NEXT:    csel x9, x9, xzr, lo
-; CHECK-GI-NEXT:    lsr x17, x13, x8
-; CHECK-GI-NEXT:    lsl x10, x14, x10
-; CHECK-GI-NEXT:    csel x12, x12, x15, lo
+; CHECK-GI-NEXT:    sub x8, x8, x10
+; CHECK-GI-NEXT:    orr x9, x12, x13
+; CHECK-GI-NEXT:    lsr x12, x3, #1
+; CHECK-GI-NEXT:    lsl x13, x0, x14
+; CHECK-GI-NEXT:    csel x14, x15, xzr, lo
+; CHECK-GI-NEXT:    sub x15, x10, #64
+; CHECK-GI-NEXT:    lsr x17, x16, x10
+; CHECK-GI-NEXT:    lsl x8, x12, x8
+; CHECK-GI-NEXT:    csel x9, x9, x13, lo
 ; CHECK-GI-NEXT:    tst x4, #0x7f
-; CHECK-GI-NEXT:    lsr x15, x14, x16
+; CHECK-GI-NEXT:    lsr x13, x12, x15
 ; CHECK-GI-NEXT:    mvn x11, x4
-; CHECK-GI-NEXT:    csel x12, x1, x12, eq
-; CHECK-GI-NEXT:    orr x10, x17, x10
-; CHECK-GI-NEXT:    cmp x8, #64
-; CHECK-GI-NEXT:    lsr x14, x14, x8
-; CHECK-GI-NEXT:    csel x10, x10, x15, lo
+; CHECK-GI-NEXT:    csel x9, x1, x9, eq
+; CHECK-GI-NEXT:    orr x8, x17, x8
+; CHECK-GI-NEXT:    cmp x10, #64
+; CHECK-GI-NEXT:    lsr x12, x12, x10
+; CHECK-GI-NEXT:    csel x8, x8, x13, lo
 ; CHECK-GI-NEXT:    tst x11, #0x7f
-; CHECK-GI-NEXT:    csel x10, x13, x10, eq
-; CHECK-GI-NEXT:    cmp x8, #64
-; CHECK-GI-NEXT:    csel x8, x14, xzr, lo
-; CHECK-GI-NEXT:    orr x0, x9, x10
-; CHECK-GI-NEXT:    orr x1, x12, x8
+; CHECK-GI-NEXT:    csel x8, x16, x8, eq
+; CHECK-GI-NEXT:    cmp x10, #64
+; CHECK-GI-NEXT:    csel x10, x12, xzr, lo
+; CHECK-GI-NEXT:    orr x0, x14, x8
+; CHECK-GI-NEXT:    orr x1, x9, x10
 ; CHECK-GI-NEXT:    ret
 entry:
   %d = call i128 @llvm.fshl(i128 %a, i128 %b, i128 %c)
@@ -571,41 +570,40 @@ define i128 @fshr_i128(i128 %a, i128 %b, i128 %c) {
 ;
 ; CHECK-GI-LABEL: fshr_i128:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    lsr x8, x0, #63
-; CHECK-GI-NEXT:    mov w9, #127 // =0x7f
-; CHECK-GI-NEXT:    mov w10, #64 // =0x40
-; CHECK-GI-NEXT:    bic x9, x9, x4
-; CHECK-GI-NEXT:    lsl x11, x0, #1
-; CHECK-GI-NEXT:    and x12, x4, #0x7f
-; CHECK-GI-NEXT:    orr x8, x8, x1, lsl #1
-; CHECK-GI-NEXT:    sub x14, x10, x9
-; CHECK-GI-NEXT:    sub x17, x9, #64
-; CHECK-GI-NEXT:    lsl x15, x11, x9
-; CHECK-GI-NEXT:    lsr x14, x11, x14
-; CHECK-GI-NEXT:    cmp x9, #64
-; CHECK-GI-NEXT:    lsl x16, x8, x9
-; CHECK-GI-NEXT:    sub x9, x10, x12
-; CHECK-GI-NEXT:    lsl x10, x11, x17
-; CHECK-GI-NEXT:    mvn x13, x4
-; CHECK-GI-NEXT:    csel x11, x15, xzr, lo
-; CHECK-GI-NEXT:    sub x15, x12, #64
-; CHECK-GI-NEXT:    orr x14, x14, x16
-; CHECK-GI-NEXT:    lsr x16, x2, x12
-; CHECK-GI-NEXT:    lsl x9, x3, x9
-; CHECK-GI-NEXT:    csel x10, x14, x10, lo
-; CHECK-GI-NEXT:    tst x13, #0x7f
-; CHECK-GI-NEXT:    lsr x13, x3, x15
-; CHECK-GI-NEXT:    csel x8, x8, x10, eq
-; CHECK-GI-NEXT:    orr x9, x16, x9
-; CHECK-GI-NEXT:    cmp x12, #64
-; CHECK-GI-NEXT:    lsr x10, x3, x12
-; CHECK-GI-NEXT:    csel x9, x9, x13, lo
+; CHECK-GI-NEXT:    mov w8, #127 // =0x7f
+; CHECK-GI-NEXT:    lsl x9, x0, #1
+; CHECK-GI-NEXT:    extr x10, x1, x0, #63
+; CHECK-GI-NEXT:    bic x8, x8, x4
+; CHECK-GI-NEXT:    mov w11, #64 // =0x40
+; CHECK-GI-NEXT:    and x14, x4, #0x7f
+; CHECK-GI-NEXT:    sub x12, x11, x8
+; CHECK-GI-NEXT:    lsl x13, x10, x8
+; CHECK-GI-NEXT:    lsl x16, x9, x8
+; CHECK-GI-NEXT:    lsr x12, x9, x12
+; CHECK-GI-NEXT:    sub x17, x8, #64
+; CHECK-GI-NEXT:    cmp x8, #64
+; CHECK-GI-NEXT:    lsl x8, x9, x17
+; CHECK-GI-NEXT:    sub x11, x11, x14
+; CHECK-GI-NEXT:    mvn x15, x4
+; CHECK-GI-NEXT:    orr x12, x12, x13
+; CHECK-GI-NEXT:    csel x9, x16, xzr, lo
+; CHECK-GI-NEXT:    sub x13, x14, #64
+; CHECK-GI-NEXT:    lsr x16, x2, x14
+; CHECK-GI-NEXT:    lsl x11, x3, x11
+; CHECK-GI-NEXT:    csel x8, x12, x8, lo
+; CHECK-GI-NEXT:    tst x15, #0x7f
+; CHECK-GI-NEXT:    lsr x12, x3, x13
+; CHECK-GI-NEXT:    csel x8, x10, x8, eq
+; CHECK-GI-NEXT:    orr x10, x16, x11
+; CHECK-GI-NEXT:    cmp x14, #64
+; CHECK-GI-NEXT:    lsr x11, x3, x14
+; CHECK-GI-NEXT:    csel x10, x10, x12, lo
 ; CHECK-GI-NEXT:    tst x4, #0x7f
-; CHECK-GI-NEXT:    csel x9, x2, x9, eq
-; CHECK-GI-NEXT:    cmp x12, #64
-; CHECK-GI-NEXT:    csel x10, x10, xzr, lo
-; CHECK-GI-NEXT:    orr x0, x11, x9
-; CHECK-GI-NEXT:    orr x1, x8, x10
+; CHECK-GI-NEXT:    csel x10, x2, x10, eq
+; CHECK-GI-NEXT:    cmp x14, #64
+; CHECK-GI-NEXT:    csel x11, x11, xzr, lo
+; CHECK-GI-NEXT:    orr x0, x9, x10
+; CHECK-GI-NEXT:    orr x1, x8, x11
 ; CHECK-GI-NEXT:    ret
 entry:
   %d = call i128 @llvm.fshr(i128 %a, i128 %b, i128 %c)
@@ -720,10 +718,9 @@ define i128 @rotl_i128_c(i128 %a) {
 ;
 ; CHECK-GI-LABEL: rotl_i128_c:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    lsr x8, x0, #61
-; CHECK-GI-NEXT:    lsr x9, x1, #61
-; CHECK-GI-NEXT:    orr x1, x8, x1, lsl #3
-; CHECK-GI-NEXT:    orr x0, x9, x0, lsl #3
+; CHECK-GI-NEXT:    extr x8, x1, x0, #61
+; CHECK-GI-NEXT:    extr x0, x0, x1, #61
+; CHECK-GI-NEXT:    mov x1, x8
 ; CHECK-GI-NEXT:    ret
 entry:
   %d = call i128 @llvm.fshl(i128 %a, i128 %a, i128 3)
@@ -731,20 +728,12 @@ entry:
 }
 
 define i128 @rotr_i128_c(i128 %a) {
-; CHECK-SD-LABEL: rotr_i128_c:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    extr x8, x1, x0, #3
-; CHECK-SD-NEXT:    extr x1, x0, x1, #3
-; CHECK-SD-NEXT:    mov x0, x8
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: rotr_i128_c:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    lsl x8, x1, #61
-; CHECK-GI-NEXT:    lsl x9, x0, #61
-; CHECK-GI-NEXT:    orr x0, x8, x0, lsr #3
-; CHECK-GI-NEXT:    orr x1, x9, x1, lsr #3
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: rotr_i128_c:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    extr x8, x1, x0, #3
+; CHECK-NEXT:    extr x1, x0, x1, #3
+; CHECK-NEXT:    mov x0, x8
+; CHECK-NEXT:    ret
 entry:
   %d = call i128 @llvm.fshr(i128 %a, i128 %a, i128 3)
   ret i128 %d
@@ -868,10 +857,8 @@ define i128 @fshl_i128_c(i128 %a, i128 %b) {
 ;
 ; CHECK-GI-LABEL: fshl_i128_c:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    lsr x8, x0, #61
-; CHECK-GI-NEXT:    lsr x9, x3, #61
-; CHECK-GI-NEXT:    orr x1, x8, x1, lsl #3
-; CHECK-GI-NEXT:    orr x0, x9, x0, lsl #3
+; CHECK-GI-NEXT:    extr x1, x1, x0, #61
+; CHECK-GI-NEXT:    extr x0, x0, x3, #61
 ; CHECK-GI-NEXT:    ret
 entry:
   %d = call i128 @llvm.fshl(i128 %a, i128 %b, i128 3)
@@ -879,21 +866,12 @@ entry:
 }
 
 define i128 @fshr_i128_c(i128 %a, i128 %b) {
-; CHECK-SD-LABEL: fshr_i128_c:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    extr x8, x3, x2, #3
-; CHECK-SD-NEXT:    extr x1, x0, x3, #3
-; CHECK-SD-NEXT:    mov x0, x8
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: fshr_i128_c:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    lsl x8, x3, #61
-; CHECK-GI-NEXT:    lsr x9, x3, #3
-; CHECK-GI-NEXT:    orr x8, x8, x2, lsr #3
-; CHECK-GI-NEXT:    orr x1, x9, x0, lsl #61
-; CHECK-GI-NEXT:    mov x0, x8
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: fshr_i128_c:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    extr x8, x3, x2, #3
+; CHECK-NEXT:    extr x1, x0, x3, #3
+; CHECK-NEXT:    mov x0, x8
+; CHECK-NEXT:    ret
 entry:
   %d = call i128 @llvm.fshr(i128 %a, i128 %b, i128 3)
   ret i128 %d
@@ -3013,75 +2991,73 @@ define <2 x i128> @fshl_v2i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> %c) {
 ; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-GI-NEXT:    .cfi_offset w19, -16
 ; CHECK-GI-NEXT:    ldr x11, [sp, #16]
-; CHECK-GI-NEXT:    mov w10, #64 // =0x40
+; CHECK-GI-NEXT:    mov w9, #64 // =0x40
 ; CHECK-GI-NEXT:    ldr x12, [sp, #32]
 ; CHECK-GI-NEXT:    mov w13, #127 // =0x7f
-; CHECK-GI-NEXT:    and x9, x11, #0x7f
+; CHECK-GI-NEXT:    and x8, x11, #0x7f
 ; CHECK-GI-NEXT:    and x14, x12, #0x7f
-; CHECK-GI-NEXT:    mvn x15, x11
-; CHECK-GI-NEXT:    sub x8, x10, x9
-; CHECK-GI-NEXT:    sub x16, x9, #64
-; CHECK-GI-NEXT:    lsl x19, x1, x9
-; CHECK-GI-NEXT:    lsr x18, x0, x8
-; CHECK-GI-NEXT:    lsl x17, x0, x9
-; CHECK-GI-NEXT:    lsl x16, x0, x16
-; CHECK-GI-NEXT:    cmp x9, #64
-; CHECK-GI-NEXT:    bic x0, x13, x11
-; CHECK-GI-NEXT:    mvn x8, x12
-; CHECK-GI-NEXT:    orr x18, x18, x19
-; CHECK-GI-NEXT:    csel x9, x17, xzr, lo
+; CHECK-GI-NEXT:    mvn x18, x11
+; CHECK-GI-NEXT:    sub x10, x9, x8
+; CHECK-GI-NEXT:    sub x15, x8, #64
+; CHECK-GI-NEXT:    lsl x17, x1, x8
+; CHECK-GI-NEXT:    lsr x16, x0, x10
+; CHECK-GI-NEXT:    lsl x15, x0, x15
+; CHECK-GI-NEXT:    cmp x8, #64
+; CHECK-GI-NEXT:    lsl x19, x0, x8
+; CHECK-GI-NEXT:    lsl x0, x3, x14
+; CHECK-GI-NEXT:    mvn x10, x12
+; CHECK-GI-NEXT:    orr x16, x16, x17
 ; CHECK-GI-NEXT:    sub x17, x14, #64
-; CHECK-GI-NEXT:    csel x16, x18, x16, lo
+; CHECK-GI-NEXT:    csel x15, x16, x15, lo
+; CHECK-GI-NEXT:    sub x16, x9, x14
+; CHECK-GI-NEXT:    csel x8, x19, xzr, lo
+; CHECK-GI-NEXT:    lsr x16, x2, x16
 ; CHECK-GI-NEXT:    tst x11, #0x7f
-; CHECK-GI-NEXT:    sub x11, x10, x14
-; CHECK-GI-NEXT:    lsr x11, x2, x11
-; CHECK-GI-NEXT:    lsl x18, x3, x14
-; CHECK-GI-NEXT:    csel x16, x1, x16, eq
-; CHECK-GI-NEXT:    lsl x1, x2, x14
+; CHECK-GI-NEXT:    lsl x19, x2, x14
 ; CHECK-GI-NEXT:    lsl x17, x2, x17
+; CHECK-GI-NEXT:    csel x15, x1, x15, eq
 ; CHECK-GI-NEXT:    cmp x14, #64
-; CHECK-GI-NEXT:    lsl x14, x5, #63
-; CHECK-GI-NEXT:    orr x11, x11, x18
-; CHECK-GI-NEXT:    bic x13, x13, x12
-; CHECK-GI-NEXT:    csel x18, x1, xzr, lo
-; CHECK-GI-NEXT:    csel x11, x11, x17, lo
+; CHECK-GI-NEXT:    orr x16, x16, x0
+; CHECK-GI-NEXT:    bic x11, x13, x11
+; CHECK-GI-NEXT:    csel x14, x19, xzr, lo
+; CHECK-GI-NEXT:    csel x16, x16, x17, lo
 ; CHECK-GI-NEXT:    tst x12, #0x7f
-; CHECK-GI-NEXT:    lsr x12, x5, #1
-; CHECK-GI-NEXT:    orr x14, x14, x4, lsr #1
-; CHECK-GI-NEXT:    lsl x17, x7, #63
-; CHECK-GI-NEXT:    sub x1, x10, x0
-; CHECK-GI-NEXT:    csel x11, x3, x11, eq
-; CHECK-GI-NEXT:    sub x2, x0, #64
-; CHECK-GI-NEXT:    lsr x3, x14, x0
-; CHECK-GI-NEXT:    lsl x1, x12, x1
-; CHECK-GI-NEXT:    lsr x4, x7, #1
-; CHECK-GI-NEXT:    orr x17, x17, x6, lsr #1
-; CHECK-GI-NEXT:    lsr x2, x12, x2
-; CHECK-GI-NEXT:    cmp x0, #64
-; CHECK-GI-NEXT:    orr x1, x3, x1
-; CHECK-GI-NEXT:    sub x10, x10, x13
-; CHECK-GI-NEXT:    lsr x12, x12, x0
-; CHECK-GI-NEXT:    csel x1, x1, x2, lo
-; CHECK-GI-NEXT:    tst x15, #0x7f
-; CHECK-GI-NEXT:    sub x15, x13, #64
-; CHECK-GI-NEXT:    lsr x2, x17, x13
-; CHECK-GI-NEXT:    lsl x10, x4, x10
-; CHECK-GI-NEXT:    csel x14, x14, x1, eq
-; CHECK-GI-NEXT:    cmp x0, #64
-; CHECK-GI-NEXT:    lsr x15, x4, x15
-; CHECK-GI-NEXT:    lsr x0, x4, x13
-; CHECK-GI-NEXT:    csel x12, x12, xzr, lo
-; CHECK-GI-NEXT:    orr x10, x2, x10
-; CHECK-GI-NEXT:    cmp x13, #64
-; CHECK-GI-NEXT:    csel x10, x10, x15, lo
-; CHECK-GI-NEXT:    tst x8, #0x7f
-; CHECK-GI-NEXT:    orr x1, x16, x12
-; CHECK-GI-NEXT:    csel x8, x17, x10, eq
-; CHECK-GI-NEXT:    cmp x13, #64
-; CHECK-GI-NEXT:    csel x10, x0, xzr, lo
-; CHECK-GI-NEXT:    orr x0, x9, x14
-; CHECK-GI-NEXT:    orr x2, x18, x8
-; CHECK-GI-NEXT:    orr x3, x11, x10
+; CHECK-GI-NEXT:    lsr x17, x5, #1
+; CHECK-GI-NEXT:    extr x0, x5, x4, #1
+; CHECK-GI-NEXT:    bic x12, x13, x12
+; CHECK-GI-NEXT:    csel x13, x3, x16, eq
+; CHECK-GI-NEXT:    sub x16, x9, x11
+; CHECK-GI-NEXT:    sub x1, x11, #64
+; CHECK-GI-NEXT:    lsr x3, x7, #1
+; CHECK-GI-NEXT:    lsr x2, x0, x11
+; CHECK-GI-NEXT:    lsl x16, x17, x16
+; CHECK-GI-NEXT:    extr x4, x7, x6, #1
+; CHECK-GI-NEXT:    lsr x1, x17, x1
+; CHECK-GI-NEXT:    cmp x11, #64
+; CHECK-GI-NEXT:    sub x9, x9, x12
+; CHECK-GI-NEXT:    orr x16, x2, x16
+; CHECK-GI-NEXT:    lsr x17, x17, x11
+; CHECK-GI-NEXT:    lsl x9, x3, x9
+; CHECK-GI-NEXT:    csel x16, x16, x1, lo
+; CHECK-GI-NEXT:    tst x18, #0x7f
+; CHECK-GI-NEXT:    sub x18, x12, #64
+; CHECK-GI-NEXT:    lsr x1, x4, x12
+; CHECK-GI-NEXT:    csel x16, x0, x16, eq
+; CHECK-GI-NEXT:    cmp x11, #64
+; CHECK-GI-NEXT:    lsr x11, x3, x18
+; CHECK-GI-NEXT:    csel x17, x17, xzr, lo
+; CHECK-GI-NEXT:    cmp x12, #64
+; CHECK-GI-NEXT:    orr x9, x1, x9
+; CHECK-GI-NEXT:    lsr x18, x3, x12
+; CHECK-GI-NEXT:    orr x0, x8, x16
+; CHECK-GI-NEXT:    csel x9, x9, x11, lo
+; CHECK-GI-NEXT:    tst x10, #0x7f
+; CHECK-GI-NEXT:    orr x1, x15, x17
+; CHECK-GI-NEXT:    csel x9, x4, x9, eq
+; CHECK-GI-NEXT:    cmp x12, #64
+; CHECK-GI-NEXT:    csel x10, x18, xzr, lo
+; CHECK-GI-NEXT:    orr x2, x14, x9
+; CHECK-GI-NEXT:    orr x3, x13, x10
 ; CHECK-GI-NEXT:    ldr x19, [sp], #16 // 8-byte Folded Reload
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -3125,75 +3101,73 @@ define <2 x i128> @fshr_v2i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> %c) {
 ; CHECK-GI-LABEL: fshr_v2i128:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    ldr x9, [sp]
-; CHECK-GI-NEXT:    lsl x12, x1, #1
-; CHECK-GI-NEXT:    mov w11, #127 // =0x7f
-; CHECK-GI-NEXT:    mov w14, #64 // =0x40
-; CHECK-GI-NEXT:    lsl x15, x0, #1
+; CHECK-GI-NEXT:    mov w10, #127 // =0x7f
+; CHECK-GI-NEXT:    mov w12, #64 // =0x40
+; CHECK-GI-NEXT:    lsl x13, x0, #1
+; CHECK-GI-NEXT:    extr x14, x1, x0, #63
 ; CHECK-GI-NEXT:    ldr x8, [sp, #16]
-; CHECK-GI-NEXT:    bic x13, x11, x9
-; CHECK-GI-NEXT:    orr x12, x12, x0, lsr #63
-; CHECK-GI-NEXT:    lsl x1, x3, #1
-; CHECK-GI-NEXT:    sub x17, x14, x13
-; CHECK-GI-NEXT:    sub x18, x13, #64
-; CHECK-GI-NEXT:    lsl x3, x15, x13
-; CHECK-GI-NEXT:    lsr x17, x15, x17
-; CHECK-GI-NEXT:    lsl x0, x12, x13
-; CHECK-GI-NEXT:    lsl x15, x15, x18
-; CHECK-GI-NEXT:    bic x11, x11, x8
+; CHECK-GI-NEXT:    bic x11, x10, x9
+; CHECK-GI-NEXT:    mvn x16, x9
+; CHECK-GI-NEXT:    and x15, x9, #0x7f
+; CHECK-GI-NEXT:    sub x17, x12, x11
+; CHECK-GI-NEXT:    sub x18, x11, #64
+; CHECK-GI-NEXT:    lsl x0, x14, x11
+; CHECK-GI-NEXT:    lsr x17, x13, x17
+; CHECK-GI-NEXT:    lsl x1, x13, x11
+; CHECK-GI-NEXT:    lsl x13, x13, x18
+; CHECK-GI-NEXT:    bic x10, x10, x8
 ; CHECK-GI-NEXT:    lsl x18, x2, #1
-; CHECK-GI-NEXT:    cmp x13, #64
+; CHECK-GI-NEXT:    cmp x11, #64
 ; CHECK-GI-NEXT:    orr x17, x17, x0
-; CHECK-GI-NEXT:    orr x13, x1, x2, lsr #63
-; CHECK-GI-NEXT:    mvn x16, x9
-; CHECK-GI-NEXT:    csel x15, x17, x15, lo
-; CHECK-GI-NEXT:    sub x17, x14, x11
-; CHECK-GI-NEXT:    csel x0, x3, xzr, lo
+; CHECK-GI-NEXT:    extr x11, x3, x2, #63
+; CHECK-GI-NEXT:    csel x0, x1, xzr, lo
+; CHECK-GI-NEXT:    csel x13, x17, x13, lo
+; CHECK-GI-NEXT:    sub x17, x12, x10
 ; CHECK-GI-NEXT:    tst x16, #0x7f
-; CHECK-GI-NEXT:    sub x16, x11, #64
+; CHECK-GI-NEXT:    sub x16, x10, #64
 ; CHECK-GI-NEXT:    lsr x17, x18, x17
-; CHECK-GI-NEXT:    lsl x2, x13, x11
-; CHECK-GI-NEXT:    lsl x1, x18, x11
-; CHECK-GI-NEXT:    csel x12, x12, x15, eq
-; CHECK-GI-NEXT:    lsl x15, x18, x16
-; CHECK-GI-NEXT:    and x10, x9, #0x7f
-; CHECK-GI-NEXT:    cmp x11, #64
-; CHECK-GI-NEXT:    mvn x11, x8
+; CHECK-GI-NEXT:    lsl x2, x11, x10
+; CHECK-GI-NEXT:    lsl x1, x18, x10
+; CHECK-GI-NEXT:    csel x13, x14, x13, eq
+; CHECK-GI-NEXT:    lsl x14, x18, x16
+; CHECK-GI-NEXT:    cmp x10, #64
+; CHECK-GI-NEXT:    mvn x10, x8
 ; CHECK-GI-NEXT:    orr x16, x17, x2
 ; CHECK-GI-NEXT:    csel x17, x1, xzr, lo
-; CHECK-GI-NEXT:    csel x15, x16, x15, lo
-; CHECK-GI-NEXT:    tst x11, #0x7f
-; CHECK-GI-NEXT:    sub x11, x14, x10
-; CHECK-GI-NEXT:    sub x16, x10, #64
-; CHECK-GI-NEXT:    lsr x18, x4, x10
-; CHECK-GI-NEXT:    lsl x11, x5, x11
-; CHECK-GI-NEXT:    csel x13, x13, x15, eq
-; CHECK-GI-NEXT:    lsr x15, x5, x16
+; CHECK-GI-NEXT:    csel x14, x16, x14, lo
+; CHECK-GI-NEXT:    tst x10, #0x7f
+; CHECK-GI-NEXT:    sub x10, x12, x15
+; CHECK-GI-NEXT:    sub x16, x15, #64
+; CHECK-GI-NEXT:    lsr x18, x4, x15
+; CHECK-GI-NEXT:    lsl x10, x5, x10
+; CHECK-GI-NEXT:    csel x11, x11, x14, eq
+; CHECK-GI-NEXT:    lsr x14, x5, x16
 ; CHECK-GI-NEXT:    and x1, x8, #0x7f
-; CHECK-GI-NEXT:    orr x11, x18, x11
-; CHECK-GI-NEXT:    cmp x10, #64
-; CHECK-GI-NEXT:    lsr x16, x5, x10
-; CHECK-GI-NEXT:    csel x11, x11, x15, lo
+; CHECK-GI-NEXT:    cmp x15, #64
+; CHECK-GI-NEXT:    lsr x16, x5, x15
+; CHECK-GI-NEXT:    orr x10, x18, x10
+; CHECK-GI-NEXT:    csel x10, x10, x14, lo
 ; CHECK-GI-NEXT:    tst x9, #0x7f
-; CHECK-GI-NEXT:    sub x9, x14, x1
-; CHECK-GI-NEXT:    sub x14, x1, #64
-; CHECK-GI-NEXT:    lsr x15, x6, x1
+; CHECK-GI-NEXT:    sub x9, x12, x1
+; CHECK-GI-NEXT:    sub x12, x1, #64
+; CHECK-GI-NEXT:    lsr x14, x6, x1
 ; CHECK-GI-NEXT:    lsl x9, x7, x9
-; CHECK-GI-NEXT:    csel x11, x4, x11, eq
-; CHECK-GI-NEXT:    cmp x10, #64
-; CHECK-GI-NEXT:    lsr x10, x7, x14
-; CHECK-GI-NEXT:    csel x14, x16, xzr, lo
-; CHECK-GI-NEXT:    orr x9, x15, x9
+; CHECK-GI-NEXT:    csel x10, x4, x10, eq
+; CHECK-GI-NEXT:    cmp x15, #64
+; CHECK-GI-NEXT:    lsr x12, x7, x12
+; CHECK-GI-NEXT:    csel x15, x16, xzr, lo
+; CHECK-GI-NEXT:    orr x9, x14, x9
 ; CHECK-GI-NEXT:    cmp x1, #64
-; CHECK-GI-NEXT:    lsr x15, x7, x1
-; CHECK-GI-NEXT:    csel x9, x9, x10, lo
+; CHECK-GI-NEXT:    lsr x14, x7, x1
+; CHECK-GI-NEXT:    csel x9, x9, x12, lo
 ; CHECK-GI-NEXT:    tst x8, #0x7f
 ; CHECK-GI-NEXT:    csel x8, x6, x9, eq
 ; CHECK-GI-NEXT:    cmp x1, #64
-; CHECK-GI-NEXT:    orr x0, x0, x11
-; CHECK-GI-NEXT:    csel x9, x15, xzr, lo
-; CHECK-GI-NEXT:    orr x1, x12, x14
+; CHECK-GI-NEXT:    orr x0, x0, x10
+; CHECK-GI-NEXT:    csel x9, x14, xzr, lo
+; CHECK-GI-NEXT:    orr x1, x13, x15
 ; CHECK-GI-NEXT:    orr x2, x17, x8
-; CHECK-GI-NEXT:    orr x3, x13, x9
+; CHECK-GI-NEXT:    orr x3, x11, x9
 ; CHECK-GI-NEXT:    ret
 entry:
   %d = call <2 x i128> @llvm.fshr(<2 x i128> %a, <2 x i128> %b, <2 x i128> %c)
@@ -3863,15 +3837,12 @@ define <2 x i128> @rotl_v2i128_c(<2 x i128> %a) {
 ;
 ; CHECK-GI-LABEL: rotl_v2i128_c:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    lsr x8, x1, #61
-; CHECK-GI-NEXT:    lsl x9, x1, #3
-; CHECK-GI-NEXT:    lsl x10, x3, #3
-; CHECK-GI-NEXT:    lsr x11, x3, #61
-; CHECK-GI-NEXT:    orr x8, x8, x0, lsl #3
-; CHECK-GI-NEXT:    orr x1, x9, x0, lsr #61
-; CHECK-GI-NEXT:    orr x3, x10, x2, lsr #61
-; CHECK-GI-NEXT:    orr x2, x11, x2, lsl #3
+; CHECK-GI-NEXT:    extr x8, x0, x1, #61
+; CHECK-GI-NEXT:    extr x9, x3, x2, #61
+; CHECK-GI-NEXT:    extr x1, x1, x0, #61
+; CHECK-GI-NEXT:    extr x2, x2, x3, #61
 ; CHECK-GI-NEXT:    mov x0, x8
+; CHECK-GI-NEXT:    mov x3, x9
 ; CHECK-GI-NEXT:    ret
 entry:
   %d = call <2 x i128> @llvm.fshl(<2 x i128> %a, <2 x i128> %a, <2 x i128> <i128 3, i128 3>)
@@ -3891,14 +3862,12 @@ define <2 x i128> @rotr_v2i128_c(<2 x i128> %a) {
 ;
 ; CHECK-GI-LABEL: rotr_v2i128_c:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    lsl x8, x1, #61
-; CHECK-GI-NEXT:    lsl x9, x3, #61
-; CHECK-GI-NEXT:    lsl x10, x0, #61
-; CHECK-GI-NEXT:    lsl x11, x2, #61
-; CHECK-GI-NEXT:    orr x0, x8, x0, lsr #3
-; CHECK-GI-NEXT:    orr x2, x9, x2, lsr #3
-; CHECK-GI-NEXT:    orr x1, x10, x1, lsr #3
-; CHECK-GI-NEXT:    orr x3, x11, x3, lsr #3
+; CHECK-GI-NEXT:    extr x8, x1, x0, #3
+; CHECK-GI-NEXT:    extr x9, x3, x2, #3
+; CHECK-GI-NEXT:    extr x1, x0, x1, #3
+; CHECK-GI-NEXT:    extr x3, x2, x3, #3
+; CHECK-GI-NEXT:    mov x0, x8
+; CHECK-GI-NEXT:    mov x2, x9
 ; CHECK-GI-NEXT:    ret
 entry:
   %d = call <2 x i128> @llvm.fshr(<2 x i128> %a, <2 x i128> %a, <2 x i128> <i128 3, i128 3>)
@@ -4464,14 +4433,10 @@ define <2 x i128> @fshl_v2i128_c(<2 x i128> %a, <2 x i128> %b) {
 ;
 ; CHECK-GI-LABEL: fshl_v2i128_c:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    lsr x8, x5, #61
-; CHECK-GI-NEXT:    lsl x9, x1, #3
-; CHECK-GI-NEXT:    lsl x10, x3, #3
-; CHECK-GI-NEXT:    lsr x11, x7, #61
-; CHECK-GI-NEXT:    orr x8, x8, x0, lsl #3
-; CHECK-GI-NEXT:    orr x1, x9, x0, lsr #61
-; CHECK-GI-NEXT:    orr x3, x10, x2, lsr #61
-; CHECK-GI-NEXT:    orr x2, x11, x2, lsl #3
+; CHECK-GI-NEXT:    extr x8, x0, x5, #61
+; CHECK-GI-NEXT:    extr x1, x1, x0, #61
+; CHECK-GI-NEXT:    extr x3, x3, x2, #61
+; CHECK-GI-NEXT:    extr x2, x2, x7, #61
 ; CHECK-GI-NEXT:    mov x0, x8
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -4480,29 +4445,15 @@ entry:
 }
 
 define <2 x i128> @fshr_v2i128_c(<2 x i128> %a, <2 x i128> %b) {
-; CHECK-SD-LABEL: fshr_v2i128_c:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    extr x8, x5, x4, #3
-; CHECK-SD-NEXT:    extr x9, x7, x6, #3
-; CHECK-SD-NEXT:    extr x1, x0, x5, #3
-; CHECK-SD-NEXT:    extr x3, x2, x7, #3
-; CHECK-SD-NEXT:    mov x0, x8
-; CHECK-SD-NEXT:    mov x2, x9
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: fshr_v2i128_c:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    lsl x8, x5, #61
-; CHECK-GI-NEXT:    lsl x9, x7, #61
-; CHECK-GI-NEXT:    lsr x10, x5, #3
-; CHECK-GI-NEXT:    lsr x11, x7, #3
-; CHECK-GI-NEXT:    orr x8, x8, x4, lsr #3
-; CHECK-GI-NEXT:    orr x9, x9, x6, lsr #3
-; CHECK-GI-NEXT:    orr x1, x10, x0, lsl #61
-; CHECK-GI-NEXT:    orr x3, x11, x2, lsl #61
-; CHECK-GI-NEXT:    mov x0, x8
-; CHECK-GI-NEXT:    mov x2, x9
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: fshr_v2i128_c:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    extr x8, x5, x4, #3
+; CHECK-NEXT:    extr x9, x7, x6, #3
+; CHECK-NEXT:    extr x1, x0, x5, #3
+; CHECK-NEXT:    extr x3, x2, x7, #3
+; CHECK-NEXT:    mov x0, x8
+; CHECK-NEXT:    mov x2, x9
+; CHECK-NEXT:    ret
 entry:
   %d = call <2 x i128> @llvm.fshr(<2 x i128> %a, <2 x i128> %b, <2 x i128> <i128 3, i128 3>)
   ret <2 x i128> %d
diff --git a/llvm/test/CodeGen/AArch64/funnel-shift.ll b/llvm/test/CodeGen/AArch64/funnel-shift.ll
index f9fd2ad1b5b6c..90fb10258dffb 100644
--- a/llvm/test/CodeGen/AArch64/funnel-shift.ll
+++ b/llvm/test/CodeGen/AArch64/funnel-shift.ll
@@ -85,41 +85,40 @@ define i128 @fshl_i128(i128 %x, i128 %y, i128 %z) nounwind {
 ;
 ; CHECK-GI-LABEL: fshl_i128:
 ; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov w8, #64 // =0x40
 ; CHECK-GI-NEXT:    and x9, x4, #0x7f
-; CHECK-GI-NEXT:    mov w10, #64 // =0x40
-; CHECK-GI-NEXT:    lsl x14, x3, #63
-; CHECK-GI-NEXT:    sub x12, x10, x9
+; CHECK-GI-NEXT:    mov w10, #127 // =0x7f
+; CHECK-GI-NEXT:    sub x12, x8, x9
 ; CHECK-GI-NEXT:    lsl x13, x1, x9
-; CHECK-GI-NEXT:    mov w8, #127 // =0x7f
+; CHECK-GI-NEXT:    bic x10, x10, x4
 ; CHECK-GI-NEXT:    lsr x12, x0, x12
-; CHECK-GI-NEXT:    bic x8, x8, x4
-; CHECK-GI-NEXT:    sub x15, x9, #64
+; CHECK-GI-NEXT:    sub x14, x9, #64
+; CHECK-GI-NEXT:    lsl x15, x0, x9
+; CHECK-GI-NEXT:    extr x16, x3, x2, #1
 ; CHECK-GI-NEXT:    cmp x9, #64
-; CHECK-GI-NEXT:    lsl x9, x0, x9
-; CHECK-GI-NEXT:    lsl x15, x0, x15
-; CHECK-GI-NEXT:    orr x12, x12, x13
-; CHECK-GI-NEXT:    orr x13, x14, x2, lsr #1
-; CHECK-GI-NEXT:    lsr x14, x3, #1
-; CHECK-GI-NEXT:    sub x10, x10, x8
-; CHECK-GI-NEXT:    sub x16, x8, #64
-; CHECK-GI-NEXT:    csel x9, x9, xzr, lo
-; CHECK-GI-NEXT:    lsr x17, x13, x8
-; CHECK-GI-NEXT:    lsl x10, x14, x10
-; CHECK-GI-NEXT:    csel x12, x12, x15, lo
+; CHECK-GI-NEXT:    sub x8, x8, x10
+; CHECK-GI-NEXT:    orr x9, x12, x13
+; CHECK-GI-NEXT:    lsr x12, x3, #1
+; CHECK-GI-NEXT:    lsl x13, x0, x14
+; CHECK-GI-NEXT:    csel x14, x15, xzr, lo
+; CHECK-GI-NEXT:    sub x15, x10, #64
+; CHECK-GI-NEXT:    lsr x17, x16, x10
+; CHECK-GI-NEXT:    lsl x8, x12, x8
+; CHECK-GI-NEXT:    csel x9, x9, x13, lo
 ; CHECK-GI-NEXT:    tst x4, #0x7f
-; CHECK-GI-NEXT:    lsr x15, x14, x16
+; CHECK-GI-NEXT:    lsr x13, x12, x15
 ; CHECK-GI-NEXT:    mvn x11, x4
-; CHECK-GI-NEXT:    csel x12, x1, x12, eq
-; CHECK-GI-NEXT:    orr x10, x17, x10
-; CHECK-GI-NEXT:    cmp x8, #64
-; CHECK-GI-NEXT:    lsr x14, x14, x8
-; CHECK-GI-NEXT:    csel x10, x10, x15, lo
+; CHECK-GI-NEXT:    csel x9, x1, x9, eq
+; CHECK-GI-NEXT:    orr x8, x17, x8
+; CHECK-GI-NEXT:    cmp x10, #64
+; CHECK-GI-NEXT:    lsr x12, x12, x10
+; CHECK-GI-NEXT:    csel x8, x8, x13, lo
 ; CHECK-GI-NEXT:    tst x11, #0x7f
-; CHECK-GI-NEXT:    csel x10, x13, x10, eq
-; CHECK-GI-NEXT:    cmp x8, #64
-; CHECK-GI-NEXT:    csel x8, x14, xzr, lo
-; CHECK-GI-NEXT:    orr x0, x9, x10
-; CHECK-GI-NEXT:    orr x1, x12, x8
+; CHECK-GI-NEXT:    csel x8, x16, x8, eq
+; CHECK-GI-NEXT:    cmp x10, #64
+; CHECK-GI-NEXT:    csel x10, x12, xzr, lo
+; CHECK-GI-NEXT:    orr x0, x14, x8
+; CHECK-GI-NEXT:    orr x1, x9, x10
 ; CHECK-GI-NEXT:    ret
   %f = call i128 @llvm.fshl.i128(i128 %x, i128 %y, i128 %z)
   ret i128 %f
diff --git a/llvm/test/CodeGen/AArch64/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll b/llvm/test/CodeGen/AArch64/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll
index c3fdc7db2abbe..8438f0b03179c 100644
--- a/llvm/test/CodeGen/AArch64/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll
+++ b/llvm/test/CodeGen/AArch64/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-unknown-unknown < %s | FileCheck %s --check-prefix=CHECK
+; RUN: llc -mtriple=aarch64-unknown-unknown < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64-unknown-unknown -global-isel < %s | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 ; We are looking for the following pattern here:
 ;   (X & (C l>> Y)) ==/!= 0
@@ -13,12 +14,21 @@
 ; i8 scalar
 
 define i1 @scalar_i8_signbit_eq(i8 %x, i8 %y) nounwind {
-; CHECK-LABEL: scalar_i8_signbit_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsl w8, w0, w1
-; CHECK-NEXT:    tst w8, #0x80
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: scalar_i8_signbit_eq:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    lsl w8, w0, w1
+; CHECK-SD-NEXT:    tst w8, #0x80
+; CHECK-SD-NEXT:    cset w0, eq
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: scalar_i8_signbit_eq:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov w8, #128 // =0x80
+; CHECK-GI-NEXT:    and w9, w1, #0xff
+; CHECK-GI-NEXT:    lsr w8, w8, w9
+; CHECK-GI-NEXT:    tst w8, w0
+; CHECK-GI-NEXT:    cset w0, eq
+; CHECK-GI-NEXT:    ret
   %t0 = lshr i8 128, %y
   %t1 = and i8 %t0, %x
   %res = icmp eq i8 %t1, 0
@@ -26,12 +36,21 @@ define i1 @scalar_i8_signbit_eq(i8 %x, i8 %y) nounwind {
 }
 
 define i1 @scalar_i8_lowestbit_eq(i8 %x, i8 %y) nounwind {
-; CHECK-LABEL: scalar_i8_lowestbit_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsl w8, w0, w1
-; CHECK-NEXT:    tst w8, #0x1
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: scalar_i8_lowestbit_eq:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    lsl w8, w0, w1
+; CHECK-SD-NEXT:    tst w8, #0x1
+; CHECK-SD-NEXT:    cset w0, eq
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: scalar_i8_lowestbit_eq:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov w8, #1 // =0x1
+; CHECK-GI-NEXT:    and w9, w1, #0xff
+; CHECK-GI-NEXT:    lsr w8, w8, w9
+; CHECK-GI-NEXT:    tst w8, w0
+; CHECK-GI-NEXT:    cset w0, eq
+; CHECK-GI-NEXT:    ret
   %t0 = lshr i8 1, %y
   %t1 = and i8 %t0, %x
   %res = icmp eq i8 %t1, 0
@@ -39,12 +58,21 @@ define i1 @scalar_i8_lowestbit_eq(i8 %x, i8 %y) nounwind {
 }
 
 define i1 @scalar_i8_bitsinmiddle_eq(i8 %x, i8 %y) nounwind {
-; CHECK-LABEL: scalar_i8_bitsinmiddle_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsl w8, w0, w1
-; CHECK-NEXT:    tst w8, #0x18
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: scalar_i8_bitsinmiddle_eq:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    lsl w8, w0, w1
+; CHECK-SD-NEXT:    tst w8, #0x18
+; CHECK-SD-NEXT:    cset w0, eq
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: scalar_i8_bitsinmiddle_eq:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov w8, #24 // =0x18
+; CHECK-GI-NEXT:    and w9, w1, #0xff
+; CHECK-GI-NEXT:    lsr w8, w8, w9
+; CHECK-GI-NEXT:    tst w8, w0
+; CHECK-GI-NEXT:    cset w0, eq
+; CHECK-GI-NEXT:    ret
   %t0 = lshr i8 24, %y
   %t1 = and i8 %t0, %x
   %res = icmp eq i8 %t1, 0
@@ -54,12 +82,21 @@ define i1 @scalar_i8_bitsinmiddle_eq(i8 %x, i8 %y) nounwind {
 ; i16 scalar
 
 define i1 @scalar_i16_signbit_eq(i16 %x, i16 %y) nounwind {
-; CHECK-LABEL: scalar_i16_signbit_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsl w8, w0, w1
-; CHECK-NEXT:    tst w8, #0x8000
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: scalar_i16_signbit_eq:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    lsl w8, w0, w1
+; CHECK-SD-NEXT:    tst w8, #0x8000
+; CHECK-SD-NEXT:    cset w0, eq
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: scalar_i16_signbit_eq:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov w8, #32768 // =0x8000
+; CHECK-GI-NEXT:    and w9, w1, #0xffff
+; CHECK-GI-NEXT:    lsr w8, w8, w9
+; CHECK-GI-NEXT:    tst w8, w0
+; CHECK-GI-NEXT:    cset w0, eq
+; CHECK-GI-NEXT:    ret
   %t0 = lshr i16 32768, %y
   %t1 = and i16 %t0, %x
   %res = icmp eq i16 %t1, 0
@@ -67,12 +104,21 @@ define i1 @scalar_i16_signbit_eq(i16 %x, i16 %y) nounwind {
 }
 
 define i1 @scalar_i16_lowestbit_eq(i16 %x, i16 %y) nounwind {
-; CHECK-LABEL: scalar_i16_lowestbit_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsl w8, w0, w1
-; CHECK-NEXT:    tst w8, #0x1
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: scalar_i16_lowestbit_eq:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    lsl w8, w0, w1
+; CHECK-SD-NEXT:    tst w8, #0x1
+; CHECK-SD-NEXT:    cset w0, eq
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: scalar_i16_lowestbit_eq:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov w8, #1 // =0x1
+; CHECK-GI-NEXT:    and w9, w1, #0xffff
+; CHECK-GI-NEXT:    lsr w8, w8, w9
+; CHECK-GI-NEXT:    tst w8, w0
+; CHECK-GI-NEXT:    cset w0, eq
+; CHECK-GI-NEXT:    ret
   %t0 = lshr i16 1, %y
   %t1 = and i16 %t0, %x
   %res = icmp eq i16 %t1, 0
@@ -80,12 +126,21 @@ define i1 @scalar_i16_lowestbit_eq(i16 %x, i16 %y) nounwind {
 }
 
 define i1 @scalar_i16_bitsinmiddle_eq(i16 %x, i16 %y) nounwind {
-; CHECK-LABEL: scalar_i16_bitsinmiddle_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsl w8, w0, w1
-; CHECK-NEXT:    tst w8, #0xff0
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: scalar_i16_bitsinmiddle_eq:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    lsl w8, w0, w1
+; CHECK-SD-NEXT:    tst w8, #0xff0
+; CHECK-SD-NEXT:    cset w0, eq
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: scalar_i16_bitsinmiddle_eq:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov w8, #4080 // =0xff0
+; CHECK-GI-NEXT:    and w9, w1, #0xffff
+; CHECK-GI-NEXT:    lsr w8, w8, w9
+; CHECK-GI-NEXT:    tst w8, w0
+; CHECK-GI-NEXT:    cset w0, eq
+; CHECK-GI-NEXT:    ret
   %t0 = lshr i16 4080, %y
   %t1 = and i16 %t0, %x
   %res = icmp eq i16 %t1, 0
@@ -95,12 +150,20 @@ define i1 @scalar_i16_bitsinmiddle_eq(i16 %x, i16 %y) nounwind {
 ; i32 scalar
 
 define i1 @scalar_i32_signbit_eq(i32 %x, i32 %y) nounwind {
-; CHECK-LABEL: scalar_i32_signbit_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsl w8, w0, w1
-; CHECK-NEXT:    tst w8, #0x80000000
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: scalar_i32_signbit_eq:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    lsl w8, w0, w1
+; CHECK-SD-NEXT:    tst w8, #0x80000000
+; CHECK-SD-NEXT:    cset w0, eq
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: scalar_i32_signbit_eq:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov w8, #-2147483648 // =0x80000000
+; CHECK-GI-NEXT:    lsr w8, w8, w1
+; CHECK-GI-NEXT:    tst w8, w0
+; CHECK-GI-NEXT:    cset w0, eq
+; CHECK-GI-NEXT:    ret
   %t0 = lshr i32 2147483648, %y
   %t1 = and i32 %t0, %x
   %res = icmp eq i32 %t1, 0
@@ -108,12 +171,20 @@ define i1 @scalar_i32_signbit_eq(i32 %x, i32 %y) nounwind {
 }
 
 define i1 @scalar_i32_lowestbit_eq(i32 %x, i32 %y) nounwind {
-; CHECK-LABEL: scalar_i32_lowestbit_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsl w8, w0, w1
-; CHECK-NEXT:    tst w8, #0x1
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: scalar_i32_lowestbit_eq:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    lsl w8, w0, w1
+; CHECK-SD-NEXT:    tst w8, #0x1
+; CHECK-SD-NEXT:    cset w0, eq
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: scalar_i32_lowestbit_eq:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov w8, #1 // =0x1
+; CHECK-GI-NEXT:    lsr w8, w8, w1
+; CHECK-GI-NEXT:    tst w8, w0
+; CHECK-GI-NEXT:    cset w0, eq
+; CHECK-GI-NEXT:    ret
   %t0 = lshr i32 1, %y
   %t1 = and i32 %t0, %x
   %res = icmp eq i32 %t1, 0
@@ -121,12 +192,20 @@ define i1 @scalar_i32_lowestbit_eq(i32 %x, i32 %y) nounwind {
 }
 
 define i1 @scalar_i32_bitsinmiddle_eq(i32 %x, i32 %y) nounwind {
-; CHECK-LABEL: scalar_i32_bitsinmiddle_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsl w8, w0, w1
-; CHECK-NEXT:    tst w8, #0xffff00
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: scalar_i32_bitsinmiddle_eq:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    lsl w8, w0, w1
+; CHECK-SD-NEXT:    tst w8, #0xffff00
+; CHECK-SD-NEXT:    cset w0, eq
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: scalar_i32_bitsinmiddle_eq:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov w8, #16776960 // =0xffff00
+; CHECK-GI-NEXT:    lsr w8, w8, w1
+; CHECK-GI-NEXT:    tst w8, w0
+; CHECK-GI-NEXT:    cset w0, eq
+; CHECK-GI-NEXT:    ret
   %t0 = lshr i32 16776960, %y
   %t1 = and i32 %t0, %x
   %res = icmp eq i32 %t1, 0
@@ -136,12 +215,20 @@ define i1 @scalar_i32_bitsinmiddle_eq(i32 %x, i32 %y) nounwind {
 ; i64 scalar
 
 define i1 @scalar_i64_signbit_eq(i64 %x, i64 %y) nounwind {
-; CHECK-LABEL: scalar_i64_signbit_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsl x8, x0, x1
-; CHECK-NEXT:    tst x8, #0x8000000000000000
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: scalar_i64_signbit_eq:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    lsl x8, x0, x1
+; CHECK-SD-NEXT:    tst x8, #0x8000000000000000
+; CHECK-SD-NEXT:    cset w0, eq
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: scalar_i64_signbit_eq:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov x8, #-9223372036854775808 // =0x8000000000000000
+; CHECK-GI-NEXT:    lsr x8, x8, x1
+; CHECK-GI-NEXT:    tst x8, x0
+; CHECK-GI-NEXT:    cset w0, eq
+; CHECK-GI-NEXT:    ret
   %t0 = lshr i64 9223372036854775808, %y
   %t1 = and i64 %t0, %x
   %res = icmp eq i64 %t1, 0
@@ -149,12 +236,20 @@ define i1 @scalar_i64_signbit_eq(i64 %x, i64 %y) nounwind {
 }
 
 define i1 @scalar_i64_lowestbit_eq(i64 %x, i64 %y) nounwind {
-; CHECK-LABEL: scalar_i64_lowestbit_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsl x8, x0, x1
-; CHECK-NEXT:    tst x8, #0x1
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: scalar_i64_lowestbit_eq:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    lsl x8, x0, x1
+; CHECK-SD-NEXT:    tst x8, #0x1
+; CHECK-SD-NEXT:    cset w0, eq
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: scalar_i64_lowestbit_eq:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov w8, #1 // =0x1
+; CHECK-GI-NEXT:    lsr x8, x8, x1
+; CHECK-GI-NEXT:    tst x8, x0
+; CHECK-GI-NEXT:    cset w0, eq
+; CHECK-GI-NEXT:    ret
   %t0 = lshr i64 1, %y
   %t1 = and i64 %t0, %x
   %res = icmp eq i64 %t1, 0
@@ -162,12 +257,20 @@ define i1 @scalar_i64_lowestbit_eq(i64 %x, i64 %y) nounwind {
 }
 
 define i1 @scalar_i64_bitsinmiddle_eq(i64 %x, i64 %y) nounwind {
-; CHECK-LABEL: scalar_i64_bitsinmiddle_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsl x8, x0, x1
-; CHECK-NEXT:    tst x8, #0xffffffff0000
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: scalar_i64_bitsinmiddle_eq:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    lsl x8, x0, x1
+; CHECK-SD-NEXT:    tst x8, #0xffffffff0000
+; CHECK-SD-NEXT:    cset w0, eq
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: scalar_i64_bitsinmiddle_eq:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov x8, #281474976645120 // =0xffffffff0000
+; CHECK-GI-NEXT:    lsr x8, x8, x1
+; CHECK-GI-NEXT:    tst x8, x0
+; CHECK-GI-NEXT:    cset w0, eq
+; CHECK-GI-NEXT:    ret
   %t0 = lshr i64 281474976645120, %y
   %t1 = and i64 %t0, %x
   %res = icmp eq i64 %t1, 0
@@ -179,14 +282,24 @@ define i1 @scalar_i64_bitsinmiddle_eq(i64 %x, i64 %y) nounwind {
 ;------------------------------------------------------------------------------;
 
 define <4 x i1> @vec_4xi32_splat_eq(<4 x i32> %x, <4 x i32> %y) nounwind {
-; CHECK-LABEL: vec_4xi32_splat_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v2.4s, #1
-; CHECK-NEXT:    ushl v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
-; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
-; CHECK-NEXT:    xtn v0.4h, v0.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: vec_4xi32_splat_eq:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    movi v2.4s, #1
+; CHECK-SD-NEXT:    ushl v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-SD-NEXT:    cmeq v0.4s, v0.4s, #0
+; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: vec_4xi32_splat_eq:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v2.4s, #1
+; CHECK-GI-NEXT:    neg v1.4s, v1.4s
+; CHECK-GI-NEXT:    ushl v1.4s, v2.4s, v1.4s
+; CHECK-GI-NEXT:    and v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    cmeq v0.4s, v0.4s, #0
+; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    ret
   %t0 = lshr <4 x i32> <i32 1, i32 1, i32 1, i32 1>, %y
   %t1 = and <4 x i32> %t0, %x
   %res = icmp eq <4 x i32> %t1, <i32 0, i32 0, i32 0, i32 0>
@@ -211,44 +324,86 @@ define <4 x i1> @vec_4xi32_nonsplat_eq(<4 x i32> %x, <4 x i32> %y) nounwind {
 }
 
 define <4 x i1> @vec_4xi32_nonsplat_undef0_eq(<4 x i32> %x, <4 x i32> %y) nounwind {
-; CHECK-LABEL: vec_4xi32_nonsplat_undef0_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v2.4s, #1
-; CHECK-NEXT:    ushl v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
-; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
-; CHECK-NEXT:    xtn v0.4h, v0.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: vec_4xi32_nonsplat_undef0_eq:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    movi v2.4s, #1
+; CHECK-SD-NEXT:    ushl v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-SD-NEXT:    cmeq v0.4s, v0.4s, #0
+; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: vec_4xi32_nonsplat_undef0_eq:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov w8, #1 // =0x1
+; CHECK-GI-NEXT:    neg v1.4s, v1.4s
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov v2.s[1], w8
+; CHECK-GI-NEXT:    mov v2.s[3], w8
+; CHECK-GI-NEXT:    ushl v1.4s, v2.4s, v1.4s
+; CHECK-GI-NEXT:    and v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    cmeq v0.4s, v0.4s, #0
+; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    ret
   %t0 = lshr <4 x i32> <i32 1, i32 1, i32 undef, i32 1>, %y
   %t1 = and <4 x i32> %t0, %x
   %res = icmp eq <4 x i32> %t1, <i32 0, i32 0, i32 0, i32 0>
   ret <4 x i1> %res
 }
 define <4 x i1> @vec_4xi32_nonsplat_undef1_eq(<4 x i32> %x, <4 x i32> %y) nounwind {
-; CHECK-LABEL: vec_4xi32_nonsplat_undef1_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v2.4s, #1
-; CHECK-NEXT:    neg v1.4s, v1.4s
-; CHECK-NEXT:    ushl v1.4s, v2.4s, v1.4s
-; CHECK-NEXT:    and v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
-; CHECK-NEXT:    xtn v0.4h, v0.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: vec_4xi32_nonsplat_undef1_eq:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    movi v2.4s, #1
+; CHECK-SD-NEXT:    neg v1.4s, v1.4s
+; CHECK-SD-NEXT:    ushl v1.4s, v2.4s, v1.4s
+; CHECK-SD-NEXT:    and v0.16b, v1.16b, v0.16b
+; CHECK-SD-NEXT:    cmeq v0.4s, v0.4s, #0
+; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: vec_4xi32_nonsplat_undef1_eq:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi d2, #0000000000000000
+; CHECK-GI-NEXT:    movi v3.4s, #1
+; CHECK-GI-NEXT:    neg v1.4s, v1.4s
+; CHECK-GI-NEXT:    mov v2.s[1], wzr
+; CHECK-GI-NEXT:    ushl v1.4s, v3.4s, v1.4s
+; CHECK-GI-NEXT:    and v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    mov v2.s[3], wzr
+; CHECK-GI-NEXT:    cmeq v0.4s, v0.4s, v2.4s
+; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    ret
   %t0 = lshr <4 x i32> <i32 1, i32 1, i32 1, i32 1>, %y
   %t1 = and <4 x i32> %t0, %x
   %res = icmp eq <4 x i32> %t1, <i32 0, i32 0, i32 undef, i32 0>
   ret <4 x i1> %res
 }
 define <4 x i1> @vec_4xi32_nonsplat_undef2_eq(<4 x i32> %x, <4 x i32> %y) nounwind {
-; CHECK-LABEL: vec_4xi32_nonsplat_undef2_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v2.4s, #1
-; CHECK-NEXT:    neg v1.4s, v1.4s
-; CHECK-NEXT:    ushl v1.4s, v2.4s, v1.4s
-; CHECK-NEXT:    and v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
-; CHECK-NEXT:    xtn v0.4h, v0.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: vec_4xi32_nonsplat_undef2_eq:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    movi v2.4s, #1
+; CHECK-SD-NEXT:    neg v1.4s, v1.4s
+; CHECK-SD-NEXT:    ushl v1.4s, v2.4s, v1.4s
+; CHECK-SD-NEXT:    and v0.16b, v1.16b, v0.16b
+; CHECK-SD-NEXT:    cmeq v0.4s, v0.4s, #0
+; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: vec_4xi32_nonsplat_undef2_eq:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov w8, #1 // =0x1
+; CHECK-GI-NEXT:    movi d2, #0000000000000000
+; CHECK-GI-NEXT:    neg v1.4s, v1.4s
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    mov v3.s[1], w8
+; CHECK-GI-NEXT:    mov v2.s[1], wzr
+; CHECK-GI-NEXT:    mov v3.s[3], w8
+; CHECK-GI-NEXT:    mov v2.s[3], wzr
+; CHECK-GI-NEXT:    ushl v1.4s, v3.4s, v1.4s
+; CHECK-GI-NEXT:    and v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    cmeq v0.4s, v0.4s, v2.4s
+; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    ret
   %t0 = lshr <4 x i32> <i32 1, i32 1, i32 undef, i32 1>, %y
   %t1 = and <4 x i32> %t0, %x
   %res = icmp eq <4 x i32> %t1, <i32 0, i32 0, i32 undef, i32 0>
@@ -260,11 +415,20 @@ define <4 x i1> @vec_4xi32_nonsplat_undef2_eq(<4 x i32> %x, <4 x i32> %y) nounwi
 ;------------------------------------------------------------------------------;
 
 define i1 @scalar_i8_signbit_ne(i8 %x, i8 %y) nounwind {
-; CHECK-LABEL: scalar_i8_signbit_ne:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsl w8, w0, w1
-; CHECK-NEXT:    ubfx w0, w8, #7, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: scalar_i8_signbit_ne:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    lsl w8, w0, w1
+; CHECK-SD-NEXT:    ubfx w0, w8, #7, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: scalar_i8_signbit_ne:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov w8, #128 // =0x80
+; CHECK-GI-NEXT:    and w9, w1, #0xff
+; CHECK-GI-NEXT:    lsr w8, w8, w9
+; CHECK-GI-NEXT:    tst w8, w0
+; CHECK-GI-NEXT:    cset w0, ne
+; CHECK-GI-NEXT:    ret
   %t0 = lshr i8 128, %y
   %t1 = and i8 %t0, %x
   %res = icmp ne i8 %t1, 0 ;  we are perfectly happy with 'ne' predicate
@@ -315,14 +479,24 @@ define i1 @scalar_i8_bitsinmiddle_slt(i8 %x, i8 %y) nounwind {
 }
 
 define i1 @scalar_i8_signbit_eq_with_nonzero(i8 %x, i8 %y) nounwind {
-; CHECK-LABEL: scalar_i8_signbit_eq_with_nonzero:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #128 // =0x80
-; CHECK-NEXT:    lsr w8, w8, w1
-; CHECK-NEXT:    and w8, w8, w0
-; CHECK-NEXT:    cmp w8, #1
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: scalar_i8_signbit_eq_with_nonzero:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    mov w8, #128 // =0x80
+; CHECK-SD-NEXT:    lsr w8, w8, w1
+; CHECK-SD-NEXT:    and w8, w8, w0
+; CHECK-SD-NEXT:    cmp w8, #1
+; CHECK-SD-NEXT:    cset w0, eq
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: scalar_i8_signbit_eq_with_nonzero:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov w8, #128 // =0x80
+; CHECK-GI-NEXT:    and w9, w1, #0xff
+; CHECK-GI-NEXT:    lsr w8, w8, w9
+; CHECK-GI-NEXT:    and w8, w8, w0
+; CHECK-GI-NEXT:    cmp w8, #1
+; CHECK-GI-NEXT:    cset w0, eq
+; CHECK-GI-NEXT:    ret
   %t0 = lshr i8 128, %y
   %t1 = and i8 %t0, %x
   %res = icmp eq i8 %t1, 1 ; should be comparing with 0
diff --git a/llvm/test/CodeGen/AArch64/hoist-and-by-const-from-shl-in-eqcmp-zero.ll b/llvm/test/CodeGen/AArch64/hoist-and-by-const-from-shl-in-eqcmp-zero.ll
index 4a73b10811d29..cc1bf27b8d4b7 100644
--- a/llvm/test/CodeGen/AArch64/hoist-and-by-const-from-shl-in-eqcmp-zero.ll
+++ b/llvm/test/CodeGen/AArch64/hoist-and-by-const-from-shl-in-eqcmp-zero.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-unknown-unknown < %s | FileCheck %s --check-prefix=CHECK
+; RUN: llc -mtriple=aarch64-unknown-unknown < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64-unknown-unknown -global-isel < %s | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 ; We are looking for the following pattern here:
 ;   (X & (C << Y)) ==/!= 0
@@ -13,13 +14,23 @@
 ; i8 scalar
 
 define i1 @scalar_i8_signbit_eq(i8 %x, i8 %y) nounwind {
-; CHECK-LABEL: scalar_i8_signbit_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0xff
-; CHECK-NEXT:    lsr w8, w8, w1
-; CHECK-NEXT:    tst w8, #0x80
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: scalar_i8_signbit_eq:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    and w8, w0, #0xff
+; CHECK-SD-NEXT:    lsr w8, w8, w1
+; CHECK-SD-NEXT:    tst w8, #0x80
+; CHECK-SD-NEXT:    cset w0, eq
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: scalar_i8_signbit_eq:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov w8, #-128 // =0xffffff80
+; CHECK-GI-NEXT:    and w9, w1, #0xff
+; CHECK-GI-NEXT:    lsl w8, w8, w9
+; CHECK-GI-NEXT:    and w8, w8, w0
+; CHECK-GI-NEXT:    tst w8, #0xff
+; CHECK-GI-NEXT:    cset w0, eq
+; CHECK-GI-NEXT:    ret
   %t0 = shl i8 128, %y
   %t1 = and i8 %t0, %x
   %res = icmp eq i8 %t1, 0
@@ -27,13 +38,23 @@ define i1 @scalar_i8_signbit_eq(i8 %x, i8 %y) nounwind {
 }
 
 define i1 @scalar_i8_lowestbit_eq(i8 %x, i8 %y) nounwind {
-; CHECK-LABEL: scalar_i8_lowestbit_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0xff
-; CHECK-NEXT:    lsr w8, w8, w1
-; CHECK-NEXT:    tst w8, #0x1
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: scalar_i8_lowestbit_eq:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    and w8, w0, #0xff
+; CHECK-SD-NEXT:    lsr w8, w8, w1
+; CHECK-SD-NEXT:    tst w8, #0x1
+; CHECK-SD-NEXT:    cset w0, eq
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: scalar_i8_lowestbit_eq:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov w8, #1 // =0x1
+; CHECK-GI-NEXT:    and w9, w1, #0xff
+; CHECK-GI-NEXT:    lsl w8, w8, w9
+; CHECK-GI-NEXT:    and w8, w8, w0
+; CHECK-GI-NEXT:    tst w8, #0xff
+; CHECK-GI-NEXT:    cset w0, eq
+; CHECK-GI-NEXT:    ret
   %t0 = shl i8 1, %y
   %t1 = and i8 %t0, %x
   %res = icmp eq i8 %t1, 0
@@ -41,13 +62,23 @@ define i1 @scalar_i8_lowestbit_eq(i8 %x, i8 %y) nounwind {
 }
 
 define i1 @scalar_i8_bitsinmiddle_eq(i8 %x, i8 %y) nounwind {
-; CHECK-LABEL: scalar_i8_bitsinmiddle_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0xff
-; CHECK-NEXT:    lsr w8, w8, w1
-; CHECK-NEXT:    tst w8, #0x18
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: scalar_i8_bitsinmiddle_eq:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    and w8, w0, #0xff
+; CHECK-SD-NEXT:    lsr w8, w8, w1
+; CHECK-SD-NEXT:    tst w8, #0x18
+; CHECK-SD-NEXT:    cset w0, eq
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: scalar_i8_bitsinmiddle_eq:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov w8, #24 // =0x18
+; CHECK-GI-NEXT:    and w9, w1, #0xff
+; CHECK-GI-NEXT:    lsl w8, w8, w9
+; CHECK-GI-NEXT:    and w8, w8, w0
+; CHECK-GI-NEXT:    tst w8, #0xff
+; CHECK-GI-NEXT:    cset w0, eq
+; CHECK-GI-NEXT:    ret
   %t0 = shl i8 24, %y
   %t1 = and i8 %t0, %x
   %res = icmp eq i8 %t1, 0
@@ -57,13 +88,23 @@ define i1 @scalar_i8_bitsinmiddle_eq(i8 %x, i8 %y) nounwind {
 ; i16 scalar
 
 define i1 @scalar_i16_signbit_eq(i16 %x, i16 %y) nounwind {
-; CHECK-LABEL: scalar_i16_signbit_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0xffff
-; CHECK-NEXT:    lsr w8, w8, w1
-; CHECK-NEXT:    tst w8, #0x8000
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: scalar_i16_signbit_eq:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    and w8, w0, #0xffff
+; CHECK-SD-NEXT:    lsr w8, w8, w1
+; CHECK-SD-NEXT:    tst w8, #0x8000
+; CHECK-SD-NEXT:    cset w0, eq
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: scalar_i16_signbit_eq:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov w8, #-32768 // =0xffff8000
+; CHECK-GI-NEXT:    and w9, w1, #0xffff
+; CHECK-GI-NEXT:    lsl w8, w8, w9
+; CHECK-GI-NEXT:    and w8, w8, w0
+; CHECK-GI-NEXT:    tst w8, #0xffff
+; CHECK-GI-NEXT:    cset w0, eq
+; CHECK-GI-NEXT:    ret
   %t0 = shl i16 32768, %y
   %t1 = and i16 %t0, %x
   %res = icmp eq i16 %t1, 0
@@ -71,13 +112,23 @@ define i1 @scalar_i16_signbit_eq(i16 %x, i16 %y) nounwind {
 }
 
 define i1 @scalar_i16_lowestbit_eq(i16 %x, i16 %y) nounwind {
-; CHECK-LABEL: scalar_i16_lowestbit_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0xffff
-; CHECK-NEXT:    lsr w8, w8, w1
-; CHECK-NEXT:    tst w8, #0x1
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: scalar_i16_lowestbit_eq:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    and w8, w0, #0xffff
+; CHECK-SD-NEXT:    lsr w8, w8, w1
+; CHECK-SD-NEXT:    tst w8, #0x1
+; CHECK-SD-NEXT:    cset w0, eq
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: scalar_i16_lowestbit_eq:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov w8, #1 // =0x1
+; CHECK-GI-NEXT:    and w9, w1, #0xffff
+; CHECK-GI-NEXT:    lsl w8, w8, w9
+; CHECK-GI-NEXT:    and w8, w8, w0
+; CHECK-GI-NEXT:    tst w8, #0xffff
+; CHECK-GI-NEXT:    cset w0, eq
+; CHECK-GI-NEXT:    ret
   %t0 = shl i16 1, %y
   %t1 = and i16 %t0, %x
   %res = icmp eq i16 %t1, 0
@@ -85,13 +136,23 @@ define i1 @scalar_i16_lowestbit_eq(i16 %x, i16 %y) nounwind {
 }
 
 define i1 @scalar_i16_bitsinmiddle_eq(i16 %x, i16 %y) nounwind {
-; CHECK-LABEL: scalar_i16_bitsinmiddle_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0xffff
-; CHECK-NEXT:    lsr w8, w8, w1
-; CHECK-NEXT:    tst w8, #0xff0
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: scalar_i16_bitsinmiddle_eq:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    and w8, w0, #0xffff
+; CHECK-SD-NEXT:    lsr w8, w8, w1
+; CHECK-SD-NEXT:    tst w8, #0xff0
+; CHECK-SD-NEXT:    cset w0, eq
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: scalar_i16_bitsinmiddle_eq:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov w8, #4080 // =0xff0
+; CHECK-GI-NEXT:    and w9, w1, #0xffff
+; CHECK-GI-NEXT:    lsl w8, w8, w9
+; CHECK-GI-NEXT:    and w8, w8, w0
+; CHECK-GI-NEXT:    tst w8, #0xffff
+; CHECK-GI-NEXT:    cset w0, eq
+; CHECK-GI-NEXT:    ret
   %t0 = shl i16 4080, %y
   %t1 = and i16 %t0, %x
   %res = icmp eq i16 %t1, 0
@@ -101,12 +162,20 @@ define i1 @scalar_i16_bitsinmiddle_eq(i16 %x, i16 %y) nounwind {
 ; i32 scalar
 
 define i1 @scalar_i32_signbit_eq(i32 %x, i32 %y) nounwind {
-; CHECK-LABEL: scalar_i32_signbit_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsr w8, w0, w1
-; CHECK-NEXT:    tst w8, #0x80000000
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: scalar_i32_signbit_eq:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    lsr w8, w0, w1
+; CHECK-SD-NEXT:    tst w8, #0x80000000
+; CHECK-SD-NEXT:    cset w0, eq
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: scalar_i32_signbit_eq:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov w8, #-2147483648 // =0x80000000
+; CHECK-GI-NEXT:    lsl w8, w8, w1
+; CHECK-GI-NEXT:    tst w8, w0
+; CHECK-GI-NEXT:    cset w0, eq
+; CHECK-GI-NEXT:    ret
   %t0 = shl i32 2147483648, %y
   %t1 = and i32 %t0, %x
   %res = icmp eq i32 %t1, 0
@@ -114,12 +183,20 @@ define i1 @scalar_i32_signbit_eq(i32 %x, i32 %y) nounwind {
 }
 
 define i1 @scalar_i32_lowestbit_eq(i32 %x, i32 %y) nounwind {
-; CHECK-LABEL: scalar_i32_lowestbit_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsr w8, w0, w1
-; CHECK-NEXT:    tst w8, #0x1
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: scalar_i32_lowestbit_eq:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    lsr w8, w0, w1
+; CHECK-SD-NEXT:    tst w8, #0x1
+; CHECK-SD-NEXT:    cset w0, eq
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: scalar_i32_lowestbit_eq:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov w8, #1 // =0x1
+; CHECK-GI-NEXT:    lsl w8, w8, w1
+; CHECK-GI-NEXT:    tst w8, w0
+; CHECK-GI-NEXT:    cset w0, eq
+; CHECK-GI-NEXT:    ret
   %t0 = shl i32 1, %y
   %t1 = and i32 %t0, %x
   %res = icmp eq i32 %t1, 0
@@ -127,12 +204,20 @@ define i1 @scalar_i32_lowestbit_eq(i32 %x, i32 %y) nounwind {
 }
 
 define i1 @scalar_i32_bitsinmiddle_eq(i32 %x, i32 %y) nounwind {
-; CHECK-LABEL: scalar_i32_bitsinmiddle_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsr w8, w0, w1
-; CHECK-NEXT:    tst w8, #0xffff00
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: scalar_i32_bitsinmiddle_eq:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    lsr w8, w0, w1
+; CHECK-SD-NEXT:    tst w8, #0xffff00
+; CHECK-SD-NEXT:    cset w0, eq
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: scalar_i32_bitsinmiddle_eq:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov w8, #16776960 // =0xffff00
+; CHECK-GI-NEXT:    lsl w8, w8, w1
+; CHECK-GI-NEXT:    tst w8, w0
+; CHECK-GI-NEXT:    cset w0, eq
+; CHECK-GI-NEXT:    ret
   %t0 = shl i32 16776960, %y
   %t1 = and i32 %t0, %x
   %res = icmp eq i32 %t1, 0
@@ -142,12 +227,20 @@ define i1 @scalar_i32_bitsinmiddle_eq(i32 %x, i32 %y) nounwind {
 ; i64 scalar
 
 define i1 @scalar_i64_signbit_eq(i64 %x, i64 %y) nounwind {
-; CHECK-LABEL: scalar_i64_signbit_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsr x8, x0, x1
-; CHECK-NEXT:    tst x8, #0x8000000000000000
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: scalar_i64_signbit_eq:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    lsr x8, x0, x1
+; CHECK-SD-NEXT:    tst x8, #0x8000000000000000
+; CHECK-SD-NEXT:    cset w0, eq
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: scalar_i64_signbit_eq:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov x8, #-9223372036854775808 // =0x8000000000000000
+; CHECK-GI-NEXT:    lsl x8, x8, x1
+; CHECK-GI-NEXT:    tst x8, x0
+; CHECK-GI-NEXT:    cset w0, eq
+; CHECK-GI-NEXT:    ret
   %t0 = shl i64 9223372036854775808, %y
   %t1 = and i64 %t0, %x
   %res = icmp eq i64 %t1, 0
@@ -155,12 +248,20 @@ define i1 @scalar_i64_signbit_eq(i64 %x, i64 %y) nounwind {
 }
 
 define i1 @scalar_i64_lowestbit_eq(i64 %x, i64 %y) nounwind {
-; CHECK-LABEL: scalar_i64_lowestbit_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsr x8, x0, x1
-; CHECK-NEXT:    tst x8, #0x1
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: scalar_i64_lowestbit_eq:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    lsr x8, x0, x1
+; CHECK-SD-NEXT:    tst x8, #0x1
+; CHECK-SD-NEXT:    cset w0, eq
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: scalar_i64_lowestbit_eq:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov w8, #1 // =0x1
+; CHECK-GI-NEXT:    lsl x8, x8, x1
+; CHECK-GI-NEXT:    tst x8, x0
+; CHECK-GI-NEXT:    cset w0, eq
+; CHECK-GI-NEXT:    ret
   %t0 = shl i64 1, %y
   %t1 = and i64 %t0, %x
   %res = icmp eq i64 %t1, 0
@@ -168,12 +269,20 @@ define i1 @scalar_i64_lowestbit_eq(i64 %x, i64 %y) nounwind {
 }
 
 define i1 @scalar_i64_bitsinmiddle_eq(i64 %x, i64 %y) nounwind {
-; CHECK-LABEL: scalar_i64_bitsinmiddle_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsr x8, x0, x1
-; CHECK-NEXT:    tst x8, #0xffffffff0000
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: scalar_i64_bitsinmiddle_eq:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    lsr x8, x0, x1
+; CHECK-SD-NEXT:    tst x8, #0xffffffff0000
+; CHECK-SD-NEXT:    cset w0, eq
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: scalar_i64_bitsinmiddle_eq:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov x8, #281474976645120 // =0xffffffff0000
+; CHECK-GI-NEXT:    lsl x8, x8, x1
+; CHECK-GI-NEXT:    tst x8, x0
+; CHECK-GI-NEXT:    cset w0, eq
+; CHECK-GI-NEXT:    ret
   %t0 = shl i64 281474976645120, %y
   %t1 = and i64 %t0, %x
   %res = icmp eq i64 %t1, 0
@@ -216,42 +325,81 @@ define <4 x i1> @vec_4xi32_nonsplat_eq(<4 x i32> %x, <4 x i32> %y) nounwind {
 }
 
 define <4 x i1> @vec_4xi32_nonsplat_undef0_eq(<4 x i32> %x, <4 x i32> %y) nounwind {
-; CHECK-LABEL: vec_4xi32_nonsplat_undef0_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v2.4s, #1
-; CHECK-NEXT:    ushl v1.4s, v2.4s, v1.4s
-; CHECK-NEXT:    and v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
-; CHECK-NEXT:    xtn v0.4h, v0.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: vec_4xi32_nonsplat_undef0_eq:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    movi v2.4s, #1
+; CHECK-SD-NEXT:    ushl v1.4s, v2.4s, v1.4s
+; CHECK-SD-NEXT:    and v0.16b, v1.16b, v0.16b
+; CHECK-SD-NEXT:    cmeq v0.4s, v0.4s, #0
+; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: vec_4xi32_nonsplat_undef0_eq:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov w8, #1 // =0x1
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov v2.s[1], w8
+; CHECK-GI-NEXT:    mov v2.s[3], w8
+; CHECK-GI-NEXT:    ushl v1.4s, v2.4s, v1.4s
+; CHECK-GI-NEXT:    and v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    cmeq v0.4s, v0.4s, #0
+; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    ret
   %t0 = shl <4 x i32> <i32 1, i32 1, i32 undef, i32 1>, %y
   %t1 = and <4 x i32> %t0, %x
   %res = icmp eq <4 x i32> %t1, <i32 0, i32 0, i32 0, i32 0>
   ret <4 x i1> %res
 }
 define <4 x i1> @vec_4xi32_nonsplat_undef1_eq(<4 x i32> %x, <4 x i32> %y) nounwind {
-; CHECK-LABEL: vec_4xi32_nonsplat_undef1_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v2.4s, #1
-; CHECK-NEXT:    ushl v1.4s, v2.4s, v1.4s
-; CHECK-NEXT:    and v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
-; CHECK-NEXT:    xtn v0.4h, v0.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: vec_4xi32_nonsplat_undef1_eq:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    movi v2.4s, #1
+; CHECK-SD-NEXT:    ushl v1.4s, v2.4s, v1.4s
+; CHECK-SD-NEXT:    and v0.16b, v1.16b, v0.16b
+; CHECK-SD-NEXT:    cmeq v0.4s, v0.4s, #0
+; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: vec_4xi32_nonsplat_undef1_eq:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi d3, #0000000000000000
+; CHECK-GI-NEXT:    movi v2.4s, #1
+; CHECK-GI-NEXT:    mov v3.s[1], wzr
+; CHECK-GI-NEXT:    ushl v1.4s, v2.4s, v1.4s
+; CHECK-GI-NEXT:    and v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    mov v3.s[3], wzr
+; CHECK-GI-NEXT:    cmeq v0.4s, v0.4s, v3.4s
+; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    ret
   %t0 = shl <4 x i32> <i32 1, i32 1, i32 1, i32 1>, %y
   %t1 = and <4 x i32> %t0, %x
   %res = icmp eq <4 x i32> %t1, <i32 0, i32 0, i32 undef, i32 0>
   ret <4 x i1> %res
 }
 define <4 x i1> @vec_4xi32_nonsplat_undef2_eq(<4 x i32> %x, <4 x i32> %y) nounwind {
-; CHECK-LABEL: vec_4xi32_nonsplat_undef2_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v2.4s, #1
-; CHECK-NEXT:    ushl v1.4s, v2.4s, v1.4s
-; CHECK-NEXT:    and v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
-; CHECK-NEXT:    xtn v0.4h, v0.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: vec_4xi32_nonsplat_undef2_eq:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    movi v2.4s, #1
+; CHECK-SD-NEXT:    ushl v1.4s, v2.4s, v1.4s
+; CHECK-SD-NEXT:    and v0.16b, v1.16b, v0.16b
+; CHECK-SD-NEXT:    cmeq v0.4s, v0.4s, #0
+; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: vec_4xi32_nonsplat_undef2_eq:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov w8, #1 // =0x1
+; CHECK-GI-NEXT:    movi d3, #0000000000000000
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov v2.s[1], w8
+; CHECK-GI-NEXT:    mov v3.s[1], wzr
+; CHECK-GI-NEXT:    mov v2.s[3], w8
+; CHECK-GI-NEXT:    mov v3.s[3], wzr
+; CHECK-GI-NEXT:    ushl v1.4s, v2.4s, v1.4s
+; CHECK-GI-NEXT:    and v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    cmeq v0.4s, v0.4s, v3.4s
+; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    ret
   %t0 = shl <4 x i32> <i32 1, i32 1, i32 undef, i32 1>, %y
   %t1 = and <4 x i32> %t0, %x
   %res = icmp eq <4 x i32> %t1, <i32 0, i32 0, i32 undef, i32 0>
@@ -263,12 +411,22 @@ define <4 x i1> @vec_4xi32_nonsplat_undef2_eq(<4 x i32> %x, <4 x i32> %y) nounwi
 ;------------------------------------------------------------------------------;
 
 define i1 @scalar_i8_signbit_ne(i8 %x, i8 %y) nounwind {
-; CHECK-LABEL: scalar_i8_signbit_ne:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0xff
-; CHECK-NEXT:    lsr w8, w8, w1
-; CHECK-NEXT:    lsr w0, w8, #7
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: scalar_i8_signbit_ne:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    and w8, w0, #0xff
+; CHECK-SD-NEXT:    lsr w8, w8, w1
+; CHECK-SD-NEXT:    lsr w0, w8, #7
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: scalar_i8_signbit_ne:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov w8, #-128 // =0xffffff80
+; CHECK-GI-NEXT:    and w9, w1, #0xff
+; CHECK-GI-NEXT:    lsl w8, w8, w9
+; CHECK-GI-NEXT:    and w8, w8, w0
+; CHECK-GI-NEXT:    tst w8, #0xff
+; CHECK-GI-NEXT:    cset w0, ne
+; CHECK-GI-NEXT:    ret
   %t0 = shl i8 128, %y
   %t1 = and i8 %t0, %x
   %res = icmp ne i8 %t1, 0 ;  we are perfectly happy with 'ne' predicate
@@ -310,13 +468,24 @@ define i1 @scalar_i32_x_is_const2_eq(i32 %y) nounwind {
 }
 
 define i1 @scalar_i8_bitsinmiddle_slt(i8 %x, i8 %y) nounwind {
-; CHECK-LABEL: scalar_i8_bitsinmiddle_slt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #24 // =0x18
-; CHECK-NEXT:    lsl w8, w8, w1
-; CHECK-NEXT:    and w8, w8, w0
-; CHECK-NEXT:    ubfx w0, w8, #7, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: scalar_i8_bitsinmiddle_slt:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    mov w8, #24 // =0x18
+; CHECK-SD-NEXT:    lsl w8, w8, w1
+; CHECK-SD-NEXT:    and w8, w8, w0
+; CHECK-SD-NEXT:    ubfx w0, w8, #7, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: scalar_i8_bitsinmiddle_slt:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov w8, #24 // =0x18
+; CHECK-GI-NEXT:    and w9, w1, #0xff
+; CHECK-GI-NEXT:    lsl w8, w8, w9
+; CHECK-GI-NEXT:    and w8, w8, w0
+; CHECK-GI-NEXT:    sxtb w8, w8
+; CHECK-GI-NEXT:    cmp w8, #0
+; CHECK-GI-NEXT:    cset w0, mi
+; CHECK-GI-NEXT:    ret
   %t0 = shl i8 24, %y
   %t1 = and i8 %t0, %x
   %res = icmp slt i8 %t1, 0
@@ -324,15 +493,20 @@ define i1 @scalar_i8_bitsinmiddle_slt(i8 %x, i8 %y) nounwind {
 }
 
 define i1 @scalar_i8_signbit_eq_with_nonzero(i8 %x, i8 %y) nounwind {
-; CHECK-LABEL: scalar_i8_signbit_eq_with_nonzero:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #-128 // =0xffffff80
-; CHECK-NEXT:    lsl w8, w8, w1
-; CHECK-NEXT:    and w8, w8, w0
-; CHECK-NEXT:    and w8, w8, #0x80
-; CHECK-NEXT:    cmp w8, #1
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: scalar_i8_signbit_eq_with_nonzero:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    mov w8, #-128 // =0xffffff80
+; CHECK-SD-NEXT:    lsl w8, w8, w1
+; CHECK-SD-NEXT:    and w8, w8, w0
+; CHECK-SD-NEXT:    and w8, w8, #0x80
+; CHECK-SD-NEXT:    cmp w8, #1
+; CHECK-SD-NEXT:    cset w0, eq
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: scalar_i8_signbit_eq_with_nonzero:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov w0, wzr
+; CHECK-GI-NEXT:    ret
   %t0 = shl i8 128, %y
   %t1 = and i8 %t0, %x
   %res = icmp eq i8 %t1, 1 ; should be comparing with 0
diff --git a/llvm/test/CodeGen/AArch64/llvm.sincos.ll b/llvm/test/CodeGen/AArch64/llvm.sincos.ll
index f1dcb2a478a0d..21da8645b9b16 100644
--- a/llvm/test/CodeGen/AArch64/llvm.sincos.ll
+++ b/llvm/test/CodeGen/AArch64/llvm.sincos.ll
@@ -215,6 +215,133 @@ define { <2 x half>, <2 x half> } @test_sincos_v2f16(<2 x half> %a) nounwind {
   ret { <2 x half>, <2 x half> } %result
 }
 
+define { <3 x half>, <3 x half> } @test_sincos_v3f16(<3 x half> %a) nounwind {
+; CHECK-LABEL: test_sincos_v3f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #64
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov h1, v0.h[1]
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    add x0, sp, #36
+; CHECK-NEXT:    add x1, sp, #32
+; CHECK-NEXT:    str x30, [sp, #48] // 8-byte Folded Spill
+; CHECK-NEXT:    fcvt s0, h1
+; CHECK-NEXT:    bl sincosf
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    add x0, sp, #28
+; CHECK-NEXT:    add x1, sp, #24
+; CHECK-NEXT:    fcvt s0, h0
+; CHECK-NEXT:    bl sincosf
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    add x0, sp, #44
+; CHECK-NEXT:    add x1, sp, #40
+; CHECK-NEXT:    mov h0, v0.h[2]
+; CHECK-NEXT:    fcvt s0, h0
+; CHECK-NEXT:    bl sincosf
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    add x0, sp, #60
+; CHECK-NEXT:    add x1, sp, #56
+; CHECK-NEXT:    mov h0, v0.h[3]
+; CHECK-NEXT:    fcvt s0, h0
+; CHECK-NEXT:    bl sincosf
+; CHECK-NEXT:    ldp s2, s0, [sp, #32]
+; CHECK-NEXT:    ldr x30, [sp, #48] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp s3, s1, [sp, #24]
+; CHECK-NEXT:    fcvt h4, s0
+; CHECK-NEXT:    fcvt h2, s2
+; CHECK-NEXT:    fcvt h0, s1
+; CHECK-NEXT:    fcvt h1, s3
+; CHECK-NEXT:    ldp s5, s3, [sp, #40]
+; CHECK-NEXT:    fcvt h3, s3
+; CHECK-NEXT:    mov v0.h[1], v4.h[0]
+; CHECK-NEXT:    fcvt h4, s5
+; CHECK-NEXT:    mov v1.h[1], v2.h[0]
+; CHECK-NEXT:    ldp s5, s2, [sp, #56]
+; CHECK-NEXT:    mov v0.h[2], v3.h[0]
+; CHECK-NEXT:    fcvt h2, s2
+; CHECK-NEXT:    fcvt h3, s5
+; CHECK-NEXT:    mov v1.h[2], v4.h[0]
+; CHECK-NEXT:    mov v0.h[3], v2.h[0]
+; CHECK-NEXT:    mov v1.h[3], v3.h[0]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $q1
+; CHECK-NEXT:    add sp, sp, #64
+; CHECK-NEXT:    ret
+;
+; NO-LIBCALL-LABEL: test_sincos_v3f16:
+; NO-LIBCALL:       // %bb.0:
+; NO-LIBCALL-NEXT:    sub sp, sp, #80
+; NO-LIBCALL-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NO-LIBCALL-NEXT:    mov h1, v0.h[1]
+; NO-LIBCALL-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; NO-LIBCALL-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; NO-LIBCALL-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; NO-LIBCALL-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
+; NO-LIBCALL-NEXT:    fcvt s8, h1
+; NO-LIBCALL-NEXT:    fmov s0, s8
+; NO-LIBCALL-NEXT:    bl sinf
+; NO-LIBCALL-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; NO-LIBCALL-NEXT:    fcvt h0, s0
+; NO-LIBCALL-NEXT:    fcvt s9, h1
+; NO-LIBCALL-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; NO-LIBCALL-NEXT:    fmov s0, s9
+; NO-LIBCALL-NEXT:    bl sinf
+; NO-LIBCALL-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; NO-LIBCALL-NEXT:    fcvt h0, s0
+; NO-LIBCALL-NEXT:    mov h1, v1.h[2]
+; NO-LIBCALL-NEXT:    fcvt s10, h1
+; NO-LIBCALL-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
+; NO-LIBCALL-NEXT:    mov v0.h[1], v1.h[0]
+; NO-LIBCALL-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; NO-LIBCALL-NEXT:    fmov s0, s10
+; NO-LIBCALL-NEXT:    bl sinf
+; NO-LIBCALL-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; NO-LIBCALL-NEXT:    fcvt h0, s0
+; NO-LIBCALL-NEXT:    mov h1, v1.h[3]
+; NO-LIBCALL-NEXT:    fcvt s11, h1
+; NO-LIBCALL-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
+; NO-LIBCALL-NEXT:    mov v1.h[2], v0.h[0]
+; NO-LIBCALL-NEXT:    fmov s0, s11
+; NO-LIBCALL-NEXT:    str q1, [sp, #16] // 16-byte Folded Spill
+; NO-LIBCALL-NEXT:    bl sinf
+; NO-LIBCALL-NEXT:    fcvt h0, s0
+; NO-LIBCALL-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
+; NO-LIBCALL-NEXT:    mov v1.h[3], v0.h[0]
+; NO-LIBCALL-NEXT:    fmov s0, s8
+; NO-LIBCALL-NEXT:    str q1, [sp, #16] // 16-byte Folded Spill
+; NO-LIBCALL-NEXT:    bl cosf
+; NO-LIBCALL-NEXT:    fcvt h0, s0
+; NO-LIBCALL-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; NO-LIBCALL-NEXT:    fmov s0, s9
+; NO-LIBCALL-NEXT:    bl cosf
+; NO-LIBCALL-NEXT:    fcvt h0, s0
+; NO-LIBCALL-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; NO-LIBCALL-NEXT:    mov v0.h[1], v1.h[0]
+; NO-LIBCALL-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; NO-LIBCALL-NEXT:    fmov s0, s10
+; NO-LIBCALL-NEXT:    bl cosf
+; NO-LIBCALL-NEXT:    fcvt h0, s0
+; NO-LIBCALL-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; NO-LIBCALL-NEXT:    mov v1.h[2], v0.h[0]
+; NO-LIBCALL-NEXT:    fmov s0, s11
+; NO-LIBCALL-NEXT:    str q1, [sp] // 16-byte Folded Spill
+; NO-LIBCALL-NEXT:    bl cosf
+; NO-LIBCALL-NEXT:    fmov s1, s0
+; NO-LIBCALL-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; NO-LIBCALL-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; NO-LIBCALL-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+; NO-LIBCALL-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
+; NO-LIBCALL-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NO-LIBCALL-NEXT:    fcvt h2, s1
+; NO-LIBCALL-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; NO-LIBCALL-NEXT:    mov v1.h[3], v2.h[0]
+; NO-LIBCALL-NEXT:    // kill: def $d1 killed $d1 killed $q1
+; NO-LIBCALL-NEXT:    add sp, sp, #80
+; NO-LIBCALL-NEXT:    ret
+  %result = call { <3 x half>, <3 x half> } @llvm.sincos.v3f16(<3 x half> %a)
+  ret { <3 x half>, <3 x half> } %result
+}
+
 define { float, float } @test_sincos_f32(float %a) nounwind {
 ; CHECK-LABEL: test_sincos_f32:
 ; CHECK:       // %bb.0:
@@ -493,3 +620,71 @@ define { <2 x double>, <2 x double> } @test_sincos_v2f64(<2 x double> %a) nounwi
   %result = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> %a)
   ret { <2 x double>, <2 x double> } %result
 }
+
+define { <3 x double>, <3 x double> } @test_sincos_v3f64(<3 x double> %a) nounwind {
+; CHECK-LABEL: test_sincos_v3f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #80
+; CHECK-NEXT:    add x0, sp, #16
+; CHECK-NEXT:    add x1, sp, #8
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    fmov d8, d2
+; CHECK-NEXT:    fmov d9, d1
+; CHECK-NEXT:    bl sincos
+; CHECK-NEXT:    fmov d0, d9
+; CHECK-NEXT:    add x0, sp, #32
+; CHECK-NEXT:    add x1, sp, #24
+; CHECK-NEXT:    bl sincos
+; CHECK-NEXT:    fmov d0, d8
+; CHECK-NEXT:    add x0, sp, #72
+; CHECK-NEXT:    add x1, sp, #40
+; CHECK-NEXT:    bl sincos
+; CHECK-NEXT:    ldp d3, d0, [sp, #8]
+; CHECK-NEXT:    ldr d2, [sp, #72]
+; CHECK-NEXT:    ldp d4, d1, [sp, #24]
+; CHECK-NEXT:    ldr d5, [sp, #40]
+; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #80
+; CHECK-NEXT:    ret
+;
+; NO-LIBCALL-LABEL: test_sincos_v3f64:
+; NO-LIBCALL:       // %bb.0:
+; NO-LIBCALL-NEXT:    stp d13, d12, [sp, #-64]! // 16-byte Folded Spill
+; NO-LIBCALL-NEXT:    stp d11, d10, [sp, #16] // 16-byte Folded Spill
+; NO-LIBCALL-NEXT:    fmov d10, d0
+; NO-LIBCALL-NEXT:    stp d9, d8, [sp, #32] // 16-byte Folded Spill
+; NO-LIBCALL-NEXT:    fmov d8, d2
+; NO-LIBCALL-NEXT:    fmov d9, d1
+; NO-LIBCALL-NEXT:    str x30, [sp, #48] // 8-byte Folded Spill
+; NO-LIBCALL-NEXT:    bl sin
+; NO-LIBCALL-NEXT:    fmov d11, d0
+; NO-LIBCALL-NEXT:    fmov d0, d9
+; NO-LIBCALL-NEXT:    bl sin
+; NO-LIBCALL-NEXT:    fmov d12, d0
+; NO-LIBCALL-NEXT:    fmov d0, d8
+; NO-LIBCALL-NEXT:    bl sin
+; NO-LIBCALL-NEXT:    fmov d13, d0
+; NO-LIBCALL-NEXT:    fmov d0, d10
+; NO-LIBCALL-NEXT:    bl cos
+; NO-LIBCALL-NEXT:    fmov d10, d0
+; NO-LIBCALL-NEXT:    fmov d0, d9
+; NO-LIBCALL-NEXT:    bl cos
+; NO-LIBCALL-NEXT:    fmov d9, d0
+; NO-LIBCALL-NEXT:    fmov d0, d8
+; NO-LIBCALL-NEXT:    bl cos
+; NO-LIBCALL-NEXT:    fmov d5, d0
+; NO-LIBCALL-NEXT:    fmov d0, d11
+; NO-LIBCALL-NEXT:    ldr x30, [sp, #48] // 8-byte Folded Reload
+; NO-LIBCALL-NEXT:    fmov d3, d10
+; NO-LIBCALL-NEXT:    fmov d4, d9
+; NO-LIBCALL-NEXT:    fmov d1, d12
+; NO-LIBCALL-NEXT:    ldp d9, d8, [sp, #32] // 16-byte Folded Reload
+; NO-LIBCALL-NEXT:    fmov d2, d13
+; NO-LIBCALL-NEXT:    ldp d11, d10, [sp, #16] // 16-byte Folded Reload
+; NO-LIBCALL-NEXT:    ldp d13, d12, [sp], #64 // 16-byte Folded Reload
+; NO-LIBCALL-NEXT:    ret
+  %result = call { <3 x double>, <3 x double> } @llvm.sincos.v3f64(<3 x double> %a)
+  ret { <3 x double>, <3 x double> } %result
+}
diff --git a/llvm/test/CodeGen/AArch64/rem-by-const.ll b/llvm/test/CodeGen/AArch64/rem-by-const.ll
index 1cb92e46cbcd1..87b11086e28d5 100644
--- a/llvm/test/CodeGen/AArch64/rem-by-const.ll
+++ b/llvm/test/CodeGen/AArch64/rem-by-const.ll
@@ -559,20 +559,18 @@ define i128 @ui128_7(i128 %a, i128 %b) {
 ; CHECK-GI-NEXT:    add x8, x8, x10
 ; CHECK-GI-NEXT:    subs x10, x0, x9
 ; CHECK-GI-NEXT:    sbc x11, x1, x8
-; CHECK-GI-NEXT:    lsl x12, x11, #63
+; CHECK-GI-NEXT:    extr x10, x11, x10, #1
 ; CHECK-GI-NEXT:    lsr x11, x11, #1
-; CHECK-GI-NEXT:    orr x10, x12, x10, lsr #1
 ; CHECK-GI-NEXT:    adds x9, x10, x9
+; CHECK-GI-NEXT:    mov w10, #7 // =0x7
 ; CHECK-GI-NEXT:    adc x8, x11, x8
-; CHECK-GI-NEXT:    lsl x10, x8, #62
+; CHECK-GI-NEXT:    extr x9, x8, x9, #2
 ; CHECK-GI-NEXT:    lsr x8, x8, #2
-; CHECK-GI-NEXT:    orr x9, x10, x9, lsr #2
-; CHECK-GI-NEXT:    mov w10, #7 // =0x7
-; CHECK-GI-NEXT:    lsl x12, x8, #3
 ; CHECK-GI-NEXT:    umulh x10, x9, x10
 ; CHECK-GI-NEXT:    lsl x11, x9, #3
-; CHECK-GI-NEXT:    sub x8, x12, x8
+; CHECK-GI-NEXT:    lsl x12, x8, #3
 ; CHECK-GI-NEXT:    sub x9, x11, x9
+; CHECK-GI-NEXT:    sub x8, x12, x8
 ; CHECK-GI-NEXT:    subs x0, x0, x9
 ; CHECK-GI-NEXT:    add x8, x8, x10
 ; CHECK-GI-NEXT:    sbc x1, x1, x8
@@ -640,10 +638,9 @@ define i128 @ui128_100(i128 %a, i128 %b) {
 ; CHECK-GI-NEXT:    add x10, x11, x12
 ; CHECK-GI-NEXT:    add x8, x8, x14
 ; CHECK-GI-NEXT:    add x8, x8, x10
-; CHECK-GI-NEXT:    lsl x10, x8, #60
-; CHECK-GI-NEXT:    lsr x8, x8, #4
-; CHECK-GI-NEXT:    orr x9, x10, x9, lsr #4
 ; CHECK-GI-NEXT:    mov w10, #100 // =0x64
+; CHECK-GI-NEXT:    extr x9, x8, x9, #4
+; CHECK-GI-NEXT:    lsr x8, x8, #4
 ; CHECK-GI-NEXT:    umulh x11, x9, x10
 ; CHECK-GI-NEXT:    mul x9, x9, x10
 ; CHECK-GI-NEXT:    madd x8, x8, x10, x11
@@ -3317,36 +3314,32 @@ define <2 x i128> @uv2i128_7(<2 x i128> %d, <2 x i128> %e) {
 ; CHECK-GI-NEXT:    sbc x14, x1, x12
 ; CHECK-GI-NEXT:    add x8, x8, x13
 ; CHECK-GI-NEXT:    subs x13, x2, x10
-; CHECK-GI-NEXT:    lsl x15, x14, #63
-; CHECK-GI-NEXT:    sbc x16, x3, x8
+; CHECK-GI-NEXT:    extr x9, x14, x9, #1
+; CHECK-GI-NEXT:    sbc x15, x3, x8
 ; CHECK-GI-NEXT:    lsr x14, x14, #1
-; CHECK-GI-NEXT:    orr x9, x15, x9, lsr #1
-; CHECK-GI-NEXT:    lsl x15, x16, #63
-; CHECK-GI-NEXT:    orr x13, x15, x13, lsr #1
+; CHECK-GI-NEXT:    extr x13, x15, x13, #1
 ; CHECK-GI-NEXT:    adds x9, x9, x11
-; CHECK-GI-NEXT:    lsr x11, x16, #1
+; CHECK-GI-NEXT:    lsr x11, x15, #1
 ; CHECK-GI-NEXT:    adc x12, x14, x12
 ; CHECK-GI-NEXT:    adds x10, x13, x10
-; CHECK-GI-NEXT:    lsl x13, x12, #62
-; CHECK-GI-NEXT:    lsr x12, x12, #2
-; CHECK-GI-NEXT:    adc x8, x11, x8
-; CHECK-GI-NEXT:    lsl x11, x8, #62
-; CHECK-GI-NEXT:    orr x9, x13, x9, lsr #2
+; CHECK-GI-NEXT:    extr x9, x12, x9, #2
 ; CHECK-GI-NEXT:    mov w13, #7 // =0x7
+; CHECK-GI-NEXT:    adc x8, x11, x8
+; CHECK-GI-NEXT:    lsr x11, x12, #2
+; CHECK-GI-NEXT:    extr x10, x8, x10, #2
+; CHECK-GI-NEXT:    umulh x12, x9, x13
 ; CHECK-GI-NEXT:    lsr x8, x8, #2
-; CHECK-GI-NEXT:    lsl x14, x12, #3
-; CHECK-GI-NEXT:    orr x10, x11, x10, lsr #2
-; CHECK-GI-NEXT:    umulh x11, x9, x13
+; CHECK-GI-NEXT:    lsl x14, x11, #3
 ; CHECK-GI-NEXT:    lsl x15, x9, #3
-; CHECK-GI-NEXT:    sub x12, x14, x12
-; CHECK-GI-NEXT:    lsl x16, x8, #3
 ; CHECK-GI-NEXT:    umulh x13, x10, x13
+; CHECK-GI-NEXT:    lsl x16, x8, #3
+; CHECK-GI-NEXT:    sub x11, x14, x11
 ; CHECK-GI-NEXT:    lsl x14, x10, #3
 ; CHECK-GI-NEXT:    sub x9, x15, x9
 ; CHECK-GI-NEXT:    sub x8, x16, x8
 ; CHECK-GI-NEXT:    subs x0, x0, x9
+; CHECK-GI-NEXT:    add x11, x11, x12
 ; CHECK-GI-NEXT:    sub x10, x14, x10
-; CHECK-GI-NEXT:    add x11, x12, x11
 ; CHECK-GI-NEXT:    sbc x1, x1, x11
 ; CHECK-GI-NEXT:    subs x2, x2, x10
 ; CHECK-GI-NEXT:    add x8, x8, x13
@@ -3394,9 +3387,10 @@ define <2 x i128> @uv2i128_100(<2 x i128> %d, <2 x i128> %e) {
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov x10, #23593 // =0x5c29
 ; CHECK-GI-NEXT:    mov x8, #62914 // =0xf5c2
-; CHECK-GI-NEXT:    sub x18, x0, x0
+; CHECK-GI-NEXT:    and x5, xzr, #0x1
 ; CHECK-GI-NEXT:    movk x10, #49807, lsl #16
 ; CHECK-GI-NEXT:    movk x8, #23592, lsl #16
+; CHECK-GI-NEXT:    umulh x18, x0, xzr
 ; CHECK-GI-NEXT:    movk x10, #10485, lsl #32
 ; CHECK-GI-NEXT:    movk x8, #49807, lsl #32
 ; CHECK-GI-NEXT:    movk x10, #36700, lsl #48
@@ -3409,84 +3403,81 @@ define <2 x i128> @uv2i128_100(<2 x i128> %d, <2 x i128> %e) {
 ; CHECK-GI-NEXT:    umulh x15, x1, x10
 ; CHECK-GI-NEXT:    cset w12, hs
 ; CHECK-GI-NEXT:    cmn x11, x13
-; CHECK-GI-NEXT:    and x11, x12, #0x1
-; CHECK-GI-NEXT:    umulh x16, x0, x8
-; CHECK-GI-NEXT:    cset w12, hs
+; CHECK-GI-NEXT:    sub x13, x0, x0
 ; CHECK-GI-NEXT:    and x12, x12, #0x1
-; CHECK-GI-NEXT:    add x14, x14, x18
-; CHECK-GI-NEXT:    add x11, x11, x12
-; CHECK-GI-NEXT:    and x12, xzr, #0x1
+; CHECK-GI-NEXT:    umulh x16, x0, x8
+; CHECK-GI-NEXT:    cset w11, hs
+; CHECK-GI-NEXT:    add x13, x14, x13
+; CHECK-GI-NEXT:    and x11, x11, #0x1
+; CHECK-GI-NEXT:    and x14, xzr, #0x1
 ; CHECK-GI-NEXT:    umulh x9, xzr, x10
-; CHECK-GI-NEXT:    adds x14, x14, x15
-; CHECK-GI-NEXT:    and x15, xzr, #0x1
+; CHECK-GI-NEXT:    add x11, x12, x11
+; CHECK-GI-NEXT:    add x12, x5, x14
+; CHECK-GI-NEXT:    adds x13, x13, x15
 ; CHECK-GI-NEXT:    umulh x17, x1, x8
-; CHECK-GI-NEXT:    cset w4, hs
-; CHECK-GI-NEXT:    add x15, x12, x15
-; CHECK-GI-NEXT:    adds x12, x14, x16
-; CHECK-GI-NEXT:    and x4, x4, #0x1
-; CHECK-GI-NEXT:    mul x18, x3, x10
 ; CHECK-GI-NEXT:    cset w14, hs
-; CHECK-GI-NEXT:    adds x12, x12, x11
-; CHECK-GI-NEXT:    add x11, x15, x4
 ; CHECK-GI-NEXT:    and x14, x14, #0x1
-; CHECK-GI-NEXT:    cset w15, hs
-; CHECK-GI-NEXT:    mul x5, x2, x8
-; CHECK-GI-NEXT:    add x11, x11, x14
-; CHECK-GI-NEXT:    and x14, x15, #0x1
-; CHECK-GI-NEXT:    add x17, x9, x17
-; CHECK-GI-NEXT:    add x14, x11, x14
-; CHECK-GI-NEXT:    mov w11, #100 // =0x64
-; CHECK-GI-NEXT:    umulh x13, x0, xzr
-; CHECK-GI-NEXT:    umulh x16, x2, x10
-; CHECK-GI-NEXT:    adds x18, x18, x5
-; CHECK-GI-NEXT:    mul x15, x3, x8
-; CHECK-GI-NEXT:    add x13, x17, x13
-; CHECK-GI-NEXT:    cset w17, hs
-; CHECK-GI-NEXT:    umulh x10, x3, x10
-; CHECK-GI-NEXT:    add x13, x13, x14
-; CHECK-GI-NEXT:    and x17, x17, #0x1
-; CHECK-GI-NEXT:    cmn x18, x16
-; CHECK-GI-NEXT:    sub x18, x2, x2
-; CHECK-GI-NEXT:    umulh x16, x2, x8
+; CHECK-GI-NEXT:    adds x13, x13, x16
+; CHECK-GI-NEXT:    mul x4, x3, x10
+; CHECK-GI-NEXT:    add x12, x12, x14
 ; CHECK-GI-NEXT:    cset w14, hs
-; CHECK-GI-NEXT:    and x14, x14, #0x1
-; CHECK-GI-NEXT:    add x15, x15, x18
+; CHECK-GI-NEXT:    adds x11, x13, x11
+; CHECK-GI-NEXT:    and x13, x14, #0x1
+; CHECK-GI-NEXT:    mul x15, x2, x8
+; CHECK-GI-NEXT:    cset w14, hs
+; CHECK-GI-NEXT:    add x12, x12, x13
+; CHECK-GI-NEXT:    and x13, x14, #0x1
+; CHECK-GI-NEXT:    add x14, x9, x17
+; CHECK-GI-NEXT:    sub x17, x2, x2
+; CHECK-GI-NEXT:    umulh x16, x2, x10
+; CHECK-GI-NEXT:    add x12, x12, x13
+; CHECK-GI-NEXT:    add x13, x14, x18
+; CHECK-GI-NEXT:    add x12, x13, x12
 ; CHECK-GI-NEXT:    and x18, xzr, #0x1
-; CHECK-GI-NEXT:    add x14, x17, x14
+; CHECK-GI-NEXT:    mul x5, x3, x8
+; CHECK-GI-NEXT:    extr x11, x12, x11, #4
+; CHECK-GI-NEXT:    adds x13, x4, x15
+; CHECK-GI-NEXT:    umulh x14, x3, x10
+; CHECK-GI-NEXT:    cset w15, hs
+; CHECK-GI-NEXT:    mov w10, #100 // =0x64
+; CHECK-GI-NEXT:    cmn x13, x16
+; CHECK-GI-NEXT:    and x15, x15, #0x1
+; CHECK-GI-NEXT:    umulh x13, x2, x8
+; CHECK-GI-NEXT:    cset w16, hs
+; CHECK-GI-NEXT:    add x17, x5, x17
+; CHECK-GI-NEXT:    and x16, x16, #0x1
 ; CHECK-GI-NEXT:    umulh x8, x3, x8
+; CHECK-GI-NEXT:    add x15, x15, x16
+; CHECK-GI-NEXT:    adds x14, x17, x14
 ; CHECK-GI-NEXT:    and x17, xzr, #0x1
-; CHECK-GI-NEXT:    adds x10, x15, x10
-; CHECK-GI-NEXT:    add x15, x17, x18
+; CHECK-GI-NEXT:    add x16, x18, x17
 ; CHECK-GI-NEXT:    cset w17, hs
-; CHECK-GI-NEXT:    umulh x18, x2, xzr
+; CHECK-GI-NEXT:    adds x13, x14, x13
+; CHECK-GI-NEXT:    umulh x14, x2, xzr
 ; CHECK-GI-NEXT:    and x17, x17, #0x1
-; CHECK-GI-NEXT:    adds x10, x10, x16
-; CHECK-GI-NEXT:    lsl x16, x13, #60
-; CHECK-GI-NEXT:    add x15, x15, x17
-; CHECK-GI-NEXT:    cset w17, hs
-; CHECK-GI-NEXT:    adds x10, x10, x14
-; CHECK-GI-NEXT:    and x14, x17, #0x1
+; CHECK-GI-NEXT:    cset w18, hs
+; CHECK-GI-NEXT:    adds x13, x13, x15
+; CHECK-GI-NEXT:    add x15, x16, x17
+; CHECK-GI-NEXT:    and x16, x18, #0x1
 ; CHECK-GI-NEXT:    cset w17, hs
 ; CHECK-GI-NEXT:    add x8, x9, x8
-; CHECK-GI-NEXT:    add x14, x15, x14
-; CHECK-GI-NEXT:    and x15, x17, #0x1
-; CHECK-GI-NEXT:    orr x12, x16, x12, lsr #4
-; CHECK-GI-NEXT:    add x9, x14, x15
-; CHECK-GI-NEXT:    add x8, x8, x18
-; CHECK-GI-NEXT:    add x8, x8, x9
-; CHECK-GI-NEXT:    lsr x9, x13, #4
-; CHECK-GI-NEXT:    umulh x14, x12, x11
-; CHECK-GI-NEXT:    lsl x13, x8, #60
+; CHECK-GI-NEXT:    add x15, x15, x16
+; CHECK-GI-NEXT:    and x16, x17, #0x1
+; CHECK-GI-NEXT:    lsr x9, x12, #4
+; CHECK-GI-NEXT:    add x15, x15, x16
+; CHECK-GI-NEXT:    umulh x17, x11, x10
+; CHECK-GI-NEXT:    add x8, x8, x14
+; CHECK-GI-NEXT:    add x8, x8, x15
+; CHECK-GI-NEXT:    mul x11, x11, x10
+; CHECK-GI-NEXT:    extr x12, x8, x13, #4
 ; CHECK-GI-NEXT:    lsr x8, x8, #4
-; CHECK-GI-NEXT:    mul x12, x12, x11
-; CHECK-GI-NEXT:    orr x10, x13, x10, lsr #4
-; CHECK-GI-NEXT:    madd x9, x9, x11, x14
-; CHECK-GI-NEXT:    umulh x13, x10, x11
-; CHECK-GI-NEXT:    subs x0, x0, x12
-; CHECK-GI-NEXT:    mul x10, x10, x11
+; CHECK-GI-NEXT:    madd x9, x9, x10, x17
+; CHECK-GI-NEXT:    umulh x13, x12, x10
+; CHECK-GI-NEXT:    subs x0, x0, x11
+; CHECK-GI-NEXT:    mul x12, x12, x10
 ; CHECK-GI-NEXT:    sbc x1, x1, x9
-; CHECK-GI-NEXT:    madd x8, x8, x11, x13
-; CHECK-GI-NEXT:    subs x2, x2, x10
+; CHECK-GI-NEXT:    madd x8, x8, x10, x13
+; CHECK-GI-NEXT:    subs x2, x2, x12
 ; CHECK-GI-NEXT:    sbc x3, x3, x8
 ; CHECK-GI-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/signbit-test.ll b/llvm/test/CodeGen/AArch64/signbit-test.ll
index c74a934ee09d8..298495bcf5a01 100644
--- a/llvm/test/CodeGen/AArch64/signbit-test.ll
+++ b/llvm/test/CodeGen/AArch64/signbit-test.ll
@@ -1,13 +1,21 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=aarch64-- | FileCheck %s
+; RUN: llc -mtriple=aarch64--  < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64-- -global-isel < %s | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 define i64 @test_clear_mask_i64_i32(i64 %x) nounwind {
-; CHECK-LABEL: test_clear_mask_i64_i32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov w8, #42 // =0x2a
-; CHECK-NEXT:    cmn w0, #1
-; CHECK-NEXT:    csel x0, x8, x0, gt
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_clear_mask_i64_i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #42 // =0x2a
+; CHECK-SD-NEXT:    cmn w0, #1
+; CHECK-SD-NEXT:    csel x0, x8, x0, gt
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_clear_mask_i64_i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #42 // =0x2a
+; CHECK-GI-NEXT:    tst x0, #0x80000000
+; CHECK-GI-NEXT:    csel x0, x8, x0, eq
+; CHECK-GI-NEXT:    ret
 entry:
   %a = and i64 %x, 2147483648
   %r = icmp eq i64 %a, 0
diff --git a/llvm/test/CodeGen/AArch64/signed-truncation-check.ll b/llvm/test/CodeGen/AArch64/signed-truncation-check.ll
index 7c80f9320faec..fc01c6b2c5471 100644
--- a/llvm/test/CodeGen/AArch64/signed-truncation-check.ll
+++ b/llvm/test/CodeGen/AArch64/signed-truncation-check.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64-unknown-linux-gnu -global-isel < %s | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 ; https://bugs.llvm.org/show_bug.cgi?id=38149
 
@@ -19,13 +20,22 @@
 ; ---------------------------------------------------------------------------- ;
 
 define i1 @shifts_eqcmp_i16_i8(i16 %x) nounwind {
-; CHECK-LABEL: shifts_eqcmp_i16_i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxtb w8, w0
-; CHECK-NEXT:    and w8, w8, #0xffff
-; CHECK-NEXT:    cmp w8, w0, uxth
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: shifts_eqcmp_i16_i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sxtb w8, w0
+; CHECK-SD-NEXT:    and w8, w8, #0xffff
+; CHECK-SD-NEXT:    cmp w8, w0, uxth
+; CHECK-SD-NEXT:    cset w0, eq
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shifts_eqcmp_i16_i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    lsl w8, w0, #8
+; CHECK-GI-NEXT:    sbfx w8, w8, #8, #8
+; CHECK-GI-NEXT:    and w8, w8, #0xffff
+; CHECK-GI-NEXT:    cmp w8, w0, uxth
+; CHECK-GI-NEXT:    cset w0, eq
+; CHECK-GI-NEXT:    ret
   %tmp0 = shl i16 %x, 8 ; 16-8
   %tmp1 = ashr exact i16 %tmp0, 8 ; 16-8
   %tmp2 = icmp eq i16 %tmp1, %x
@@ -97,26 +107,43 @@ define i1 @shifts_eqcmp_i64_i8(i64 %x) nounwind {
 ; ---------------------------------------------------------------------------- ;
 
 define i1 @add_ugecmp_i16_i8(i16 %x) nounwind {
-; CHECK-LABEL: add_ugecmp_i16_i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0xffff
-; CHECK-NEXT:    sub w8, w8, #128
-; CHECK-NEXT:    lsr w8, w8, #8
-; CHECK-NEXT:    cmp w8, #254
-; CHECK-NEXT:    cset w0, hi
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: add_ugecmp_i16_i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    and w8, w0, #0xffff
+; CHECK-SD-NEXT:    sub w8, w8, #128
+; CHECK-SD-NEXT:    lsr w8, w8, #8
+; CHECK-SD-NEXT:    cmp w8, #254
+; CHECK-SD-NEXT:    cset w0, hi
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_ugecmp_i16_i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov w8, #-128 // =0xffffff80
+; CHECK-GI-NEXT:    mov w9, #65280 // =0xff00
+; CHECK-GI-NEXT:    add w8, w8, w0, uxth
+; CHECK-GI-NEXT:    cmp w8, w9
+; CHECK-GI-NEXT:    cset w0, hs
+; CHECK-GI-NEXT:    ret
   %tmp0 = add i16 %x, -128 ; ~0U << (8-1)
   %tmp1 = icmp uge i16 %tmp0, -256 ; ~0U << 8
   ret i1 %tmp1
 }
 
 define i1 @add_ugecmp_i32_i16_i8(i16 %xx) nounwind {
-; CHECK-LABEL: add_ugecmp_i32_i16_i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0xffff
-; CHECK-NEXT:    cmp w8, w8, sxtb
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: add_ugecmp_i32_i16_i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    and w8, w0, #0xffff
+; CHECK-SD-NEXT:    cmp w8, w8, sxtb
+; CHECK-SD-NEXT:    cset w0, eq
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_ugecmp_i32_i16_i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov w8, #-128 // =0xffffff80
+; CHECK-GI-NEXT:    add w8, w8, w0, uxth
+; CHECK-GI-NEXT:    cmn w8, #256
+; CHECK-GI-NEXT:    cset w0, hs
+; CHECK-GI-NEXT:    ret
   %x = zext i16 %xx to i32
   %tmp0 = add i32 %x, -128 ; ~0U << (8-1)
   %tmp1 = icmp uge i32 %tmp0, -256 ; ~0U << 8
@@ -124,55 +151,92 @@ define i1 @add_ugecmp_i32_i16_i8(i16 %xx) nounwind {
 }
 
 define i1 @add_ugecmp_i32_i16(i32 %x) nounwind {
-; CHECK-LABEL: add_ugecmp_i32_i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmp w0, w0, sxth
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: add_ugecmp_i32_i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmp w0, w0, sxth
+; CHECK-SD-NEXT:    cset w0, eq
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_ugecmp_i32_i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sub w8, w0, #8, lsl #12 // =32768
+; CHECK-GI-NEXT:    cmn w8, #16, lsl #12 // =65536
+; CHECK-GI-NEXT:    cset w0, hs
+; CHECK-GI-NEXT:    ret
   %tmp0 = add i32 %x, -32768 ; ~0U << (16-1)
   %tmp1 = icmp uge i32 %tmp0, -65536 ; ~0U << 16
   ret i1 %tmp1
 }
 
 define i1 @add_ugecmp_i32_i8(i32 %x) nounwind {
-; CHECK-LABEL: add_ugecmp_i32_i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmp w0, w0, sxtb
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: add_ugecmp_i32_i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmp w0, w0, sxtb
+; CHECK-SD-NEXT:    cset w0, eq
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_ugecmp_i32_i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sub w8, w0, #128
+; CHECK-GI-NEXT:    cmn w8, #256
+; CHECK-GI-NEXT:    cset w0, hs
+; CHECK-GI-NEXT:    ret
   %tmp0 = add i32 %x, -128 ; ~0U << (8-1)
   %tmp1 = icmp uge i32 %tmp0, -256 ; ~0U << 8
   ret i1 %tmp1
 }
 
 define i1 @add_ugecmp_i64_i32(i64 %x) nounwind {
-; CHECK-LABEL: add_ugecmp_i64_i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmp x0, w0, sxtw
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: add_ugecmp_i64_i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmp x0, w0, sxtw
+; CHECK-SD-NEXT:    cset w0, eq
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_ugecmp_i64_i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov x8, #-2147483648 // =0xffffffff80000000
+; CHECK-GI-NEXT:    mov x9, #-4294967296 // =0xffffffff00000000
+; CHECK-GI-NEXT:    add x8, x0, x8
+; CHECK-GI-NEXT:    cmp x8, x9
+; CHECK-GI-NEXT:    cset w0, hs
+; CHECK-GI-NEXT:    ret
   %tmp0 = add i64 %x, -2147483648 ; ~0U << (32-1)
   %tmp1 = icmp uge i64 %tmp0, -4294967296 ; ~0U << 32
   ret i1 %tmp1
 }
 
 define i1 @add_ugecmp_i64_i16(i64 %x) nounwind {
-; CHECK-LABEL: add_ugecmp_i64_i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmp x0, w0, sxth
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: add_ugecmp_i64_i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmp x0, w0, sxth
+; CHECK-SD-NEXT:    cset w0, eq
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_ugecmp_i64_i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sub x8, x0, #8, lsl #12 // =32768
+; CHECK-GI-NEXT:    cmn x8, #16, lsl #12 // =65536
+; CHECK-GI-NEXT:    cset w0, hs
+; CHECK-GI-NEXT:    ret
   %tmp0 = add i64 %x, -32768 ; ~0U << (16-1)
   %tmp1 = icmp uge i64 %tmp0, -65536 ; ~0U << 16
   ret i1 %tmp1
 }
 
 define i1 @add_ugecmp_i64_i8(i64 %x) nounwind {
-; CHECK-LABEL: add_ugecmp_i64_i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmp x0, w0, sxtb
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: add_ugecmp_i64_i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmp x0, w0, sxtb
+; CHECK-SD-NEXT:    cset w0, eq
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_ugecmp_i64_i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sub x8, x0, #128
+; CHECK-GI-NEXT:    cmn x8, #256
+; CHECK-GI-NEXT:    cset w0, hs
+; CHECK-GI-NEXT:    ret
   %tmp0 = add i64 %x, -128 ; ~0U << (8-1)
   %tmp1 = icmp uge i64 %tmp0, -256 ; ~0U << 8
   ret i1 %tmp1
@@ -180,14 +244,23 @@ define i1 @add_ugecmp_i64_i8(i64 %x) nounwind {
 
 ; Slightly more canonical variant
 define i1 @add_ugtcmp_i16_i8(i16 %x) nounwind {
-; CHECK-LABEL: add_ugtcmp_i16_i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0xffff
-; CHECK-NEXT:    sub w8, w8, #128
-; CHECK-NEXT:    lsr w8, w8, #8
-; CHECK-NEXT:    cmp w8, #254
-; CHECK-NEXT:    cset w0, hi
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: add_ugtcmp_i16_i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    and w8, w0, #0xffff
+; CHECK-SD-NEXT:    sub w8, w8, #128
+; CHECK-SD-NEXT:    lsr w8, w8, #8
+; CHECK-SD-NEXT:    cmp w8, #254
+; CHECK-SD-NEXT:    cset w0, hi
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_ugtcmp_i16_i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov w8, #-128 // =0xffffff80
+; CHECK-GI-NEXT:    mov w9, #65279 // =0xfeff
+; CHECK-GI-NEXT:    add w8, w8, w0, uxth
+; CHECK-GI-NEXT:    cmp w8, w9
+; CHECK-GI-NEXT:    cset w0, hi
+; CHECK-GI-NEXT:    ret
   %tmp0 = add i16 %x, -128 ; ~0U << (8-1)
   %tmp1 = icmp ugt i16 %tmp0, -257 ; ~0U << 8 - 1
   ret i1 %tmp1
@@ -198,68 +271,113 @@ define i1 @add_ugtcmp_i16_i8(i16 %x) nounwind {
 ; ---------------------------------------------------------------------------- ;
 
 define i1 @add_ultcmp_i16_i8(i16 %x) nounwind {
-; CHECK-LABEL: add_ultcmp_i16_i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxtb w8, w0
-; CHECK-NEXT:    and w8, w8, #0xffff
-; CHECK-NEXT:    cmp w8, w0, uxth
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: add_ultcmp_i16_i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sxtb w8, w0
+; CHECK-SD-NEXT:    and w8, w8, #0xffff
+; CHECK-SD-NEXT:    cmp w8, w0, uxth
+; CHECK-SD-NEXT:    cset w0, eq
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_ultcmp_i16_i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    add w8, w0, #128
+; CHECK-GI-NEXT:    and w8, w8, #0xffff
+; CHECK-GI-NEXT:    cmp w8, #256
+; CHECK-GI-NEXT:    cset w0, lo
+; CHECK-GI-NEXT:    ret
   %tmp0 = add i16 %x, 128 ; 1U << (8-1)
   %tmp1 = icmp ult i16 %tmp0, 256 ; 1U << 8
   ret i1 %tmp1
 }
 
 define i1 @add_ultcmp_i32_i16(i32 %x) nounwind {
-; CHECK-LABEL: add_ultcmp_i32_i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmp w0, w0, sxth
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: add_ultcmp_i32_i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmp w0, w0, sxth
+; CHECK-SD-NEXT:    cset w0, eq
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_ultcmp_i32_i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    add w8, w0, #8, lsl #12 // =32768
+; CHECK-GI-NEXT:    cmp w8, #16, lsl #12 // =65536
+; CHECK-GI-NEXT:    cset w0, lo
+; CHECK-GI-NEXT:    ret
   %tmp0 = add i32 %x, 32768 ; 1U << (16-1)
   %tmp1 = icmp ult i32 %tmp0, 65536 ; 1U << 16
   ret i1 %tmp1
 }
 
 define i1 @add_ultcmp_i32_i8(i32 %x) nounwind {
-; CHECK-LABEL: add_ultcmp_i32_i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmp w0, w0, sxtb
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: add_ultcmp_i32_i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmp w0, w0, sxtb
+; CHECK-SD-NEXT:    cset w0, eq
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_ultcmp_i32_i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    add w8, w0, #128
+; CHECK-GI-NEXT:    cmp w8, #256
+; CHECK-GI-NEXT:    cset w0, lo
+; CHECK-GI-NEXT:    ret
   %tmp0 = add i32 %x, 128 ; 1U << (8-1)
   %tmp1 = icmp ult i32 %tmp0, 256 ; 1U << 8
   ret i1 %tmp1
 }
 
 define i1 @add_ultcmp_i64_i32(i64 %x) nounwind {
-; CHECK-LABEL: add_ultcmp_i64_i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmp x0, w0, sxtw
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: add_ultcmp_i64_i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmp x0, w0, sxtw
+; CHECK-SD-NEXT:    cset w0, eq
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_ultcmp_i64_i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov w8, #-2147483648 // =0x80000000
+; CHECK-GI-NEXT:    mov x9, #4294967296 // =0x100000000
+; CHECK-GI-NEXT:    add x8, x0, x8
+; CHECK-GI-NEXT:    cmp x8, x9
+; CHECK-GI-NEXT:    cset w0, lo
+; CHECK-GI-NEXT:    ret
   %tmp0 = add i64 %x, 2147483648 ; 1U << (32-1)
   %tmp1 = icmp ult i64 %tmp0, 4294967296 ; 1U << 32
   ret i1 %tmp1
 }
 
 define i1 @add_ultcmp_i64_i16(i64 %x) nounwind {
-; CHECK-LABEL: add_ultcmp_i64_i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmp x0, w0, sxth
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: add_ultcmp_i64_i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmp x0, w0, sxth
+; CHECK-SD-NEXT:    cset w0, eq
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_ultcmp_i64_i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    add x8, x0, #8, lsl #12 // =32768
+; CHECK-GI-NEXT:    cmp x8, #16, lsl #12 // =65536
+; CHECK-GI-NEXT:    cset w0, lo
+; CHECK-GI-NEXT:    ret
   %tmp0 = add i64 %x, 32768 ; 1U << (16-1)
   %tmp1 = icmp ult i64 %tmp0, 65536 ; 1U << 16
   ret i1 %tmp1
 }
 
 define i1 @add_ultcmp_i64_i8(i64 %x) nounwind {
-; CHECK-LABEL: add_ultcmp_i64_i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmp x0, w0, sxtb
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: add_ultcmp_i64_i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmp x0, w0, sxtb
+; CHECK-SD-NEXT:    cset w0, eq
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_ultcmp_i64_i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    add x8, x0, #128
+; CHECK-GI-NEXT:    cmp x8, #256
+; CHECK-GI-NEXT:    cset w0, lo
+; CHECK-GI-NEXT:    ret
   %tmp0 = add i64 %x, 128 ; 1U << (8-1)
   %tmp1 = icmp ult i64 %tmp0, 256 ; 1U << 8
   ret i1 %tmp1
@@ -267,13 +385,21 @@ define i1 @add_ultcmp_i64_i8(i64 %x) nounwind {
 
 ; Slightly more canonical variant
 define i1 @add_ulecmp_i16_i8(i16 %x) nounwind {
-; CHECK-LABEL: add_ulecmp_i16_i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxtb w8, w0
-; CHECK-NEXT:    and w8, w8, #0xffff
-; CHECK-NEXT:    cmp w8, w0, uxth
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: add_ulecmp_i16_i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sxtb w8, w0
+; CHECK-SD-NEXT:    and w8, w8, #0xffff
+; CHECK-SD-NEXT:    cmp w8, w0, uxth
+; CHECK-SD-NEXT:    cset w0, eq
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_ulecmp_i16_i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    add w8, w0, #128
+; CHECK-GI-NEXT:    and w8, w8, #0xffff
+; CHECK-GI-NEXT:    cmp w8, #255
+; CHECK-GI-NEXT:    cset w0, ls
+; CHECK-GI-NEXT:    ret
   %tmp0 = add i16 %x, 128 ; 1U << (8-1)
   %tmp1 = icmp ule i16 %tmp0, 255 ; (1U << 8) - 1
   ret i1 %tmp1
@@ -284,12 +410,20 @@ define i1 @add_ulecmp_i16_i8(i16 %x) nounwind {
 
 ; Adding not a constant
 define i1 @add_ultcmp_bad_i16_i8_add(i16 %x, i16 %y) nounwind {
-; CHECK-LABEL: add_ultcmp_bad_i16_i8_add:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    add w8, w0, w1
-; CHECK-NEXT:    tst w8, #0xff00
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: add_ultcmp_bad_i16_i8_add:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    add w8, w0, w1
+; CHECK-SD-NEXT:    tst w8, #0xff00
+; CHECK-SD-NEXT:    cset w0, eq
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_ultcmp_bad_i16_i8_add:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    add w8, w0, w1
+; CHECK-GI-NEXT:    and w8, w8, #0xffff
+; CHECK-GI-NEXT:    cmp w8, #256
+; CHECK-GI-NEXT:    cset w0, lo
+; CHECK-GI-NEXT:    ret
   %tmp0 = add i16 %x, %y
   %tmp1 = icmp ult i16 %tmp0, 256 ; 1U << 8
   ret i1 %tmp1
@@ -311,12 +445,20 @@ define i1 @add_ultcmp_bad_i16_i8_cmp(i16 %x, i16 %y) nounwind {
 
 ; Second constant is not larger than the first one
 define i1 @add_ultcmp_bad_i8_i16(i16 %x) nounwind {
-; CHECK-LABEL: add_ultcmp_bad_i8_i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0xffff
-; CHECK-NEXT:    add w8, w8, #128
-; CHECK-NEXT:    lsr w0, w8, #16
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: add_ultcmp_bad_i8_i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    and w8, w0, #0xffff
+; CHECK-SD-NEXT:    add w8, w8, #128
+; CHECK-SD-NEXT:    lsr w0, w8, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_ultcmp_bad_i8_i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    and w8, w0, #0xffff
+; CHECK-GI-NEXT:    add w8, w8, #128
+; CHECK-GI-NEXT:    cmp w8, w8, uxth
+; CHECK-GI-NEXT:    cset w0, ne
+; CHECK-GI-NEXT:    ret
   %tmp0 = add i16 %x, 128 ; 1U << (8-1)
   %tmp1 = icmp ult i16 %tmp0, 128 ; 1U << (8-1)
   ret i1 %tmp1
@@ -324,12 +466,20 @@ define i1 @add_ultcmp_bad_i8_i16(i16 %x) nounwind {
 
 ; First constant is not power of two
 define i1 @add_ultcmp_bad_i16_i8_c0notpoweroftwo(i16 %x) nounwind {
-; CHECK-LABEL: add_ultcmp_bad_i16_i8_c0notpoweroftwo:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    add w8, w0, #192
-; CHECK-NEXT:    tst w8, #0xff00
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: add_ultcmp_bad_i16_i8_c0notpoweroftwo:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    add w8, w0, #192
+; CHECK-SD-NEXT:    tst w8, #0xff00
+; CHECK-SD-NEXT:    cset w0, eq
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_ultcmp_bad_i16_i8_c0notpoweroftwo:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    add w8, w0, #192
+; CHECK-GI-NEXT:    and w8, w8, #0xffff
+; CHECK-GI-NEXT:    cmp w8, #256
+; CHECK-GI-NEXT:    cset w0, lo
+; CHECK-GI-NEXT:    ret
   %tmp0 = add i16 %x, 192 ; (1U << (8-1)) + (1U << (8-1-1))
   %tmp1 = icmp ult i16 %tmp0, 256 ; 1U << 8
   ret i1 %tmp1
@@ -351,12 +501,20 @@ define i1 @add_ultcmp_bad_i16_i8_c1notpoweroftwo(i16 %x) nounwind {
 
 ; Magic check fails, 64 << 1 != 256
 define i1 @add_ultcmp_bad_i16_i8_magic(i16 %x) nounwind {
-; CHECK-LABEL: add_ultcmp_bad_i16_i8_magic:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    add w8, w0, #64
-; CHECK-NEXT:    tst w8, #0xff00
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: add_ultcmp_bad_i16_i8_magic:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    add w8, w0, #64
+; CHECK-SD-NEXT:    tst w8, #0xff00
+; CHECK-SD-NEXT:    cset w0, eq
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_ultcmp_bad_i16_i8_magic:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    add w8, w0, #64
+; CHECK-GI-NEXT:    and w8, w8, #0xffff
+; CHECK-GI-NEXT:    cmp w8, #256
+; CHECK-GI-NEXT:    cset w0, lo
+; CHECK-GI-NEXT:    ret
   %tmp0 = add i16 %x, 64 ; 1U << (8-1-1)
   %tmp1 = icmp ult i16 %tmp0, 256 ; 1U << 8
   ret i1 %tmp1
@@ -364,12 +522,20 @@ define i1 @add_ultcmp_bad_i16_i8_magic(i16 %x) nounwind {
 
 ; Bad 'destination type'
 define i1 @add_ultcmp_bad_i16_i4(i16 %x) nounwind {
-; CHECK-LABEL: add_ultcmp_bad_i16_i4:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    add w8, w0, #8
-; CHECK-NEXT:    tst w8, #0xfff0
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: add_ultcmp_bad_i16_i4:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    add w8, w0, #8
+; CHECK-SD-NEXT:    tst w8, #0xfff0
+; CHECK-SD-NEXT:    cset w0, eq
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_ultcmp_bad_i16_i4:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    add w8, w0, #8
+; CHECK-GI-NEXT:    and w8, w8, #0xffff
+; CHECK-GI-NEXT:    cmp w8, #16
+; CHECK-GI-NEXT:    cset w0, lo
+; CHECK-GI-NEXT:    ret
   %tmp0 = add i16 %x, 8 ; 1U << (4-1)
   %tmp1 = icmp ult i16 %tmp0, 16 ; 1U << 4
   ret i1 %tmp1
@@ -377,12 +543,20 @@ define i1 @add_ultcmp_bad_i16_i4(i16 %x) nounwind {
 
 ; Bad storage type
 define i1 @add_ultcmp_bad_i24_i8(i24 %x) nounwind {
-; CHECK-LABEL: add_ultcmp_bad_i24_i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    add w8, w0, #128
-; CHECK-NEXT:    tst w8, #0xffff00
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: add_ultcmp_bad_i24_i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    add w8, w0, #128
+; CHECK-SD-NEXT:    tst w8, #0xffff00
+; CHECK-SD-NEXT:    cset w0, eq
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_ultcmp_bad_i24_i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    add w8, w0, #128
+; CHECK-GI-NEXT:    and w8, w8, #0xffffff
+; CHECK-GI-NEXT:    cmp w8, #256
+; CHECK-GI-NEXT:    cset w0, lo
+; CHECK-GI-NEXT:    ret
   %tmp0 = add i24 %x, 128 ; 1U << (8-1)
   %tmp1 = icmp ult i24 %tmp0, 256 ; 1U << 8
   ret i1 %tmp1
diff --git a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
index 2583a93e514a2..5b81f5dafe421 100644
--- a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
+++ b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
@@ -426,3 +426,21 @@ define void @zt0_multiple_private_za_calls(ptr %callee) "aarch64_in_zt0" nounwin
   call void %callee()
   ret void
 }
+
+define void @disable_tailcallopt(ptr %callee) "aarch64_inout_zt0" nounwind {
+; CHECK-COMMON-LABEL: disable_tailcallopt:
+; CHECK-COMMON:       // %bb.0:
+; CHECK-COMMON-NEXT:    sub sp, sp, #80
+; CHECK-COMMON-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-COMMON-NEXT:    mov x19, sp
+; CHECK-COMMON-NEXT:    str zt0, [x19]
+; CHECK-COMMON-NEXT:    smstop za
+; CHECK-COMMON-NEXT:    blr x0
+; CHECK-COMMON-NEXT:    smstart za
+; CHECK-COMMON-NEXT:    ldr zt0, [x19]
+; CHECK-COMMON-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-COMMON-NEXT:    add sp, sp, #80
+; CHECK-COMMON-NEXT:    ret
+  tail call void %callee()
+  ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/stackmap.ll b/llvm/test/CodeGen/AArch64/stackmap.ll
index 995d2545c6359..26221d0c26eb2 100644
--- a/llvm/test/CodeGen/AArch64/stackmap.ll
+++ b/llvm/test/CodeGen/AArch64/stackmap.ll
@@ -81,14 +81,14 @@
 ; CHECK-NEXT:   .hword  8
 ; CHECK-NEXT:   .hword  0
 ; CHECK-NEXT:   .hword  0
-; CHECK-NEXT:   .word   65535
+; CHECK-NEXT:   .word   -1
 ; SmallConstant
 ; CHECK-NEXT:   .byte   4
 ; CHECK-NEXT:   .byte   0
 ; CHECK-NEXT:   .hword  8
 ; CHECK-NEXT:   .hword  0
 ; CHECK-NEXT:   .hword  0
-; CHECK-NEXT:   .word   65535
+; CHECK-NEXT:   .word   -1
 ; SmallConstant
 ; CHECK-NEXT:   .byte   4
 ; CHECK-NEXT:   .byte   0
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-loads-stores.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-loads-stores.ll
index becddaea31267..b2ed8de369146 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-loads-stores.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-loads-stores.ll
@@ -1,19 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -aarch64-sve-vector-bits-min=128  < %s | not grep ptrue
 ; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
-; RUN: llc -aarch64-sve-vector-bits-min=384  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
 ; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
 ; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
 ; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_2048
 
 target triple = "aarch64-unknown-linux-gnu"
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.ll
new file mode 100644
index 0000000000000..e11720011af10
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.ll
@@ -0,0 +1,612 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefix=GFX12 %s
+
+define i16 @s_add_i16(i16 inreg %a, i16 inreg %b) {
+; GFX7-LABEL: s_add_i16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_add_i32 s16, s16, s17
+; GFX7-NEXT:    v_mov_b32_e32 v0, s16
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_add_i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_add_i32 s16, s16, s17
+; GFX9-NEXT:    v_mov_b32_e32 v0, s16
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: s_add_i16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_add_i32 s16, s16, s17
+; GFX8-NEXT:    v_mov_b32_e32 v0, s16
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: s_add_i16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_add_i32 s16, s16, s17
+; GFX10-NEXT:    v_mov_b32_e32 v0, s16
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: s_add_i16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_add_i32 s0, s0, s1
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: s_add_i16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_add_co_i32 s0, s0, s1
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %c = add i16 %a, %b
+  ret i16 %c
+}
+
+define i16 @v_add_i16(i16 %a, i16 %b) {
+; GFX7-LABEL: v_add_i16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_add_i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_u16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_add_i16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u16_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_add_i16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_nc_u16 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_add_i16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_nc_u16 v0.l, v0.l, v1.l
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_add_i16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_add_nc_u16 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %c = add i16 %a, %b
+  ret i16 %c
+}
+
+define i32 @s_add_i32(i32 inreg %a, i32 inreg %b) {
+; GFX7-LABEL: s_add_i32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_add_i32 s16, s16, s17
+; GFX7-NEXT:    v_mov_b32_e32 v0, s16
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_add_i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_add_i32 s16, s16, s17
+; GFX9-NEXT:    v_mov_b32_e32 v0, s16
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: s_add_i32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_add_i32 s16, s16, s17
+; GFX8-NEXT:    v_mov_b32_e32 v0, s16
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: s_add_i32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_add_i32 s16, s16, s17
+; GFX10-NEXT:    v_mov_b32_e32 v0, s16
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: s_add_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_add_i32 s0, s0, s1
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: s_add_i32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_add_co_i32 s0, s0, s1
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %c = add i32 %a, %b
+  ret i32 %c
+}
+
+define i32 @v_add_i32(i32 %a, i32 %b) {
+; GFX7-LABEL: v_add_i32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_add_i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_add_i32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_add_i32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_add_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_add_i32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_add_nc_u32_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %c = add i32 %a, %b
+  ret i32 %c
+}
+
+define <2 x i16> @s_add_v2i16(<2 x i16> inreg %a, <2 x i16> inreg %b) {
+; GFX7-LABEL: s_add_v2i16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_add_i32 s16, s16, s18
+; GFX7-NEXT:    s_add_i32 s17, s17, s19
+; GFX7-NEXT:    v_mov_b32_e32 v0, s16
+; GFX7-NEXT:    v_mov_b32_e32 v1, s17
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_add_v2i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_lshr_b32 s4, s16, 16
+; GFX9-NEXT:    s_lshr_b32 s5, s17, 16
+; GFX9-NEXT:    s_add_i32 s16, s16, s17
+; GFX9-NEXT:    s_add_i32 s4, s4, s5
+; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s16, s4
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: s_add_v2i16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_lshr_b32 s4, s16, 16
+; GFX8-NEXT:    s_lshr_b32 s5, s17, 16
+; GFX8-NEXT:    s_add_i32 s4, s4, s5
+; GFX8-NEXT:    s_add_i32 s16, s16, s17
+; GFX8-NEXT:    s_and_b32 s4, 0xffff, s4
+; GFX8-NEXT:    s_and_b32 s5, 0xffff, s16
+; GFX8-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX8-NEXT:    s_or_b32 s4, s5, s4
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: s_add_v2i16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_lshr_b32 s4, s16, 16
+; GFX10-NEXT:    s_lshr_b32 s5, s17, 16
+; GFX10-NEXT:    s_add_i32 s16, s16, s17
+; GFX10-NEXT:    s_add_i32 s4, s4, s5
+; GFX10-NEXT:    s_pack_ll_b32_b16 s4, s16, s4
+; GFX10-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: s_add_v2i16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX11-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX11-NEXT:    s_add_i32 s0, s0, s1
+; GFX11-NEXT:    s_add_i32 s2, s2, s3
+; GFX11-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: s_add_v2i16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX12-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX12-NEXT:    s_add_co_i32 s0, s0, s1
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_add_co_i32 s2, s2, s3
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %c = add <2 x i16> %a, %b
+  ret <2 x i16> %c
+}
+
+define <2 x i16> @v_add_v2i16(<2 x i16> %a, <2 x i16> %b) {
+; GFX7-LABEL: v_add_v2i16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_add_v2i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_add_u16 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_add_v2i16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u16_e32 v2, v0, v1
+; GFX8-NEXT:    v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_add_v2i16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_add_u16 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_add_v2i16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_add_u16 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_add_v2i16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_add_u16 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %c = add <2 x i16> %a, %b
+  ret <2 x i16> %c
+}
+
+define i64 @s_add_i64(i64 inreg %a, i64 inreg %b) {
+; GFX7-LABEL: s_add_i64:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_add_u32 s4, s16, s18
+; GFX7-NEXT:    s_addc_u32 s5, s17, s19
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_add_i64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_add_u32 s4, s16, s18
+; GFX9-NEXT:    s_addc_u32 s5, s17, s19
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: s_add_i64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_add_u32 s4, s16, s18
+; GFX8-NEXT:    s_addc_u32 s5, s17, s19
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: s_add_i64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_add_u32 s4, s16, s18
+; GFX10-NEXT:    s_addc_u32 s5, s17, s19
+; GFX10-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: s_add_i64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: s_add_i64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %c = add i64 %a, %b
+  ret i64 %c
+}
+
+define i64 @v_add_i64(i64 %a, i64 %b) {
+; GFX7-LABEL: v_add_i64:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_add_i64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_add_i64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_add_i64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_add_i64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_add_i64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX12-NEXT:    s_wait_alu 0xfffd
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %c = add i64 %a, %b
+  ret i64 %c
+}
+
+define void @s_uaddo_uadde(i64 inreg %a, i64 inreg %b, ptr addrspace(1) %res, ptr addrspace(1) %carry) {
+; GFX7-LABEL: s_uaddo_uadde:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_add_u32 s4, s16, s18
+; GFX7-NEXT:    s_addc_u32 s5, s17, s19
+; GFX7-NEXT:    v_mov_b32_e32 v4, s4
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v5, s5
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_store_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_uaddo_uadde:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_add_u32 s4, s16, s18
+; GFX9-NEXT:    s_addc_u32 s5, s17, s19
+; GFX9-NEXT:    v_mov_b32_e32 v4, s4
+; GFX9-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v5, s5
+; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[4:5], off
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    global_store_dword v[2:3], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: s_uaddo_uadde:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_add_u32 s4, s16, s18
+; GFX8-NEXT:    s_addc_u32 s5, s17, s19
+; GFX8-NEXT:    v_mov_b32_e32 v4, s4
+; GFX8-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v5, s5
+; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    flat_store_dword v[2:3], v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: s_uaddo_uadde:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_add_u32 s4, s16, s18
+; GFX10-NEXT:    s_addc_u32 s5, s17, s19
+; GFX10-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX10-NEXT:    v_mov_b32_e32 v4, s4
+; GFX10-NEXT:    v_mov_b32_e32 v5, s5
+; GFX10-NEXT:    v_mov_b32_e32 v6, s6
+; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[4:5], off
+; GFX10-NEXT:    global_store_dword v[2:3], v6, off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: s_uaddo_uadde:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
+; GFX11-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX11-NEXT:    v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
+; GFX11-NEXT:    v_mov_b32_e32 v6, s2
+; GFX11-NEXT:    global_store_b64 v[0:1], v[4:5], off
+; GFX11-NEXT:    global_store_b32 v[2:3], v6, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: s_uaddo_uadde:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_add_co_u32 s0, s0, s2
+; GFX12-NEXT:    s_add_co_ci_u32 s1, s1, s3
+; GFX12-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
+; GFX12-NEXT:    v_mov_b32_e32 v6, s2
+; GFX12-NEXT:    global_store_b64 v[0:1], v[4:5], off
+; GFX12-NEXT:    global_store_b32 v[2:3], v6, off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %uaddo = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
+  %add = extractvalue {i64, i1} %uaddo, 0
+  %of = extractvalue {i64, i1} %uaddo, 1
+  %of32 = select i1 %of, i32 1, i32 0
+  store i64 %add, ptr addrspace(1) %res
+  store i32 %of32, ptr addrspace(1) %carry
+  ret void
+}
+
+define void @v_uaddo_uadde(i64 %a, i64 %b, ptr addrspace(1) %res, ptr addrspace(1) %carry) {
+; GFX7-LABEL: v_uaddo_uadde:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_store_dword v2, v[6:7], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_uaddo_uadde:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT:    global_store_dwordx2 v[4:5], v[0:1], off
+; GFX9-NEXT:    global_store_dword v[6:7], v2, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_uaddo_uadde:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
+; GFX8-NEXT:    flat_store_dword v[6:7], v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_uaddo_uadde:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX10-NEXT:    global_store_dwordx2 v[4:5], v[0:1], off
+; GFX10-NEXT:    global_store_dword v[6:7], v2, off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_uaddo_uadde:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-NEXT:    global_store_b64 v[4:5], v[0:1], off
+; GFX11-NEXT:    global_store_b32 v[6:7], v2, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_uaddo_uadde:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX12-NEXT:    s_wait_alu 0xfffd
+; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-NEXT:    s_wait_alu 0xfffd
+; GFX12-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX12-NEXT:    global_store_b64 v[4:5], v[0:1], off
+; GFX12-NEXT:    global_store_b32 v[6:7], v2, off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %uaddo = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
+  %add = extractvalue {i64, i1} %uaddo, 0
+  %of = extractvalue {i64, i1} %uaddo, 1
+  %of32 = select i1 %of, i32 1, i32 0
+  store i64 %add, ptr addrspace(1) %res
+  store i32 %of32, ptr addrspace(1) %carry
+  ret void
+}
+
+declare {i64, i1} @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fadd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fadd.ll
new file mode 100644
index 0000000000000..e440beed1da79
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fadd.ll
@@ -0,0 +1,165 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-FAKE16 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=+real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-TRUE16 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-FAKE16 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=+real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-TRUE16 %s
+
+define amdgpu_ps half @fadd_s16_uniform(half inreg %a, half inreg %b) {
+; GFX11-FAKE16-LABEL: fadd_s16_uniform:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    v_add_f16_e64 v0, s0, s1
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-TRUE16-LABEL: fadd_s16_uniform:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    v_add_f16_e64 v0.l, s0, s1
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: fadd_s16_uniform:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_add_f16 s0, s0, s1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-NEXT:    ; return to shader part epilog
+  %fadd = fadd half %a, %b
+  ret half %fadd
+}
+
+define amdgpu_ps half @fadd_s16_div(half %a, half %b) {
+; GFX11-FAKE16-LABEL: fadd_s16_div:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-TRUE16-LABEL: fadd_s16_div:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-FAKE16-LABEL: fadd_s16_div:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX12-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-TRUE16-LABEL: fadd_s16_div:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX12-TRUE16-NEXT:    ; return to shader part epilog
+  %fadd = fadd half %a, %b
+  ret half %fadd
+}
+
+define amdgpu_ps float @fadd_s32_uniform(float inreg %a, float inreg %b) {
+; GFX11-LABEL: fadd_s32_uniform:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_add_f32_e64 v0, s0, s1
+; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: fadd_s32_uniform:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_add_f32 s0, s0, s1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-NEXT:    ; return to shader part epilog
+  %fadd = fadd float %a, %b
+  ret float %fadd
+}
+
+define amdgpu_ps float @fadd_s32_div(float %a, float %b) {
+; GCN-LABEL: fadd_s32_div:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_add_f32_e32 v0, v0, v1
+; GCN-NEXT:    ; return to shader part epilog
+  %fadd = fadd float %a, %b
+  ret float %fadd
+}
+
+define amdgpu_ps void @fadd_s64_uniform(double inreg %a, double inreg %b, ptr addrspace(1) %ptr) {
+; GFX11-LABEL: fadd_s64_uniform:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_add_f64 v[2:3], s[0:1], s[2:3]
+; GFX11-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: fadd_s64_uniform:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_add_f64_e64 v[2:3], s[0:1], s[2:3]
+; GFX12-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX12-NEXT:    s_endpgm
+  %fadd = fadd double %a, %b
+  store double %fadd, ptr addrspace(1) %ptr
+  ret void
+}
+
+define amdgpu_ps void @fadd_s64_div(double %a, double %b, ptr addrspace(1) %ptr) {
+; GFX11-LABEL: fadd_s64_div:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
+; GFX11-NEXT:    global_store_b64 v[4:5], v[0:1], off
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: fadd_s64_div:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_add_f64_e32 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    global_store_b64 v[4:5], v[0:1], off
+; GFX12-NEXT:    s_endpgm
+  %fadd = fadd double %a, %b
+  store double %fadd, ptr addrspace(1) %ptr
+  ret void
+}
+
+define amdgpu_ps <2 x half> @fadd_v2s16_uniform(<2 x half> inreg %a, <2 x half> inreg %b) {
+; GFX11-LABEL: fadd_v2s16_uniform:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_pk_add_f16 v0, s0, s1
+; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: fadd_v2s16_uniform:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX12-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX12-NEXT:    s_add_f16 s0, s0, s1
+; GFX12-NEXT:    s_add_f16 s1, s2, s3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
+; GFX12-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-NEXT:    ; return to shader part epilog
+  %fadd = fadd <2 x half> %a, %b
+  ret <2 x half> %fadd
+}
+
+define amdgpu_ps <2 x half> @fadd_v2s16_div(<2 x half> %a, <2 x half> %b) {
+; GCN-LABEL: fadd_v2s16_div:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_pk_add_f16 v0, v0, v1
+; GCN-NEXT:    ; return to shader part epilog
+  %fadd = fadd <2 x half> %a, %b
+  ret <2 x half> %fadd
+}
+
+define amdgpu_ps <2 x float> @fadd_v2s32_uniform(<2 x float> inreg %a, <2 x float> inreg %b) {
+; GFX11-LABEL: fadd_v2s32_uniform:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_add_f32_e64 v0, s0, s2
+; GFX11-NEXT:    v_add_f32_e64 v1, s1, s3
+; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: fadd_v2s32_uniform:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_add_f32 s0, s0, s2
+; GFX12-NEXT:    s_add_f32 s1, s1, s3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    ; return to shader part epilog
+  %fadd = fadd <2 x float> %a, %b
+  ret <2 x float> %fadd
+}
+
+define amdgpu_ps <2 x float> @fadd_v2s32_div(<2 x float> %a, <2 x float> %b) {
+; GCN-LABEL: fadd_v2s32_div:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_dual_add_f32 v0, v0, v2 :: v_dual_add_f32 v1, v1, v3
+; GCN-NEXT:    ; return to shader part epilog
+  %fadd = fadd <2 x float> %a, %b
+  ret <2 x float> %fadd
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.ll
new file mode 100644
index 0000000000000..588802cbd56c7
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.ll
@@ -0,0 +1,66 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx803 < %s | FileCheck -check-prefixes=GFX8 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s
+
+define amdgpu_kernel void @fcmp_uniform_select(float %a, i32 %b, i32 %c, ptr addrspace(1) %out) {
+; GFX7-LABEL: fcmp_uniform_select:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x9
+; GFX7-NEXT:    s_load_dword s3, s[4:5], 0xb
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_f32_e64 s[4:5], s6, 0
+; GFX7-NEXT:    s_or_b64 s[4:5], s[4:5], s[4:5]
+; GFX7-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX7-NEXT:    s_and_b32 s4, s4, 1
+; GFX7-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX7-NEXT:    s_cselect_b32 s3, s7, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s3
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: fcmp_uniform_select:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x34
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_eq_f32_e64 s[4:5], s0, 0
+; GFX8-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX8-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX8-NEXT:    s_and_b32 s0, s0, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX8-NEXT:    s_cselect_b32 s0, s1, s6
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX11-LABEL: fcmp_uniform_select:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x2c
+; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x34
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_f32_e64 s0, s0, 0
+; GFX11-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX11-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s0, s0, 1
+; GFX11-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX11-NEXT:    s_cselect_b32 s0, s1, s6
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    global_store_b32 v1, v0, s[2:3]
+; GFX11-NEXT:    s_endpgm
+  %cmp = fcmp oeq float %a, 0.0
+  %sel = select i1 %cmp, i32 %b, i32 %c
+  store i32 %sel, ptr addrspace(1) %out
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.mir
new file mode 100644
index 0000000000000..b6652f605be19
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.mir
@@ -0,0 +1,37 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn -mcpu=gfx700 -run-pass=instruction-select %s -o - | FileCheck -check-prefixes=GFX7 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx803 -run-pass=instruction-select %s -o - | FileCheck -check-prefixes=GF8 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select %s -o - | FileCheck -check-prefixes=GFX11 %s
+
+---
+name: test_copy_scc_vcc
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    ; GFX7-LABEL: name: test_copy_scc_vcc
+    ; GFX7: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
+    ; GFX7-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64 = S_OR_B64 [[DEF]], [[DEF]], implicit-def $scc
+    ; GFX7-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $scc
+    ; GFX7-NEXT: $sgpr0 = COPY [[COPY]]
+    ; GFX7-NEXT: S_ENDPGM 0, implicit $sgpr0
+    ;
+    ; GF8-LABEL: name: test_copy_scc_vcc
+    ; GF8: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
+    ; GF8-NEXT: S_CMP_LG_U64 [[DEF]], 0, implicit-def $scc
+    ; GF8-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $scc
+    ; GF8-NEXT: $sgpr0 = COPY [[COPY]]
+    ; GF8-NEXT: S_ENDPGM 0, implicit $sgpr0
+    ;
+    ; GFX11-LABEL: name: test_copy_scc_vcc
+    ; GFX11: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
+    ; GFX11-NEXT: S_CMP_LG_U32 [[DEF]], 0, implicit-def $scc
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $scc
+    ; GFX11-NEXT: $sgpr0 = COPY [[COPY]]
+    ; GFX11-NEXT: S_ENDPGM 0, implicit $sgpr0
+    %0:vcc(s1) = G_IMPLICIT_DEF
+    %1:sgpr(s32) = G_AMDGPU_COPY_SCC_VCC %0
+    $sgpr0 = COPY %1
+    S_ENDPGM 0, implicit $sgpr0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll
index 02d0e521e3b00..6facdfdec64ae 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll
@@ -104,109 +104,110 @@ define amdgpu_cs <4 x i32> @abs_sgpr_v4i32(<4 x i32> inreg %arg) {
   ret <4 x i32> %res
 }
 
-define amdgpu_cs i16 @abs_vgpr_i16(i16 %arg) {
+define i16 @abs_vgpr_i16(i16 %arg) {
 ; GFX6-LABEL: abs_vgpr_i16:
 ; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, 0, v0
 ; GFX6-NEXT:    v_max_i32_e32 v0, v0, v1
-; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX6-NEXT:    ; return to shader part epilog
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: abs_vgpr_i16:
 ; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_sub_u16_e32 v1, 0, v0
 ; GFX8-NEXT:    v_max_i16_e32 v0, v0, v1
-; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX8-NEXT:    ; return to shader part epilog
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: abs_vgpr_i16:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_sub_nc_u16 v1, 0, v0
 ; GFX10-NEXT:    v_max_i16 v0, v0, v1
-; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX10-NEXT:    ; return to shader part epilog
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1250-LABEL: abs_vgpr_i16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    v_sub_nc_u16 v1, 0, v0
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-NEXT:    v_max_i16 v0, v0, v1
-; GFX1250-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX1250-NEXT:    ; return to shader part epilog
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %res = call i16 @llvm.abs.i16(i16 %arg, i1 false)
   ret i16 %res
 }
 
-define amdgpu_cs i32 @abs_vgpr_i32(i32 %arg) {
+define i32 @abs_vgpr_i32(i32 %arg) {
 ; GFX6-LABEL: abs_vgpr_i32:
 ; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, 0, v0
 ; GFX6-NEXT:    v_max_i32_e32 v0, v0, v1
-; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX6-NEXT:    ; return to shader part epilog
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: abs_vgpr_i32:
 ; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, 0, v0
 ; GFX8-NEXT:    v_max_i32_e32 v0, v0, v1
-; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX8-NEXT:    ; return to shader part epilog
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: abs_vgpr_i32:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 0, v0
 ; GFX10-NEXT:    v_max_i32_e32 v0, v0, v1
-; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX10-NEXT:    ; return to shader part epilog
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1250-LABEL: abs_vgpr_i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    v_sub_nc_u32_e32 v1, 0, v0
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-NEXT:    v_max_i32_e32 v0, v0, v1
-; GFX1250-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX1250-NEXT:    ; return to shader part epilog
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %res = call i32 @llvm.abs.i32(i32 %arg, i1 false)
   ret i32 %res
 }
 
-define amdgpu_cs i64 @abs_vgpr_i64(i64 %arg) {
+define i64 @abs_vgpr_i64(i64 %arg) {
 ; GFX6-LABEL: abs_vgpr_i64:
 ; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v2, vcc
 ; GFX6-NEXT:    v_xor_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_xor_b32_e32 v1, v1, v2
-; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX6-NEXT:    ; return to shader part epilog
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: abs_vgpr_i64:
 ; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v1, v2, vcc
 ; GFX8-NEXT:    v_xor_b32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_xor_b32_e32 v1, v1, v2
-; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX8-NEXT:    ; return to shader part epilog
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: abs_vgpr_i64:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo
 ; GFX10-NEXT:    v_xor_b32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_xor_b32_e32 v1, v1, v2
-; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX10-NEXT:    ; return to shader part epilog
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1250-LABEL: abs_vgpr_i64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_mov_b32_e32 v3, v2
@@ -214,17 +215,15 @@ define amdgpu_cs i64 @abs_vgpr_i64(i64 %arg) {
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1250-NEXT:    v_xor_b32_e32 v0, v0, v2
 ; GFX1250-NEXT:    v_xor_b32_e32 v1, v1, v2
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX1250-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX1250-NEXT:    ; return to shader part epilog
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %res = call i64 @llvm.abs.i64(i64 %arg, i1 false)
   ret i64 %res
 }
 
-define amdgpu_cs <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) {
+define <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) {
 ; GFX6-LABEL: abs_vgpr_v4i32:
 ; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, 0, v0
 ; GFX6-NEXT:    v_max_i32_e32 v0, v0, v4
 ; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, 0, v1
@@ -233,14 +232,11 @@ define amdgpu_cs <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) {
 ; GFX6-NEXT:    v_max_i32_e32 v2, v2, v4
 ; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, 0, v3
 ; GFX6-NEXT:    v_max_i32_e32 v3, v3, v4
-; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX6-NEXT:    v_readfirstlane_b32 s2, v2
-; GFX6-NEXT:    v_readfirstlane_b32 s3, v3
-; GFX6-NEXT:    ; return to shader part epilog
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: abs_vgpr_v4i32:
 ; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, 0, v0
 ; GFX8-NEXT:    v_max_i32_e32 v0, v0, v4
 ; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, 0, v1
@@ -249,14 +245,11 @@ define amdgpu_cs <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) {
 ; GFX8-NEXT:    v_max_i32_e32 v2, v2, v4
 ; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, 0, v3
 ; GFX8-NEXT:    v_max_i32_e32 v3, v3, v4
-; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX8-NEXT:    v_readfirstlane_b32 s2, v2
-; GFX8-NEXT:    v_readfirstlane_b32 s3, v3
-; GFX8-NEXT:    ; return to shader part epilog
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: abs_vgpr_v4i32:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v4, 0, v0
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v5, 0, v1
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v6, 0, v2
@@ -265,14 +258,12 @@ define amdgpu_cs <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) {
 ; GFX10-NEXT:    v_max_i32_e32 v1, v1, v5
 ; GFX10-NEXT:    v_max_i32_e32 v2, v2, v6
 ; GFX10-NEXT:    v_max_i32_e32 v3, v3, v7
-; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX10-NEXT:    v_readfirstlane_b32 s2, v2
-; GFX10-NEXT:    v_readfirstlane_b32 s3, v3
-; GFX10-NEXT:    ; return to shader part epilog
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1250-LABEL: abs_vgpr_v4i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    v_dual_sub_nc_u32 v4, 0, v0 :: v_dual_sub_nc_u32 v5, 0, v1
 ; GFX1250-NEXT:    v_dual_sub_nc_u32 v6, 0, v2 :: v_dual_sub_nc_u32 v7, 0, v3
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
@@ -281,13 +272,7 @@ define amdgpu_cs <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) {
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1250-NEXT:    v_max_i32_e32 v2, v2, v6
 ; GFX1250-NEXT:    v_max_i32_e32 v3, v3, v7
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1250-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX1250-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1250-NEXT:    v_readfirstlane_b32 s2, v2
-; GFX1250-NEXT:    v_readfirstlane_b32 s3, v3
-; GFX1250-NEXT:    ; return to shader part epilog
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %res = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %arg, i1 false)
   ret <4 x i32> %res
 }
@@ -304,44 +289,43 @@ define amdgpu_cs <2 x i8> @abs_sgpr_v2i8(<2 x i8> inreg %arg) {
   ret <2 x i8> %res
 }
 
-define amdgpu_cs <2 x i8> @abs_vgpr_v2i8(<2 x i8> %arg) {
+define <2 x i8> @abs_vgpr_v2i8(<2 x i8> %arg) {
 ; GFX6-LABEL: abs_vgpr_v2i8:
 ; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 8
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 0, v0
 ; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 8
 ; GFX6-NEXT:    v_max_i32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 0, v1
 ; GFX6-NEXT:    v_max_i32_e32 v1, v1, v2
-; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX6-NEXT:    ; return to shader part epilog
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: abs_vgpr_v2i8:
 ; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX8-NEXT:    v_sub_u16_sdwa v3, v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX8-NEXT:    v_sub_u16_sdwa v2, v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX8-NEXT:    v_max_i16_sdwa v0, sext(v0), v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_max_i16_sdwa v1, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX8-NEXT:    ; return to shader part epilog
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: abs_vgpr_v2i8:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_bfe_i32 v0, v0, 0, 8
 ; GFX10-NEXT:    v_bfe_i32 v1, v1, 0, 8
 ; GFX10-NEXT:    v_sub_nc_u16 v2, 0, v0
 ; GFX10-NEXT:    v_sub_nc_u16 v3, 0, v1
 ; GFX10-NEXT:    v_max_i16 v0, v0, v2
 ; GFX10-NEXT:    v_max_i16 v1, v1, v3
-; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX10-NEXT:    ; return to shader part epilog
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1250-LABEL: abs_vgpr_v2i8:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    v_bfe_i32 v0, v0, 0, 8
 ; GFX1250-NEXT:    v_bfe_i32 v1, v1, 0, 8
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -350,10 +334,7 @@ define amdgpu_cs <2 x i8> @abs_vgpr_v2i8(<2 x i8> %arg) {
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1250-NEXT:    v_max_i16 v0, v0, v2
 ; GFX1250-NEXT:    v_max_i16 v1, v1, v3
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX1250-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX1250-NEXT:    ; return to shader part epilog
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %res = call <2 x i8> @llvm.abs.v2i8(<2 x i8> %arg, i1 false)
   ret <2 x i8> %res
 }
@@ -372,9 +353,10 @@ define amdgpu_cs <3 x i8> @abs_sgpr_v3i8(<3 x i8> inreg %arg) {
   ret <3 x i8> %res
 }
 
-define amdgpu_cs <3 x i8> @abs_vgpr_v3i8(<3 x i8>  %arg) {
+define <3 x i8> @abs_vgpr_v3i8(<3 x i8>  %arg) {
 ; GFX6-LABEL: abs_vgpr_v3i8:
 ; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 8
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 0, v0
 ; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 8
@@ -384,13 +366,11 @@ define amdgpu_cs <3 x i8> @abs_vgpr_v3i8(<3 x i8>  %arg) {
 ; GFX6-NEXT:    v_max_i32_e32 v1, v1, v3
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 0, v2
 ; GFX6-NEXT:    v_max_i32_e32 v2, v2, v3
-; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX6-NEXT:    v_readfirstlane_b32 s2, v2
-; GFX6-NEXT:    ; return to shader part epilog
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: abs_vgpr_v3i8:
 ; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX8-NEXT:    v_sub_u16_sdwa v4, v3, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX8-NEXT:    v_max_i16_sdwa v0, sext(v0), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -398,13 +378,11 @@ define amdgpu_cs <3 x i8> @abs_vgpr_v3i8(<3 x i8>  %arg) {
 ; GFX8-NEXT:    v_sub_u16_sdwa v3, v3, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX8-NEXT:    v_max_i16_sdwa v1, sext(v1), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_max_i16_sdwa v2, sext(v2), v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX8-NEXT:    v_readfirstlane_b32 s2, v2
-; GFX8-NEXT:    ; return to shader part epilog
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: abs_vgpr_v3i8:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_bfe_i32 v0, v0, 0, 8
 ; GFX10-NEXT:    v_bfe_i32 v1, v1, 0, 8
 ; GFX10-NEXT:    v_bfe_i32 v2, v2, 0, 8
@@ -414,13 +392,12 @@ define amdgpu_cs <3 x i8> @abs_vgpr_v3i8(<3 x i8>  %arg) {
 ; GFX10-NEXT:    v_max_i16 v0, v0, v3
 ; GFX10-NEXT:    v_max_i16 v1, v1, v4
 ; GFX10-NEXT:    v_max_i16 v2, v2, v5
-; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX10-NEXT:    v_readfirstlane_b32 s2, v2
-; GFX10-NEXT:    ; return to shader part epilog
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1250-LABEL: abs_vgpr_v3i8:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    v_bfe_i32 v0, v0, 0, 8
 ; GFX1250-NEXT:    v_bfe_i32 v1, v1, 0, 8
 ; GFX1250-NEXT:    v_bfe_i32 v2, v2, 0, 8
@@ -433,12 +410,7 @@ define amdgpu_cs <3 x i8> @abs_vgpr_v3i8(<3 x i8>  %arg) {
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1250-NEXT:    v_max_i16 v1, v1, v4
 ; GFX1250-NEXT:    v_max_i16 v2, v2, v5
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX1250-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX1250-NEXT:    v_readfirstlane_b32 s2, v2
-; GFX1250-NEXT:    ; return to shader part epilog
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %res = call <3 x i8> @llvm.abs.v3i8(<3 x i8> %arg, i1 false)
   ret <3 x i8> %res
 }
@@ -485,44 +457,44 @@ define amdgpu_cs <2 x i16> @abs_sgpr_v2i16(<2 x i16> inreg %arg) {
   ret <2 x i16> %res
 }
 
-define amdgpu_cs <2 x i16> @abs_vgpr_v2i16(<2 x i16> %arg) {
+define <2 x i16> @abs_vgpr_v2i16(<2 x i16> %arg) {
 ; GFX6-LABEL: abs_vgpr_v2i16:
 ; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 0, v0
 ; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 16
 ; GFX6-NEXT:    v_max_i32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 0, v1
 ; GFX6-NEXT:    v_max_i32_e32 v1, v1, v2
-; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX6-NEXT:    ; return to shader part epilog
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: abs_vgpr_v2i16:
 ; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX8-NEXT:    v_sub_u16_e32 v1, 0, v0
 ; GFX8-NEXT:    v_sub_u16_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_i16_e32 v1, v0, v1
 ; GFX8-NEXT:    v_max_i16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX8-NEXT:    ; return to shader part epilog
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: abs_vgpr_v2i16:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_sub_i16 v1, 0, v0
 ; GFX10-NEXT:    v_pk_max_i16 v0, v0, v1
-; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX10-NEXT:    ; return to shader part epilog
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1250-LABEL: abs_vgpr_v2i16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    v_pk_sub_i16 v1, 0, v0
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-NEXT:    v_pk_max_i16 v0, v0, v1
-; GFX1250-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX1250-NEXT:    ; return to shader part epilog
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %res = call <2 x i16> @llvm.abs.v2i16(<2 x i16> %arg, i1 false)
   ret <2 x i16> %res
 }
@@ -576,9 +548,10 @@ define amdgpu_cs <3 x i16> @abs_sgpr_v3i16(<3 x i16> inreg %arg) {
   ret <3 x i16> %res
 }
 
-define amdgpu_cs <3 x i16> @abs_vgpr_v3i16(<3 x i16> %arg) {
+define <3 x i16> @abs_vgpr_v3i16(<3 x i16> %arg) {
 ; GFX6-LABEL: abs_vgpr_v3i16:
 ; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 0, v0
 ; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 16
@@ -588,13 +561,11 @@ define amdgpu_cs <3 x i16> @abs_vgpr_v3i16(<3 x i16> %arg) {
 ; GFX6-NEXT:    v_max_i32_e32 v1, v1, v3
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 0, v2
 ; GFX6-NEXT:    v_max_i32_e32 v2, v2, v3
-; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX6-NEXT:    v_readfirstlane_b32 s2, v2
-; GFX6-NEXT:    ; return to shader part epilog
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: abs_vgpr_v3i16:
 ; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX8-NEXT:    v_sub_u16_e32 v2, 0, v0
 ; GFX8-NEXT:    v_sub_u16_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
@@ -603,31 +574,27 @@ define amdgpu_cs <3 x i16> @abs_vgpr_v3i16(<3 x i16> %arg) {
 ; GFX8-NEXT:    v_max_i16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX8-NEXT:    v_max_i16_e32 v1, v1, v4
-; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX8-NEXT:    ; return to shader part epilog
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: abs_vgpr_v3i16:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_sub_i16 v2, 0, v0
 ; GFX10-NEXT:    v_sub_nc_u16 v3, 0, v1
 ; GFX10-NEXT:    v_pk_max_i16 v0, v0, v2
 ; GFX10-NEXT:    v_max_i16 v1, v1, v3
-; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX10-NEXT:    ; return to shader part epilog
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1250-LABEL: abs_vgpr_v3i16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    v_pk_sub_i16 v2, 0, v0
 ; GFX1250-NEXT:    v_sub_nc_u16 v3, 0, v1
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1250-NEXT:    v_pk_max_i16 v0, v0, v2
 ; GFX1250-NEXT:    v_max_i16 v1, v1, v3
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX1250-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX1250-NEXT:    ; return to shader part epilog
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %res = call <3 x i16> @llvm.abs.v3i16(<3 x i16> %arg, i1 false)
   ret <3 x i16> %res
 }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
index 7714c032d1737..d3e211855d7ed 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
@@ -113,9 +113,9 @@ false:
 define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_non_compare(i32 inreg %v) {
 ; CHECK-LABEL: branch_uniform_ballot_ne_zero_non_compare:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_and_b32 s0, 1, s0
-; CHECK-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
-; CHECK-NEXT:    s_cmp_eq_u32 s0, 0
+; CHECK-NEXT:    s_xor_b32 s0, s0, 1
+; CHECK-NEXT:    s_and_b32 s0, s0, 1
+; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cbranch_scc1 .LBB8_2
 ; CHECK-NEXT:  ; %bb.1: ; %true
 ; CHECK-NEXT:    s_mov_b32 s0, 42
@@ -161,16 +161,17 @@ false:
 define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_non_compare(i32 inreg %v) {
 ; CHECK-LABEL: branch_uniform_ballot_eq_zero_non_compare:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_and_b32 s0, 1, s0
-; CHECK-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
+; CHECK-NEXT:    s_xor_b32 s0, s0, 1
+; CHECK-NEXT:    s_xor_b32 s0, s0, 1
+; CHECK-NEXT:    s_and_b32 s0, s0, 1
 ; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
-; CHECK-NEXT:    s_cbranch_scc0 .LBB10_2
-; CHECK-NEXT:  ; %bb.1: ; %false
-; CHECK-NEXT:    s_mov_b32 s0, 33
-; CHECK-NEXT:    s_branch .LBB10_3
-; CHECK-NEXT:  .LBB10_2: ; %true
+; CHECK-NEXT:    s_cbranch_scc1 .LBB10_2
+; CHECK-NEXT:  ; %bb.1: ; %true
 ; CHECK-NEXT:    s_mov_b32 s0, 42
 ; CHECK-NEXT:    s_branch .LBB10_3
+; CHECK-NEXT:  .LBB10_2: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB10_3
 ; CHECK-NEXT:  .LBB10_3:
   %c = trunc i32 %v to i1
   %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
@@ -208,11 +209,7 @@ false:
 define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_compare(i32 inreg %v) {
 ; CHECK-LABEL: branch_uniform_ballot_ne_zero_compare:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_cmp_lt_u32 s0, 12
-; CHECK-NEXT:    s_cselect_b32 s0, 1, 0
-; CHECK-NEXT:    s_and_b32 s0, 1, s0
-; CHECK-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
-; CHECK-NEXT:    s_cmp_eq_u32 s0, 0
+; CHECK-NEXT:    s_cmp_ge_u32 s0, 12
 ; CHECK-NEXT:    s_cbranch_scc1 .LBB12_2
 ; CHECK-NEXT:  ; %bb.1: ; %true
 ; CHECK-NEXT:    s_mov_b32 s0, 42
@@ -258,17 +255,13 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_compare(i32 inreg %v) {
 ; CHECK-LABEL: branch_uniform_ballot_eq_zero_compare:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_cmp_lt_u32 s0, 12
-; CHECK-NEXT:    s_cselect_b32 s0, 1, 0
-; CHECK-NEXT:    s_and_b32 s0, 1, s0
-; CHECK-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
-; CHECK-NEXT:    s_cbranch_scc0 .LBB14_2
-; CHECK-NEXT:  ; %bb.1: ; %false
-; CHECK-NEXT:    s_mov_b32 s0, 33
-; CHECK-NEXT:    s_branch .LBB14_3
-; CHECK-NEXT:  .LBB14_2: ; %true
+; CHECK-NEXT:    s_cbranch_scc1 .LBB14_2
+; CHECK-NEXT:  ; %bb.1: ; %true
 ; CHECK-NEXT:    s_mov_b32 s0, 42
 ; CHECK-NEXT:    s_branch .LBB14_3
+; CHECK-NEXT:  .LBB14_2: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB14_3
 ; CHECK-NEXT:  .LBB14_3:
   %c = icmp ult i32 %v, 12
   %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
@@ -310,14 +303,12 @@ false:
 define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_and(i32 inreg %v1, i32 inreg %v2) {
 ; CHECK-LABEL: branch_uniform_ballot_ne_zero_and:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_cmp_lt_u32 s0, 12
+; CHECK-NEXT:    s_cmp_ge_u32 s0, 12
 ; CHECK-NEXT:    s_cselect_b32 s0, 1, 0
-; CHECK-NEXT:    s_cmp_gt_u32 s1, 34
+; CHECK-NEXT:    s_cmp_le_u32 s1, 34
 ; CHECK-NEXT:    s_cselect_b32 s1, 1, 0
-; CHECK-NEXT:    s_and_b32 s0, s0, s1
-; CHECK-NEXT:    s_and_b32 s0, 1, s0
-; CHECK-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
-; CHECK-NEXT:    s_cmp_eq_u32 s0, 0
+; CHECK-NEXT:    s_or_b32 s0, s0, s1
+; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cbranch_scc1 .LBB16_2
 ; CHECK-NEXT:  ; %bb.1: ; %true
 ; CHECK-NEXT:    s_mov_b32 s0, 42
@@ -372,16 +363,14 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_and(i32 inreg %v1, i32 inreg
 ; CHECK-NEXT:    s_cmp_gt_u32 s1, 34
 ; CHECK-NEXT:    s_cselect_b32 s1, 1, 0
 ; CHECK-NEXT:    s_and_b32 s0, s0, s1
-; CHECK-NEXT:    s_and_b32 s0, 1, s0
-; CHECK-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
 ; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
-; CHECK-NEXT:    s_cbranch_scc0 .LBB18_2
-; CHECK-NEXT:  ; %bb.1: ; %false
-; CHECK-NEXT:    s_mov_b32 s0, 33
-; CHECK-NEXT:    s_branch .LBB18_3
-; CHECK-NEXT:  .LBB18_2: ; %true
+; CHECK-NEXT:    s_cbranch_scc1 .LBB18_2
+; CHECK-NEXT:  ; %bb.1: ; %true
 ; CHECK-NEXT:    s_mov_b32 s0, 42
 ; CHECK-NEXT:    s_branch .LBB18_3
+; CHECK-NEXT:  .LBB18_2: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB18_3
 ; CHECK-NEXT:  .LBB18_3:
   %v1c = icmp ult i32 %v1, 12
   %v2c = icmp ugt i32 %v2, 34
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
index 7b8166948610b..250fbc7c0f147 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
@@ -116,9 +116,9 @@ false:
 define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_non_compare(i32 inreg %v) {
 ; CHECK-LABEL: branch_uniform_ballot_ne_zero_non_compare:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_and_b32 s0, 1, s0
-; CHECK-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
-; CHECK-NEXT:    s_cmp_eq_u64 s[0:1], 0
+; CHECK-NEXT:    s_xor_b32 s0, s0, 1
+; CHECK-NEXT:    s_and_b32 s0, s0, 1
+; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cbranch_scc1 .LBB8_2
 ; CHECK-NEXT:  ; %bb.1: ; %true
 ; CHECK-NEXT:    s_mov_b32 s0, 42
@@ -164,16 +164,17 @@ false:
 define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_non_compare(i32 inreg %v) {
 ; CHECK-LABEL: branch_uniform_ballot_eq_zero_non_compare:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_and_b32 s0, 1, s0
-; CHECK-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
-; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; CHECK-NEXT:    s_cbranch_scc0 .LBB10_2
-; CHECK-NEXT:  ; %bb.1: ; %false
-; CHECK-NEXT:    s_mov_b32 s0, 33
-; CHECK-NEXT:    s_branch .LBB10_3
-; CHECK-NEXT:  .LBB10_2: ; %true
+; CHECK-NEXT:    s_xor_b32 s0, s0, 1
+; CHECK-NEXT:    s_xor_b32 s0, s0, 1
+; CHECK-NEXT:    s_and_b32 s0, s0, 1
+; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
+; CHECK-NEXT:    s_cbranch_scc1 .LBB10_2
+; CHECK-NEXT:  ; %bb.1: ; %true
 ; CHECK-NEXT:    s_mov_b32 s0, 42
 ; CHECK-NEXT:    s_branch .LBB10_3
+; CHECK-NEXT:  .LBB10_2: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB10_3
 ; CHECK-NEXT:  .LBB10_3:
   %c = trunc i32 %v to i1
   %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
@@ -211,11 +212,7 @@ false:
 define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_compare(i32 inreg %v) {
 ; CHECK-LABEL: branch_uniform_ballot_ne_zero_compare:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_cmp_lt_u32 s0, 12
-; CHECK-NEXT:    s_cselect_b32 s0, 1, 0
-; CHECK-NEXT:    s_and_b32 s0, 1, s0
-; CHECK-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
-; CHECK-NEXT:    s_cmp_eq_u64 s[0:1], 0
+; CHECK-NEXT:    s_cmp_ge_u32 s0, 12
 ; CHECK-NEXT:    s_cbranch_scc1 .LBB12_2
 ; CHECK-NEXT:  ; %bb.1: ; %true
 ; CHECK-NEXT:    s_mov_b32 s0, 42
@@ -261,17 +258,13 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_compare(i32 inreg %v) {
 ; CHECK-LABEL: branch_uniform_ballot_eq_zero_compare:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_cmp_lt_u32 s0, 12
-; CHECK-NEXT:    s_cselect_b32 s0, 1, 0
-; CHECK-NEXT:    s_and_b32 s0, 1, s0
-; CHECK-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
-; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; CHECK-NEXT:    s_cbranch_scc0 .LBB14_2
-; CHECK-NEXT:  ; %bb.1: ; %false
-; CHECK-NEXT:    s_mov_b32 s0, 33
-; CHECK-NEXT:    s_branch .LBB14_3
-; CHECK-NEXT:  .LBB14_2: ; %true
+; CHECK-NEXT:    s_cbranch_scc1 .LBB14_2
+; CHECK-NEXT:  ; %bb.1: ; %true
 ; CHECK-NEXT:    s_mov_b32 s0, 42
 ; CHECK-NEXT:    s_branch .LBB14_3
+; CHECK-NEXT:  .LBB14_2: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB14_3
 ; CHECK-NEXT:  .LBB14_3:
   %c = icmp ult i32 %v, 12
   %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
@@ -313,14 +306,12 @@ false:
 define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_and(i32 inreg %v1, i32 inreg %v2) {
 ; CHECK-LABEL: branch_uniform_ballot_ne_zero_and:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_cmp_lt_u32 s0, 12
+; CHECK-NEXT:    s_cmp_ge_u32 s0, 12
 ; CHECK-NEXT:    s_cselect_b32 s0, 1, 0
-; CHECK-NEXT:    s_cmp_gt_u32 s1, 34
+; CHECK-NEXT:    s_cmp_le_u32 s1, 34
 ; CHECK-NEXT:    s_cselect_b32 s1, 1, 0
-; CHECK-NEXT:    s_and_b32 s0, s0, s1
-; CHECK-NEXT:    s_and_b32 s0, 1, s0
-; CHECK-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
-; CHECK-NEXT:    s_cmp_eq_u64 s[0:1], 0
+; CHECK-NEXT:    s_or_b32 s0, s0, s1
+; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cbranch_scc1 .LBB16_2
 ; CHECK-NEXT:  ; %bb.1: ; %true
 ; CHECK-NEXT:    s_mov_b32 s0, 42
@@ -375,16 +366,14 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_and(i32 inreg %v1, i32 inreg
 ; CHECK-NEXT:    s_cmp_gt_u32 s1, 34
 ; CHECK-NEXT:    s_cselect_b32 s1, 1, 0
 ; CHECK-NEXT:    s_and_b32 s0, s0, s1
-; CHECK-NEXT:    s_and_b32 s0, 1, s0
-; CHECK-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
-; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; CHECK-NEXT:    s_cbranch_scc0 .LBB18_2
-; CHECK-NEXT:  ; %bb.1: ; %false
-; CHECK-NEXT:    s_mov_b32 s0, 33
-; CHECK-NEXT:    s_branch .LBB18_3
-; CHECK-NEXT:  .LBB18_2: ; %true
+; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
+; CHECK-NEXT:    s_cbranch_scc1 .LBB18_2
+; CHECK-NEXT:  ; %bb.1: ; %true
 ; CHECK-NEXT:    s_mov_b32 s0, 42
 ; CHECK-NEXT:    s_branch .LBB18_3
+; CHECK-NEXT:  .LBB18_2: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB18_3
 ; CHECK-NEXT:  .LBB18_3:
   %v1c = icmp ult i32 %v1, 12
   %v2c = icmp ugt i32 %v2, 34
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.ll
index 221e2fd4f00f7..09e1fca3f2677 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.ll
@@ -1200,7 +1200,7 @@ define amdgpu_ps void @atomic_cmpswap_i32_1d_no_return(<8 x i32> inreg %rsrc, i3
 ; GFX6-NEXT:    s_mov_b32 s5, s7
 ; GFX6-NEXT:    s_mov_b32 s6, s8
 ; GFX6-NEXT:    s_mov_b32 s7, s9
-; GFX6-NEXT:    image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm glc
+; GFX6-NEXT:    image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: atomic_cmpswap_i32_1d_no_return:
@@ -1213,7 +1213,7 @@ define amdgpu_ps void @atomic_cmpswap_i32_1d_no_return(<8 x i32> inreg %rsrc, i3
 ; GFX8-NEXT:    s_mov_b32 s5, s7
 ; GFX8-NEXT:    s_mov_b32 s6, s8
 ; GFX8-NEXT:    s_mov_b32 s7, s9
-; GFX8-NEXT:    image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm glc
+; GFX8-NEXT:    image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX900-LABEL: atomic_cmpswap_i32_1d_no_return:
@@ -1226,7 +1226,7 @@ define amdgpu_ps void @atomic_cmpswap_i32_1d_no_return(<8 x i32> inreg %rsrc, i3
 ; GFX900-NEXT:    s_mov_b32 s5, s7
 ; GFX900-NEXT:    s_mov_b32 s6, s8
 ; GFX900-NEXT:    s_mov_b32 s7, s9
-; GFX900-NEXT:    image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm glc
+; GFX900-NEXT:    image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm
 ; GFX900-NEXT:    s_endpgm
 ;
 ; GFX90A-LABEL: atomic_cmpswap_i32_1d_no_return:
@@ -1239,7 +1239,7 @@ define amdgpu_ps void @atomic_cmpswap_i32_1d_no_return(<8 x i32> inreg %rsrc, i3
 ; GFX90A-NEXT:    s_mov_b32 s5, s7
 ; GFX90A-NEXT:    s_mov_b32 s6, s8
 ; GFX90A-NEXT:    s_mov_b32 s7, s9
-; GFX90A-NEXT:    image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm glc
+; GFX90A-NEXT:    image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX10PLUS-LABEL: atomic_cmpswap_i32_1d_no_return:
@@ -1252,7 +1252,7 @@ define amdgpu_ps void @atomic_cmpswap_i32_1d_no_return(<8 x i32> inreg %rsrc, i3
 ; GFX10PLUS-NEXT:    s_mov_b32 s5, s7
 ; GFX10PLUS-NEXT:    s_mov_b32 s6, s8
 ; GFX10PLUS-NEXT:    s_mov_b32 s7, s9
-; GFX10PLUS-NEXT:    image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm glc
+; GFX10PLUS-NEXT:    image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm
 ; GFX10PLUS-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_cmpswap_i32_1d_no_return:
@@ -1265,7 +1265,7 @@ define amdgpu_ps void @atomic_cmpswap_i32_1d_no_return(<8 x i32> inreg %rsrc, i3
 ; GFX12-NEXT:    s_mov_b32 s5, s7
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
-; GFX12-NEXT:    image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D
 ; GFX12-NEXT:    s_endpgm
 main_body:
   %v = call i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32(i32 %cmp, i32 %swap, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -3194,7 +3194,7 @@ define amdgpu_ps void @atomic_cmpswap_i64_1d_no_return(<8 x i32> inreg %rsrc, i6
 ; GFX6-NEXT:    s_mov_b32 s5, s7
 ; GFX6-NEXT:    s_mov_b32 s6, s8
 ; GFX6-NEXT:    s_mov_b32 s7, s9
-; GFX6-NEXT:    image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm glc
+; GFX6-NEXT:    image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: atomic_cmpswap_i64_1d_no_return:
@@ -3207,7 +3207,7 @@ define amdgpu_ps void @atomic_cmpswap_i64_1d_no_return(<8 x i32> inreg %rsrc, i6
 ; GFX8-NEXT:    s_mov_b32 s5, s7
 ; GFX8-NEXT:    s_mov_b32 s6, s8
 ; GFX8-NEXT:    s_mov_b32 s7, s9
-; GFX8-NEXT:    image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm glc
+; GFX8-NEXT:    image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX900-LABEL: atomic_cmpswap_i64_1d_no_return:
@@ -3220,7 +3220,7 @@ define amdgpu_ps void @atomic_cmpswap_i64_1d_no_return(<8 x i32> inreg %rsrc, i6
 ; GFX900-NEXT:    s_mov_b32 s5, s7
 ; GFX900-NEXT:    s_mov_b32 s6, s8
 ; GFX900-NEXT:    s_mov_b32 s7, s9
-; GFX900-NEXT:    image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm glc
+; GFX900-NEXT:    image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm
 ; GFX900-NEXT:    s_endpgm
 ;
 ; GFX90A-LABEL: atomic_cmpswap_i64_1d_no_return:
@@ -3233,7 +3233,7 @@ define amdgpu_ps void @atomic_cmpswap_i64_1d_no_return(<8 x i32> inreg %rsrc, i6
 ; GFX90A-NEXT:    s_mov_b32 s5, s7
 ; GFX90A-NEXT:    s_mov_b32 s6, s8
 ; GFX90A-NEXT:    s_mov_b32 s7, s9
-; GFX90A-NEXT:    image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm glc
+; GFX90A-NEXT:    image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX10PLUS-LABEL: atomic_cmpswap_i64_1d_no_return:
@@ -3246,7 +3246,7 @@ define amdgpu_ps void @atomic_cmpswap_i64_1d_no_return(<8 x i32> inreg %rsrc, i6
 ; GFX10PLUS-NEXT:    s_mov_b32 s5, s7
 ; GFX10PLUS-NEXT:    s_mov_b32 s6, s8
 ; GFX10PLUS-NEXT:    s_mov_b32 s7, s9
-; GFX10PLUS-NEXT:    image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc
+; GFX10PLUS-NEXT:    image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm
 ; GFX10PLUS-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_cmpswap_i64_1d_no_return:
@@ -3259,7 +3259,7 @@ define amdgpu_ps void @atomic_cmpswap_i64_1d_no_return(<8 x i32> inreg %rsrc, i6
 ; GFX12-NEXT:    s_mov_b32 s5, s7
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
-; GFX12-NEXT:    image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D
 ; GFX12-NEXT:    s_endpgm
 main_body:
   %v = call i64 @llvm.amdgcn.image.atomic.cmpswap.1d.i64.i32(i64 %cmp, i64 %swap, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.mir
index 292fa4be1ca1d..4f160b6cb4b1b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.mir
@@ -25,6 +25,7 @@ body: |
     ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed [[IMAGE_ATOMIC_CMPSWAP_V2_V1_si]].sub0
     ; GFX6-NEXT: $vgpr0 = COPY [[COPY3]]
     ; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+    ;
     ; GFX8-LABEL: name: atomic_cmpswap_i32_1d
     ; GFX8: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1, $vgpr2
     ; GFX8-NEXT: {{  $}}
@@ -35,6 +36,7 @@ body: |
     ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed [[IMAGE_ATOMIC_CMPSWAP_V2_V1_vi]].sub0
     ; GFX8-NEXT: $vgpr0 = COPY [[COPY3]]
     ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+    ;
     ; GFX10-LABEL: name: atomic_cmpswap_i32_1d
     ; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
@@ -45,6 +47,7 @@ body: |
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed [[IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx10_]].sub0
     ; GFX10-NEXT: $vgpr0 = COPY [[COPY3]]
     ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+    ;
     ; GFX11-LABEL: name: atomic_cmpswap_i32_1d
     ; GFX11: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
@@ -55,6 +58,7 @@ body: |
     ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed [[IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx11_]].sub0
     ; GFX11-NEXT: $vgpr0 = COPY [[COPY3]]
     ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+    ;
     ; GFX12-LABEL: name: atomic_cmpswap_i32_1d
     ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1, $vgpr2
     ; GFX12-NEXT: {{  $}}
@@ -89,39 +93,43 @@ body: |
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
     ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX6-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V2_V1_si:%[0-9]+]]:vreg_64 = IMAGE_ATOMIC_CMPSWAP_V2_V1_si [[COPY1]], [[COPY2]], [[COPY]], 3, 1, 1, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8)
+    ; GFX6-NEXT: IMAGE_ATOMIC_CMPSWAP_NORTN_V2_V1_si [[COPY1]], [[COPY2]], [[COPY]], 3, 1, 0, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8)
     ; GFX6-NEXT: S_ENDPGM 0
+    ;
     ; GFX8-LABEL: name: atomic_cmpswap_i32_1d_no_return
     ; GFX8: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1, $vgpr2
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX8-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V1_V1_vi:%[0-9]+]]:vreg_64 = IMAGE_ATOMIC_CMPSWAP_V2_V1_vi [[COPY1]], [[COPY2]], [[COPY]], 3, 1, 1, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8)
+    ; GFX8-NEXT: IMAGE_ATOMIC_CMPSWAP_NORTN_V2_V1_vi [[COPY1]], [[COPY2]], [[COPY]], 3, 1, 0, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8)
     ; GFX8-NEXT: S_ENDPGM 0
+    ;
     ; GFX10-LABEL: name: atomic_cmpswap_i32_1d_no_return
     ; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx10_:%[0-9]+]]:vreg_64 = IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx10 [[COPY1]], [[COPY2]], [[COPY]], 3, 0, 1, 1, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8)
+    ; GFX10-NEXT: IMAGE_ATOMIC_CMPSWAP_NORTN_V2_V1_gfx10 [[COPY1]], [[COPY2]], [[COPY]], 3, 0, 1, 0, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8)
     ; GFX10-NEXT: S_ENDPGM 0
+    ;
     ; GFX11-LABEL: name: atomic_cmpswap_i32_1d_no_return
     ; GFX11: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX11-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx11_:%[0-9]+]]:vreg_64 = IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx11 [[COPY1]], [[COPY2]], [[COPY]], 3, 0, 1, 1, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8)
+    ; GFX11-NEXT: IMAGE_ATOMIC_CMPSWAP_NORTN_V2_V1_gfx11 [[COPY1]], [[COPY2]], [[COPY]], 3, 0, 1, 0, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8)
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: atomic_cmpswap_i32_1d_no_return
     ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1, $vgpr2
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
     ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX12-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx12_:%[0-9]+]]:vreg_64 = IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx12 [[COPY1]], [[COPY2]], [[COPY]], 3, 0, 1, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8)
+    ; GFX12-NEXT: IMAGE_ATOMIC_CMPSWAP_NORTN_V2_V1_gfx12 [[COPY1]], [[COPY2]], [[COPY]], 3, 0, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8)
     ; GFX12-NEXT: S_ENDPGM 0
     %0:sgpr(<8 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
     %1:vgpr(<2 x s32>) = COPY $vgpr0_vgpr1
@@ -150,6 +158,7 @@ body: |
     ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY killed [[IMAGE_ATOMIC_CMPSWAP_V4_V1_si]].sub0_sub1
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[COPY3]]
     ; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0_vgpr1
+    ;
     ; GFX8-LABEL: name: atomic_cmpswap_i64_1d
     ; GFX8: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
     ; GFX8-NEXT: {{  $}}
@@ -160,6 +169,7 @@ body: |
     ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY killed [[IMAGE_ATOMIC_CMPSWAP_V4_V1_vi]].sub0_sub1
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[COPY3]]
     ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0_vgpr1
+    ;
     ; GFX10-LABEL: name: atomic_cmpswap_i64_1d
     ; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
     ; GFX10-NEXT: {{  $}}
@@ -170,6 +180,7 @@ body: |
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY killed [[IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx10_]].sub0_sub1
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[COPY3]]
     ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0_vgpr1
+    ;
     ; GFX11-LABEL: name: atomic_cmpswap_i64_1d
     ; GFX11: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
     ; GFX11-NEXT: {{  $}}
@@ -180,6 +191,7 @@ body: |
     ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY killed [[IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx11_]].sub0_sub1
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[COPY3]]
     ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0_vgpr1
+    ;
     ; GFX12-LABEL: name: atomic_cmpswap_i64_1d
     ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
     ; GFX12-NEXT: {{  $}}
@@ -214,39 +226,43 @@ body: |
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
     ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
     ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-    ; GFX6-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V4_V1_si:%[0-9]+]]:vreg_128 = IMAGE_ATOMIC_CMPSWAP_V4_V1_si [[COPY1]], [[COPY2]], [[COPY]], 15, 1, 1, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 8)
+    ; GFX6-NEXT: IMAGE_ATOMIC_CMPSWAP_NORTN_V4_V1_si [[COPY1]], [[COPY2]], [[COPY]], 15, 1, 0, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 8)
     ; GFX6-NEXT: S_ENDPGM 0
+    ;
     ; GFX8-LABEL: name: atomic_cmpswap_i64_1d_no_return
     ; GFX8: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
     ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-    ; GFX8-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V4_V1_vi:%[0-9]+]]:vreg_128 = IMAGE_ATOMIC_CMPSWAP_V4_V1_vi [[COPY1]], [[COPY2]], [[COPY]], 15, 1, 1, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 8)
+    ; GFX8-NEXT: IMAGE_ATOMIC_CMPSWAP_NORTN_V4_V1_vi [[COPY1]], [[COPY2]], [[COPY]], 15, 1, 0, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 8)
     ; GFX8-NEXT: S_ENDPGM 0
+    ;
     ; GFX10-LABEL: name: atomic_cmpswap_i64_1d_no_return
     ; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
     ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-    ; GFX10-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx10_:%[0-9]+]]:vreg_128 = IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx10 [[COPY1]], [[COPY2]], [[COPY]], 15, 0, 1, 1, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 8)
+    ; GFX10-NEXT: IMAGE_ATOMIC_CMPSWAP_NORTN_V4_V1_gfx10 [[COPY1]], [[COPY2]], [[COPY]], 15, 0, 1, 0, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 8)
     ; GFX10-NEXT: S_ENDPGM 0
+    ;
     ; GFX11-LABEL: name: atomic_cmpswap_i64_1d_no_return
     ; GFX11: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
     ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-    ; GFX11-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx11_:%[0-9]+]]:vreg_128 = IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx11 [[COPY1]], [[COPY2]], [[COPY]], 15, 0, 1, 1, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 8)
+    ; GFX11-NEXT: IMAGE_ATOMIC_CMPSWAP_NORTN_V4_V1_gfx11 [[COPY1]], [[COPY2]], [[COPY]], 15, 0, 1, 0, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 8)
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: atomic_cmpswap_i64_1d_no_return
     ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
     ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
     ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-    ; GFX12-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx12_:%[0-9]+]]:vreg_128 = IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx12 [[COPY1]], [[COPY2]], [[COPY]], 15, 0, 1, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 8)
+    ; GFX12-NEXT: IMAGE_ATOMIC_CMPSWAP_NORTN_V4_V1_gfx12 [[COPY1]], [[COPY2]], [[COPY]], 15, 0, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 8)
     ; GFX12-NEXT: S_ENDPGM 0
     %0:sgpr(<8 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
     %1:vgpr(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-add.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-add.mir
new file mode 100644
index 0000000000000..097372a957461
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-add.mir
@@ -0,0 +1,524 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=amdgpu-regbankselect,amdgpu-regbanklegalize %s -verify-machineinstrs -o - | FileCheck %s
+---
+name: add_s16_ss
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0, $sgpr1
+    ; CHECK-LABEL: name: add_s16_ss
+    ; CHECK: liveins: $sgpr0, $sgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY1]](s32)
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC]](s16)
+    ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC1]](s16)
+    ; CHECK-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[ANYEXT]], [[ANYEXT1]]
+    ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:sgpr(s16) = G_TRUNC [[ADD]](s32)
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s16) = G_AND [[TRUNC2]], [[TRUNC2]]
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s32) = COPY $sgpr1
+    %2:_(s16) = G_TRUNC %0
+    %3:_(s16) = G_TRUNC %1
+    %4:_(s16) = G_ADD %2, %3
+    %5:_(s16) = G_AND %4, %4
+...
+
+---
+name: add_s16_sv
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0, $vgpr0
+    ; CHECK-LABEL: name: add_s16_sv
+    ; CHECK: liveins: $sgpr0, $vgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32)
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s16) = COPY [[TRUNC]](s16)
+    ; CHECK-NEXT: [[ADD:%[0-9]+]]:vgpr(s16) = G_ADD [[COPY2]], [[TRUNC1]]
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s16) = G_AND [[ADD]], [[ADD]]
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s32) = COPY $vgpr0
+    %2:_(s16) = G_TRUNC %0
+    %3:_(s16) = G_TRUNC %1
+    %4:_(s16) = G_ADD %2, %3
+    %5:_(s16) = G_AND %4, %4
+...
+
+---
+name: add_s16_vs
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0, $vgpr0
+    ; CHECK-LABEL: name: add_s16_vs
+    ; CHECK: liveins: $sgpr0, $vgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY1]](s32)
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s16) = COPY [[TRUNC1]](s16)
+    ; CHECK-NEXT: [[ADD:%[0-9]+]]:vgpr(s16) = G_ADD [[TRUNC]], [[COPY2]]
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s16) = G_AND [[ADD]], [[ADD]]
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $sgpr0
+    %2:_(s16) = G_TRUNC %0
+    %3:_(s16) = G_TRUNC %1
+    %4:_(s16) = G_ADD %2, %3
+    %5:_(s16) = G_AND %4, %4
+...
+
+---
+name: add_s16_vv
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+    ; CHECK-LABEL: name: add_s16_vv
+    ; CHECK: liveins: $vgpr0, $vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32)
+    ; CHECK-NEXT: [[ADD:%[0-9]+]]:vgpr(s16) = G_ADD [[TRUNC]], [[TRUNC1]]
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s16) = G_AND [[ADD]], [[ADD]]
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s16) = G_TRUNC %0
+    %3:_(s16) = G_TRUNC %1
+    %4:_(s16) = G_ADD %2, %3
+    %5:_(s16) = G_AND %4, %4
+...
+
+---
+name: add_s32_ss
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0, $sgpr1
+    ; CHECK-LABEL: name: add_s32_ss
+    ; CHECK: liveins: $sgpr0, $sgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
+    ; CHECK-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[ADD]], [[ADD]]
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s32) = COPY $sgpr1
+    %2:_(s32) = G_ADD %0, %1
+    %3:_(s32) = G_AND %2, %2
+...
+
+---
+name: add_s32_sv
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0, $vgpr0
+    ; CHECK-LABEL: name: add_s32_sv
+    ; CHECK: liveins: $sgpr0, $vgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
+    ; CHECK-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY2]], [[COPY1]]
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[ADD]], [[ADD]]
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s32) = COPY $vgpr0
+    %2:_(s32) = G_ADD %0, %1
+    %3:_(s32) = G_AND %2, %2
+...
+
+---
+name: add_s32_vs
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0, $vgpr0
+    ; CHECK-LABEL: name: add_s32_vs
+    ; CHECK: liveins: $sgpr0, $vgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
+    ; CHECK-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY]], [[COPY2]]
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[ADD]], [[ADD]]
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $sgpr0
+    %2:_(s32) = G_ADD %0, %1
+    %3:_(s32) = G_AND %2, %2
+...
+
+---
+name: add_s32_vv
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+    ; CHECK-LABEL: name: add_s32_vv
+    ; CHECK: liveins: $vgpr0, $vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+    ; CHECK-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[ADD]], [[ADD]]
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = G_ADD %0, %1
+    %3:_(s32) = G_AND %2, %2
+...
+
+---
+name: add_s64_ss
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1, $sgpr2_sgpr3
+    ; CHECK-LABEL: name: add_s64_ss
+    ; CHECK: liveins: $sgpr0_sgpr1, $sgpr2_sgpr3
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s64) = COPY $sgpr0_sgpr1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s64) = COPY $sgpr2_sgpr3
+    ; CHECK-NEXT: [[ADD:%[0-9]+]]:sgpr(s64) = G_ADD [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 255
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s64) = G_AND [[ADD]], [[ADD]]
+    %0:_(s64) = COPY $sgpr0_sgpr1
+    %1:_(s64) = COPY $sgpr2_sgpr3
+    %2:_(s64) = G_ADD %0, %1
+    %3:_(s64) = G_CONSTANT i64 255
+    %4:_(s64) = G_AND %2, %2
+...
+
+---
+name: add_s64_sv
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1, $vgpr0_vgpr1
+    ; CHECK-LABEL: name: add_s64_sv
+    ; CHECK: liveins: $sgpr0_sgpr1, $vgpr0_vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s64) = COPY $sgpr0_sgpr1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s64) = COPY [[COPY]](s64)
+    ; CHECK-NEXT: [[ADD:%[0-9]+]]:vgpr(s64) = G_ADD [[COPY2]], [[COPY1]]
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[ADD]](s64)
+    ; CHECK-NEXT: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[ADD]](s64)
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[UV]], [[UV2]]
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:vgpr(s32) = G_AND [[UV1]], [[UV3]]
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[AND]](s32), [[AND1]](s32)
+    %0:_(s64) = COPY $sgpr0_sgpr1
+    %1:_(s64) = COPY $vgpr0_vgpr1
+    %2:_(s64) = G_ADD %0, %1
+    %3:_(s64) = G_AND %2, %2
+...
+
+---
+name: add_s64_vs
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1, $vgpr0_vgpr1
+    ; CHECK-LABEL: name: add_s64_vs
+    ; CHECK: liveins: $sgpr0_sgpr1, $vgpr0_vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s64) = COPY $sgpr0_sgpr1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s64) = COPY [[COPY1]](s64)
+    ; CHECK-NEXT: [[ADD:%[0-9]+]]:vgpr(s64) = G_ADD [[COPY]], [[COPY2]]
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[ADD]](s64)
+    ; CHECK-NEXT: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[ADD]](s64)
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[UV]], [[UV2]]
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:vgpr(s32) = G_AND [[UV1]], [[UV3]]
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[AND]](s32), [[AND1]](s32)
+    %0:_(s64) = COPY $vgpr0_vgpr1
+    %1:_(s64) = COPY $sgpr0_sgpr1
+    %2:_(s64) = G_ADD %0, %1
+    %3:_(s64) = G_AND %2, %2
+...
+
+---
+name: add_s64_vv
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; CHECK-LABEL: name: add_s64_vv
+    ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3
+    ; CHECK-NEXT: [[ADD:%[0-9]+]]:vgpr(s64) = G_ADD [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[ADD]](s64)
+    ; CHECK-NEXT: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[ADD]](s64)
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[UV]], [[UV2]]
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:vgpr(s32) = G_AND [[UV1]], [[UV3]]
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[AND]](s32), [[AND1]](s32)
+    %0:_(s64) = COPY $vgpr0_vgpr1
+    %1:_(s64) = COPY $vgpr2_vgpr3
+    %2:_(s64) = G_ADD %0, %1
+    %3:_(s64) = G_AND %2, %2
+...
+
+---
+name: uaddo_s32_ss
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0, $sgpr1
+    ; CHECK-LABEL: name: uaddo_s32_ss
+    ; CHECK: liveins: $sgpr0, $sgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
+    ; CHECK-NEXT: [[UADDO:%[0-9]+]]:sgpr(s32), [[UADDO1:%[0-9]+]]:sgpr(s32) = G_UADDO [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[UADDO1]], [[C]]
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[AND]](s32), [[C]], [[C1]]
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:sgpr(s32) = G_AND [[SELECT]], [[UADDO]]
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s32) = COPY $sgpr1
+    %2:_(s32), %3:_(s1) = G_UADDO %0, %1
+    %4:_(s32) = G_ZEXT %3
+    %5:_(s32) = G_AND %4, %2
+...
+
+---
+name: uaddo_s32_sv
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0, $vgpr1
+    ; CHECK-LABEL: name: uaddo_s32_sv
+    ; CHECK: liveins: $sgpr0, $vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
+    ; CHECK-NEXT: [[UADDO:%[0-9]+]]:vgpr(s32), [[UADDO1:%[0-9]+]]:vcc(s1) = G_UADDO [[COPY2]], [[COPY1]]
+    ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[UADDO1]](s1), [[C]], [[C1]]
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[UADDO]], [[SELECT]]
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32), %3:_(s1) = G_UADDO %0, %1
+    %4:_(s32) = G_ZEXT %3
+    %5:_(s32) = G_AND %2, %4
+...
+
+---
+name: uaddo_s32_vs
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $sgpr1
+    ; CHECK-LABEL: name: uaddo_s32_vs
+    ; CHECK: liveins: $vgpr0, $sgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
+    ; CHECK-NEXT: [[UADDO:%[0-9]+]]:vgpr(s32), [[UADDO1:%[0-9]+]]:vcc(s1) = G_UADDO [[COPY]], [[COPY2]]
+    ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[UADDO1]](s1), [[C]], [[C1]]
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[UADDO]], [[SELECT]]
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $sgpr1
+    %2:_(s32), %3:_(s1) = G_UADDO %0, %1
+    %4:_(s32) = G_ZEXT %3
+    %5:_(s32) = G_AND %2, %4
+...
+
+---
+name: uaddo_s32_vv
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+    ; CHECK-LABEL: name: uaddo_s32_vv
+    ; CHECK: liveins: $vgpr0, $vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+    ; CHECK-NEXT: [[UADDO:%[0-9]+]]:vgpr(s32), [[UADDO1:%[0-9]+]]:vcc(s1) = G_UADDO [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[UADDO1]](s1), [[C]], [[C1]]
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[UADDO]], [[SELECT]]
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32), %3:_(s1) = G_UADDO %0, %1
+    %4:_(s32) = G_ZEXT %3
+    %5:_(s32) = G_AND %2, %4
+...
+
+---
+name: uadde_s32_ss
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0, $sgpr1, $sgpr2
+    ; CHECK-LABEL: name: uadde_s32_ss
+    ; CHECK: liveins: $sgpr0, $sgpr1, $sgpr2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
+    ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[COPY2]], [[C]]
+    ; CHECK-NEXT: [[UADDE:%[0-9]+]]:sgpr(s32), [[UADDE1:%[0-9]+]]:sgpr(s32) = G_UADDE [[COPY]], [[COPY1]], [[AND]]
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:sgpr(s32) = G_AND [[UADDE1]], [[C]]
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[AND1]](s32), [[C]], [[C1]]
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:sgpr(s32) = G_AND [[UADDE]], [[SELECT]]
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s32) = COPY $sgpr1
+    %2:_(s32) = COPY $sgpr2
+    %3:_(s1) = G_TRUNC %2
+    %4:_(s32), %5:_(s1) = G_UADDE %0, %1, %3
+    %6:_(s32) = G_ZEXT %5
+    %7:_(s32) = G_AND %4, %6
+...
+
+---
+name: uadde_s32_sv
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0, $vgpr1, $sgpr2
+    ; CHECK-LABEL: name: uadde_s32_sv
+    ; CHECK: liveins: $sgpr0, $vgpr1, $sgpr2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
+    ; CHECK-NEXT: [[AMDGPU_COPY_VCC_SCC:%[0-9]+]]:vcc(s1) = G_AMDGPU_COPY_VCC_SCC [[COPY2]](s32)
+    ; CHECK-NEXT: [[UADDE:%[0-9]+]]:vgpr(s32), [[UADDE1:%[0-9]+]]:vcc(s1) = G_UADDE [[COPY3]], [[COPY1]], [[AMDGPU_COPY_VCC_SCC]]
+    ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[UADDE1]](s1), [[C]], [[C1]]
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[UADDE]], [[SELECT]]
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = COPY $sgpr2
+    %3:_(s1) = G_TRUNC %2
+    %4:_(s32), %5:_(s1) = G_UADDE %0, %1, %3
+    %6:_(s32) = G_ZEXT %5
+    %7:_(s32) = G_AND %4, %6
+...
+
+---
+name: uadde_s32_vs
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $sgpr1, $sgpr2
+    ; CHECK-LABEL: name: uadde_s32_vs
+    ; CHECK: liveins: $vgpr0, $sgpr1, $sgpr2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
+    ; CHECK-NEXT: [[AMDGPU_COPY_VCC_SCC:%[0-9]+]]:vcc(s1) = G_AMDGPU_COPY_VCC_SCC [[COPY2]](s32)
+    ; CHECK-NEXT: [[UADDE:%[0-9]+]]:vgpr(s32), [[UADDE1:%[0-9]+]]:vcc(s1) = G_UADDE [[COPY]], [[COPY3]], [[AMDGPU_COPY_VCC_SCC]]
+    ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[UADDE1]](s1), [[C]], [[C1]]
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[UADDE]], [[SELECT]]
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $sgpr1
+    %2:_(s32) = COPY $sgpr2
+    %3:_(s1) = G_TRUNC %2
+    %4:_(s32), %5:_(s1) = G_UADDE %0, %1, %3
+    %6:_(s32) = G_ZEXT %5
+    %7:_(s32) = G_AND %4, %6
+...
+
+---
+name: uadde_s32_vv
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+    ; CHECK-LABEL: name: uadde_s32_vv
+    ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
+    ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[COPY2]], [[C]]
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(ne), [[AND]](s32), [[C1]]
+    ; CHECK-NEXT: [[UADDE:%[0-9]+]]:vgpr(s32), [[UADDE1:%[0-9]+]]:vcc(s1) = G_UADDE [[COPY]], [[COPY1]], [[ICMP]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[UADDE1]](s1), [[C]], [[C1]]
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:vgpr(s32) = G_AND [[UADDE]], [[SELECT]]
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = COPY $vgpr2
+    %3:_(s1) = G_TRUNC %2
+    %4:_(s32), %5:_(s1) = G_UADDE %0, %1, %3
+    %6:_(s32) = G_ZEXT %5
+    %7:_(s32) = G_AND %4, %6
+...
+
+---
+name: uadde_s32_ss_scc_use
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0, $sgpr1, $sgpr2
+    ; CHECK-LABEL: name: uadde_s32_ss_scc_use
+    ; CHECK: liveins: $sgpr0, $sgpr1, $sgpr2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
+    ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[COPY2]], [[C]]
+    ; CHECK-NEXT: [[UADDE:%[0-9]+]]:sgpr(s32), [[UADDE1:%[0-9]+]]:sgpr(s32) = G_UADDE [[COPY]], [[COPY1]], [[AND]]
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:sgpr(s32) = G_AND [[UADDE1]], [[C]]
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[AND1]](s32), [[C]], [[C1]]
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:sgpr(s32) = G_AND [[UADDE]], [[SELECT]]
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s32) = COPY $sgpr1
+    %2:_(s32) = COPY $sgpr2
+    %3:_(s1) = G_TRUNC %2
+    %4:_(s32), %5:_(s1) = G_UADDE %0, %1, %3
+    %6:_(s32) = G_ZEXT %5
+    %8:_(s32) = G_AND %4, %6
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-add.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-add.s16.mir
index 54ee69fcb2204..30c958fcb192a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-add.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-add.s16.mir
@@ -1,6 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s
-# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=amdgpu-regbankselect,amdgpu-regbanklegalize %s -verify-machineinstrs -o - | FileCheck %s
 ---
 name: add_s16_ss
 legalized: true
@@ -19,13 +18,13 @@ body: |
     ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC1]](s16)
     ; CHECK-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[ANYEXT]], [[ANYEXT1]]
     ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:sgpr(s16) = G_TRUNC [[ADD]](s32)
-    ; CHECK-NEXT: S_ENDPGM 0, implicit [[TRUNC2]](s16)
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s16) = G_AND [[TRUNC2]], [[TRUNC2]]
     %0:_(s32) = COPY $sgpr0
     %1:_(s32) = COPY $sgpr1
     %2:_(s16) = G_TRUNC %0
     %3:_(s16) = G_TRUNC %1
     %4:_(s16) = G_ADD %2, %3
-    S_ENDPGM 0, implicit %4
+    %5:_(s16) = G_AND %4, %4
 ...
 
 ---
@@ -44,13 +43,13 @@ body: |
     ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32)
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s16) = COPY [[TRUNC]](s16)
     ; CHECK-NEXT: [[ADD:%[0-9]+]]:vgpr(s16) = G_ADD [[COPY2]], [[TRUNC1]]
-    ; CHECK-NEXT: S_ENDPGM 0, implicit [[ADD]](s16)
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s16) = G_AND [[ADD]], [[ADD]]
     %0:_(s32) = COPY $sgpr0
     %1:_(s32) = COPY $vgpr0
     %2:_(s16) = G_TRUNC %0
     %3:_(s16) = G_TRUNC %1
     %4:_(s16) = G_ADD %2, %3
-    S_ENDPGM 0, implicit %4
+    %5:_(s16) = G_AND %4, %4
 ...
 
 ---
@@ -69,13 +68,13 @@ body: |
     ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY1]](s32)
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s16) = COPY [[TRUNC1]](s16)
     ; CHECK-NEXT: [[ADD:%[0-9]+]]:vgpr(s16) = G_ADD [[TRUNC]], [[COPY2]]
-    ; CHECK-NEXT: S_ENDPGM 0, implicit [[ADD]](s16)
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s16) = G_AND [[ADD]], [[ADD]]
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = COPY $sgpr0
     %2:_(s16) = G_TRUNC %0
     %3:_(s16) = G_TRUNC %1
     %4:_(s16) = G_ADD %2, %3
-    S_ENDPGM 0, implicit %4
+    %5:_(s16) = G_AND %4, %4
 ...
 
 ---
@@ -93,11 +92,11 @@ body: |
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32)
     ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32)
     ; CHECK-NEXT: [[ADD:%[0-9]+]]:vgpr(s16) = G_ADD [[TRUNC]], [[TRUNC1]]
-    ; CHECK-NEXT: S_ENDPGM 0, implicit [[ADD]](s16)
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s16) = G_AND [[ADD]], [[ADD]]
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = COPY $vgpr1
     %2:_(s16) = G_TRUNC %0
     %3:_(s16) = G_TRUNC %1
     %4:_(s16) = G_ADD %2, %3
-    S_ENDPGM 0, implicit %4
+    %5:_(s16) = G_AND %4, %4
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-add.v2s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-add.v2s16.mir
index 97018fac13a87..01eb39111b0ab 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-add.v2s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-add.v2s16.mir
@@ -1,6 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=amdgpu-regbankselect,amdgpu-regbanklegalize %s -verify-machineinstrs -o - | FileCheck %s
 
 ---
 name: add_v2s16_ss
@@ -18,16 +17,19 @@ body: |
     ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16
     ; CHECK-NEXT: [[LSHR:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST]], [[C]](s32)
     ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:sgpr(s32) = G_BITCAST [[COPY1]](<2 x s16>)
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16
-    ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
+    ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
     ; CHECK-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[BITCAST]], [[BITCAST1]]
     ; CHECK-NEXT: [[ADD1:%[0-9]+]]:sgpr(s32) = G_ADD [[LSHR]], [[LSHR1]]
     ; CHECK-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ADD]](s32), [[ADD1]](s32)
-    ; CHECK-NEXT: S_ENDPGM 0, implicit [[BUILD_VECTOR_TRUNC]](<2 x s16>)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s16) = G_CONSTANT i16 255
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR [[C1]](s16), [[C1]](s16)
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(<2 x s16>) = G_AND [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR]]
     %0:_(<2 x s16>) = COPY $sgpr0
     %1:_(<2 x s16>) = COPY $sgpr1
     %2:_(<2 x s16>) = G_ADD %0, %1
-    S_ENDPGM 0, implicit %2
+    %3:_(s16) = G_CONSTANT i16 255
+    %4:_(<2 x s16>) = G_BUILD_VECTOR %3, %3
+    %5:_(<2 x s16>) = G_AND %2, %4
 ...
 
 ---
@@ -44,11 +46,11 @@ body: |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(<2 x s16>) = COPY [[COPY]](<2 x s16>)
     ; CHECK-NEXT: [[ADD:%[0-9]+]]:vgpr(<2 x s16>) = G_ADD [[COPY2]], [[COPY1]]
-    ; CHECK-NEXT: S_ENDPGM 0, implicit [[ADD]](<2 x s16>)
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(<2 x s16>) = G_AND [[ADD]], [[ADD]]
     %0:_(<2 x s16>) = COPY $sgpr0
     %1:_(<2 x s16>) = COPY $vgpr0
     %2:_(<2 x s16>) = G_ADD %0, %1
-    S_ENDPGM 0, implicit %2
+    %3:_(<2 x s16>) = G_AND %2, %2
 ...
 
 ---
@@ -65,9 +67,11 @@ body: |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr0
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(<2 x s16>) = COPY [[COPY1]](<2 x s16>)
     ; CHECK-NEXT: [[ADD:%[0-9]+]]:vgpr(<2 x s16>) = G_ADD [[COPY]], [[COPY2]]
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(<2 x s16>) = G_AND [[ADD]], [[ADD]]
     %0:_(<2 x s16>) = COPY $vgpr0
     %1:_(<2 x s16>) = COPY $sgpr0
     %2:_(<2 x s16>) = G_ADD %0, %1
+    %3:_(<2 x s16>) = G_AND %2, %2
 ...
 
 ---
@@ -83,9 +87,9 @@ body: |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr1
     ; CHECK-NEXT: [[ADD:%[0-9]+]]:vgpr(<2 x s16>) = G_ADD [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: S_ENDPGM 0, implicit [[ADD]](<2 x s16>)
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(<2 x s16>) = G_AND [[ADD]], [[ADD]]
     %0:_(<2 x s16>) = COPY $vgpr0
     %1:_(<2 x s16>) = COPY $vgpr1
     %2:_(<2 x s16>) = G_ADD %0, %1
-    S_ENDPGM 0, implicit %2
+    %3:_(<2 x s16>) = G_AND %2, %2
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-sext.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-sext.mir
index 7378c9366ec36..e0e783e7a62f9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-sext.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-sext.mir
@@ -77,10 +77,14 @@ body: |
     ; CHECK-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
     ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[AND]](s32), [[C1]], [[C2]]
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[SELECT]](s32)
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:sgpr(s16) = G_CONSTANT i16 255
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:sgpr(s16) = G_AND [[TRUNC]], [[C3]]
     %0:_(s32) = COPY $sgpr0
     %1:_(s32) = COPY $sgpr1
     %2:_(s1) = G_ICMP intpred(eq), %0, %1
     %3:_(s16) = G_SEXT %2
+    %4:_(s16) = G_CONSTANT i16 255
+    %5:_(s16) = G_AND %3, %4
 ...
 
 ---
@@ -215,9 +219,13 @@ body: |
     ; CHECK-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
     ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[AND]](s32), [[C1]], [[C2]]
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[SELECT]](s32)
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:sgpr(s16) = G_CONSTANT i16 255
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:sgpr(s16) = G_AND [[TRUNC]], [[C3]]
     %0:_(s32) = COPY $sgpr0
     %1:_(s1) = G_TRUNC %0
     %2:_(s16) = G_SEXT %1
+    %3:_(s16) = G_CONSTANT i16 255
+    %4:_(s16) = G_AND %2, %3
 ...
 
 ---
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-sub.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-sub.mir
index b0199d3ad5cd1..e3c01c0e7fcb4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-sub.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-sub.mir
@@ -1,5 +1,107 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=amdgpu-regbankselect,amdgpu-regbanklegalize %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=amdgpu-regbankselect,amdgpu-regbanklegalize %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s
+
+---
+name: sub_s16_ss
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0, $sgpr1
+    ; CHECK-LABEL: name: sub_s16_ss
+    ; CHECK: liveins: $sgpr0, $sgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY1]](s32)
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC]](s16)
+    ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC1]](s16)
+    ; CHECK-NEXT: [[SUB:%[0-9]+]]:sgpr(s32) = G_SUB [[ANYEXT]], [[ANYEXT1]]
+    ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:sgpr(s16) = G_TRUNC [[SUB]](s32)
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s16) = G_AND [[TRUNC2]], [[TRUNC2]]
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s32) = COPY $sgpr1
+    %2:_(s16) = G_TRUNC %0
+    %3:_(s16) = G_TRUNC %1
+    %4:_(s16) = G_SUB %2, %3
+    %6:_(s16) = G_AND %4, %4
+...
+
+---
+name: sub_s16_sv
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0, $vgpr0
+    ; CHECK-LABEL: name: sub_s16_sv
+    ; CHECK: liveins: $sgpr0, $vgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32)
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s16) = COPY [[TRUNC]](s16)
+    ; CHECK-NEXT: [[SUB:%[0-9]+]]:vgpr(s16) = G_SUB [[COPY2]], [[TRUNC1]]
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s16) = G_AND [[SUB]], [[SUB]]
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s32) = COPY $vgpr0
+    %2:_(s16) = G_TRUNC %0
+    %3:_(s16) = G_TRUNC %1
+    %4:_(s16) = G_SUB %2, %3
+    %6:_(s16) = G_AND %4, %4
+...
+
+---
+name: sub_s16_vs
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0, $vgpr0
+    ; CHECK-LABEL: name: sub_s16_vs
+    ; CHECK: liveins: $sgpr0, $vgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY1]](s32)
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s16) = COPY [[TRUNC1]](s16)
+    ; CHECK-NEXT: [[SUB:%[0-9]+]]:vgpr(s16) = G_SUB [[TRUNC]], [[COPY2]]
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s16) = G_AND [[SUB]], [[SUB]]
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $sgpr0
+    %2:_(s16) = G_TRUNC %0
+    %3:_(s16) = G_TRUNC %1
+    %4:_(s16) = G_SUB %2, %3
+    %6:_(s16) = G_AND %4, %4
+...
+
+---
+name: sub_s16_vv
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+    ; CHECK-LABEL: name: sub_s16_vv
+    ; CHECK: liveins: $vgpr0, $vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32)
+    ; CHECK-NEXT: [[SUB:%[0-9]+]]:vgpr(s16) = G_SUB [[TRUNC]], [[TRUNC1]]
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s16) = G_AND [[SUB]], [[SUB]]
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s16) = G_TRUNC %0
+    %3:_(s16) = G_TRUNC %1
+    %4:_(s16) = G_SUB %2, %3
+    %6:_(s16) = G_AND %4, %4
+...
 
 ---
 name: sub_s32_ss
@@ -14,9 +116,11 @@ body: |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
     ; CHECK-NEXT: [[SUB:%[0-9]+]]:sgpr(s32) = G_SUB [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[SUB]], [[SUB]]
     %0:_(s32) = COPY $sgpr0
     %1:_(s32) = COPY $sgpr1
     %2:_(s32) = G_SUB %0, %1
+    %4:_(s32) = G_AND %2, %2
 ...
 
 ---
@@ -33,9 +137,11 @@ body: |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
     ; CHECK-NEXT: [[SUB:%[0-9]+]]:vgpr(s32) = G_SUB [[COPY2]], [[COPY1]]
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[SUB]], [[SUB]]
     %0:_(s32) = COPY $sgpr0
     %1:_(s32) = COPY $vgpr0
     %2:_(s32) = G_SUB %0, %1
+    %4:_(s32) = G_AND %2, %2
 ...
 
 ---
@@ -52,9 +158,11 @@ body: |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
     ; CHECK-NEXT: [[SUB:%[0-9]+]]:vgpr(s32) = G_SUB [[COPY]], [[COPY2]]
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[SUB]], [[SUB]]
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = COPY $sgpr0
     %2:_(s32) = G_SUB %0, %1
+    %4:_(s32) = G_AND %2, %2
 ...
 
 ---
@@ -70,7 +178,376 @@ body: |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
     ; CHECK-NEXT: [[SUB:%[0-9]+]]:vgpr(s32) = G_SUB [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[SUB]], [[SUB]]
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = COPY $vgpr1
     %2:_(s32) = G_SUB %0, %1
+    %4:_(s32) = G_AND %2, %2
+...
+
+---
+name: sub_v2s16_ss
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0, $sgpr1
+    ; CHECK-LABEL: name: sub_v2s16_ss
+    ; CHECK: liveins: $sgpr0, $sgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr1
+    ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:sgpr(s32) = G_BITCAST [[COPY]](<2 x s16>)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16
+    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+    ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:sgpr(s32) = G_BITCAST [[COPY1]](<2 x s16>)
+    ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+    ; CHECK-NEXT: [[SUB:%[0-9]+]]:sgpr(s32) = G_SUB [[BITCAST]], [[BITCAST1]]
+    ; CHECK-NEXT: [[SUB1:%[0-9]+]]:sgpr(s32) = G_SUB [[LSHR]], [[LSHR1]]
+    ; CHECK-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[SUB]](s32), [[SUB1]](s32)
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(<2 x s16>) = G_AND [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR_TRUNC]]
+    %0:_(<2 x s16>) = COPY $sgpr0
+    %1:_(<2 x s16>) = COPY $sgpr1
+    %2:_(<2 x s16>) = G_SUB %0, %1
+    %5:_(<2 x s16>) = G_AND %2, %2
+...
+
+---
+name: sub_v2s16_sv
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0, $vgpr0
+    ; CHECK-LABEL: name: sub_v2s16_sv
+    ; CHECK: liveins: $sgpr0, $vgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(<2 x s16>) = COPY [[COPY]](<2 x s16>)
+    ; CHECK-NEXT: [[SUB:%[0-9]+]]:vgpr(<2 x s16>) = G_SUB [[COPY2]], [[COPY1]]
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(<2 x s16>) = G_AND [[SUB]], [[SUB]]
+    %0:_(<2 x s16>) = COPY $sgpr0
+    %1:_(<2 x s16>) = COPY $vgpr0
+    %2:_(<2 x s16>) = G_SUB %0, %1
+    %5:_(<2 x s16>) = G_AND %2, %2
+...
+
+---
+name: sub_v2s16_vs
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0, $vgpr0
+    ; CHECK-LABEL: name: sub_v2s16_vs
+    ; CHECK: liveins: $sgpr0, $vgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr0
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(<2 x s16>) = COPY [[COPY1]](<2 x s16>)
+    ; CHECK-NEXT: [[SUB:%[0-9]+]]:vgpr(<2 x s16>) = G_SUB [[COPY]], [[COPY2]]
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(<2 x s16>) = G_AND [[SUB]], [[SUB]]
+    %0:_(<2 x s16>) = COPY $vgpr0
+    %1:_(<2 x s16>) = COPY $sgpr0
+    %2:_(<2 x s16>) = G_SUB %0, %1
+    %5:_(<2 x s16>) = G_AND %2, %2
+...
+
+---
+name: sub_v2s16_vv
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+    ; CHECK-LABEL: name: sub_v2s16_vv
+    ; CHECK: liveins: $vgpr0, $vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr1
+    ; CHECK-NEXT: [[SUB:%[0-9]+]]:vgpr(<2 x s16>) = G_SUB [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(<2 x s16>) = G_AND [[SUB]], [[SUB]]
+    %0:_(<2 x s16>) = COPY $vgpr0
+    %1:_(<2 x s16>) = COPY $vgpr1
+    %2:_(<2 x s16>) = G_SUB %0, %1
+    %5:_(<2 x s16>) = G_AND %2, %2
+...
+
+---
+name: sub_s64_ss
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1, $sgpr0_sgpr1
+    ; CHECK-LABEL: name: sub_s64_ss
+    ; CHECK: liveins: $sgpr0_sgpr1, $sgpr0_sgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s64) = COPY $sgpr0_sgpr1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s64) = COPY $sgpr0_sgpr1
+    ; CHECK-NEXT: [[SUB:%[0-9]+]]:sgpr(s64) = G_SUB [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s64) = G_AND [[SUB]], [[SUB]]
+    %0:_(s64) = COPY $sgpr0_sgpr1
+    %1:_(s64) = COPY $sgpr0_sgpr1
+    %2:_(s64) = G_SUB %0, %1
+    %4:_(s64) = G_AND %2, %2
+...
+
+---
+name: sub_s64_sv
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1, $vgpr0_vgpr1
+    ; CHECK-LABEL: name: sub_s64_sv
+    ; CHECK: liveins: $sgpr0_sgpr1, $vgpr0_vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s64) = COPY $sgpr0_sgpr1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s64) = COPY [[COPY]](s64)
+    ; CHECK-NEXT: [[SUB:%[0-9]+]]:vgpr(s64) = G_SUB [[COPY2]], [[COPY1]]
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[SUB]](s64)
+    ; CHECK-NEXT: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[SUB]](s64)
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[UV]], [[UV2]]
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:vgpr(s32) = G_AND [[UV1]], [[UV3]]
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[AND]](s32), [[AND1]](s32)
+    %0:_(s64) = COPY $sgpr0_sgpr1
+    %1:_(s64) = COPY $vgpr0_vgpr1
+    %2:_(s64) = G_SUB %0, %1
+    %4:_(s64) = G_AND %2, %2
+...
+
+---
+name: sub_s64_vs
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1, $vgpr0_vgpr1
+    ; CHECK-LABEL: name: sub_s64_vs
+    ; CHECK: liveins: $sgpr0_sgpr1, $vgpr0_vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s64) = COPY $sgpr0_sgpr1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s64) = COPY [[COPY1]](s64)
+    ; CHECK-NEXT: [[SUB:%[0-9]+]]:vgpr(s64) = G_SUB [[COPY]], [[COPY2]]
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[SUB]](s64)
+    ; CHECK-NEXT: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[SUB]](s64)
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[UV]], [[UV2]]
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:vgpr(s32) = G_AND [[UV1]], [[UV3]]
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[AND]](s32), [[AND1]](s32)
+    %0:_(s64) = COPY $vgpr0_vgpr1
+    %1:_(s64) = COPY $sgpr0_sgpr1
+    %2:_(s64) = G_SUB %0, %1
+    %4:_(s64) = G_AND %2, %2
+...
+
+---
+name: sub_s64_vv
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; CHECK-LABEL: name: sub_s64_vv
+    ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3
+    ; CHECK-NEXT: [[SUB:%[0-9]+]]:vgpr(s64) = G_SUB [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[SUB]](s64)
+    ; CHECK-NEXT: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[SUB]](s64)
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[UV]], [[UV2]]
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:vgpr(s32) = G_AND [[UV1]], [[UV3]]
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[AND]](s32), [[AND1]](s32)
+    %0:_(s64) = COPY $vgpr0_vgpr1
+    %1:_(s64) = COPY $vgpr2_vgpr3
+    %2:_(s64) = G_SUB %0, %1
+    %4:_(s64) = G_AND %2, %2
+...
+
+---
+name: usubo_s32_ss
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0, $sgpr1
+    ; CHECK-LABEL: name: usubo_s32_ss
+    ; CHECK: liveins: $sgpr0, $sgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
+    ; CHECK-NEXT: [[USUBO:%[0-9]+]]:sgpr(s32), [[USUBO1:%[0-9]+]]:sgpr(s32) = G_USUBO [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[USUBO]], [[USUBO]]
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s32) = COPY $sgpr1
+    %2:_(s32), %3:_(s1) = G_USUBO %0, %1
+    %5:_(s32) = G_AND %2, %2
+...
+
+---
+name: usubo_s32_sv
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0, $vgpr1
+    ; CHECK-LABEL: name: usubo_s32_sv
+    ; CHECK: liveins: $sgpr0, $vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
+    ; CHECK-NEXT: [[USUBO:%[0-9]+]]:vgpr(s32), [[USUBO1:%[0-9]+]]:vcc(s1) = G_USUBO [[COPY2]], [[COPY1]]
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[USUBO]], [[USUBO]]
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32), %3:_(s1) = G_USUBO %0, %1
+    %5:_(s32) = G_AND %2, %2
+...
+
+---
+name: usubo_s32_vs
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $sgpr1
+    ; CHECK-LABEL: name: usubo_s32_vs
+    ; CHECK: liveins: $vgpr0, $sgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
+    ; CHECK-NEXT: [[USUBO:%[0-9]+]]:vgpr(s32), [[USUBO1:%[0-9]+]]:vcc(s1) = G_USUBO [[COPY]], [[COPY2]]
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[USUBO]], [[USUBO]]
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $sgpr1
+    %2:_(s32), %3:_(s1) = G_USUBO %0, %1
+    %5:_(s32) = G_AND %2, %2
+...
+
+---
+name: usubo_s32_vv
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+    ; CHECK-LABEL: name: usubo_s32_vv
+    ; CHECK: liveins: $vgpr0, $vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+    ; CHECK-NEXT: [[USUBO:%[0-9]+]]:vgpr(s32), [[USUBO1:%[0-9]+]]:vcc(s1) = G_USUBO [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[USUBO]], [[USUBO]]
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32), %3:_(s1) = G_USUBO %0, %1
+    %5:_(s32) = G_AND %2, %2
+...
+
+---
+name: usube_s32_ss
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0, $sgpr1, $sgpr2
+    ; CHECK-LABEL: name: usube_s32_ss
+    ; CHECK: liveins: $sgpr0, $sgpr1, $sgpr2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
+    ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[COPY2]], [[C]]
+    ; CHECK-NEXT: [[USUBE:%[0-9]+]]:sgpr(s32), [[USUBE1:%[0-9]+]]:sgpr(s32) = G_USUBE [[COPY]], [[COPY1]], [[AND]]
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:sgpr(s32) = G_AND [[USUBE]], [[USUBE]]
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s32) = COPY $sgpr1
+    %2:_(s32) = COPY $sgpr2
+    %3:_(s1) = G_TRUNC %2
+    %4:_(s32), %5:_(s1) = G_USUBE %0, %1, %3
+    %7:_(s32) = G_AND %4, %4
+...
+
+---
+name: usube_s32_sv
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0, $vgpr1, $sgpr2
+    ; CHECK-LABEL: name: usube_s32_sv
+    ; CHECK: liveins: $sgpr0, $vgpr1, $sgpr2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
+    ; CHECK-NEXT: [[AMDGPU_COPY_VCC_SCC:%[0-9]+]]:vcc(s1) = G_AMDGPU_COPY_VCC_SCC [[COPY2]](s32)
+    ; CHECK-NEXT: [[USUBE:%[0-9]+]]:vgpr(s32), [[USUBE1:%[0-9]+]]:vcc(s1) = G_USUBE [[COPY3]], [[COPY1]], [[AMDGPU_COPY_VCC_SCC]]
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[USUBE]], [[USUBE]]
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = COPY $sgpr2
+    %3:_(s1) = G_TRUNC %2
+    %4:_(s32), %5:_(s1) = G_USUBE %0, %1, %3
+    %7:_(s32) = G_AND %4, %4
+...
+
+---
+name: usube_s32_vs
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $sgpr1, $sgpr2
+    ; CHECK-LABEL: name: usube_s32_vs
+    ; CHECK: liveins: $vgpr0, $sgpr1, $sgpr2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
+    ; CHECK-NEXT: [[AMDGPU_COPY_VCC_SCC:%[0-9]+]]:vcc(s1) = G_AMDGPU_COPY_VCC_SCC [[COPY2]](s32)
+    ; CHECK-NEXT: [[USUBE:%[0-9]+]]:vgpr(s32), [[USUBE1:%[0-9]+]]:vcc(s1) = G_USUBE [[COPY]], [[COPY3]], [[AMDGPU_COPY_VCC_SCC]]
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[USUBE]], [[USUBE]]
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $sgpr1
+    %2:_(s32) = COPY $sgpr2
+    %3:_(s1) = G_TRUNC %2
+    %4:_(s32), %5:_(s1) = G_USUBE %0, %1, %3
+    %7:_(s32) = G_AND %4, %4
+...
+
+---
+name: usube_s32_vv
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+    ; CHECK-LABEL: name: usube_s32_vv
+    ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
+    ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[COPY2]], [[C]]
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(ne), [[AND]](s32), [[C1]]
+    ; CHECK-NEXT: [[USUBE:%[0-9]+]]:vgpr(s32), [[USUBE1:%[0-9]+]]:vcc(s1) = G_USUBE [[COPY]], [[COPY1]], [[ICMP]]
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:vgpr(s32) = G_AND [[USUBE]], [[USUBE]]
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = COPY $vgpr2
+    %3:_(s1) = G_TRUNC %2
+    %4:_(s32), %5:_(s1) = G_USUBE %0, %1, %3
+    %7:_(s32) = G_AND %4, %4
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-zext.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-zext.mir
index 088c20a3137f7..d4baa5fb864fc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-zext.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-zext.mir
@@ -73,10 +73,14 @@ body: |
     ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
     ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[AND]](s32), [[C]], [[C1]]
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[SELECT]](s32)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:sgpr(s16) = G_CONSTANT i16 255
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:sgpr(s16) = G_AND [[TRUNC]], [[C2]]
     %0:_(s32) = COPY $sgpr0
     %1:_(s32) = COPY $sgpr1
     %2:_(s1) = G_ICMP intpred(eq), %0, %1
     %3:_(s16) = G_ZEXT %2
+    %4:_(s16) = G_CONSTANT i16 255
+    %5:_(s16) = G_AND %3, %4
 ...
 
 ---
@@ -209,9 +213,13 @@ body: |
     ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
     ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[AND]](s32), [[C]], [[C1]]
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[SELECT]](s32)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:sgpr(s16) = G_CONSTANT i16 255
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:sgpr(s16) = G_AND [[TRUNC]], [[C2]]
     %0:_(s32) = COPY $sgpr0
     %1:_(s1) = G_TRUNC %0
     %2:_(s16) = G_ZEXT %1
+    %3:_(s16) = G_CONSTANT i16 255
+    %4:_(s16) = G_AND %2, %3
 ...
 
 ---
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.ll
new file mode 100644
index 0000000000000..8b5958daac168
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.ll
@@ -0,0 +1,535 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefix=GFX12 %s
+
+define i16 @s_sub_i16(i16 inreg %a, i16 inreg %b) {
+; GFX7-LABEL: s_sub_i16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_sub_i32 s4, s16, s17
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_sub_i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_sub_i32 s4, s16, s17
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: s_sub_i16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_sub_i32 s4, s16, s17
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: s_sub_i16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_sub_i32 s4, s16, s17
+; GFX10-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: s_sub_i16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_sub_i32 s0, s0, s1
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: s_sub_i16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_sub_co_i32 s0, s0, s1
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %c = sub i16 %a, %b
+  ret i16 %c
+}
+
+define i16 @v_sub_i16(i16 %a, i16 %b) {
+; GFX7-LABEL: v_sub_i16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_sub_i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_sub_u16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_sub_i16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_sub_u16_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_sub_i16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_sub_nc_u16 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_sub_i16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_sub_nc_u16 v0.l, v0.l, v1.l
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_sub_i16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_sub_nc_u16 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %c = sub i16 %a, %b
+  ret i16 %c
+}
+
+define i32 @s_sub_i32(i32 inreg %a, i32 inreg %b) {
+; GFX7-LABEL: s_sub_i32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_sub_i32 s4, s16, s17
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_sub_i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_sub_i32 s4, s16, s17
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: s_sub_i32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_sub_i32 s4, s16, s17
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: s_sub_i32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_sub_i32 s4, s16, s17
+; GFX10-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: s_sub_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_sub_i32 s0, s0, s1
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: s_sub_i32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_sub_co_i32 s0, s0, s1
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %c = sub i32 %a, %b
+  ret i32 %c
+}
+
+define i32 @v_sub_i32(i32 %a, i32 %b) {
+; GFX7-LABEL: v_sub_i32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_sub_i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_sub_i32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_sub_i32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_sub_nc_u32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_sub_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_sub_nc_u32_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_sub_i32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_sub_nc_u32_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %c = sub i32 %a, %b
+  ret i32 %c
+}
+
+; TODO: Add test for s_sub_v2i16. Instruction selector currently fails
+; to handle G_UNMERGE_VALUES.
+
+define <2 x i16> @v_sub_v2i16(<2 x i16> %a, <2 x i16> %b) {
+; GFX7-LABEL: v_sub_v2i16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
+; GFX7-NEXT:    v_sub_i32_e32 v1, vcc, v1, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_sub_v2i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_sub_i16 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_sub_v2i16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_sub_u16_e32 v2, v0, v1
+; GFX8-NEXT:    v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_sub_v2i16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_sub_i16 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_sub_v2i16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_sub_i16 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_sub_v2i16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_sub_i16 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %c = sub <2 x i16> %a, %b
+  ret <2 x i16> %c
+}
+
+define i64 @s_sub_i64(i64 inreg %a, i64 inreg %b) {
+; GFX7-LABEL: s_sub_i64:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_sub_u32 s4, s16, s18
+; GFX7-NEXT:    s_subb_u32 s5, s17, s19
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_sub_i64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_sub_u32 s4, s16, s18
+; GFX9-NEXT:    s_subb_u32 s5, s17, s19
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: s_sub_i64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_sub_u32 s4, s16, s18
+; GFX8-NEXT:    s_subb_u32 s5, s17, s19
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: s_sub_i64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_sub_u32 s4, s16, s18
+; GFX10-NEXT:    s_subb_u32 s5, s17, s19
+; GFX10-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: s_sub_i64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_sub_u32 s0, s0, s2
+; GFX11-NEXT:    s_subb_u32 s1, s1, s3
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: s_sub_i64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_sub_nc_u64 s[0:1], s[0:1], s[2:3]
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %c = sub i64 %a, %b
+  ret i64 %c
+}
+
+define i64 @v_sub_i64(i64 %a, i64 %b) {
+; GFX7-LABEL: v_sub_i64:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
+; GFX7-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_sub_i64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_sub_i64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_sub_i64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_sub_i64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT:    v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_sub_i64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v2
+; GFX12-NEXT:    s_wait_alu 0xfffd
+; GFX12-NEXT:    v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %c = sub i64 %a, %b
+  ret i64 %c
+}
+
+define void @s_usubo_usube(i64 inreg %a, i64 inreg %b, ptr addrspace(1) %res, ptr addrspace(1) %carry) {
+; GFX7-LABEL: s_usubo_usube:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_sub_u32 s4, s16, s18
+; GFX7-NEXT:    s_subb_u32 s5, s17, s19
+; GFX7-NEXT:    v_mov_b32_e32 v4, s4
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v5, s5
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_store_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_usubo_usube:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_sub_u32 s4, s16, s18
+; GFX9-NEXT:    s_subb_u32 s5, s17, s19
+; GFX9-NEXT:    v_mov_b32_e32 v4, s4
+; GFX9-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v5, s5
+; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[4:5], off
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    global_store_dword v[2:3], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: s_usubo_usube:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_sub_u32 s4, s16, s18
+; GFX8-NEXT:    s_subb_u32 s5, s17, s19
+; GFX8-NEXT:    v_mov_b32_e32 v4, s4
+; GFX8-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v5, s5
+; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    flat_store_dword v[2:3], v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: s_usubo_usube:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_sub_u32 s4, s16, s18
+; GFX10-NEXT:    s_subb_u32 s5, s17, s19
+; GFX10-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX10-NEXT:    v_mov_b32_e32 v4, s4
+; GFX10-NEXT:    v_mov_b32_e32 v5, s5
+; GFX10-NEXT:    v_mov_b32_e32 v6, s6
+; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[4:5], off
+; GFX10-NEXT:    global_store_dword v[2:3], v6, off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: s_usubo_usube:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_sub_u32 s0, s0, s2
+; GFX11-NEXT:    s_subb_u32 s1, s1, s3
+; GFX11-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX11-NEXT:    v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
+; GFX11-NEXT:    v_mov_b32_e32 v6, s2
+; GFX11-NEXT:    global_store_b64 v[0:1], v[4:5], off
+; GFX11-NEXT:    global_store_b32 v[2:3], v6, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: s_usubo_usube:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_sub_co_u32 s0, s0, s2
+; GFX12-NEXT:    s_sub_co_ci_u32 s1, s1, s3
+; GFX12-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
+; GFX12-NEXT:    v_mov_b32_e32 v6, s2
+; GFX12-NEXT:    global_store_b64 v[0:1], v[4:5], off
+; GFX12-NEXT:    global_store_b32 v[2:3], v6, off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %usubo = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
+  %sub = extractvalue {i64, i1} %usubo, 0
+  %of = extractvalue {i64, i1} %usubo, 1
+  %of32 = select i1 %of, i32 1, i32 0
+  store i64 %sub, ptr addrspace(1) %res
+  store i32 %of32, ptr addrspace(1) %carry
+  ret void
+}
+
+define void @v_usubo_usube(i64 %a, i64 %b, ptr addrspace(1) %res, ptr addrspace(1) %carry) {
+; GFX7-LABEL: v_usubo_usube:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
+; GFX7-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_store_dword v2, v[6:7], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_usubo_usube:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT:    global_store_dwordx2 v[4:5], v[0:1], off
+; GFX9-NEXT:    global_store_dword v[6:7], v2, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_usubo_usube:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
+; GFX8-NEXT:    flat_store_dword v[6:7], v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_usubo_usube:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX10-NEXT:    global_store_dwordx2 v[4:5], v[0:1], off
+; GFX10-NEXT:    global_store_dword v[6:7], v2, off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_usubo_usube:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-NEXT:    global_store_b64 v[4:5], v[0:1], off
+; GFX11-NEXT:    global_store_b32 v[6:7], v2, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_usubo_usube:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v2
+; GFX12-NEXT:    s_wait_alu 0xfffd
+; GFX12-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-NEXT:    s_wait_alu 0xfffd
+; GFX12-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX12-NEXT:    global_store_b64 v[4:5], v[0:1], off
+; GFX12-NEXT:    global_store_b32 v[6:7], v2, off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %usubo = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
+  %sub = extractvalue {i64, i1} %usubo, 0
+  %of = extractvalue {i64, i1} %usubo, 1
+  %of32 = select i1 %of, i32 1, i32 0
+  store i64 %sub, ptr addrspace(1) %res
+  store i32 %of32, ptr addrspace(1) %carry
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-miscellaneous-uniform-intrinsic.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-miscellaneous-uniform-intrinsic.ll
new file mode 100644
index 0000000000000..34d4c519851d4
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-miscellaneous-uniform-intrinsic.ll
@@ -0,0 +1,173 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100  -o - %s | FileCheck %s
+define amdgpu_kernel void @readfirstlane_with_readfirstlane(ptr addrspace(1) %out) {
+; CHECK-LABEL: readfirstlane_with_readfirstlane:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 5
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    global_store_b32 v0, v1, s[0:1]
+; CHECK-NEXT:    s_endpgm
+  %v1 = call i32 @llvm.amdgcn.readfirstlane(i32 5)
+  %v2 = call i32 @llvm.amdgcn.readfirstlane(i32 %v1)
+  store i32 %v2, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @readfirstlane_with_readlane(ptr addrspace(1) %out) {
+; CHECK-LABEL: readfirstlane_with_readlane:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-NEXT:    v_bfe_u32 v1, v0, 10, 10
+; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; CHECK-NEXT:    v_readfirstlane_b32 s2, v1
+; CHECK-NEXT:    v_readlane_b32 s2, v0, s2
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; CHECK-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    global_store_b32 v0, v1, s[0:1]
+; CHECK-NEXT:    s_endpgm
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+  %v1 = call i32 @llvm.amdgcn.readlane(i32 %tidx, i32 %tidy)
+  %v2 = call i32 @llvm.amdgcn.readfirstlane(i32 %v1)
+  store i32 %v2, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @readlane_with_firstlane(ptr addrspace(1) %out) {
+; CHECK-LABEL: readlane_with_firstlane:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; CHECK-NEXT:    v_readfirstlane_b32 s2, v0
+; CHECK-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    global_store_b32 v0, v1, s[0:1]
+; CHECK-NEXT:    s_endpgm
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %v1 = call i32 @llvm.amdgcn.readfirstlane(i32 %tidx)
+  %v2 = call i32 @llvm.amdgcn.readlane(i32 %v1, i32 3)
+  store i32 %v2, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @readlane_readlane(ptr addrspace(1) %out) {
+; CHECK-LABEL: readlane_readlane:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-NEXT:    v_bfe_u32 v1, v0, 10, 10
+; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; CHECK-NEXT:    v_readfirstlane_b32 s2, v1
+; CHECK-NEXT:    v_readlane_b32 s2, v0, s2
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; CHECK-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    global_store_b32 v0, v1, s[0:1]
+; CHECK-NEXT:    s_endpgm
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+  %v1 = call i32 @llvm.amdgcn.readlane(i32 %tidx, i32 %tidy)
+  %v2 = call i32 @llvm.amdgcn.readlane(i32 %v1, i32 2)
+  store i32 %v2, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @permlane64_uniform(ptr addrspace(1) %out, i32 %src) {
+; CHECK-LABEL: permlane64_uniform:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; CHECK-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; CHECK-NEXT:    global_store_b32 v0, v1, s[0:1]
+; CHECK-NEXT:    s_endpgm
+  %v = call i32 @llvm.amdgcn.permlane64(i32 %src)
+  store i32 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @permlane64_nonuniform(i32 addrspace(1)* %out) {
+; CHECK-LABEL: permlane64_nonuniform:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; CHECK-NEXT:    v_permlane64_b32 v1, v0
+; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    global_store_b32 v0, v1, s[0:1]
+; CHECK-NEXT:    s_endpgm
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %v = call i32 @llvm.amdgcn.permlane64(i32 %tid)
+  %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  store i32 %v, i32 addrspace(1)* %out_ptr
+  ret void
+}
+
+define amdgpu_kernel void @permlane64_nonuniform_expression(i32 addrspace(1)* %out) {
+; CHECK-LABEL: permlane64_nonuniform_expression:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; CHECK-NEXT:    v_add_nc_u32_e32 v1, 1, v0
+; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; CHECK-NEXT:    v_permlane64_b32 v1, v1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    global_store_b32 v0, v1, s[0:1]
+; CHECK-NEXT:    s_endpgm
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid2 = add i32 %tid, 1
+  %v = call i32 @llvm.amdgcn.permlane64(i32 %tid2)
+  %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  store i32 %v, i32 addrspace(1)* %out_ptr
+  ret void
+}
+
+define protected amdgpu_kernel void @trivial_waterfall_eq_zero(ptr addrspace(1) %out) {
+; CHECK-LABEL: trivial_waterfall_eq_zero:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 5
+; CHECK-NEXT:    s_mov_b32 s2, 0
+; CHECK-NEXT:    s_branch .LBB7_2
+; CHECK-NEXT:  .LBB7_1: ; %Flow
+; CHECK-NEXT:    ; in Loop: Header=BB7_2 Depth=1
+; CHECK-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s2
+; CHECK-NEXT:    s_mov_b32 s2, -1
+; CHECK-NEXT:    s_cbranch_vccz .LBB7_4
+; CHECK-NEXT:  .LBB7_2: ; %while
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
+; CHECK-NEXT:    s_mov_b32 s2, -1
+; CHECK-NEXT:    s_cbranch_vccnz .LBB7_1
+; CHECK-NEXT:  ; %bb.3: ; %if
+; CHECK-NEXT:    ; in Loop: Header=BB7_2 Depth=1
+; CHECK-NEXT:    s_mov_b32 s2, 0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    global_store_b32 v0, v1, s[0:1]
+; CHECK-NEXT:    s_branch .LBB7_1
+; CHECK-NEXT:  .LBB7_4: ; %exit
+; CHECK-NEXT:    s_endpgm
+entry:
+  br label %while
+
+while:
+  %done = phi i1 [ 0, %entry ], [ 1, %if ]
+  %not_done = xor i1 %done, true
+  %ballot = tail call i64 @llvm.amdgcn.ballot.i64(i1 %not_done)
+  %is_done = icmp eq i64 %ballot, 0 ; in this case is_done = !not_done
+  br i1 %is_done, label %exit, label %if
+
+if:
+  store i32 5, ptr addrspace(1) %out
+  br label %while
+
+exit:
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll
index 6c4f504f3456c..c962c05d24ad0 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -amdgpu-enable-uniform-intrinsic-combine=0 -O3 -S < %s | FileCheck %s -check-prefix=CURRENT-CHECK
 ; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes=amdgpu-uniform-intrinsic-combine -S < %s | FileCheck %s -check-prefix=PASS-CHECK
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -amdgpu-uniform-intrinsic-combine -S < %s | FileCheck %s -check-prefix=PASS-CHECK
 ; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -O3 -S < %s | FileCheck %s -check-prefix=O3-CHECK
 
 define protected amdgpu_kernel void @trivial_waterfall_eq_zero(ptr addrspace(1) %out) {
@@ -23,7 +24,9 @@ define protected amdgpu_kernel void @trivial_waterfall_eq_zero(ptr addrspace(1)
 ; PASS-CHECK:       [[WHILE]]:
 ; PASS-CHECK-NEXT:    [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ]
 ; PASS-CHECK-NEXT:    [[NOT_DONE:%.*]] = xor i1 [[DONE]], true
+; PASS-CHECK-NEXT:    [[BALLOT:%.*]] = tail call i64 @llvm.amdgcn.ballot.i64(i1 [[NOT_DONE]])
 ; PASS-CHECK-NEXT:    [[TMP0:%.*]] = xor i1 [[NOT_DONE]], true
+; PASS-CHECK-NEXT:    [[IS_DONE:%.*]] = icmp eq i64 [[BALLOT]], 0
 ; PASS-CHECK-NEXT:    br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF]]
 ; PASS-CHECK:       [[IF]]:
 ; PASS-CHECK-NEXT:    store i32 5, ptr addrspace(1) [[OUT]], align 4
@@ -75,7 +78,9 @@ define protected amdgpu_kernel void @trivial_waterfall_eq_zero_swap_op(ptr addrs
 ; PASS-CHECK:       [[WHILE]]:
 ; PASS-CHECK-NEXT:    [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ]
 ; PASS-CHECK-NEXT:    [[NOT_DONE:%.*]] = xor i1 [[DONE]], true
+; PASS-CHECK-NEXT:    [[BALLOT:%.*]] = tail call i64 @llvm.amdgcn.ballot.i64(i1 [[NOT_DONE]])
 ; PASS-CHECK-NEXT:    [[TMP0:%.*]] = xor i1 [[NOT_DONE]], true
+; PASS-CHECK-NEXT:    [[IS_DONE:%.*]] = icmp eq i64 0, [[BALLOT]]
 ; PASS-CHECK-NEXT:    br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF]]
 ; PASS-CHECK:       [[IF]]:
 ; PASS-CHECK-NEXT:    store i32 5, ptr addrspace(1) [[OUT]], align 4
@@ -126,6 +131,8 @@ define protected amdgpu_kernel void @trivial_waterfall_ne_zero(ptr addrspace(1)
 ; PASS-CHECK-NEXT:    br label %[[WHILE:.*]]
 ; PASS-CHECK:       [[WHILE]]:
 ; PASS-CHECK-NEXT:    [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ]
+; PASS-CHECK-NEXT:    [[BALLOT:%.*]] = tail call i64 @llvm.amdgcn.ballot.i64(i1 [[DONE]])
+; PASS-CHECK-NEXT:    [[IS_DONE:%.*]] = icmp ne i64 0, [[BALLOT]]
 ; PASS-CHECK-NEXT:    br i1 [[DONE]], label %[[EXIT:.*]], label %[[IF]]
 ; PASS-CHECK:       [[IF]]:
 ; PASS-CHECK-NEXT:    store i32 5, ptr addrspace(1) [[OUT]], align 4
@@ -175,6 +182,8 @@ define protected amdgpu_kernel void @trivial_waterfall_ne_zero_swap(ptr addrspac
 ; PASS-CHECK-NEXT:    br label %[[WHILE:.*]]
 ; PASS-CHECK:       [[WHILE]]:
 ; PASS-CHECK-NEXT:    [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ]
+; PASS-CHECK-NEXT:    [[BALLOT:%.*]] = tail call i64 @llvm.amdgcn.ballot.i64(i1 [[DONE]])
+; PASS-CHECK-NEXT:    [[IS_DONE:%.*]] = icmp ne i64 [[BALLOT]], 0
 ; PASS-CHECK-NEXT:    br i1 [[DONE]], label %[[EXIT:.*]], label %[[IF]]
 ; PASS-CHECK:       [[IF]]:
 ; PASS-CHECK-NEXT:    store i32 5, ptr addrspace(1) [[OUT]], align 4
@@ -225,7 +234,9 @@ define protected amdgpu_kernel void @trivial_uniform_waterfall(ptr addrspace(1)
 ; PASS-CHECK:       [[WHILE]]:
 ; PASS-CHECK-NEXT:    [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ [[NEW_DONE:%.*]], %[[TAIL:.*]] ]
 ; PASS-CHECK-NEXT:    [[NOT_DONE:%.*]] = xor i1 [[DONE]], true
+; PASS-CHECK-NEXT:    [[BALLOT:%.*]] = tail call i64 @llvm.amdgcn.ballot.i64(i1 [[NOT_DONE]])
 ; PASS-CHECK-NEXT:    [[TMP0:%.*]] = xor i1 [[NOT_DONE]], true
+; PASS-CHECK-NEXT:    [[IS_DONE:%.*]] = icmp eq i64 [[BALLOT]], 0
 ; PASS-CHECK-NEXT:    br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF:.*]]
 ; PASS-CHECK:       [[IF]]:
 ; PASS-CHECK-NEXT:    [[IS_FIRST_ACTIVE_ID:%.*]] = icmp eq i32 0, 0
@@ -292,7 +303,9 @@ define protected amdgpu_kernel void @uniform_waterfall(ptr addrspace(1) %out, i3
 ; PASS-CHECK:       [[WHILE]]:
 ; PASS-CHECK-NEXT:    [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ [[NEW_DONE:%.*]], %[[TAIL:.*]] ]
 ; PASS-CHECK-NEXT:    [[NOT_DONE:%.*]] = xor i1 [[DONE]], true
+; PASS-CHECK-NEXT:    [[BALLOT:%.*]] = tail call i64 @llvm.amdgcn.ballot.i64(i1 [[NOT_DONE]])
 ; PASS-CHECK-NEXT:    [[TMP0:%.*]] = xor i1 [[NOT_DONE]], true
+; PASS-CHECK-NEXT:    [[IS_DONE:%.*]] = icmp eq i64 [[BALLOT]], 0
 ; PASS-CHECK-NEXT:    br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF:.*]]
 ; PASS-CHECK:       [[IF]]:
 ; PASS-CHECK-NEXT:    [[IS_FIRST_ACTIVE_ID:%.*]] = icmp eq i32 [[MYMASK]], [[MYMASK]]
@@ -359,7 +372,9 @@ define protected amdgpu_kernel void @trivial_waterfall_eq_zero_i32(ptr addrspace
 ; PASS-CHECK:       [[WHILE]]:
 ; PASS-CHECK-NEXT:    [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ]
 ; PASS-CHECK-NEXT:    [[NOT_DONE:%.*]] = xor i1 [[DONE]], true
+; PASS-CHECK-NEXT:    [[BALLOT:%.*]] = tail call i32 @llvm.amdgcn.ballot.i32(i1 [[NOT_DONE]])
 ; PASS-CHECK-NEXT:    [[TMP0:%.*]] = xor i1 [[NOT_DONE]], true
+; PASS-CHECK-NEXT:    [[IS_DONE:%.*]] = icmp eq i32 [[BALLOT]], 0
 ; PASS-CHECK-NEXT:    br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF]]
 ; PASS-CHECK:       [[IF]]:
 ; PASS-CHECK-NEXT:    store i32 5, ptr addrspace(1) [[OUT]], align 4
@@ -410,6 +425,8 @@ define protected amdgpu_kernel void @trivial_waterfall_ne_zero_i32(ptr addrspace
 ; PASS-CHECK-NEXT:    br label %[[WHILE:.*]]
 ; PASS-CHECK:       [[WHILE]]:
 ; PASS-CHECK-NEXT:    [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ]
+; PASS-CHECK-NEXT:    [[BALLOT:%.*]] = tail call i32 @llvm.amdgcn.ballot.i32(i1 [[DONE]])
+; PASS-CHECK-NEXT:    [[IS_DONE:%.*]] = icmp ne i32 0, [[BALLOT]]
 ; PASS-CHECK-NEXT:    br i1 [[DONE]], label %[[EXIT:.*]], label %[[IF]]
 ; PASS-CHECK:       [[IF]]:
 ; PASS-CHECK-NEXT:    store i32 5, ptr addrspace(1) [[OUT]], align 4
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
index aa11574517520..a7e828c95d69f 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -amdgpu-enable-uniform-intrinsic-combine=0 -O3 -S < %s | FileCheck %s -check-prefix=CURRENT-CHECK
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -amdgpu-uniform-intrinsic-combine -S < %s | FileCheck %s -check-prefix=PASS-CHECK
 ; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes=amdgpu-uniform-intrinsic-combine -S < %s | FileCheck %s -check-prefix=PASS-CHECK
 ; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes=amdgpu-uniform-intrinsic-combine,dce -S < %s | FileCheck %s -check-prefix=DCE-CHECK
 
@@ -595,6 +596,8 @@ define amdgpu_kernel void @ballot_i32(i32 %v, ptr addrspace(1) %out) {
 ; PASS-CHECK-LABEL: define amdgpu_kernel void @ballot_i32(
 ; PASS-CHECK-SAME: i32 [[V:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
 ; PASS-CHECK-NEXT:    [[C:%.*]] = trunc i32 [[V]] to i1
+; PASS-CHECK-NEXT:    [[BALLOT:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[C]])
+; PASS-CHECK-NEXT:    [[BALLOT_NE_ZERO:%.*]] = icmp ne i32 [[BALLOT]], 0
 ; PASS-CHECK-NEXT:    store i1 [[C]], ptr addrspace(1) [[OUT]], align 1
 ; PASS-CHECK-NEXT:    ret void
 ;
@@ -623,6 +626,8 @@ define amdgpu_kernel void @ballot_i64(i32 %v, ptr addrspace(1) %out) {
 ; PASS-CHECK-LABEL: define amdgpu_kernel void @ballot_i64(
 ; PASS-CHECK-SAME: i32 [[V:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
 ; PASS-CHECK-NEXT:    [[C:%.*]] = trunc i32 [[V]] to i1
+; PASS-CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[C]])
+; PASS-CHECK-NEXT:    [[BALLOT_NE_ZERO:%.*]] = icmp ne i64 [[BALLOT]], 0
 ; PASS-CHECK-NEXT:    store i1 [[C]], ptr addrspace(1) [[OUT]], align 1
 ; PASS-CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-temporal-divergence.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-temporal-divergence.ll
index 2fde3e3759f47..792926154f7a8 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-temporal-divergence.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-temporal-divergence.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes=amdgpu-uniform-intrinsic-combine -S < %s | FileCheck %s -check-prefix=PASS-CHECK
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -amdgpu-uniform-intrinsic-combine -S < %s | FileCheck %s -check-prefix=PASS-CHECK
 ; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes=amdgpu-uniform-intrinsic-combine,instcombine,early-cse,simplifycfg -S < %s | FileCheck %s -check-prefix=COMB-CHECK
 
 ; This should not be optimized
diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-wwm.ll b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-wwm.ll
index db32135939a5d..b8f084d5f82ad 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-wwm.ll
+++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-wwm.ll
@@ -4,24 +4,14 @@
 define amdgpu_gs i32 @main() {
 ; CHECK-LABEL: main:
 ; CHECK:       ; %bb.0: ; %bb
-; CHECK-NEXT:    s_bitcmp1_b32 0, 0
 ; CHECK-NEXT:    s_mov_b32 s0, 0
-; CHECK-NEXT:    s_cselect_b32 s1, -1, 0
-; CHECK-NEXT:    s_or_saveexec_b32 s2, -1
-; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s1
-; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; CHECK-NEXT:    v_readfirstlane_b32 s1, v0
-; CHECK-NEXT:    s_mov_b32 exec_lo, s2
-; CHECK-NEXT:    s_or_b32 s0, s0, s1
-; CHECK-NEXT:    s_wait_alu 0xfffe
+; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; CHECK-NEXT:    s_bitcmp1_b32 s0, 0
 ; CHECK-NEXT:    s_cselect_b32 s0, -1, 0
-; CHECK-NEXT:    s_wait_alu 0xfffe
 ; CHECK-NEXT:    s_xor_b32 s0, s0, -1
-; CHECK-NEXT:    s_wait_alu 0xfffe
-; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
-; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; CHECK-NEXT:    v_readfirstlane_b32 s0, v1
+; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
 ; CHECK-NEXT:    s_wait_alu 0xf1ff
 ; CHECK-NEXT:    ; return to shader part epilog
 bb:
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
index 3aa36635a0ab6..704ea37117f32 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
@@ -9,11 +9,11 @@
 ; RUN:   | FileCheck -check-prefix=GCN-O3 %s
 
 
-; GCN-O0: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp<O0>),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(atomic-expand,verify,gc-lowering,lower-constant-intrinsics,unreachableblockelim,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa,require<uniformity>,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,localstackalloc))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,phi-node-elimination,two-address-instruction,regallocfast,si-fix-vgpr-copies,remove-redundant-debug-values,fixup-statepoint-caller-saved,prolog-epilog,post-ra-pseudos,si-post-ra-bundler,fentry-insert,xray-instrumentation,patchable-function,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function))
+; GCN-O0: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp<O0>),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-uniform-intrinsic-combine),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(atomic-expand,verify,gc-lowering,lower-constant-intrinsics,unreachableblockelim,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa,require<uniformity>,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,localstackalloc))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,phi-node-elimination,two-address-instruction,regallocfast,si-fix-vgpr-copies,remove-redundant-debug-values,fixup-statepoint-caller-saved,prolog-epilog,post-ra-pseudos,si-post-ra-bundler,fentry-insert,xray-instrumentation,patchable-function,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function))
 
-; GCN-O2: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp<O2>),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,early-cse<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm<allowspeculation>),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,early-cse<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments,codegenprepare,load-store-vectorizer),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require<uniformity>,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require<live-vars>,si-opt-vgpr-liverange,require<machine-loops>,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy<sgpr>,virt-reg-rewriter<no-clear-vregs>,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy<wwm>,si-lower-wwm-copies,virt-reg-rewriter<no-clear-vregs>,amdgpu-reserve-wwm-regs,greedy<vgpr>,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function))
+; GCN-O2: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp<O2>),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt,amdgpu-uniform-intrinsic-combine),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,early-cse<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm<allowspeculation>),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,early-cse<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments,codegenprepare,load-store-vectorizer),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require<uniformity>,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require<live-vars>,si-opt-vgpr-liverange,require<machine-loops>,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy<sgpr>,virt-reg-rewriter<no-clear-vregs>,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy<wwm>,si-lower-wwm-copies,virt-reg-rewriter<no-clear-vregs>,amdgpu-reserve-wwm-regs,greedy<vgpr>,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function))
 
-; GCN-O3: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp<O3>),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,gvn<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm<allowspeculation>),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,gvn<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments,codegenprepare,load-store-vectorizer),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require<uniformity>,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require<live-vars>,si-opt-vgpr-liverange,require<machine-loops>,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy<sgpr>,virt-reg-rewriter<no-clear-vregs>,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy<wwm>,si-lower-wwm-copies,virt-reg-rewriter<no-clear-vregs>,amdgpu-reserve-wwm-regs,greedy<vgpr>,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function))
+; GCN-O3: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp<O3>),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt,amdgpu-uniform-intrinsic-combine),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,gvn<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm<allowspeculation>),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,gvn<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments,codegenprepare,load-store-vectorizer),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require<uniformity>,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require<live-vars>,si-opt-vgpr-liverange,require<machine-loops>,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy<sgpr>,virt-reg-rewriter<no-clear-vregs>,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy<wwm>,si-lower-wwm-copies,virt-reg-rewriter<no-clear-vregs>,amdgpu-reserve-wwm-regs,greedy<vgpr>,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function))
 
 define void @empty() {
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 6e5212580ba2e..ee6caab6f25cd 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -31,6 +31,11 @@
 ; GCN-O0-NEXT:    AMDGPU Remove Incompatible Functions
 ; GCN-O0-NEXT:    AMDGPU Printf lowering
 ; GCN-O0-NEXT:    Lower ctors and dtors for AMDGPU
+; GCN-O0-NEXT:    FunctionPass Manager
+; GCN-O0-NEXT:      Dominator Tree Construction
+; GCN-O0-NEXT:      Cycle Info Analysis
+; GCN-O0-NEXT:      Uniformity Analysis
+; GCN-O0-NEXT:      AMDGPU Uniform Intrinsic Combine
 ; GCN-O0-NEXT:    Expand variadic functions
 ; GCN-O0-NEXT:    AMDGPU Inline All Functions
 ; GCN-O0-NEXT:    Inliner for always_inline functions
@@ -179,6 +184,11 @@
 ; GCN-O1-NEXT:    AMDGPU Remove Incompatible Functions
 ; GCN-O1-NEXT:    AMDGPU Printf lowering
 ; GCN-O1-NEXT:    Lower ctors and dtors for AMDGPU
+; GCN-O1-NEXT:    FunctionPass Manager
+; GCN-O1-NEXT:      Dominator Tree Construction
+; GCN-O1-NEXT:      Cycle Info Analysis
+; GCN-O1-NEXT:      Uniformity Analysis
+; GCN-O1-NEXT:      AMDGPU Uniform Intrinsic Combine
 ; GCN-O1-NEXT:    Expand variadic functions
 ; GCN-O1-NEXT:    AMDGPU Inline All Functions
 ; GCN-O1-NEXT:    Inliner for always_inline functions
@@ -466,6 +476,11 @@
 ; GCN-O1-OPTS-NEXT:    AMDGPU Remove Incompatible Functions
 ; GCN-O1-OPTS-NEXT:    AMDGPU Printf lowering
 ; GCN-O1-OPTS-NEXT:    Lower ctors and dtors for AMDGPU
+; GCN-O1-OPTS-NEXT:    FunctionPass Manager
+; GCN-O1-OPTS-NEXT:      Dominator Tree Construction
+; GCN-O1-OPTS-NEXT:      Cycle Info Analysis
+; GCN-O1-OPTS-NEXT:      Uniformity Analysis
+; GCN-O1-OPTS-NEXT:      AMDGPU Uniform Intrinsic Combine
 ; GCN-O1-OPTS-NEXT:    Expand variadic functions
 ; GCN-O1-OPTS-NEXT:    AMDGPU Inline All Functions
 ; GCN-O1-OPTS-NEXT:    Inliner for always_inline functions
@@ -783,6 +798,10 @@
 ; GCN-O2-NEXT:    Lower ctors and dtors for AMDGPU
 ; GCN-O2-NEXT:    FunctionPass Manager
 ; GCN-O2-NEXT:      AMDGPU Image Intrinsic Optimizer
+; GCN-O2-NEXT:      Dominator Tree Construction
+; GCN-O2-NEXT:      Cycle Info Analysis
+; GCN-O2-NEXT:      Uniformity Analysis
+; GCN-O2-NEXT:      AMDGPU Uniform Intrinsic Combine
 ; GCN-O2-NEXT:    Expand variadic functions
 ; GCN-O2-NEXT:    AMDGPU Inline All Functions
 ; GCN-O2-NEXT:    Inliner for always_inline functions
@@ -1104,6 +1123,10 @@
 ; GCN-O3-NEXT:    Lower ctors and dtors for AMDGPU
 ; GCN-O3-NEXT:    FunctionPass Manager
 ; GCN-O3-NEXT:      AMDGPU Image Intrinsic Optimizer
+; GCN-O3-NEXT:      Dominator Tree Construction
+; GCN-O3-NEXT:      Cycle Info Analysis
+; GCN-O3-NEXT:      Uniformity Analysis
+; GCN-O3-NEXT:      AMDGPU Uniform Intrinsic Combine
 ; GCN-O3-NEXT:    Expand variadic functions
 ; GCN-O3-NEXT:    AMDGPU Inline All Functions
 ; GCN-O3-NEXT:    Inliner for always_inline functions
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
index e00e1f13b2b77..c1f3a12dba578 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
@@ -110,9 +110,8 @@ false:
 define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_non_compare(i32 inreg %v) {
 ; CHECK-LABEL: branch_uniform_ballot_ne_zero_non_compare:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_and_b32 s0, s0, 1
-; CHECK-NEXT:    v_cmp_ne_u32_e64 vcc_lo, s0, 0
-; CHECK-NEXT:    s_cbranch_vccz .LBB8_2
+; CHECK-NEXT:    s_bitcmp0_b32 s0, 0
+; CHECK-NEXT:    s_cbranch_scc1 .LBB8_2
 ; CHECK-NEXT:  ; %bb.1: ; %true
 ; CHECK-NEXT:    s_mov_b32 s0, 42
 ; CHECK-NEXT:    s_branch .LBB8_3
@@ -156,15 +155,16 @@ false:
 define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_non_compare(i32 inreg %v) {
 ; CHECK-LABEL: branch_uniform_ballot_eq_zero_non_compare:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_and_b32 s0, s0, 1
-; CHECK-NEXT:    v_cmp_ne_u32_e64 vcc_lo, s0, 0
-; CHECK-NEXT:    s_cbranch_vccz .LBB10_2
-; CHECK-NEXT:  ; %bb.1: ; %false
-; CHECK-NEXT:    s_mov_b32 s0, 33
-; CHECK-NEXT:    s_branch .LBB10_3
-; CHECK-NEXT:  .LBB10_2: ; %true
+; CHECK-NEXT:    s_bitcmp1_b32 s0, 0
+; CHECK-NEXT:    s_cselect_b32 s0, -1, 0
+; CHECK-NEXT:    s_and_b32 vcc_lo, exec_lo, s0
+; CHECK-NEXT:    s_cbranch_vccnz .LBB10_2
+; CHECK-NEXT:  ; %bb.1: ; %true
 ; CHECK-NEXT:    s_mov_b32 s0, 42
 ; CHECK-NEXT:    s_branch .LBB10_3
+; CHECK-NEXT:  .LBB10_2: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB10_3
 ; CHECK-NEXT:  .LBB10_3:
   %c = trunc i32 %v to i1
   %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
@@ -201,8 +201,8 @@ false:
 define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_compare(i32 inreg %v) {
 ; CHECK-LABEL: branch_uniform_ballot_ne_zero_compare:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    v_cmp_lt_u32_e64 vcc_lo, s0, 12
-; CHECK-NEXT:    s_cbranch_vccz .LBB12_2
+; CHECK-NEXT:    s_cmp_gt_u32 s0, 11
+; CHECK-NEXT:    s_cbranch_scc1 .LBB12_2
 ; CHECK-NEXT:  ; %bb.1: ; %true
 ; CHECK-NEXT:    s_mov_b32 s0, 42
 ; CHECK-NEXT:    s_branch .LBB12_3
@@ -245,14 +245,14 @@ false:
 define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_compare(i32 inreg %v) {
 ; CHECK-LABEL: branch_uniform_ballot_eq_zero_compare:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    v_cmp_lt_u32_e64 vcc_lo, s0, 12
-; CHECK-NEXT:    s_cbranch_vccz .LBB14_2
-; CHECK-NEXT:  ; %bb.1: ; %false
-; CHECK-NEXT:    s_mov_b32 s0, 33
-; CHECK-NEXT:    s_branch .LBB14_3
-; CHECK-NEXT:  .LBB14_2: ; %true
+; CHECK-NEXT:    s_cmp_lt_u32 s0, 12
+; CHECK-NEXT:    s_cbranch_scc1 .LBB14_2
+; CHECK-NEXT:  ; %bb.1: ; %true
 ; CHECK-NEXT:    s_mov_b32 s0, 42
 ; CHECK-NEXT:    s_branch .LBB14_3
+; CHECK-NEXT:  .LBB14_2: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB14_3
 ; CHECK-NEXT:  .LBB14_3:
   %c = icmp ult i32 %v, 12
   %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
@@ -293,13 +293,13 @@ false:
 define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_and(i32 inreg %v1, i32 inreg %v2) {
 ; CHECK-LABEL: branch_uniform_ballot_ne_zero_and:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_cmp_lt_u32 s0, 12
+; CHECK-NEXT:    s_cmp_gt_u32 s0, 11
 ; CHECK-NEXT:    s_cselect_b32 s0, -1, 0
-; CHECK-NEXT:    s_cmp_gt_u32 s1, 34
+; CHECK-NEXT:    s_cmp_lt_u32 s1, 35
 ; CHECK-NEXT:    s_cselect_b32 s1, -1, 0
-; CHECK-NEXT:    s_and_b32 s0, s0, s1
-; CHECK-NEXT:    s_and_b32 s0, s0, exec_lo
-; CHECK-NEXT:    s_cbranch_scc0 .LBB16_2
+; CHECK-NEXT:    s_or_b32 s0, s0, s1
+; CHECK-NEXT:    s_and_b32 vcc_lo, exec_lo, s0
+; CHECK-NEXT:    s_cbranch_vccnz .LBB16_2
 ; CHECK-NEXT:  ; %bb.1: ; %true
 ; CHECK-NEXT:    s_mov_b32 s0, 42
 ; CHECK-NEXT:    s_branch .LBB16_3
@@ -353,14 +353,14 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_and(i32 inreg %v1, i32 inreg
 ; CHECK-NEXT:    s_cmp_gt_u32 s1, 34
 ; CHECK-NEXT:    s_cselect_b32 s1, -1, 0
 ; CHECK-NEXT:    s_and_b32 s0, s0, s1
-; CHECK-NEXT:    s_and_b32 s0, s0, exec_lo
-; CHECK-NEXT:    s_cbranch_scc0 .LBB18_2
-; CHECK-NEXT:  ; %bb.1: ; %false
-; CHECK-NEXT:    s_mov_b32 s0, 33
-; CHECK-NEXT:    s_branch .LBB18_3
-; CHECK-NEXT:  .LBB18_2: ; %true
+; CHECK-NEXT:    s_and_b32 vcc_lo, exec_lo, s0
+; CHECK-NEXT:    s_cbranch_vccnz .LBB18_2
+; CHECK-NEXT:  ; %bb.1: ; %true
 ; CHECK-NEXT:    s_mov_b32 s0, 42
 ; CHECK-NEXT:    s_branch .LBB18_3
+; CHECK-NEXT:  .LBB18_2: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB18_3
 ; CHECK-NEXT:  .LBB18_3:
   %v1c = icmp ult i32 %v1, 12
   %v2c = icmp ugt i32 %v2, 34
@@ -591,3 +591,24 @@ exit:
   store i32 %ballot, ptr addrspace(1) %out
   ret void
 }
+
+define amdgpu_cs i32 @compare_bfloats(bfloat %x, bfloat %y) {
+; GFX10-LABEL: compare_bfloats:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s0, v0, v1
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: compare_bfloats:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_mov_b16_e32 v2.l, 0
+; GFX11-NEXT:    v_mov_b16_e32 v2.h, v1.l
+; GFX11-NEXT:    v_mov_b16_e32 v1.h, v0.l
+; GFX11-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; GFX11-NEXT:    v_cmp_gt_f32_e64 s0, v1, v2
+; GFX11-NEXT:    ; return to shader part epilog
+  %cmp = fcmp ogt bfloat %x, %y
+  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %cmp)
+  ret i32 %ballot
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll
index b4adf7f641550..827a01ff33d02 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll
@@ -113,9 +113,8 @@ false:
 define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_non_compare(i32 inreg %v) {
 ; CHECK-LABEL: branch_uniform_ballot_ne_zero_non_compare:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_and_b32 s0, s0, 1
-; CHECK-NEXT:    v_cmp_ne_u32_e64 vcc, s0, 0
-; CHECK-NEXT:    s_cbranch_vccz .LBB8_2
+; CHECK-NEXT:    s_bitcmp0_b32 s0, 0
+; CHECK-NEXT:    s_cbranch_scc1 .LBB8_2
 ; CHECK-NEXT:  ; %bb.1: ; %true
 ; CHECK-NEXT:    s_mov_b32 s0, 42
 ; CHECK-NEXT:    s_branch .LBB8_3
@@ -159,15 +158,16 @@ false:
 define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_non_compare(i32 inreg %v) {
 ; CHECK-LABEL: branch_uniform_ballot_eq_zero_non_compare:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_and_b32 s0, s0, 1
-; CHECK-NEXT:    v_cmp_ne_u32_e64 vcc, s0, 0
-; CHECK-NEXT:    s_cbranch_vccz .LBB10_2
-; CHECK-NEXT:  ; %bb.1: ; %false
-; CHECK-NEXT:    s_mov_b32 s0, 33
-; CHECK-NEXT:    s_branch .LBB10_3
-; CHECK-NEXT:  .LBB10_2: ; %true
+; CHECK-NEXT:    s_bitcmp1_b32 s0, 0
+; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; CHECK-NEXT:    s_cbranch_vccnz .LBB10_2
+; CHECK-NEXT:  ; %bb.1: ; %true
 ; CHECK-NEXT:    s_mov_b32 s0, 42
 ; CHECK-NEXT:    s_branch .LBB10_3
+; CHECK-NEXT:  .LBB10_2: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB10_3
 ; CHECK-NEXT:  .LBB10_3:
   %c = trunc i32 %v to i1
   %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
@@ -204,8 +204,8 @@ false:
 define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_compare(i32 inreg %v) {
 ; CHECK-LABEL: branch_uniform_ballot_ne_zero_compare:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    v_cmp_lt_u32_e64 vcc, s0, 12
-; CHECK-NEXT:    s_cbranch_vccz .LBB12_2
+; CHECK-NEXT:    s_cmp_gt_u32 s0, 11
+; CHECK-NEXT:    s_cbranch_scc1 .LBB12_2
 ; CHECK-NEXT:  ; %bb.1: ; %true
 ; CHECK-NEXT:    s_mov_b32 s0, 42
 ; CHECK-NEXT:    s_branch .LBB12_3
@@ -248,14 +248,14 @@ false:
 define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_compare(i32 inreg %v) {
 ; CHECK-LABEL: branch_uniform_ballot_eq_zero_compare:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    v_cmp_lt_u32_e64 vcc, s0, 12
-; CHECK-NEXT:    s_cbranch_vccz .LBB14_2
-; CHECK-NEXT:  ; %bb.1: ; %false
-; CHECK-NEXT:    s_mov_b32 s0, 33
-; CHECK-NEXT:    s_branch .LBB14_3
-; CHECK-NEXT:  .LBB14_2: ; %true
+; CHECK-NEXT:    s_cmp_lt_u32 s0, 12
+; CHECK-NEXT:    s_cbranch_scc1 .LBB14_2
+; CHECK-NEXT:  ; %bb.1: ; %true
 ; CHECK-NEXT:    s_mov_b32 s0, 42
 ; CHECK-NEXT:    s_branch .LBB14_3
+; CHECK-NEXT:  .LBB14_2: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB14_3
 ; CHECK-NEXT:  .LBB14_3:
   %c = icmp ult i32 %v, 12
   %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
@@ -296,13 +296,13 @@ false:
 define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_and(i32 inreg %v1, i32 inreg %v2) {
 ; CHECK-LABEL: branch_uniform_ballot_ne_zero_and:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_cmp_lt_u32 s0, 12
+; CHECK-NEXT:    s_cmp_gt_u32 s0, 11
 ; CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; CHECK-NEXT:    s_cmp_gt_u32 s1, 34
+; CHECK-NEXT:    s_cmp_lt_u32 s1, 35
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; CHECK-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
-; CHECK-NEXT:    s_and_b64 s[0:1], s[0:1], exec
-; CHECK-NEXT:    s_cbranch_scc0 .LBB16_2
+; CHECK-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
+; CHECK-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; CHECK-NEXT:    s_cbranch_vccnz .LBB16_2
 ; CHECK-NEXT:  ; %bb.1: ; %true
 ; CHECK-NEXT:    s_mov_b32 s0, 42
 ; CHECK-NEXT:    s_branch .LBB16_3
@@ -356,14 +356,14 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_and(i32 inreg %v1, i32 inreg
 ; CHECK-NEXT:    s_cmp_gt_u32 s1, 34
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
-; CHECK-NEXT:    s_and_b64 s[0:1], s[0:1], exec
-; CHECK-NEXT:    s_cbranch_scc0 .LBB18_2
-; CHECK-NEXT:  ; %bb.1: ; %false
-; CHECK-NEXT:    s_mov_b32 s0, 33
-; CHECK-NEXT:    s_branch .LBB18_3
-; CHECK-NEXT:  .LBB18_2: ; %true
+; CHECK-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; CHECK-NEXT:    s_cbranch_vccnz .LBB18_2
+; CHECK-NEXT:  ; %bb.1: ; %true
 ; CHECK-NEXT:    s_mov_b32 s0, 42
 ; CHECK-NEXT:    s_branch .LBB18_3
+; CHECK-NEXT:  .LBB18_2: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB18_3
 ; CHECK-NEXT:  .LBB18_3:
   %v1c = icmp ult i32 %v1, 12
   %v2c = icmp ugt i32 %v2, 34
@@ -557,3 +557,15 @@ exit:
   store i64 %ballot, ptr addrspace(1) %out
   ret void
 }
+
+define amdgpu_cs i64 @compare_bfloats(bfloat %x, bfloat %y) {
+; CHECK-LABEL: compare_bfloats:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; CHECK-NEXT:    v_cmp_gt_f32_e64 s[0:1], v0, v1
+; CHECK-NEXT:    ; return to shader part epilog
+  %cmp = fcmp ogt bfloat %x, %y
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
+  ret i64 %ballot
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.gfx90a.ll
index 49607e320bd0a..83f0229aea326 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.gfx90a.ll
@@ -92,8 +92,7 @@ define amdgpu_ps void @atomic_swap_1d_agpr_noret(<8 x i32> inreg %rsrc, i32 %s)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a0
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_accvgpr_read_b32 v1, a0
-; GFX90A-NEXT:    image_atomic_swap v1, v0, s[0:7] dmask:0x1 unorm glc
+; GFX90A-NEXT:    image_atomic_swap a0, v0, s[0:7] dmask:0x1 unorm
 ; GFX90A-NEXT:    s_endpgm
   %data = call i32 asm "; def $0", "=a"()
   %unused = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -106,8 +105,7 @@ define amdgpu_ps void @atomic_add_2d_agpr_noret(<8 x i32> inreg %rsrc, i32 %s, i
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a0
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_accvgpr_read_b32 v2, a0
-; GFX90A-NEXT:    image_atomic_add v2, v[0:1], s[0:7] dmask:0x1 unorm glc
+; GFX90A-NEXT:    image_atomic_add a0, v[0:1], s[0:7] dmask:0x1 unorm
 ; GFX90A-NEXT:    s_endpgm
   %data = call i32 asm "; def $0", "=a"()
   %unused = call i32 @llvm.amdgcn.image.atomic.add.2d.i32.i32(i32 %data, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
@@ -123,9 +121,7 @@ define amdgpu_ps void @atomic_cmpswap_1d_agpr_noret(<8 x i32> inreg %rsrc, i32 %
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a1
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_accvgpr_read_b32 v2, a0
-; GFX90A-NEXT:    v_accvgpr_read_b32 v3, a1
-; GFX90A-NEXT:    image_atomic_cmpswap v[2:3], v0, s[0:7] dmask:0x3 unorm glc
+; GFX90A-NEXT:    image_atomic_cmpswap a[0:1], v0, s[0:7] dmask:0x3 unorm
 ; GFX90A-NEXT:    s_endpgm
   %cmp = call i32 asm "; def $0", "=a"()
   %swap = call i32 asm "; def $0", "=a"()
@@ -139,9 +135,7 @@ define amdgpu_ps void @atomic_swap_1d_i64_agpr_noret(<8 x i32> inreg %rsrc, i32
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_accvgpr_read_b32 v3, a1
-; GFX90A-NEXT:    v_accvgpr_read_b32 v2, a0
-; GFX90A-NEXT:    image_atomic_swap v[2:3], v0, s[0:7] dmask:0x3 unorm glc
+; GFX90A-NEXT:    image_atomic_swap a[0:1], v0, s[0:7] dmask:0x3 unorm
 ; GFX90A-NEXT:    s_endpgm
   %data = call i64 asm "; def $0", "=a"()
   %unused = call i64 @llvm.amdgcn.image.atomic.swap.1d.i64.i32(i64 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -154,14 +148,10 @@ define amdgpu_ps void @atomic_cmpswap_1d_64_agpr_noret(<8 x i32> inreg %rsrc, i3
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_accvgpr_read_b32 v3, a1
-; GFX90A-NEXT:    v_accvgpr_read_b32 v2, a0
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def a[0:1]
+; GFX90A-NEXT:    ; def a[2:3]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_accvgpr_read_b32 v5, a1
-; GFX90A-NEXT:    v_accvgpr_read_b32 v4, a0
-; GFX90A-NEXT:    image_atomic_cmpswap v[2:5], v0, s[0:7] dmask:0xf unorm glc
+; GFX90A-NEXT:    image_atomic_cmpswap a[0:3], v0, s[0:7] dmask:0xf unorm
 ; GFX90A-NEXT:    s_endpgm
   %cmp = call i64 asm "; def $0", "=a"()
   %swap = call i64 asm "; def $0", "=a"()
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.noret.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.noret.ll
new file mode 100644
index 0000000000000..6c58a1a30bd4c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.noret.ll
@@ -0,0 +1,581 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX10PLUS-GISE %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX10PLUS %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12-GISE %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12 %s
+
+define amdgpu_ps void @atomic_swap_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+; GFX10PLUS-GISE-LABEL: atomic_swap_1d:
+; GFX10PLUS-GISE:       ; %bb.0:
+; GFX10PLUS-GISE-NEXT:    image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-GISE-NEXT:    s_endpgm
+;
+; GFX10PLUS-LABEL: atomic_swap_1d:
+; GFX10PLUS:       ; %bb.0:
+; GFX10PLUS-NEXT:    image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-NEXT:    s_endpgm
+;
+; GFX12-GISE-LABEL: atomic_swap_1d:
+; GFX12-GISE:       ; %bb.0:
+; GFX12-GISE-NEXT:    image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-GISE-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_swap_1d:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-NEXT:    s_endpgm
+  %v = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @atomic_swap_1d_i64(<8 x i32> inreg %rsrc, i64 %data, i32 %s) {
+; GFX10PLUS-GISE-LABEL: atomic_swap_1d_i64:
+; GFX10PLUS-GISE:       ; %bb.0:
+; GFX10PLUS-GISE-NEXT:    image_atomic_swap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-GISE-NEXT:    s_endpgm
+;
+; GFX10PLUS-LABEL: atomic_swap_1d_i64:
+; GFX10PLUS:       ; %bb.0:
+; GFX10PLUS-NEXT:    image_atomic_swap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-NEXT:    s_endpgm
+;
+; GFX12-GISE-LABEL: atomic_swap_1d_i64:
+; GFX12-GISE:       ; %bb.0:
+; GFX12-GISE-NEXT:    image_atomic_swap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D
+; GFX12-GISE-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_swap_1d_i64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    image_atomic_swap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D
+; GFX12-NEXT:    s_endpgm
+  %v = call i64 @llvm.amdgcn.image.atomic.swap.1d.i64.i32(i64 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @atomic_swap_1d_float(<8 x i32> inreg %rsrc, float %data, i32 %s) {
+; GFX10PLUS-GISE-LABEL: atomic_swap_1d_float:
+; GFX10PLUS-GISE:       ; %bb.0:
+; GFX10PLUS-GISE-NEXT:    image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-GISE-NEXT:    s_endpgm
+;
+; GFX10PLUS-LABEL: atomic_swap_1d_float:
+; GFX10PLUS:       ; %bb.0:
+; GFX10PLUS-NEXT:    image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-NEXT:    s_endpgm
+;
+; GFX12-GISE-LABEL: atomic_swap_1d_float:
+; GFX12-GISE:       ; %bb.0:
+; GFX12-GISE-NEXT:    image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-GISE-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_swap_1d_float:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-NEXT:    s_endpgm
+  %v = call float @llvm.amdgcn.image.atomic.swap.1d.f32.i32(float %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @atomic_add_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+; GFX10PLUS-GISE-LABEL: atomic_add_1d:
+; GFX10PLUS-GISE:       ; %bb.0:
+; GFX10PLUS-GISE-NEXT:    image_atomic_add v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-GISE-NEXT:    s_endpgm
+;
+; GFX10PLUS-LABEL: atomic_add_1d:
+; GFX10PLUS:       ; %bb.0:
+; GFX10PLUS-NEXT:    image_atomic_add v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-NEXT:    s_endpgm
+;
+; GFX12-GISE-LABEL: atomic_add_1d:
+; GFX12-GISE:       ; %bb.0:
+; GFX12-GISE-NEXT:    image_atomic_add_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-GISE-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_add_1d:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    image_atomic_add_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-NEXT:    s_endpgm
+  %v = call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @atomic_sub_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+; GFX10PLUS-GISE-LABEL: atomic_sub_1d:
+; GFX10PLUS-GISE:       ; %bb.0:
+; GFX10PLUS-GISE-NEXT:    image_atomic_sub v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-GISE-NEXT:    s_endpgm
+;
+; GFX10PLUS-LABEL: atomic_sub_1d:
+; GFX10PLUS:       ; %bb.0:
+; GFX10PLUS-NEXT:    image_atomic_sub v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-NEXT:    s_endpgm
+;
+; GFX12-GISE-LABEL: atomic_sub_1d:
+; GFX12-GISE:       ; %bb.0:
+; GFX12-GISE-NEXT:    image_atomic_sub_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-GISE-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_sub_1d:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    image_atomic_sub_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-NEXT:    s_endpgm
+  %v = call i32 @llvm.amdgcn.image.atomic.sub.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @atomic_smin_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+; GFX10PLUS-GISE-LABEL: atomic_smin_1d:
+; GFX10PLUS-GISE:       ; %bb.0:
+; GFX10PLUS-GISE-NEXT:    image_atomic_smin v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-GISE-NEXT:    s_endpgm
+;
+; GFX10PLUS-LABEL: atomic_smin_1d:
+; GFX10PLUS:       ; %bb.0:
+; GFX10PLUS-NEXT:    image_atomic_smin v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-NEXT:    s_endpgm
+;
+; GFX12-GISE-LABEL: atomic_smin_1d:
+; GFX12-GISE:       ; %bb.0:
+; GFX12-GISE-NEXT:    image_atomic_min_int v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-GISE-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_smin_1d:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    image_atomic_min_int v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-NEXT:    s_endpgm
+  %v = call i32 @llvm.amdgcn.image.atomic.smin.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @atomic_umin_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+; GFX10PLUS-GISE-LABEL: atomic_umin_1d:
+; GFX10PLUS-GISE:       ; %bb.0:
+; GFX10PLUS-GISE-NEXT:    image_atomic_umin v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-GISE-NEXT:    s_endpgm
+;
+; GFX10PLUS-LABEL: atomic_umin_1d:
+; GFX10PLUS:       ; %bb.0:
+; GFX10PLUS-NEXT:    image_atomic_umin v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-NEXT:    s_endpgm
+;
+; GFX12-GISE-LABEL: atomic_umin_1d:
+; GFX12-GISE:       ; %bb.0:
+; GFX12-GISE-NEXT:    image_atomic_min_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-GISE-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_umin_1d:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    image_atomic_min_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-NEXT:    s_endpgm
+  %v = call i32 @llvm.amdgcn.image.atomic.umin.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @atomic_smax_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+; GFX10PLUS-GISE-LABEL: atomic_smax_1d:
+; GFX10PLUS-GISE:       ; %bb.0:
+; GFX10PLUS-GISE-NEXT:    image_atomic_smax v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-GISE-NEXT:    s_endpgm
+;
+; GFX10PLUS-LABEL: atomic_smax_1d:
+; GFX10PLUS:       ; %bb.0:
+; GFX10PLUS-NEXT:    image_atomic_smax v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-NEXT:    s_endpgm
+;
+; GFX12-GISE-LABEL: atomic_smax_1d:
+; GFX12-GISE:       ; %bb.0:
+; GFX12-GISE-NEXT:    image_atomic_max_int v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-GISE-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_smax_1d:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    image_atomic_max_int v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-NEXT:    s_endpgm
+  %v = call i32 @llvm.amdgcn.image.atomic.smax.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @atomic_umax_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+; GFX10PLUS-GISE-LABEL: atomic_umax_1d:
+; GFX10PLUS-GISE:       ; %bb.0:
+; GFX10PLUS-GISE-NEXT:    image_atomic_umax v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-GISE-NEXT:    s_endpgm
+;
+; GFX10PLUS-LABEL: atomic_umax_1d:
+; GFX10PLUS:       ; %bb.0:
+; GFX10PLUS-NEXT:    image_atomic_umax v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-NEXT:    s_endpgm
+;
+; GFX12-GISE-LABEL: atomic_umax_1d:
+; GFX12-GISE:       ; %bb.0:
+; GFX12-GISE-NEXT:    image_atomic_max_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-GISE-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_umax_1d:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    image_atomic_max_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-NEXT:    s_endpgm
+  %v = call i32 @llvm.amdgcn.image.atomic.umax.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @atomic_and_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+; GFX10PLUS-GISE-LABEL: atomic_and_1d:
+; GFX10PLUS-GISE:       ; %bb.0:
+; GFX10PLUS-GISE-NEXT:    image_atomic_and v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-GISE-NEXT:    s_endpgm
+;
+; GFX10PLUS-LABEL: atomic_and_1d:
+; GFX10PLUS:       ; %bb.0:
+; GFX10PLUS-NEXT:    image_atomic_and v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-NEXT:    s_endpgm
+;
+; GFX12-GISE-LABEL: atomic_and_1d:
+; GFX12-GISE:       ; %bb.0:
+; GFX12-GISE-NEXT:    image_atomic_and v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-GISE-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_and_1d:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    image_atomic_and v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-NEXT:    s_endpgm
+  %v = call i32 @llvm.amdgcn.image.atomic.and.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @atomic_or_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+; GFX10PLUS-GISE-LABEL: atomic_or_1d:
+; GFX10PLUS-GISE:       ; %bb.0:
+; GFX10PLUS-GISE-NEXT:    image_atomic_or v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-GISE-NEXT:    s_endpgm
+;
+; GFX10PLUS-LABEL: atomic_or_1d:
+; GFX10PLUS:       ; %bb.0:
+; GFX10PLUS-NEXT:    image_atomic_or v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-NEXT:    s_endpgm
+;
+; GFX12-GISE-LABEL: atomic_or_1d:
+; GFX12-GISE:       ; %bb.0:
+; GFX12-GISE-NEXT:    image_atomic_or v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-GISE-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_or_1d:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    image_atomic_or v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-NEXT:    s_endpgm
+  %v = call i32 @llvm.amdgcn.image.atomic.or.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @atomic_xor_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+; GFX10PLUS-GISE-LABEL: atomic_xor_1d:
+; GFX10PLUS-GISE:       ; %bb.0:
+; GFX10PLUS-GISE-NEXT:    image_atomic_xor v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-GISE-NEXT:    s_endpgm
+;
+; GFX10PLUS-LABEL: atomic_xor_1d:
+; GFX10PLUS:       ; %bb.0:
+; GFX10PLUS-NEXT:    image_atomic_xor v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-NEXT:    s_endpgm
+;
+; GFX12-GISE-LABEL: atomic_xor_1d:
+; GFX12-GISE:       ; %bb.0:
+; GFX12-GISE-NEXT:    image_atomic_xor v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-GISE-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_xor_1d:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    image_atomic_xor v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-NEXT:    s_endpgm
+  %v = call i32 @llvm.amdgcn.image.atomic.xor.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @atomic_inc_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+; GFX10PLUS-GISE-LABEL: atomic_inc_1d:
+; GFX10PLUS-GISE:       ; %bb.0:
+; GFX10PLUS-GISE-NEXT:    image_atomic_inc v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-GISE-NEXT:    s_endpgm
+;
+; GFX10PLUS-LABEL: atomic_inc_1d:
+; GFX10PLUS:       ; %bb.0:
+; GFX10PLUS-NEXT:    image_atomic_inc v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-NEXT:    s_endpgm
+;
+; GFX12-GISE-LABEL: atomic_inc_1d:
+; GFX12-GISE:       ; %bb.0:
+; GFX12-GISE-NEXT:    image_atomic_inc_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-GISE-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_inc_1d:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    image_atomic_inc_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-NEXT:    s_endpgm
+  %v = call i32 @llvm.amdgcn.image.atomic.inc.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @atomic_dec_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+; GFX10PLUS-GISE-LABEL: atomic_dec_1d:
+; GFX10PLUS-GISE:       ; %bb.0:
+; GFX10PLUS-GISE-NEXT:    image_atomic_dec v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-GISE-NEXT:    s_endpgm
+;
+; GFX10PLUS-LABEL: atomic_dec_1d:
+; GFX10PLUS:       ; %bb.0:
+; GFX10PLUS-NEXT:    image_atomic_dec v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-NEXT:    s_endpgm
+;
+; GFX12-GISE-LABEL: atomic_dec_1d:
+; GFX12-GISE:       ; %bb.0:
+; GFX12-GISE-NEXT:    image_atomic_dec_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-GISE-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_dec_1d:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    image_atomic_dec_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-NEXT:    s_endpgm
+  %v = call i32 @llvm.amdgcn.image.atomic.dec.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @atomic_cmpswap_1d(<8 x i32> inreg %rsrc, i32 %cmp, i32 %swap, i32 %s) {
+; GFX10PLUS-GISE-LABEL: atomic_cmpswap_1d:
+; GFX10PLUS-GISE:       ; %bb.0:
+; GFX10PLUS-GISE-NEXT:    image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-GISE-NEXT:    s_endpgm
+;
+; GFX10PLUS-LABEL: atomic_cmpswap_1d:
+; GFX10PLUS:       ; %bb.0:
+; GFX10PLUS-NEXT:    image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-NEXT:    s_endpgm
+;
+; GFX12-GISE-LABEL: atomic_cmpswap_1d:
+; GFX12-GISE:       ; %bb.0:
+; GFX12-GISE-NEXT:    image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D
+; GFX12-GISE-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_cmpswap_1d:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D
+; GFX12-NEXT:    s_endpgm
+  %v = call i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32(i32 %cmp, i32 %swap, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @atomic_cmpswap_1d_64(<8 x i32> inreg %rsrc, i64 %cmp, i64 %swap, i32 %s) {
+; GFX10PLUS-GISE-LABEL: atomic_cmpswap_1d_64:
+; GFX10PLUS-GISE:       ; %bb.0:
+; GFX10PLUS-GISE-NEXT:    image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-GISE-NEXT:    s_endpgm
+;
+; GFX10PLUS-LABEL: atomic_cmpswap_1d_64:
+; GFX10PLUS:       ; %bb.0:
+; GFX10PLUS-NEXT:    image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-NEXT:    s_endpgm
+;
+; GFX12-GISE-LABEL: atomic_cmpswap_1d_64:
+; GFX12-GISE:       ; %bb.0:
+; GFX12-GISE-NEXT:    image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX12-GISE-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_cmpswap_1d_64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX12-NEXT:    s_endpgm
+  %v = call i64 @llvm.amdgcn.image.atomic.cmpswap.1d.i64.i32(i64 %cmp, i64 %swap, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @atomic_add_2d(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %t) {
+; GFX10PLUS-GISE-LABEL: atomic_add_2d:
+; GFX10PLUS-GISE:       ; %bb.0:
+; GFX10PLUS-GISE-NEXT:    image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D unorm
+; GFX10PLUS-GISE-NEXT:    s_endpgm
+;
+; GFX10PLUS-LABEL: atomic_add_2d:
+; GFX10PLUS:       ; %bb.0:
+; GFX10PLUS-NEXT:    image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D unorm
+; GFX10PLUS-NEXT:    s_endpgm
+;
+; GFX12-GISE-LABEL: atomic_add_2d:
+; GFX12-GISE:       ; %bb.0:
+; GFX12-GISE-NEXT:    image_atomic_add_uint v0, [v1, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D
+; GFX12-GISE-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_add_2d:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    image_atomic_add_uint v0, [v1, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D
+; GFX12-NEXT:    s_endpgm
+  %v = call i32 @llvm.amdgcn.image.atomic.add.2d.i32.i32(i32 %data, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @atomic_add_3d(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %t, i32 %r) {
+; GFX10PLUS-GISE-LABEL: atomic_add_3d:
+; GFX10PLUS-GISE:       ; %bb.0:
+; GFX10PLUS-GISE-NEXT:    image_atomic_add v0, v[1:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_3D unorm
+; GFX10PLUS-GISE-NEXT:    s_endpgm
+;
+; GFX10PLUS-LABEL: atomic_add_3d:
+; GFX10PLUS:       ; %bb.0:
+; GFX10PLUS-NEXT:    image_atomic_add v0, v[1:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_3D unorm
+; GFX10PLUS-NEXT:    s_endpgm
+;
+; GFX12-GISE-LABEL: atomic_add_3d:
+; GFX12-GISE:       ; %bb.0:
+; GFX12-GISE-NEXT:    image_atomic_add_uint v0, [v1, v2, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_3D
+; GFX12-GISE-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_add_3d:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    image_atomic_add_uint v0, [v1, v2, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_3D
+; GFX12-NEXT:    s_endpgm
+  %v = call i32 @llvm.amdgcn.image.atomic.add.3d.i32.i32(i32 %data, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @atomic_add_cube(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %t, i32 %face) {
+; GFX10PLUS-GISE-LABEL: atomic_add_cube:
+; GFX10PLUS-GISE:       ; %bb.0:
+; GFX10PLUS-GISE-NEXT:    image_atomic_add v0, v[1:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_CUBE unorm
+; GFX10PLUS-GISE-NEXT:    s_endpgm
+;
+; GFX10PLUS-LABEL: atomic_add_cube:
+; GFX10PLUS:       ; %bb.0:
+; GFX10PLUS-NEXT:    image_atomic_add v0, v[1:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_CUBE unorm
+; GFX10PLUS-NEXT:    s_endpgm
+;
+; GFX12-GISE-LABEL: atomic_add_cube:
+; GFX12-GISE:       ; %bb.0:
+; GFX12-GISE-NEXT:    image_atomic_add_uint v0, [v1, v2, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_CUBE
+; GFX12-GISE-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_add_cube:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    image_atomic_add_uint v0, [v1, v2, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_CUBE
+; GFX12-NEXT:    s_endpgm
+  %v = call i32 @llvm.amdgcn.image.atomic.add.cube.i32.i32(i32 %data, i32 %s, i32 %t, i32 %face, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @atomic_add_1darray(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %slice) {
+; GFX10PLUS-GISE-LABEL: atomic_add_1darray:
+; GFX10PLUS-GISE:       ; %bb.0:
+; GFX10PLUS-GISE-NEXT:    image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D_ARRAY unorm
+; GFX10PLUS-GISE-NEXT:    s_endpgm
+;
+; GFX10PLUS-LABEL: atomic_add_1darray:
+; GFX10PLUS:       ; %bb.0:
+; GFX10PLUS-NEXT:    image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D_ARRAY unorm
+; GFX10PLUS-NEXT:    s_endpgm
+;
+; GFX12-GISE-LABEL: atomic_add_1darray:
+; GFX12-GISE:       ; %bb.0:
+; GFX12-GISE-NEXT:    image_atomic_add_uint v0, [v1, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D_ARRAY
+; GFX12-GISE-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_add_1darray:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    image_atomic_add_uint v0, [v1, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D_ARRAY
+; GFX12-NEXT:    s_endpgm
+  %v = call i32 @llvm.amdgcn.image.atomic.add.1darray.i32.i32(i32 %data, i32 %s, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @atomic_add_2darray(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %t, i32 %slice) {
+; GFX10PLUS-GISE-LABEL: atomic_add_2darray:
+; GFX10PLUS-GISE:       ; %bb.0:
+; GFX10PLUS-GISE-NEXT:    image_atomic_add v0, v[1:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY unorm
+; GFX10PLUS-GISE-NEXT:    s_endpgm
+;
+; GFX10PLUS-LABEL: atomic_add_2darray:
+; GFX10PLUS:       ; %bb.0:
+; GFX10PLUS-NEXT:    image_atomic_add v0, v[1:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY unorm
+; GFX10PLUS-NEXT:    s_endpgm
+;
+; GFX12-GISE-LABEL: atomic_add_2darray:
+; GFX12-GISE:       ; %bb.0:
+; GFX12-GISE-NEXT:    image_atomic_add_uint v0, [v1, v2, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
+; GFX12-GISE-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_add_2darray:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    image_atomic_add_uint v0, [v1, v2, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
+; GFX12-NEXT:    s_endpgm
+  %v = call i32 @llvm.amdgcn.image.atomic.add.2darray.i32.i32(i32 %data, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @atomic_add_2dmsaa(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %t, i32 %fragid) {
+; GFX10PLUS-GISE-LABEL: atomic_add_2dmsaa:
+; GFX10PLUS-GISE:       ; %bb.0:
+; GFX10PLUS-GISE-NEXT:    image_atomic_add v0, v[1:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm
+; GFX10PLUS-GISE-NEXT:    s_endpgm
+;
+; GFX10PLUS-LABEL: atomic_add_2dmsaa:
+; GFX10PLUS:       ; %bb.0:
+; GFX10PLUS-NEXT:    image_atomic_add v0, v[1:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm
+; GFX10PLUS-NEXT:    s_endpgm
+;
+; GFX12-GISE-LABEL: atomic_add_2dmsaa:
+; GFX12-GISE:       ; %bb.0:
+; GFX12-GISE-NEXT:    image_atomic_add_uint v0, [v1, v2, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA
+; GFX12-GISE-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_add_2dmsaa:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    image_atomic_add_uint v0, [v1, v2, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA
+; GFX12-NEXT:    s_endpgm
+  %v = call i32 @llvm.amdgcn.image.atomic.add.2dmsaa.i32.i32(i32 %data, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @atomic_add_2darraymsaa(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %t, i32 %slice, i32 %fragid) {
+; GFX10PLUS-GISE-LABEL: atomic_add_2darraymsaa:
+; GFX10PLUS-GISE:       ; %bb.0:
+; GFX10PLUS-GISE-NEXT:    image_atomic_add v0, v[1:4], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm
+; GFX10PLUS-GISE-NEXT:    s_endpgm
+;
+; GFX10PLUS-LABEL: atomic_add_2darraymsaa:
+; GFX10PLUS:       ; %bb.0:
+; GFX10PLUS-NEXT:    image_atomic_add v0, v[1:4], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm
+; GFX10PLUS-NEXT:    s_endpgm
+;
+; GFX12-GISE-LABEL: atomic_add_2darraymsaa:
+; GFX12-GISE:       ; %bb.0:
+; GFX12-GISE-NEXT:    image_atomic_add_uint v0, [v1, v2, v3, v4], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY
+; GFX12-GISE-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_add_2darraymsaa:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    image_atomic_add_uint v0, [v1, v2, v3, v4], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY
+; GFX12-NEXT:    s_endpgm
+  %v = call i32 @llvm.amdgcn.image.atomic.add.2darraymsaa.i32.i32(i32 %data, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @atomic_add_1d_slc(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+; GFX10PLUS-GISE-LABEL: atomic_add_1d_slc:
+; GFX10PLUS-GISE:       ; %bb.0:
+; GFX10PLUS-GISE-NEXT:    image_atomic_add v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm slc
+; GFX10PLUS-GISE-NEXT:    s_endpgm
+;
+; GFX10PLUS-LABEL: atomic_add_1d_slc:
+; GFX10PLUS:       ; %bb.0:
+; GFX10PLUS-NEXT:    image_atomic_add v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm slc
+; GFX10PLUS-NEXT:    s_endpgm
+;
+; GFX12-GISE-LABEL: atomic_add_1d_slc:
+; GFX12-GISE:       ; %bb.0:
+; GFX12-GISE-NEXT:    image_atomic_add_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_NT
+; GFX12-GISE-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_add_1d_slc:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    image_atomic_add_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_NT
+; GFX12-NEXT:    s_endpgm
+  %v = call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 2)
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll
index 3d1d6c87eb98d..0ba62e49cabc3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll
@@ -41,15 +41,13 @@ main_body:
 define amdgpu_ps float @atomic_pk_add_f16_1d_v2_noret(<8 x i32> inreg %rsrc, <2 x half> %data, i32 %s) {
 ; GFX12-SDAG-LABEL: atomic_pk_add_f16_1d_v2_noret:
 ; GFX12-SDAG:       ; %bb.0: ; %main_body
-; GFX12-SDAG-NEXT:    image_atomic_pk_add_f16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
-; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT:    image_atomic_pk_add_f16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 1.0
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-GISEL-LABEL: atomic_pk_add_f16_1d_v2_noret:
 ; GFX12-GISEL:       ; %bb.0: ; %main_body
-; GFX12-GISEL-NEXT:    image_atomic_pk_add_f16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
-; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    image_atomic_pk_add_f16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 1.0
 ; GFX12-GISEL-NEXT:    ; return to shader part epilog
 main_body:
@@ -79,15 +77,13 @@ main_body:
 define amdgpu_ps float @atomic_pk_add_f16_1d_v4_noret(<8 x i32> inreg %rsrc, <4 x half> %data, i32 %s) {
 ; GFX12-SDAG-LABEL: atomic_pk_add_f16_1d_v4_noret:
 ; GFX12-SDAG:       ; %bb.0: ; %main_body
-; GFX12-SDAG-NEXT:    image_atomic_pk_add_f16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
-; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT:    image_atomic_pk_add_f16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 1.0
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-GISEL-LABEL: atomic_pk_add_f16_1d_v4_noret:
 ; GFX12-GISEL:       ; %bb.0: ; %main_body
-; GFX12-GISEL-NEXT:    image_atomic_pk_add_f16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
-; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    image_atomic_pk_add_f16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 1.0
 ; GFX12-GISEL-NEXT:    ; return to shader part epilog
 main_body:
@@ -126,15 +122,13 @@ main_body:
 define amdgpu_ps float @atomic_pk_add_bf16_1d_v2_noret(<8 x i32> inreg %rsrc, <2 x bfloat> %data, i32 %s) {
 ; GFX12-SDAG-LABEL: atomic_pk_add_bf16_1d_v2_noret:
 ; GFX12-SDAG:       ; %bb.0: ; %main_body
-; GFX12-SDAG-NEXT:    image_atomic_pk_add_bf16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
-; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT:    image_atomic_pk_add_bf16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 1.0
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-GISEL-LABEL: atomic_pk_add_bf16_1d_v2_noret:
 ; GFX12-GISEL:       ; %bb.0: ; %main_body
-; GFX12-GISEL-NEXT:    image_atomic_pk_add_bf16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
-; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    image_atomic_pk_add_bf16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 1.0
 ; GFX12-GISEL-NEXT:    ; return to shader part epilog
 main_body:
@@ -173,15 +167,13 @@ main_body:
 define amdgpu_ps float @atomic_pk_add_bf16_1d_v4_noret(<8 x i32> inreg %rsrc, <4 x bfloat> %data, i32 %s) {
 ; GFX12-SDAG-LABEL: atomic_pk_add_bf16_1d_v4_noret:
 ; GFX12-SDAG:       ; %bb.0: ; %main_body
-; GFX12-SDAG-NEXT:    image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
-; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT:    image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 1.0
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-GISEL-LABEL: atomic_pk_add_bf16_1d_v4_noret:
 ; GFX12-GISEL:       ; %bb.0: ; %main_body
-; GFX12-GISEL-NEXT:    image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
-; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 1.0
 ; GFX12-GISEL-NEXT:    ; return to shader part epilog
 main_body:
@@ -192,15 +184,13 @@ main_body:
 define amdgpu_ps float @atomic_pk_add_bf16_1d_v4_nt(<8 x i32> inreg %rsrc, <4 x bfloat> %data, i32 %s) {
 ; GFX12-SDAG-LABEL: atomic_pk_add_bf16_1d_v4_nt:
 ; GFX12-SDAG:       ; %bb.0: ; %main_body
-; GFX12-SDAG-NEXT:    image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_NT_RETURN
-; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT:    image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_NT
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 1.0
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-GISEL-LABEL: atomic_pk_add_bf16_1d_v4_nt:
 ; GFX12-GISEL:       ; %bb.0: ; %main_body
-; GFX12-GISEL-NEXT:    image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_NT_RETURN
-; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_NT
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 1.0
 ; GFX12-GISEL-NEXT:    ; return to shader part epilog
 main_body:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll
index 6dd2258420998..39191d242574f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll
@@ -23,10 +23,8 @@ define amdgpu_kernel void @test_s_i32(ptr addrspace(1) %out, i32 %src0) {
 ; GFX11-SDAG-NEXT:    s_load_b32 s2, s[4:5], 0x2c
 ; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_permlane64_b32 v0, v0
-; GFX11-SDAG-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: test_s_i32:
@@ -36,8 +34,6 @@ define amdgpu_kernel void @test_s_i32(ptr addrspace(1) %out, i32 %src0) {
 ; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_permlane64_b32 v0, v0
 ; GFX11-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-GISEL-NEXT:    s_endpgm
   %v = call i32 @llvm.amdgcn.permlane64.i32(i32 %src0)
@@ -50,12 +46,9 @@ define amdgpu_kernel void @test_s_i64(ptr addrspace(1) %out, i64 %src0) {
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v2, s2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_permlane64_b32 v1, v0
-; GFX11-SDAG-NEXT:    v_permlane64_b32 v0, v2
-; GFX11-SDAG-NEXT:    global_store_b64 v3, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: test_s_i64:
@@ -64,9 +57,6 @@ define amdgpu_kernel void @test_s_i64(ptr addrspace(1) %out, i64 %src0) {
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_permlane64_b32 v0, v0
-; GFX11-GISEL-NEXT:    v_permlane64_b32 v1, v1
 ; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-GISEL-NEXT:    s_endpgm
   %v = call i64 @llvm.amdgcn.permlane64.i64(i64 %src0)
@@ -79,12 +69,9 @@ define amdgpu_kernel void @test_s_f64(ptr addrspace(1) %out, double %src0) {
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v2, s2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_permlane64_b32 v1, v0
-; GFX11-SDAG-NEXT:    v_permlane64_b32 v0, v2
-; GFX11-SDAG-NEXT:    global_store_b64 v3, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: test_s_f64:
@@ -93,9 +80,6 @@ define amdgpu_kernel void @test_s_f64(ptr addrspace(1) %out, double %src0) {
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_permlane64_b32 v0, v0
-; GFX11-GISEL-NEXT:    v_permlane64_b32 v1, v1
 ; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-GISEL-NEXT:    s_endpgm
   %v = call double @llvm.amdgcn.permlane64.f64(double %src0)
@@ -116,19 +100,15 @@ define amdgpu_kernel void @test_i_i32(ptr addrspace(1) %out) {
 ; GFX11-SDAG-LABEL: test_i_i32:
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-NEXT:    v_dual_mov_b32 v0, 0x63 :: v_dual_mov_b32 v1, 0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_permlane64_b32 v0, v0
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x63
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: test_i_i32:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, 0x63 :: v_dual_mov_b32 v1, 0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_permlane64_b32 v0, v0
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-GISEL-NEXT:    s_endpgm
@@ -141,19 +121,15 @@ define amdgpu_kernel void @test_i_f32(ptr addrspace(1) %out) {
 ; GFX11-SDAG-LABEL: test_i_f32:
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-NEXT:    v_dual_mov_b32 v0, 0x449a5000 :: v_dual_mov_b32 v1, 0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_permlane64_b32 v0, v0
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x449a5000
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: test_i_f32:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, 0x449a5000 :: v_dual_mov_b32 v1, 0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_permlane64_b32 v0, v0
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-GISEL-NEXT:    s_endpgm
@@ -166,23 +142,16 @@ define amdgpu_kernel void @test_i_i64(ptr addrspace(1) %out) {
 ; GFX11-SDAG-LABEL: test_i_i64:
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v2, 0
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 0x63
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_permlane64_b32 v1, v2
-; GFX11-SDAG-NEXT:    v_permlane64_b32 v0, v0
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, 0x63
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT:    global_store_b64 v1, v[0:1], s[0:1]
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: test_i_i64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x63
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_permlane64_b32 v0, v0
-; GFX11-GISEL-NEXT:    v_permlane64_b32 v1, v2
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-GISEL-NEXT:    s_endpgm
@@ -195,22 +164,16 @@ define amdgpu_kernel void @test_i_f64(ptr addrspace(1) %out) {
 ; GFX11-SDAG-LABEL: test_i_f64:
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 0x40934a00
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v2, 0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_permlane64_b32 v1, v0
-; GFX11-SDAG-NEXT:    v_permlane64_b32 v0, v2
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x40934a00
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: test_i_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x40934a00
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_permlane64_b32 v0, v2
-; GFX11-GISEL-NEXT:    v_permlane64_b32 v1, v1
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 0x40934a00 :: v_dual_mov_b32 v2, 0
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-GISEL-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ptr.ll
index b0149f7de5e85..672b658659824 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ptr.ll
@@ -6,12 +6,9 @@ define amdgpu_kernel void @test_p0(ptr addrspace(1) %out, ptr %src0) {
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v2, s2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_permlane64_b32 v1, v0
-; GFX11-SDAG-NEXT:    v_permlane64_b32 v0, v2
-; GFX11-SDAG-NEXT:    global_store_b64 v3, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-SDAG-NEXT:    s_endpgm
   %v = call ptr @llvm.amdgcn.permlane64.p0(ptr %src0)
   store ptr %v, ptr addrspace(1) %out
@@ -22,21 +19,14 @@ define amdgpu_kernel void @test_v3p0(ptr addrspace(1) %out, <3 x ptr> %src0) {
 ; GFX11-SDAG-LABEL: test_v3p0:
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_clause 0x2
-; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x44
 ; GFX11-SDAG-NEXT:    s_load_b64 s[6:7], s[4:5], 0x54
+; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x44
 ; GFX11-SDAG-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_dual_mov_b32 v6, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-SDAG-NEXT:    v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s7
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v8, s6
-; GFX11-SDAG-NEXT:    v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v7, s0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-NEXT:    v_permlane64_b32 v2, v1
-; GFX11-SDAG-NEXT:    v_permlane64_b32 v1, v4
-; GFX11-SDAG-NEXT:    v_permlane64_b32 v5, v5
-; GFX11-SDAG-NEXT:    v_permlane64_b32 v4, v8
-; GFX11-SDAG-NEXT:    v_permlane64_b32 v3, v0
-; GFX11-SDAG-NEXT:    v_permlane64_b32 v0, v7
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v6, 0 :: v_dual_mov_b32 v5, s7
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v4, s6 :: v_dual_mov_b32 v1, s1
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-SDAG-NEXT:    s_clause 0x1
 ; GFX11-SDAG-NEXT:    global_store_b64 v6, v[4:5], s[4:5] offset:16
 ; GFX11-SDAG-NEXT:    global_store_b128 v6, v[0:3], s[4:5]
@@ -53,10 +43,8 @@ define amdgpu_kernel void @test_p3(ptr addrspace(1) %out, ptr addrspace(3) %src0
 ; GFX11-SDAG-NEXT:    s_load_b32 s2, s[4:5], 0x2c
 ; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_permlane64_b32 v0, v0
-; GFX11-SDAG-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-SDAG-NEXT:    s_endpgm
   %v = call ptr addrspace(3) @llvm.amdgcn.permlane64.v3p0(ptr addrspace(3) %src0)
   store ptr addrspace(3) %v, ptr addrspace(1) %out
@@ -70,14 +58,9 @@ define amdgpu_kernel void @test_v3p3(ptr addrspace(1) %out, <3 x ptr addrspace(3
 ; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
 ; GFX11-SDAG-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s0
-; GFX11-SDAG-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_permlane64_b32 v2, v0
-; GFX11-SDAG-NEXT:    v_permlane64_b32 v1, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-SDAG-NEXT:    v_permlane64_b32 v0, v3
-; GFX11-SDAG-NEXT:    global_store_b96 v4, v[0:2], s[4:5]
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s0
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
+; GFX11-SDAG-NEXT:    global_store_b96 v3, v[0:2], s[4:5]
 ; GFX11-SDAG-NEXT:    s_endpgm
   %v = call <3 x ptr addrspace(3)> @llvm.amdgcn.permlane64.v3p3(<3 x ptr addrspace(3)> %src0)
   store <3 x ptr addrspace(3)> %v, ptr addrspace(1) %out
@@ -91,10 +74,8 @@ define amdgpu_kernel void @test_p5(ptr addrspace(1) %out, ptr addrspace(5) %src0
 ; GFX11-SDAG-NEXT:    s_load_b32 s2, s[4:5], 0x2c
 ; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_permlane64_b32 v0, v0
-; GFX11-SDAG-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-SDAG-NEXT:    s_endpgm
   %v = call ptr addrspace(5) @llvm.amdgcn.permlane64.p5(ptr addrspace(5) %src0)
   store ptr addrspace(5) %v, ptr addrspace(1) %out
@@ -108,14 +89,9 @@ define amdgpu_kernel void @test_v3p5(ptr addrspace(1) %out, <3 x ptr addrspace(5
 ; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
 ; GFX11-SDAG-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s0
-; GFX11-SDAG-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_permlane64_b32 v2, v0
-; GFX11-SDAG-NEXT:    v_permlane64_b32 v1, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-SDAG-NEXT:    v_permlane64_b32 v0, v3
-; GFX11-SDAG-NEXT:    global_store_b96 v4, v[0:2], s[4:5]
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s0
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
+; GFX11-SDAG-NEXT:    global_store_b96 v3, v[0:2], s[4:5]
 ; GFX11-SDAG-NEXT:    s_endpgm
   %v = call <3 x ptr addrspace(5)> @llvm.amdgcn.permlane64.v3p5(<3 x ptr addrspace(5)> %src0)
   store <3 x ptr addrspace(5)> %v, ptr addrspace(1) %out
@@ -129,10 +105,8 @@ define amdgpu_kernel void @test_p6(ptr addrspace(1) %out, ptr addrspace(6) %src0
 ; GFX11-SDAG-NEXT:    s_load_b32 s2, s[4:5], 0x2c
 ; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_permlane64_b32 v0, v0
-; GFX11-SDAG-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-SDAG-NEXT:    s_endpgm
   %v = call ptr addrspace(6) @llvm.amdgcn.permlane64.p6(ptr addrspace(6) %src0)
   store ptr addrspace(6) %v, ptr addrspace(1) %out
@@ -146,14 +120,9 @@ define amdgpu_kernel void @test_v3p6(ptr addrspace(1) %out, <3 x ptr addrspace(6
 ; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
 ; GFX11-SDAG-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s0
-; GFX11-SDAG-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_permlane64_b32 v2, v0
-; GFX11-SDAG-NEXT:    v_permlane64_b32 v1, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-SDAG-NEXT:    v_permlane64_b32 v0, v3
-; GFX11-SDAG-NEXT:    global_store_b96 v4, v[0:2], s[4:5]
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s0
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
+; GFX11-SDAG-NEXT:    global_store_b96 v3, v[0:2], s[4:5]
 ; GFX11-SDAG-NEXT:    s_endpgm
   %v = call <3 x ptr addrspace(6)> @llvm.amdgcn.permlane64.v3p6(<3 x ptr addrspace(6)> %src0)
   store <3 x ptr addrspace(6)> %v, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
index d1ba892d7f7e1..02d29909c661c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
@@ -396,8 +396,7 @@ define amdgpu_kernel void @test_readfirstlane_imm_f64(ptr addrspace(1) %out) {
 ;
 ; CHECK-GISEL-LABEL: test_readfirstlane_imm_f64:
 ; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    s_mov_b32 s0, 0
-; CHECK-GISEL-NEXT:    s_mov_b32 s1, 0x40400000
+; CHECK-GISEL-NEXT:    s_mov_b64 s[0:1], 0x4040000000000000
 ; CHECK-GISEL-NEXT:    ;;#ASMSTART
 ; CHECK-GISEL-NEXT:    ; use s[0:1]
 ; CHECK-GISEL-NEXT:    ;;#ASMEND
@@ -456,14 +455,13 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i64(ptr addrspace(1) %out
 ; CHECK-GISEL-LABEL: test_readfirstlane_imm_fold_i64:
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CHECK-GISEL-NEXT:    s_mov_b64 s[2:3], 32
 ; CHECK-GISEL-NEXT:    s_add_i32 s12, s12, s17
-; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, 32
 ; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
-; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CHECK-GISEL-NEXT:    s_endpgm
@@ -490,15 +488,13 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_f64(ptr addrspace(1) %out
 ; CHECK-GISEL-LABEL: test_readfirstlane_imm_fold_f64:
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CHECK-GISEL-NEXT:    s_mov_b32 s2, 0
 ; CHECK-GISEL-NEXT:    s_add_i32 s12, s12, s17
-; CHECK-GISEL-NEXT:    s_mov_b32 s3, 0x40400000
-; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, 0x40400000
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CHECK-GISEL-NEXT:    s_endpgm
@@ -588,17 +584,17 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i64(ptr addrspace(1
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; CHECK-SDAG-NEXT:    s_add_i32 s12, s12, s17
-; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s13
-; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; CHECK-SDAG-NEXT:    ;;#ASMSTART
 ; CHECK-SDAG-NEXT:    s_mov_b64 s[2:3], 0
 ; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s2
+; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s3
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s0
-; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s3
+; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; CHECK-SDAG-NEXT:    s_endpgm
 ;
 ; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_i64:
@@ -628,17 +624,17 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_f64(ptr addrspace(1
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; CHECK-SDAG-NEXT:    s_add_i32 s12, s12, s17
-; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s13
-; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; CHECK-SDAG-NEXT:    ;;#ASMSTART
 ; CHECK-SDAG-NEXT:    s_mov_b64 s[2:3], 0
 ; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s2
+; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s3
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s0
-; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s3
+; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; CHECK-SDAG-NEXT:    s_endpgm
 ;
 ; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_f64:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
index 7ff5eb46def38..0795f4050b622 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
@@ -9,7 +9,7 @@ declare double @llvm.amdgcn.readlane.f64(double, i32) #0
 define amdgpu_kernel void @test_readlane_sreg_sreg_i32(i32 %src0, i32 %src1) #1 {
 ; CHECK-SDAG-LABEL: test_readlane_sreg_sreg_i32:
 ; CHECK-SDAG:       ; %bb.0:
-; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-SDAG-NEXT:    s_load_dword s0, s[8:9], 0x0
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    ;;#ASMSTART
 ; CHECK-SDAG-NEXT:    ; use s0
@@ -18,7 +18,7 @@ define amdgpu_kernel void @test_readlane_sreg_sreg_i32(i32 %src0, i32 %src1) #1
 ;
 ; CHECK-GISEL-LABEL: test_readlane_sreg_sreg_i32:
 ; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-GISEL-NEXT:    s_load_dword s0, s[8:9], 0x0
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    ;;#ASMSTART
 ; CHECK-GISEL-NEXT:    ; use s0
@@ -224,14 +224,13 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i64(ptr addrspace(1) %out, i32
 ; CHECK-GISEL-LABEL: test_readlane_imm_sreg_i64:
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CHECK-GISEL-NEXT:    s_mov_b64 s[2:3], 32
 ; CHECK-GISEL-NEXT:    s_add_i32 s12, s12, s17
-; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, 32
 ; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
-; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CHECK-GISEL-NEXT:    s_endpgm
@@ -258,15 +257,13 @@ define amdgpu_kernel void @test_readlane_imm_sreg_f64(ptr addrspace(1) %out, i32
 ; CHECK-GISEL-LABEL: test_readlane_imm_sreg_f64:
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CHECK-GISEL-NEXT:    s_mov_b32 s2, 0
 ; CHECK-GISEL-NEXT:    s_add_i32 s12, s12, s17
-; CHECK-GISEL-NEXT:    s_mov_b32 s3, 0x40400000
-; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, 0x40400000
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CHECK-GISEL-NEXT:    s_endpgm
@@ -660,17 +657,17 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_i64(ptr addrspace(1) %ou
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; CHECK-SDAG-NEXT:    s_add_i32 s12, s12, s17
-; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s13
-; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; CHECK-SDAG-NEXT:    ;;#ASMSTART
 ; CHECK-SDAG-NEXT:    s_mov_b64 s[2:3], 0
 ; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s2
+; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s3
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s0
-; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s3
+; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; CHECK-SDAG-NEXT:    s_endpgm
 ;
 ; CHECK-GISEL-LABEL: test_readlane_copy_from_sgpr_i64:
@@ -700,17 +697,17 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_f64(ptr addrspace(1) %ou
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; CHECK-SDAG-NEXT:    s_add_i32 s12, s12, s17
-; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s13
-; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; CHECK-SDAG-NEXT:    ;;#ASMSTART
 ; CHECK-SDAG-NEXT:    s_mov_b64 s[2:3], 0
 ; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s2
+; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s3
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s0
-; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s3
+; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; CHECK-SDAG-NEXT:    s_endpgm
 ;
 ; CHECK-GISEL-LABEL: test_readlane_copy_from_sgpr_f64:
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-array-to-vector.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-array-to-vector.ll
new file mode 100644
index 0000000000000..05a0e39d4a715
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-array-to-vector.ll
@@ -0,0 +1,325 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-- -mcpu=gfx1100 -passes=amdgpu-promote-alloca < %s | FileCheck -check-prefix=OPT %s
+
+define amdgpu_kernel void @large_array_vectors_small_users(<16 x i8> %in, <16 x i8> %add, ptr addrspace(3) %out) #0 {
+; OPT-LABEL: define amdgpu_kernel void @large_array_vectors_small_users(
+; OPT-SAME: <16 x i8> [[IN:%.*]], <16 x i8> [[ADD:%.*]], ptr addrspace(3) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; OPT-NEXT:  [[ENTRY:.*:]]
+; OPT-NEXT:    [[ALLOCA:%.*]] = freeze <128 x i8> poison
+; OPT-NEXT:    [[TMP0:%.*]] = extractelement <16 x i8> [[IN]], i64 0
+; OPT-NEXT:    [[TMP1:%.*]] = insertelement <128 x i8> [[ALLOCA]], i8 [[TMP0]], i32 0
+; OPT-NEXT:    [[TMP2:%.*]] = extractelement <16 x i8> [[IN]], i64 1
+; OPT-NEXT:    [[TMP3:%.*]] = insertelement <128 x i8> [[TMP1]], i8 [[TMP2]], i32 1
+; OPT-NEXT:    [[TMP4:%.*]] = extractelement <16 x i8> [[IN]], i64 2
+; OPT-NEXT:    [[TMP5:%.*]] = insertelement <128 x i8> [[TMP3]], i8 [[TMP4]], i32 2
+; OPT-NEXT:    [[TMP6:%.*]] = extractelement <16 x i8> [[IN]], i64 3
+; OPT-NEXT:    [[TMP7:%.*]] = insertelement <128 x i8> [[TMP5]], i8 [[TMP6]], i32 3
+; OPT-NEXT:    [[TMP8:%.*]] = extractelement <16 x i8> [[IN]], i64 4
+; OPT-NEXT:    [[TMP9:%.*]] = insertelement <128 x i8> [[TMP7]], i8 [[TMP8]], i32 4
+; OPT-NEXT:    [[TMP10:%.*]] = extractelement <16 x i8> [[IN]], i64 5
+; OPT-NEXT:    [[TMP11:%.*]] = insertelement <128 x i8> [[TMP9]], i8 [[TMP10]], i32 5
+; OPT-NEXT:    [[TMP12:%.*]] = extractelement <16 x i8> [[IN]], i64 6
+; OPT-NEXT:    [[TMP13:%.*]] = insertelement <128 x i8> [[TMP11]], i8 [[TMP12]], i32 6
+; OPT-NEXT:    [[TMP14:%.*]] = extractelement <16 x i8> [[IN]], i64 7
+; OPT-NEXT:    [[TMP15:%.*]] = insertelement <128 x i8> [[TMP13]], i8 [[TMP14]], i32 7
+; OPT-NEXT:    [[TMP16:%.*]] = extractelement <16 x i8> [[IN]], i64 8
+; OPT-NEXT:    [[TMP17:%.*]] = insertelement <128 x i8> [[TMP15]], i8 [[TMP16]], i32 8
+; OPT-NEXT:    [[TMP18:%.*]] = extractelement <16 x i8> [[IN]], i64 9
+; OPT-NEXT:    [[TMP19:%.*]] = insertelement <128 x i8> [[TMP17]], i8 [[TMP18]], i32 9
+; OPT-NEXT:    [[TMP20:%.*]] = extractelement <16 x i8> [[IN]], i64 10
+; OPT-NEXT:    [[TMP21:%.*]] = insertelement <128 x i8> [[TMP19]], i8 [[TMP20]], i32 10
+; OPT-NEXT:    [[TMP22:%.*]] = extractelement <16 x i8> [[IN]], i64 11
+; OPT-NEXT:    [[TMP23:%.*]] = insertelement <128 x i8> [[TMP21]], i8 [[TMP22]], i32 11
+; OPT-NEXT:    [[TMP24:%.*]] = extractelement <16 x i8> [[IN]], i64 12
+; OPT-NEXT:    [[TMP25:%.*]] = insertelement <128 x i8> [[TMP23]], i8 [[TMP24]], i32 12
+; OPT-NEXT:    [[TMP26:%.*]] = extractelement <16 x i8> [[IN]], i64 13
+; OPT-NEXT:    [[TMP27:%.*]] = insertelement <128 x i8> [[TMP25]], i8 [[TMP26]], i32 13
+; OPT-NEXT:    [[TMP28:%.*]] = extractelement <16 x i8> [[IN]], i64 14
+; OPT-NEXT:    [[TMP29:%.*]] = insertelement <128 x i8> [[TMP27]], i8 [[TMP28]], i32 14
+; OPT-NEXT:    [[TMP30:%.*]] = extractelement <16 x i8> [[IN]], i64 15
+; OPT-NEXT:    [[TMP31:%.*]] = insertelement <128 x i8> [[TMP29]], i8 [[TMP30]], i32 15
+; OPT-NEXT:    [[TMP32:%.*]] = extractelement <16 x i8> [[IN]], i64 0
+; OPT-NEXT:    [[TMP33:%.*]] = insertelement <128 x i8> [[TMP31]], i8 [[TMP32]], i32 0
+; OPT-NEXT:    [[TMP34:%.*]] = extractelement <16 x i8> [[IN]], i64 1
+; OPT-NEXT:    [[TMP35:%.*]] = insertelement <128 x i8> [[TMP33]], i8 [[TMP34]], i32 1
+; OPT-NEXT:    [[TMP36:%.*]] = extractelement <16 x i8> [[IN]], i64 2
+; OPT-NEXT:    [[TMP37:%.*]] = insertelement <128 x i8> [[TMP35]], i8 [[TMP36]], i32 2
+; OPT-NEXT:    [[TMP38:%.*]] = extractelement <16 x i8> [[IN]], i64 3
+; OPT-NEXT:    [[TMP39:%.*]] = insertelement <128 x i8> [[TMP37]], i8 [[TMP38]], i32 3
+; OPT-NEXT:    [[TMP40:%.*]] = extractelement <16 x i8> [[IN]], i64 4
+; OPT-NEXT:    [[TMP41:%.*]] = insertelement <128 x i8> [[TMP39]], i8 [[TMP40]], i32 4
+; OPT-NEXT:    [[TMP42:%.*]] = extractelement <16 x i8> [[IN]], i64 5
+; OPT-NEXT:    [[TMP43:%.*]] = insertelement <128 x i8> [[TMP41]], i8 [[TMP42]], i32 5
+; OPT-NEXT:    [[TMP44:%.*]] = extractelement <16 x i8> [[IN]], i64 6
+; OPT-NEXT:    [[TMP45:%.*]] = insertelement <128 x i8> [[TMP43]], i8 [[TMP44]], i32 6
+; OPT-NEXT:    [[TMP46:%.*]] = extractelement <16 x i8> [[IN]], i64 7
+; OPT-NEXT:    [[TMP47:%.*]] = insertelement <128 x i8> [[TMP45]], i8 [[TMP46]], i32 7
+; OPT-NEXT:    [[TMP48:%.*]] = extractelement <16 x i8> [[IN]], i64 8
+; OPT-NEXT:    [[TMP49:%.*]] = insertelement <128 x i8> [[TMP47]], i8 [[TMP48]], i32 8
+; OPT-NEXT:    [[TMP50:%.*]] = extractelement <16 x i8> [[IN]], i64 9
+; OPT-NEXT:    [[TMP51:%.*]] = insertelement <128 x i8> [[TMP49]], i8 [[TMP50]], i32 9
+; OPT-NEXT:    [[TMP52:%.*]] = extractelement <16 x i8> [[IN]], i64 10
+; OPT-NEXT:    [[TMP53:%.*]] = insertelement <128 x i8> [[TMP51]], i8 [[TMP52]], i32 10
+; OPT-NEXT:    [[TMP54:%.*]] = extractelement <16 x i8> [[IN]], i64 11
+; OPT-NEXT:    [[TMP55:%.*]] = insertelement <128 x i8> [[TMP53]], i8 [[TMP54]], i32 11
+; OPT-NEXT:    [[TMP56:%.*]] = extractelement <16 x i8> [[IN]], i64 12
+; OPT-NEXT:    [[TMP57:%.*]] = insertelement <128 x i8> [[TMP55]], i8 [[TMP56]], i32 12
+; OPT-NEXT:    [[TMP58:%.*]] = extractelement <16 x i8> [[IN]], i64 13
+; OPT-NEXT:    [[TMP59:%.*]] = insertelement <128 x i8> [[TMP57]], i8 [[TMP58]], i32 13
+; OPT-NEXT:    [[TMP60:%.*]] = extractelement <16 x i8> [[IN]], i64 14
+; OPT-NEXT:    [[TMP61:%.*]] = insertelement <128 x i8> [[TMP59]], i8 [[TMP60]], i32 14
+; OPT-NEXT:    [[TMP62:%.*]] = extractelement <16 x i8> [[IN]], i64 15
+; OPT-NEXT:    [[TMP63:%.*]] = insertelement <128 x i8> [[TMP61]], i8 [[TMP62]], i32 15
+; OPT-NEXT:    [[TMP64:%.*]] = extractelement <16 x i8> [[IN]], i64 0
+; OPT-NEXT:    [[TMP65:%.*]] = insertelement <128 x i8> [[TMP63]], i8 [[TMP64]], i32 0
+; OPT-NEXT:    [[TMP66:%.*]] = extractelement <16 x i8> [[IN]], i64 1
+; OPT-NEXT:    [[TMP67:%.*]] = insertelement <128 x i8> [[TMP65]], i8 [[TMP66]], i32 1
+; OPT-NEXT:    [[TMP68:%.*]] = extractelement <16 x i8> [[IN]], i64 2
+; OPT-NEXT:    [[TMP69:%.*]] = insertelement <128 x i8> [[TMP67]], i8 [[TMP68]], i32 2
+; OPT-NEXT:    [[TMP70:%.*]] = extractelement <16 x i8> [[IN]], i64 3
+; OPT-NEXT:    [[TMP71:%.*]] = insertelement <128 x i8> [[TMP69]], i8 [[TMP70]], i32 3
+; OPT-NEXT:    [[TMP72:%.*]] = extractelement <16 x i8> [[IN]], i64 4
+; OPT-NEXT:    [[TMP73:%.*]] = insertelement <128 x i8> [[TMP71]], i8 [[TMP72]], i32 4
+; OPT-NEXT:    [[TMP74:%.*]] = extractelement <16 x i8> [[IN]], i64 5
+; OPT-NEXT:    [[TMP75:%.*]] = insertelement <128 x i8> [[TMP73]], i8 [[TMP74]], i32 5
+; OPT-NEXT:    [[TMP76:%.*]] = extractelement <16 x i8> [[IN]], i64 6
+; OPT-NEXT:    [[TMP77:%.*]] = insertelement <128 x i8> [[TMP75]], i8 [[TMP76]], i32 6
+; OPT-NEXT:    [[TMP78:%.*]] = extractelement <16 x i8> [[IN]], i64 7
+; OPT-NEXT:    [[TMP79:%.*]] = insertelement <128 x i8> [[TMP77]], i8 [[TMP78]], i32 7
+; OPT-NEXT:    [[TMP80:%.*]] = extractelement <16 x i8> [[IN]], i64 8
+; OPT-NEXT:    [[TMP81:%.*]] = insertelement <128 x i8> [[TMP79]], i8 [[TMP80]], i32 8
+; OPT-NEXT:    [[TMP82:%.*]] = extractelement <16 x i8> [[IN]], i64 9
+; OPT-NEXT:    [[TMP83:%.*]] = insertelement <128 x i8> [[TMP81]], i8 [[TMP82]], i32 9
+; OPT-NEXT:    [[TMP84:%.*]] = extractelement <16 x i8> [[IN]], i64 10
+; OPT-NEXT:    [[TMP85:%.*]] = insertelement <128 x i8> [[TMP83]], i8 [[TMP84]], i32 10
+; OPT-NEXT:    [[TMP86:%.*]] = extractelement <16 x i8> [[IN]], i64 11
+; OPT-NEXT:    [[TMP87:%.*]] = insertelement <128 x i8> [[TMP85]], i8 [[TMP86]], i32 11
+; OPT-NEXT:    [[TMP88:%.*]] = extractelement <16 x i8> [[IN]], i64 12
+; OPT-NEXT:    [[TMP89:%.*]] = insertelement <128 x i8> [[TMP87]], i8 [[TMP88]], i32 12
+; OPT-NEXT:    [[TMP90:%.*]] = extractelement <16 x i8> [[IN]], i64 13
+; OPT-NEXT:    [[TMP91:%.*]] = insertelement <128 x i8> [[TMP89]], i8 [[TMP90]], i32 13
+; OPT-NEXT:    [[TMP92:%.*]] = extractelement <16 x i8> [[IN]], i64 14
+; OPT-NEXT:    [[TMP93:%.*]] = insertelement <128 x i8> [[TMP91]], i8 [[TMP92]], i32 14
+; OPT-NEXT:    [[TMP94:%.*]] = extractelement <16 x i8> [[IN]], i64 15
+; OPT-NEXT:    [[TMP95:%.*]] = insertelement <128 x i8> [[TMP93]], i8 [[TMP94]], i32 15
+; OPT-NEXT:    [[TMP96:%.*]] = extractelement <16 x i8> [[IN]], i64 0
+; OPT-NEXT:    [[TMP97:%.*]] = insertelement <128 x i8> [[TMP95]], i8 [[TMP96]], i32 0
+; OPT-NEXT:    [[TMP98:%.*]] = extractelement <16 x i8> [[IN]], i64 1
+; OPT-NEXT:    [[TMP99:%.*]] = insertelement <128 x i8> [[TMP97]], i8 [[TMP98]], i32 1
+; OPT-NEXT:    [[TMP100:%.*]] = extractelement <16 x i8> [[IN]], i64 2
+; OPT-NEXT:    [[TMP101:%.*]] = insertelement <128 x i8> [[TMP99]], i8 [[TMP100]], i32 2
+; OPT-NEXT:    [[TMP102:%.*]] = extractelement <16 x i8> [[IN]], i64 3
+; OPT-NEXT:    [[TMP103:%.*]] = insertelement <128 x i8> [[TMP101]], i8 [[TMP102]], i32 3
+; OPT-NEXT:    [[TMP104:%.*]] = extractelement <16 x i8> [[IN]], i64 4
+; OPT-NEXT:    [[TMP105:%.*]] = insertelement <128 x i8> [[TMP103]], i8 [[TMP104]], i32 4
+; OPT-NEXT:    [[TMP106:%.*]] = extractelement <16 x i8> [[IN]], i64 5
+; OPT-NEXT:    [[TMP107:%.*]] = insertelement <128 x i8> [[TMP105]], i8 [[TMP106]], i32 5
+; OPT-NEXT:    [[TMP108:%.*]] = extractelement <16 x i8> [[IN]], i64 6
+; OPT-NEXT:    [[TMP109:%.*]] = insertelement <128 x i8> [[TMP107]], i8 [[TMP108]], i32 6
+; OPT-NEXT:    [[TMP110:%.*]] = extractelement <16 x i8> [[IN]], i64 7
+; OPT-NEXT:    [[TMP111:%.*]] = insertelement <128 x i8> [[TMP109]], i8 [[TMP110]], i32 7
+; OPT-NEXT:    [[TMP112:%.*]] = extractelement <16 x i8> [[IN]], i64 8
+; OPT-NEXT:    [[TMP113:%.*]] = insertelement <128 x i8> [[TMP111]], i8 [[TMP112]], i32 8
+; OPT-NEXT:    [[TMP114:%.*]] = extractelement <16 x i8> [[IN]], i64 9
+; OPT-NEXT:    [[TMP115:%.*]] = insertelement <128 x i8> [[TMP113]], i8 [[TMP114]], i32 9
+; OPT-NEXT:    [[TMP116:%.*]] = extractelement <16 x i8> [[IN]], i64 10
+; OPT-NEXT:    [[TMP117:%.*]] = insertelement <128 x i8> [[TMP115]], i8 [[TMP116]], i32 10
+; OPT-NEXT:    [[TMP118:%.*]] = extractelement <16 x i8> [[IN]], i64 11
+; OPT-NEXT:    [[TMP119:%.*]] = insertelement <128 x i8> [[TMP117]], i8 [[TMP118]], i32 11
+; OPT-NEXT:    [[TMP120:%.*]] = extractelement <16 x i8> [[IN]], i64 12
+; OPT-NEXT:    [[TMP121:%.*]] = insertelement <128 x i8> [[TMP119]], i8 [[TMP120]], i32 12
+; OPT-NEXT:    [[TMP122:%.*]] = extractelement <16 x i8> [[IN]], i64 13
+; OPT-NEXT:    [[TMP123:%.*]] = insertelement <128 x i8> [[TMP121]], i8 [[TMP122]], i32 13
+; OPT-NEXT:    [[TMP124:%.*]] = extractelement <16 x i8> [[IN]], i64 14
+; OPT-NEXT:    [[TMP125:%.*]] = insertelement <128 x i8> [[TMP123]], i8 [[TMP124]], i32 14
+; OPT-NEXT:    [[TMP126:%.*]] = extractelement <16 x i8> [[IN]], i64 15
+; OPT-NEXT:    [[TMP127:%.*]] = insertelement <128 x i8> [[TMP125]], i8 [[TMP126]], i32 15
+; OPT-NEXT:    [[TMP128:%.*]] = extractelement <16 x i8> [[IN]], i64 0
+; OPT-NEXT:    [[TMP129:%.*]] = insertelement <128 x i8> [[TMP127]], i8 [[TMP128]], i32 0
+; OPT-NEXT:    [[TMP130:%.*]] = extractelement <16 x i8> [[IN]], i64 1
+; OPT-NEXT:    [[TMP131:%.*]] = insertelement <128 x i8> [[TMP129]], i8 [[TMP130]], i32 1
+; OPT-NEXT:    [[TMP132:%.*]] = extractelement <16 x i8> [[IN]], i64 2
+; OPT-NEXT:    [[TMP133:%.*]] = insertelement <128 x i8> [[TMP131]], i8 [[TMP132]], i32 2
+; OPT-NEXT:    [[TMP134:%.*]] = extractelement <16 x i8> [[IN]], i64 3
+; OPT-NEXT:    [[TMP135:%.*]] = insertelement <128 x i8> [[TMP133]], i8 [[TMP134]], i32 3
+; OPT-NEXT:    [[TMP136:%.*]] = extractelement <16 x i8> [[IN]], i64 4
+; OPT-NEXT:    [[TMP137:%.*]] = insertelement <128 x i8> [[TMP135]], i8 [[TMP136]], i32 4
+; OPT-NEXT:    [[TMP138:%.*]] = extractelement <16 x i8> [[IN]], i64 5
+; OPT-NEXT:    [[TMP139:%.*]] = insertelement <128 x i8> [[TMP137]], i8 [[TMP138]], i32 5
+; OPT-NEXT:    [[TMP140:%.*]] = extractelement <16 x i8> [[IN]], i64 6
+; OPT-NEXT:    [[TMP141:%.*]] = insertelement <128 x i8> [[TMP139]], i8 [[TMP140]], i32 6
+; OPT-NEXT:    [[TMP142:%.*]] = extractelement <16 x i8> [[IN]], i64 7
+; OPT-NEXT:    [[TMP143:%.*]] = insertelement <128 x i8> [[TMP141]], i8 [[TMP142]], i32 7
+; OPT-NEXT:    [[TMP144:%.*]] = extractelement <16 x i8> [[IN]], i64 8
+; OPT-NEXT:    [[TMP145:%.*]] = insertelement <128 x i8> [[TMP143]], i8 [[TMP144]], i32 8
+; OPT-NEXT:    [[TMP146:%.*]] = extractelement <16 x i8> [[IN]], i64 9
+; OPT-NEXT:    [[TMP147:%.*]] = insertelement <128 x i8> [[TMP145]], i8 [[TMP146]], i32 9
+; OPT-NEXT:    [[TMP148:%.*]] = extractelement <16 x i8> [[IN]], i64 10
+; OPT-NEXT:    [[TMP149:%.*]] = insertelement <128 x i8> [[TMP147]], i8 [[TMP148]], i32 10
+; OPT-NEXT:    [[TMP150:%.*]] = extractelement <16 x i8> [[IN]], i64 11
+; OPT-NEXT:    [[TMP151:%.*]] = insertelement <128 x i8> [[TMP149]], i8 [[TMP150]], i32 11
+; OPT-NEXT:    [[TMP152:%.*]] = extractelement <16 x i8> [[IN]], i64 12
+; OPT-NEXT:    [[TMP153:%.*]] = insertelement <128 x i8> [[TMP151]], i8 [[TMP152]], i32 12
+; OPT-NEXT:    [[TMP154:%.*]] = extractelement <16 x i8> [[IN]], i64 13
+; OPT-NEXT:    [[TMP155:%.*]] = insertelement <128 x i8> [[TMP153]], i8 [[TMP154]], i32 13
+; OPT-NEXT:    [[TMP156:%.*]] = extractelement <16 x i8> [[IN]], i64 14
+; OPT-NEXT:    [[TMP157:%.*]] = insertelement <128 x i8> [[TMP155]], i8 [[TMP156]], i32 14
+; OPT-NEXT:    [[TMP158:%.*]] = extractelement <16 x i8> [[IN]], i64 15
+; OPT-NEXT:    [[TMP159:%.*]] = insertelement <128 x i8> [[TMP157]], i8 [[TMP158]], i32 15
+; OPT-NEXT:    [[TMP160:%.*]] = extractelement <16 x i8> [[IN]], i64 0
+; OPT-NEXT:    [[TMP161:%.*]] = insertelement <128 x i8> [[TMP159]], i8 [[TMP160]], i32 0
+; OPT-NEXT:    [[TMP162:%.*]] = extractelement <16 x i8> [[IN]], i64 1
+; OPT-NEXT:    [[TMP163:%.*]] = insertelement <128 x i8> [[TMP161]], i8 [[TMP162]], i32 1
+; OPT-NEXT:    [[TMP164:%.*]] = extractelement <16 x i8> [[IN]], i64 2
+; OPT-NEXT:    [[TMP165:%.*]] = insertelement <128 x i8> [[TMP163]], i8 [[TMP164]], i32 2
+; OPT-NEXT:    [[TMP166:%.*]] = extractelement <16 x i8> [[IN]], i64 3
+; OPT-NEXT:    [[TMP167:%.*]] = insertelement <128 x i8> [[TMP165]], i8 [[TMP166]], i32 3
+; OPT-NEXT:    [[TMP168:%.*]] = extractelement <16 x i8> [[IN]], i64 4
+; OPT-NEXT:    [[TMP169:%.*]] = insertelement <128 x i8> [[TMP167]], i8 [[TMP168]], i32 4
+; OPT-NEXT:    [[TMP170:%.*]] = extractelement <16 x i8> [[IN]], i64 5
+; OPT-NEXT:    [[TMP171:%.*]] = insertelement <128 x i8> [[TMP169]], i8 [[TMP170]], i32 5
+; OPT-NEXT:    [[TMP172:%.*]] = extractelement <16 x i8> [[IN]], i64 6
+; OPT-NEXT:    [[TMP173:%.*]] = insertelement <128 x i8> [[TMP171]], i8 [[TMP172]], i32 6
+; OPT-NEXT:    [[TMP174:%.*]] = extractelement <16 x i8> [[IN]], i64 7
+; OPT-NEXT:    [[TMP175:%.*]] = insertelement <128 x i8> [[TMP173]], i8 [[TMP174]], i32 7
+; OPT-NEXT:    [[TMP176:%.*]] = extractelement <16 x i8> [[IN]], i64 8
+; OPT-NEXT:    [[TMP177:%.*]] = insertelement <128 x i8> [[TMP175]], i8 [[TMP176]], i32 8
+; OPT-NEXT:    [[TMP178:%.*]] = extractelement <16 x i8> [[IN]], i64 9
+; OPT-NEXT:    [[TMP179:%.*]] = insertelement <128 x i8> [[TMP177]], i8 [[TMP178]], i32 9
+; OPT-NEXT:    [[TMP180:%.*]] = extractelement <16 x i8> [[IN]], i64 10
+; OPT-NEXT:    [[TMP181:%.*]] = insertelement <128 x i8> [[TMP179]], i8 [[TMP180]], i32 10
+; OPT-NEXT:    [[TMP182:%.*]] = extractelement <16 x i8> [[IN]], i64 11
+; OPT-NEXT:    [[TMP183:%.*]] = insertelement <128 x i8> [[TMP181]], i8 [[TMP182]], i32 11
+; OPT-NEXT:    [[TMP184:%.*]] = extractelement <16 x i8> [[IN]], i64 12
+; OPT-NEXT:    [[TMP185:%.*]] = insertelement <128 x i8> [[TMP183]], i8 [[TMP184]], i32 12
+; OPT-NEXT:    [[TMP186:%.*]] = extractelement <16 x i8> [[IN]], i64 13
+; OPT-NEXT:    [[TMP187:%.*]] = insertelement <128 x i8> [[TMP185]], i8 [[TMP186]], i32 13
+; OPT-NEXT:    [[TMP188:%.*]] = extractelement <16 x i8> [[IN]], i64 14
+; OPT-NEXT:    [[TMP189:%.*]] = insertelement <128 x i8> [[TMP187]], i8 [[TMP188]], i32 14
+; OPT-NEXT:    [[TMP190:%.*]] = extractelement <16 x i8> [[IN]], i64 15
+; OPT-NEXT:    [[TMP191:%.*]] = insertelement <128 x i8> [[TMP189]], i8 [[TMP190]], i32 15
+; OPT-NEXT:    [[TMP192:%.*]] = extractelement <16 x i8> [[IN]], i64 0
+; OPT-NEXT:    [[TMP193:%.*]] = insertelement <128 x i8> [[TMP191]], i8 [[TMP192]], i32 0
+; OPT-NEXT:    [[TMP194:%.*]] = extractelement <16 x i8> [[IN]], i64 1
+; OPT-NEXT:    [[TMP195:%.*]] = insertelement <128 x i8> [[TMP193]], i8 [[TMP194]], i32 1
+; OPT-NEXT:    [[TMP196:%.*]] = extractelement <16 x i8> [[IN]], i64 2
+; OPT-NEXT:    [[TMP197:%.*]] = insertelement <128 x i8> [[TMP195]], i8 [[TMP196]], i32 2
+; OPT-NEXT:    [[TMP198:%.*]] = extractelement <16 x i8> [[IN]], i64 3
+; OPT-NEXT:    [[TMP199:%.*]] = insertelement <128 x i8> [[TMP197]], i8 [[TMP198]], i32 3
+; OPT-NEXT:    [[TMP200:%.*]] = extractelement <16 x i8> [[IN]], i64 4
+; OPT-NEXT:    [[TMP201:%.*]] = insertelement <128 x i8> [[TMP199]], i8 [[TMP200]], i32 4
+; OPT-NEXT:    [[TMP202:%.*]] = extractelement <16 x i8> [[IN]], i64 5
+; OPT-NEXT:    [[TMP203:%.*]] = insertelement <128 x i8> [[TMP201]], i8 [[TMP202]], i32 5
+; OPT-NEXT:    [[TMP204:%.*]] = extractelement <16 x i8> [[IN]], i64 6
+; OPT-NEXT:    [[TMP205:%.*]] = insertelement <128 x i8> [[TMP203]], i8 [[TMP204]], i32 6
+; OPT-NEXT:    [[TMP206:%.*]] = extractelement <16 x i8> [[IN]], i64 7
+; OPT-NEXT:    [[TMP207:%.*]] = insertelement <128 x i8> [[TMP205]], i8 [[TMP206]], i32 7
+; OPT-NEXT:    [[TMP208:%.*]] = extractelement <16 x i8> [[IN]], i64 8
+; OPT-NEXT:    [[TMP209:%.*]] = insertelement <128 x i8> [[TMP207]], i8 [[TMP208]], i32 8
+; OPT-NEXT:    [[TMP210:%.*]] = extractelement <16 x i8> [[IN]], i64 9
+; OPT-NEXT:    [[TMP211:%.*]] = insertelement <128 x i8> [[TMP209]], i8 [[TMP210]], i32 9
+; OPT-NEXT:    [[TMP212:%.*]] = extractelement <16 x i8> [[IN]], i64 10
+; OPT-NEXT:    [[TMP213:%.*]] = insertelement <128 x i8> [[TMP211]], i8 [[TMP212]], i32 10
+; OPT-NEXT:    [[TMP214:%.*]] = extractelement <16 x i8> [[IN]], i64 11
+; OPT-NEXT:    [[TMP215:%.*]] = insertelement <128 x i8> [[TMP213]], i8 [[TMP214]], i32 11
+; OPT-NEXT:    [[TMP216:%.*]] = extractelement <16 x i8> [[IN]], i64 12
+; OPT-NEXT:    [[TMP217:%.*]] = insertelement <128 x i8> [[TMP215]], i8 [[TMP216]], i32 12
+; OPT-NEXT:    [[TMP218:%.*]] = extractelement <16 x i8> [[IN]], i64 13
+; OPT-NEXT:    [[TMP219:%.*]] = insertelement <128 x i8> [[TMP217]], i8 [[TMP218]], i32 13
+; OPT-NEXT:    [[TMP220:%.*]] = extractelement <16 x i8> [[IN]], i64 14
+; OPT-NEXT:    [[TMP221:%.*]] = insertelement <128 x i8> [[TMP219]], i8 [[TMP220]], i32 14
+; OPT-NEXT:    [[TMP222:%.*]] = extractelement <16 x i8> [[IN]], i64 15
+; OPT-NEXT:    [[TMP223:%.*]] = insertelement <128 x i8> [[TMP221]], i8 [[TMP222]], i32 15
+; OPT-NEXT:    [[TMP224:%.*]] = extractelement <16 x i8> [[IN]], i64 0
+; OPT-NEXT:    [[TMP225:%.*]] = insertelement <128 x i8> [[TMP223]], i8 [[TMP224]], i32 0
+; OPT-NEXT:    [[TMP226:%.*]] = extractelement <16 x i8> [[IN]], i64 1
+; OPT-NEXT:    [[TMP227:%.*]] = insertelement <128 x i8> [[TMP225]], i8 [[TMP226]], i32 1
+; OPT-NEXT:    [[TMP228:%.*]] = extractelement <16 x i8> [[IN]], i64 2
+; OPT-NEXT:    [[TMP229:%.*]] = insertelement <128 x i8> [[TMP227]], i8 [[TMP228]], i32 2
+; OPT-NEXT:    [[TMP230:%.*]] = extractelement <16 x i8> [[IN]], i64 3
+; OPT-NEXT:    [[TMP231:%.*]] = insertelement <128 x i8> [[TMP229]], i8 [[TMP230]], i32 3
+; OPT-NEXT:    [[TMP232:%.*]] = extractelement <16 x i8> [[IN]], i64 4
+; OPT-NEXT:    [[TMP233:%.*]] = insertelement <128 x i8> [[TMP231]], i8 [[TMP232]], i32 4
+; OPT-NEXT:    [[TMP234:%.*]] = extractelement <16 x i8> [[IN]], i64 5
+; OPT-NEXT:    [[TMP235:%.*]] = insertelement <128 x i8> [[TMP233]], i8 [[TMP234]], i32 5
+; OPT-NEXT:    [[TMP236:%.*]] = extractelement <16 x i8> [[IN]], i64 6
+; OPT-NEXT:    [[TMP237:%.*]] = insertelement <128 x i8> [[TMP235]], i8 [[TMP236]], i32 6
+; OPT-NEXT:    [[TMP238:%.*]] = extractelement <16 x i8> [[IN]], i64 7
+; OPT-NEXT:    [[TMP239:%.*]] = insertelement <128 x i8> [[TMP237]], i8 [[TMP238]], i32 7
+; OPT-NEXT:    [[TMP240:%.*]] = extractelement <16 x i8> [[IN]], i64 8
+; OPT-NEXT:    [[TMP241:%.*]] = insertelement <128 x i8> [[TMP239]], i8 [[TMP240]], i32 8
+; OPT-NEXT:    [[TMP242:%.*]] = extractelement <16 x i8> [[IN]], i64 9
+; OPT-NEXT:    [[TMP243:%.*]] = insertelement <128 x i8> [[TMP241]], i8 [[TMP242]], i32 9
+; OPT-NEXT:    [[TMP244:%.*]] = extractelement <16 x i8> [[IN]], i64 10
+; OPT-NEXT:    [[TMP245:%.*]] = insertelement <128 x i8> [[TMP243]], i8 [[TMP244]], i32 10
+; OPT-NEXT:    [[TMP246:%.*]] = extractelement <16 x i8> [[IN]], i64 11
+; OPT-NEXT:    [[TMP247:%.*]] = insertelement <128 x i8> [[TMP245]], i8 [[TMP246]], i32 11
+; OPT-NEXT:    [[TMP248:%.*]] = extractelement <16 x i8> [[IN]], i64 12
+; OPT-NEXT:    [[TMP249:%.*]] = insertelement <128 x i8> [[TMP247]], i8 [[TMP248]], i32 12
+; OPT-NEXT:    [[TMP250:%.*]] = extractelement <16 x i8> [[IN]], i64 13
+; OPT-NEXT:    [[TMP251:%.*]] = insertelement <128 x i8> [[TMP249]], i8 [[TMP250]], i32 13
+; OPT-NEXT:    [[TMP252:%.*]] = extractelement <16 x i8> [[IN]], i64 14
+; OPT-NEXT:    [[TMP253:%.*]] = insertelement <128 x i8> [[TMP251]], i8 [[TMP252]], i32 14
+; OPT-NEXT:    [[TMP254:%.*]] = extractelement <16 x i8> [[IN]], i64 15
+; OPT-NEXT:    [[TMP255:%.*]] = insertelement <128 x i8> [[TMP253]], i8 [[TMP254]], i32 15
+; OPT-NEXT:    [[TMP256:%.*]] = extractelement <128 x i8> [[TMP255]], i32 80
+; OPT-NEXT:    [[TMP257:%.*]] = insertelement <16 x i8> poison, i8 [[TMP256]], i64 0
+; OPT-NEXT:    [[TMP258:%.*]] = extractelement <128 x i8> [[TMP255]], i32 81
+; OPT-NEXT:    [[TMP259:%.*]] = insertelement <16 x i8> [[TMP257]], i8 [[TMP258]], i64 1
+; OPT-NEXT:    [[TMP260:%.*]] = extractelement <128 x i8> [[TMP255]], i32 82
+; OPT-NEXT:    [[TMP261:%.*]] = insertelement <16 x i8> [[TMP259]], i8 [[TMP260]], i64 2
+; OPT-NEXT:    [[TMP262:%.*]] = extractelement <128 x i8> [[TMP255]], i32 83
+; OPT-NEXT:    [[TMP263:%.*]] = insertelement <16 x i8> [[TMP261]], i8 [[TMP262]], i64 3
+; OPT-NEXT:    [[TMP264:%.*]] = extractelement <128 x i8> [[TMP255]], i32 84
+; OPT-NEXT:    [[TMP265:%.*]] = insertelement <16 x i8> [[TMP263]], i8 [[TMP264]], i64 4
+; OPT-NEXT:    [[TMP266:%.*]] = extractelement <128 x i8> [[TMP255]], i32 85
+; OPT-NEXT:    [[TMP267:%.*]] = insertelement <16 x i8> [[TMP265]], i8 [[TMP266]], i64 5
+; OPT-NEXT:    [[TMP268:%.*]] = extractelement <128 x i8> [[TMP255]], i32 86
+; OPT-NEXT:    [[TMP269:%.*]] = insertelement <16 x i8> [[TMP267]], i8 [[TMP268]], i64 6
+; OPT-NEXT:    [[TMP270:%.*]] = extractelement <128 x i8> [[TMP255]], i32 87
+; OPT-NEXT:    [[TMP271:%.*]] = insertelement <16 x i8> [[TMP269]], i8 [[TMP270]], i64 7
+; OPT-NEXT:    [[TMP272:%.*]] = extractelement <128 x i8> [[TMP255]], i32 88
+; OPT-NEXT:    [[TMP273:%.*]] = insertelement <16 x i8> [[TMP271]], i8 [[TMP272]], i64 8
+; OPT-NEXT:    [[TMP274:%.*]] = extractelement <128 x i8> [[TMP255]], i32 89
+; OPT-NEXT:    [[TMP275:%.*]] = insertelement <16 x i8> [[TMP273]], i8 [[TMP274]], i64 9
+; OPT-NEXT:    [[TMP276:%.*]] = extractelement <128 x i8> [[TMP255]], i32 90
+; OPT-NEXT:    [[TMP277:%.*]] = insertelement <16 x i8> [[TMP275]], i8 [[TMP276]], i64 10
+; OPT-NEXT:    [[TMP278:%.*]] = extractelement <128 x i8> [[TMP255]], i32 91
+; OPT-NEXT:    [[TMP279:%.*]] = insertelement <16 x i8> [[TMP277]], i8 [[TMP278]], i64 11
+; OPT-NEXT:    [[TMP280:%.*]] = extractelement <128 x i8> [[TMP255]], i32 92
+; OPT-NEXT:    [[TMP281:%.*]] = insertelement <16 x i8> [[TMP279]], i8 [[TMP280]], i64 12
+; OPT-NEXT:    [[TMP282:%.*]] = extractelement <128 x i8> [[TMP255]], i32 93
+; OPT-NEXT:    [[TMP283:%.*]] = insertelement <16 x i8> [[TMP281]], i8 [[TMP282]], i64 13
+; OPT-NEXT:    [[TMP284:%.*]] = extractelement <128 x i8> [[TMP255]], i32 94
+; OPT-NEXT:    [[TMP285:%.*]] = insertelement <16 x i8> [[TMP283]], i8 [[TMP284]], i64 14
+; OPT-NEXT:    [[TMP286:%.*]] = extractelement <128 x i8> [[TMP255]], i32 95
+; OPT-NEXT:    [[TMP287:%.*]] = insertelement <16 x i8> [[TMP285]], i8 [[TMP286]], i64 15
+; OPT-NEXT:    [[SUM:%.*]] = add <16 x i8> [[TMP287]], [[ADD]]
+; OPT-NEXT:    store <16 x i8> [[SUM]], ptr addrspace(3) [[OUT]], align 16
+; OPT-NEXT:    ret void
+;
+entry:
+  %alloca = alloca [8 x <16 x i8>], align 16, addrspace(5)
+  %gep0 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 0
+  store <16 x i8> %in, ptr addrspace(5) %gep0, align 16
+  %gep1 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 1
+  store <16 x i8> %in, ptr addrspace(5) %gep0, align 16
+  %gep2 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 2
+  store <16 x i8> %in, ptr addrspace(5) %gep0, align 16
+  %gep3 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 3
+  store <16 x i8> %in, ptr addrspace(5) %gep0, align 16
+  %gep4 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 4
+  store <16 x i8> %in, ptr addrspace(5) %gep0, align 16
+  %gep5 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 5
+  store <16 x i8> %in, ptr addrspace(5) %gep0, align 16
+  %gep6 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 6
+  store <16 x i8> %in, ptr addrspace(5) %gep0, align 16
+  %gep7 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 7
+  store <16 x i8> %in, ptr addrspace(5) %gep0, align 16
+  %load = load <16 x i8>, ptr addrspace(5) %gep5, align 16
+  %sum = add <16 x i8> %load, %add
+  store <16 x i8> %sum, ptr addrspace(3) %out, align 16
+  ret void
+}
+
+attributes #0 = {"amdgpu-waves-per-eu"="2,2"}
diff --git a/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll b/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll
index f67cbe381bfad..ddb522a82880b 100644
--- a/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll
+++ b/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll
@@ -1,17 +1,17 @@
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=MEMTIME -check-prefix=SIVI -check-prefix=GCN %s
 ; -global-isel=1 SI run line skipped since store not yet implemented.
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=MEMTIME -check-prefix=SIVI -check-prefix=GCN %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=MEMTIME -check-prefix=SIVI -check-prefix=GCN %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=MEMTIME -check-prefix=SIVI -check-prefix=GCN %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=MEMTIME -check-prefix=GCN %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=MEMTIME -check-prefix=GCN %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=MEMTIME -check-prefix=GCN %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=MEMTIME -check-prefix=GCN %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=MEMTIME -check-prefix=GCN %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=MEMTIME -check-prefix=GCN %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GETREG,GETREG-SDAG -check-prefix=GCN %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GETREG,GETREG-GISEL -check-prefix=GCN %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GETREG,GETREG-GISEL -check-prefix=GCN %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GFX1250 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GFX1250 %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GFX1250 %s
 
 declare i64 @llvm.readcyclecounter() #0
 
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll
index c5732531f5423..48ed5c4dedfb2 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll
@@ -73,10 +73,10 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) %
 }
 
 ; CHECK-LABEL: {{^}}excess_soft_clause_reg_pressure:
-; GFX908:    NumSgprs: 64
-; GFX908-GCNTRACKERS:    NumSgprs: 64
+; GFX908:    NumSgprs: 56
+; GFX908-GCNTRACKERS:    NumSgprs: 56
 ; GFX908:    NumVgprs: 43
-; GFX908-GCNTRACKERS:    NumVgprs: 39
+; GFX908-GCNTRACKERS:    NumVgprs: 40
 ; GFX908:    Occupancy: 5
 ; GFX908-GCNTRACKERS:    Occupancy: 6
 
diff --git a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll
index 586579fcaeb93..ef96944abef0e 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll
@@ -20,38 +20,33 @@ define void @test() {
 ; CHECK-NEXT:    ; in Loop: Header=BB0_1 Depth=1
 ; CHECK-NEXT:  .LBB0_3: ; %bb.3
 ; CHECK-NEXT:    ; in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    ; implicit-def: $sgpr4
-; CHECK-NEXT:    v_mov_b32_e32 v0, s4
-; CHECK-NEXT:    v_readfirstlane_b32 s6, v0
 ; CHECK-NEXT:    s_mov_b64 s[4:5], -1
-; CHECK-NEXT:    s_mov_b32 s7, 0
-; CHECK-NEXT:    s_cmp_eq_u32 s6, s7
 ; CHECK-NEXT:    ; implicit-def: $vgpr1 : SGPR spill to VGPR lane
 ; CHECK-NEXT:    v_writelane_b32 v1, s4, 0
 ; CHECK-NEXT:    v_writelane_b32 v1, s5, 1
-; CHECK-NEXT:    s_mov_b64 s[10:11], exec
-; CHECK-NEXT:    s_mov_b64 exec, -1
+; CHECK-NEXT:    s_or_saveexec_b64 s[8:9], -1
+; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    v_accvgpr_write_b32 a0, v1 ; Reload Reuse
-; CHECK-NEXT:    s_mov_b64 exec, s[10:11]
+; CHECK-NEXT:    s_mov_b64 exec, s[8:9]
 ; CHECK-NEXT:    s_cbranch_scc1 .LBB0_5
 ; CHECK-NEXT:  ; %bb.4: ; %bb.4
 ; CHECK-NEXT:    ; in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    s_or_saveexec_b64 s[10:11], -1
+; CHECK-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; CHECK-NEXT:    v_accvgpr_read_b32 v1, a0 ; Reload Reuse
-; CHECK-NEXT:    s_mov_b64 exec, s[10:11]
+; CHECK-NEXT:    s_mov_b64 exec, s[8:9]
 ; CHECK-NEXT:    s_mov_b64 s[4:5], 0
 ; CHECK-NEXT:    v_writelane_b32 v1, s4, 0
 ; CHECK-NEXT:    v_writelane_b32 v1, s5, 1
-; CHECK-NEXT:    s_or_saveexec_b64 s[10:11], -1
+; CHECK-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    v_accvgpr_write_b32 a0, v1 ; Reload Reuse
-; CHECK-NEXT:    s_mov_b64 exec, s[10:11]
+; CHECK-NEXT:    s_mov_b64 exec, s[8:9]
 ; CHECK-NEXT:  .LBB0_5: ; %Flow
 ; CHECK-NEXT:    ; in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    s_or_saveexec_b64 s[10:11], -1
+; CHECK-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    v_accvgpr_read_b32 v1, a0 ; Reload Reuse
-; CHECK-NEXT:    s_mov_b64 exec, s[10:11]
+; CHECK-NEXT:    s_mov_b64 exec, s[8:9]
 ; CHECK-NEXT:    v_readlane_b32 s4, v1, 0
 ; CHECK-NEXT:    v_readlane_b32 s5, v1, 1
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/spill_kill_v16.mir b/llvm/test/CodeGen/AMDGPU/spill_kill_v16.mir
index 0c694d9f49e18..69895833efccb 100644
--- a/llvm/test/CodeGen/AMDGPU/spill_kill_v16.mir
+++ b/llvm/test/CodeGen/AMDGPU/spill_kill_v16.mir
@@ -1,5 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -march=amdgcn -verify-machineinstrs -mcpu=gfx1100 -mattr=+real-true16 -run-pass=prologepilog -o - %s | FileCheck -check-prefix=EXPANDED %s
+# RUN: llc -march=amdgcn -verify-machineinstrs -mcpu=gfx1250 -mattr=+real-true16 -run-pass=prologepilog -o - %s | FileCheck -check-prefix=SRAMECC-EXPANDED %s
 
 ---
 name: spill_restore_vgpr16
@@ -31,6 +32,28 @@ body: |
   ; EXPANDED-NEXT:   $vgpr0_lo16 = SCRATCH_LOAD_SHORT_D16_SADDR_t16 $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s16) from %stack.0, align 4, addrspace 5)
   ; EXPANDED-NEXT:   $vgpr0_hi16 = SCRATCH_LOAD_SHORT_D16_SADDR_t16 $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s16) from %stack.1, align 4, addrspace 5)
   ; EXPANDED-NEXT:   S_NOP 0, implicit killed renamable $vgpr0_lo16, implicit killed renamable $vgpr0_hi16
+  ;
+  ; SRAMECC-EXPANDED-LABEL: name: spill_restore_vgpr16
+  ; SRAMECC-EXPANDED: bb.0:
+  ; SRAMECC-EXPANDED-NEXT:   successors: %bb.1(0x80000000)
+  ; SRAMECC-EXPANDED-NEXT: {{  $}}
+  ; SRAMECC-EXPANDED-NEXT:   S_NOP 0, implicit-def renamable $vgpr0_lo16, implicit-def renamable $vgpr0_hi16
+  ; SRAMECC-EXPANDED-NEXT:   SCRATCH_STORE_SHORT_SADDR_t16 killed $vgpr0_hi16, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (store (s16) into %stack.1, align 4, addrspace 5)
+  ; SRAMECC-EXPANDED-NEXT:   S_NOP 0, implicit renamable $vgpr0_lo16
+  ; SRAMECC-EXPANDED-NEXT:   SCRATCH_STORE_SHORT_SADDR_t16 killed $vgpr0_lo16, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into %stack.0, align 4, addrspace 5)
+  ; SRAMECC-EXPANDED-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+  ; SRAMECC-EXPANDED-NEXT: {{  $}}
+  ; SRAMECC-EXPANDED-NEXT: bb.1:
+  ; SRAMECC-EXPANDED-NEXT:   successors: %bb.2(0x80000000)
+  ; SRAMECC-EXPANDED-NEXT: {{  $}}
+  ; SRAMECC-EXPANDED-NEXT:   S_NOP 1
+  ; SRAMECC-EXPANDED-NEXT: {{  $}}
+  ; SRAMECC-EXPANDED-NEXT: bb.2:
+  ; SRAMECC-EXPANDED-NEXT:   $vgpr1 = SCRATCH_LOAD_USHORT_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s16) from %stack.0, align 4, addrspace 5)
+  ; SRAMECC-EXPANDED-NEXT:   $vgpr0_lo16 = V_MOV_B16_t16_e64 0, killed $vgpr1_lo16, 0, implicit $exec
+  ; SRAMECC-EXPANDED-NEXT:   $vgpr1 = SCRATCH_LOAD_USHORT_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s16) from %stack.1, align 4, addrspace 5)
+  ; SRAMECC-EXPANDED-NEXT:   $vgpr0_hi16 = V_MOV_B16_t16_e64 0, killed $vgpr1_lo16, 0, implicit $exec
+  ; SRAMECC-EXPANDED-NEXT:   S_NOP 0, implicit killed renamable $vgpr0_lo16, implicit killed renamable $vgpr0_hi16
    bb.0:
      successors: %bb.1(0x80000000)
      S_NOP 0, implicit-def renamable $vgpr0_lo16, implicit-def renamable $vgpr0_hi16
@@ -78,6 +101,29 @@ body: |
   ; EXPANDED-NEXT:   $vgpr0_lo16 = SCRATCH_LOAD_SHORT_D16_SADDR_t16 $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s16) from %stack.0, align 4, addrspace 5)
   ; EXPANDED-NEXT:   $vgpr0_hi16 = SCRATCH_LOAD_SHORT_D16_SADDR_t16 $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s16) from %stack.1, align 4, addrspace 5)
   ; EXPANDED-NEXT:   S_NOP 0, implicit killed renamable $vgpr0_lo16, implicit killed renamable $vgpr0_hi16
+  ;
+  ; SRAMECC-EXPANDED-LABEL: name: spill_restore_vgpr16_middle_of_block
+  ; SRAMECC-EXPANDED: bb.0:
+  ; SRAMECC-EXPANDED-NEXT:   successors: %bb.1(0x80000000)
+  ; SRAMECC-EXPANDED-NEXT: {{  $}}
+  ; SRAMECC-EXPANDED-NEXT:   S_NOP 0, implicit-def renamable $vgpr0_lo16, implicit-def renamable $vgpr0_hi16
+  ; SRAMECC-EXPANDED-NEXT:   SCRATCH_STORE_SHORT_SADDR_t16 killed $vgpr0_hi16, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (store (s16) into %stack.1, align 4, addrspace 5)
+  ; SRAMECC-EXPANDED-NEXT:   S_NOP 0, implicit renamable $vgpr0_lo16
+  ; SRAMECC-EXPANDED-NEXT:   SCRATCH_STORE_SHORT_SADDR_t16 killed $vgpr0_lo16, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into %stack.0, align 4, addrspace 5)
+  ; SRAMECC-EXPANDED-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+  ; SRAMECC-EXPANDED-NEXT: {{  $}}
+  ; SRAMECC-EXPANDED-NEXT: bb.1:
+  ; SRAMECC-EXPANDED-NEXT:   successors: %bb.2(0x80000000)
+  ; SRAMECC-EXPANDED-NEXT: {{  $}}
+  ; SRAMECC-EXPANDED-NEXT:   S_NOP 1
+  ; SRAMECC-EXPANDED-NEXT: {{  $}}
+  ; SRAMECC-EXPANDED-NEXT: bb.2:
+  ; SRAMECC-EXPANDED-NEXT:   S_NOP 1
+  ; SRAMECC-EXPANDED-NEXT:   $vgpr1 = SCRATCH_LOAD_USHORT_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s16) from %stack.0, align 4, addrspace 5)
+  ; SRAMECC-EXPANDED-NEXT:   $vgpr0_lo16 = V_MOV_B16_t16_e64 0, killed $vgpr1_lo16, 0, implicit $exec
+  ; SRAMECC-EXPANDED-NEXT:   $vgpr1 = SCRATCH_LOAD_USHORT_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s16) from %stack.1, align 4, addrspace 5)
+  ; SRAMECC-EXPANDED-NEXT:   $vgpr0_hi16 = V_MOV_B16_t16_e64 0, killed $vgpr1_lo16, 0, implicit $exec
+  ; SRAMECC-EXPANDED-NEXT:   S_NOP 0, implicit killed renamable $vgpr0_lo16, implicit killed renamable $vgpr0_hi16
    bb.0:
      successors: %bb.1(0x80000000)
      S_NOP 0, implicit-def renamable $vgpr0_lo16, implicit-def renamable $vgpr0_hi16
@@ -124,6 +170,27 @@ body: |
   ; EXPANDED-NEXT: bb.2:
   ; EXPANDED-NEXT:   $vgpr0_lo16 = SCRATCH_LOAD_SHORT_D16_SADDR_t16 $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s16) from %stack.0, align 4, addrspace 5)
   ; EXPANDED-NEXT:   $vgpr0_hi16 = SCRATCH_LOAD_SHORT_D16_SADDR_t16 $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s16) from %stack.1, align 4, addrspace 5)
+  ;
+  ; SRAMECC-EXPANDED-LABEL: name: spill_restore_vgpr16_end_of_block
+  ; SRAMECC-EXPANDED: bb.0:
+  ; SRAMECC-EXPANDED-NEXT:   successors: %bb.1(0x80000000)
+  ; SRAMECC-EXPANDED-NEXT: {{  $}}
+  ; SRAMECC-EXPANDED-NEXT:   S_NOP 0, implicit-def renamable $vgpr0_lo16, implicit-def renamable $vgpr0_hi16
+  ; SRAMECC-EXPANDED-NEXT:   SCRATCH_STORE_SHORT_SADDR_t16 killed $vgpr0_hi16, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (store (s16) into %stack.1, align 4, addrspace 5)
+  ; SRAMECC-EXPANDED-NEXT:   S_NOP 0, implicit renamable $vgpr0_lo16
+  ; SRAMECC-EXPANDED-NEXT:   SCRATCH_STORE_SHORT_SADDR_t16 killed $vgpr0_lo16, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into %stack.0, align 4, addrspace 5)
+  ; SRAMECC-EXPANDED-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+  ; SRAMECC-EXPANDED-NEXT: {{  $}}
+  ; SRAMECC-EXPANDED-NEXT: bb.1:
+  ; SRAMECC-EXPANDED-NEXT:   successors: %bb.2(0x80000000)
+  ; SRAMECC-EXPANDED-NEXT: {{  $}}
+  ; SRAMECC-EXPANDED-NEXT:   S_NOP 1
+  ; SRAMECC-EXPANDED-NEXT: {{  $}}
+  ; SRAMECC-EXPANDED-NEXT: bb.2:
+  ; SRAMECC-EXPANDED-NEXT:   $vgpr1 = SCRATCH_LOAD_USHORT_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s16) from %stack.0, align 4, addrspace 5)
+  ; SRAMECC-EXPANDED-NEXT:   $vgpr0_lo16 = V_MOV_B16_t16_e64 0, killed $vgpr1_lo16, 0, implicit $exec
+  ; SRAMECC-EXPANDED-NEXT:   $vgpr1 = SCRATCH_LOAD_USHORT_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s16) from %stack.1, align 4, addrspace 5)
+  ; SRAMECC-EXPANDED-NEXT:   $vgpr0_hi16 = V_MOV_B16_t16_e64 0, killed $vgpr1_lo16, 0, implicit $exec
    bb.0:
      successors: %bb.1(0x80000000)
      S_NOP 0, implicit-def renamable $vgpr0_lo16, implicit-def renamable $vgpr0_hi16
diff --git a/llvm/test/CodeGen/AMDGPU/spillv16.ll b/llvm/test/CodeGen/AMDGPU/spillv16.ll
index 0e45df223465d..2d54ac8283a3a 100644
--- a/llvm/test/CodeGen/AMDGPU/spillv16.ll
+++ b/llvm/test/CodeGen/AMDGPU/spillv16.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s -check-prefixes=GCN,GCN-TRUE16
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s -check-prefixes=GCN,GCN-FAKE16
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s -check-prefixes=GFX1250,GFX1250-TRUE16
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s -check-prefixes=GFX1250,GFX1250-FAKE16
 
 define void @spill_i16_alu() {
 ; GCN-TRUE16-LABEL: spill_i16_alu:
@@ -32,6 +34,41 @@ define void @spill_i16_alu() {
 ; GCN-FAKE16-NEXT:    scratch_store_b16 off, v0, s32 dlc
 ; GCN-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-TRUE16-LABEL: spill_i16_alu:
+; GFX1250-TRUE16:       ; %bb.0: ; %entry
+; GFX1250-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-TRUE16-NEXT:    scratch_load_u16 v0, off, s32 scope:SCOPE_SYS
+; GFX1250-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x7b, v0.l
+; GFX1250-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:2 ; 2-byte Folded Spill
+; GFX1250-TRUE16-NEXT:    s_wait_xcnt 0x0
+; GFX1250-TRUE16-NEXT:    ;;#ASMSTART
+; GFX1250-TRUE16-NEXT:    ;;#ASMEND
+; GFX1250-TRUE16-NEXT:    scratch_load_u16 v1, off, s32 offset:2 th:TH_LOAD_LU ; 2-byte Folded Reload
+; GFX1250-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v1.l
+; GFX1250-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 scope:SCOPE_SYS
+; GFX1250-TRUE16-NEXT:    s_wait_storecnt 0x0
+; GFX1250-TRUE16-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-FAKE16-LABEL: spill_i16_alu:
+; GFX1250-FAKE16:       ; %bb.0: ; %entry
+; GFX1250-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 scope:SCOPE_SYS
+; GFX1250-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT:    v_add_nc_u16 v0, 0x7b, v0
+; GFX1250-FAKE16-NEXT:    scratch_store_b32 off, v0, s32 offset:4 ; 4-byte Folded Spill
+; GFX1250-FAKE16-NEXT:    s_wait_xcnt 0x0
+; GFX1250-FAKE16-NEXT:    ;;#ASMSTART
+; GFX1250-FAKE16-NEXT:    ;;#ASMEND
+; GFX1250-FAKE16-NEXT:    scratch_load_b32 v0, off, s32 offset:4 th:TH_LOAD_LU ; 4-byte Folded Reload
+; GFX1250-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT:    scratch_store_b16 off, v0, s32 scope:SCOPE_SYS
+; GFX1250-FAKE16-NEXT:    s_wait_storecnt 0x0
+; GFX1250-FAKE16-NEXT:    s_set_pc_i64 s[30:31]
 entry:
   %alloca = alloca i16, i32 1, align 4, addrspace(5)
 
@@ -88,6 +125,51 @@ define void @spill_i16_alu_two_vals() {
 ; GCN-FAKE16-NEXT:    scratch_store_b16 off, v0, s32 offset:4 dlc
 ; GCN-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-TRUE16-LABEL: spill_i16_alu_two_vals:
+; GFX1250-TRUE16:       ; %bb.0: ; %entry
+; GFX1250-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-TRUE16-NEXT:    scratch_load_u16 v0, off, s32 scope:SCOPE_SYS
+; GFX1250-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x7b, v0.l
+; GFX1250-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:6 ; 2-byte Folded Spill
+; GFX1250-TRUE16-NEXT:    s_wait_xcnt 0x0
+; GFX1250-TRUE16-NEXT:    ;;#ASMSTART
+; GFX1250-TRUE16-NEXT:    ;;#ASMEND
+; GFX1250-TRUE16-NEXT:    scratch_load_u16 v0, off, s32 offset:4 scope:SCOPE_SYS
+; GFX1250-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT:    scratch_load_u16 v1, off, s32 offset:6 th:TH_LOAD_LU ; 2-byte Folded Reload
+; GFX1250-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x7b, v0.l
+; GFX1250-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; GFX1250-TRUE16-NEXT:    scratch_store_d16_hi_b16 off, v0, s32 scope:SCOPE_SYS
+; GFX1250-TRUE16-NEXT:    s_wait_storecnt 0x0
+; GFX1250-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:4 scope:SCOPE_SYS
+; GFX1250-TRUE16-NEXT:    s_wait_storecnt 0x0
+; GFX1250-TRUE16-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-FAKE16-LABEL: spill_i16_alu_two_vals:
+; GFX1250-FAKE16:       ; %bb.0: ; %entry
+; GFX1250-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 scope:SCOPE_SYS
+; GFX1250-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT:    v_add_nc_u16 v0, 0x7b, v0
+; GFX1250-FAKE16-NEXT:    scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill
+; GFX1250-FAKE16-NEXT:    s_wait_xcnt 0x0
+; GFX1250-FAKE16-NEXT:    ;;#ASMSTART
+; GFX1250-FAKE16-NEXT:    ;;#ASMEND
+; GFX1250-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 offset:4 scope:SCOPE_SYS
+; GFX1250-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT:    scratch_load_b32 v1, off, s32 offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload
+; GFX1250-FAKE16-NEXT:    v_add_nc_u16 v0, 0x7b, v0
+; GFX1250-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT:    scratch_store_b16 off, v1, s32 scope:SCOPE_SYS
+; GFX1250-FAKE16-NEXT:    s_wait_storecnt 0x0
+; GFX1250-FAKE16-NEXT:    scratch_store_b16 off, v0, s32 offset:4 scope:SCOPE_SYS
+; GFX1250-FAKE16-NEXT:    s_wait_storecnt 0x0
+; GFX1250-FAKE16-NEXT:    s_set_pc_i64 s[30:31]
 entry:
   %alloca = alloca i16, i32 1, align 4, addrspace(5)
   %alloca2 = alloca i16, i32 1, align 4, addrspace(5)
@@ -140,6 +222,22 @@ define void @spill_i16() {
 ; GCN-FAKE16-NEXT:    scratch_store_b16 off, v0, s32 dlc
 ; GCN-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: spill_i16:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    scratch_load_u16 v0, off, s32 scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    scratch_store_b32 off, v0, s32 offset:4 ; 4-byte Folded Spill
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    ;;#ASMSTART
+; GFX1250-NEXT:    ;;#ASMEND
+; GFX1250-NEXT:    scratch_load_b32 v0, off, s32 offset:4 th:TH_LOAD_LU ; 4-byte Folded Reload
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    scratch_store_b16 off, v0, s32 scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
 entry:
   %alloca = alloca i16, i32 1, align 4, addrspace(5)
 
@@ -183,6 +281,22 @@ define void @spill_half() {
 ; GCN-FAKE16-NEXT:    scratch_store_b16 off, v0, s32 dlc
 ; GCN-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: spill_half:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    scratch_load_u16 v0, off, s32 scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    scratch_store_b32 off, v0, s32 offset:4 ; 4-byte Folded Spill
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    ;;#ASMSTART
+; GFX1250-NEXT:    ;;#ASMEND
+; GFX1250-NEXT:    scratch_load_b32 v0, off, s32 offset:4 th:TH_LOAD_LU ; 4-byte Folded Reload
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    scratch_store_b16 off, v0, s32 scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
 entry:
   %alloca = alloca half, i32 1, align 4, addrspace(5)
 
@@ -226,6 +340,22 @@ define void @spill_i16_from_v2i16() {
 ; GCN-FAKE16-NEXT:    scratch_store_b16 off, v0, s32 offset:2 dlc
 ; GCN-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: spill_i16_from_v2i16:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    scratch_load_u16 v0, off, s32 offset:2 scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    ;;#ASMSTART
+; GFX1250-NEXT:    ;;#ASMEND
+; GFX1250-NEXT:    scratch_load_b32 v0, off, s32 offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    scratch_store_b16 off, v0, s32 offset:2 scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
 entry:
   %alloca = alloca <2 x i16>, i32 2, align 1, addrspace(5)
 
@@ -283,6 +413,54 @@ define void @spill_2xi16_from_v2i16() {
 ; GCN-FAKE16-NEXT:    scratch_store_b16 off, v0, s32 dlc
 ; GCN-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-TRUE16-LABEL: spill_2xi16_from_v2i16:
+; GFX1250-TRUE16:       ; %bb.0: ; %entry
+; GFX1250-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-TRUE16-NEXT:    scratch_load_u16 v0, off, s32 offset:2 scope:SCOPE_SYS
+; GFX1250-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT:    s_clause 0x1
+; GFX1250-TRUE16-NEXT:    scratch_store_b32 off, v0, s32 offset:12
+; GFX1250-TRUE16-NEXT:    scratch_load_u16 v0, off, s32 scope:SCOPE_SYS
+; GFX1250-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT:    scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill
+; GFX1250-TRUE16-NEXT:    s_wait_xcnt 0x0
+; GFX1250-TRUE16-NEXT:    ;;#ASMSTART
+; GFX1250-TRUE16-NEXT:    ;;#ASMEND
+; GFX1250-TRUE16-NEXT:    scratch_load_b32 v0, off, s32 offset:12 th:TH_LOAD_LU ; 4-byte Folded Reload
+; GFX1250-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:2 scope:SCOPE_SYS
+; GFX1250-TRUE16-NEXT:    s_wait_storecnt 0x0
+; GFX1250-TRUE16-NEXT:    scratch_load_b32 v0, off, s32 offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload
+; GFX1250-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 scope:SCOPE_SYS
+; GFX1250-TRUE16-NEXT:    s_wait_storecnt 0x0
+; GFX1250-TRUE16-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-FAKE16-LABEL: spill_2xi16_from_v2i16:
+; GFX1250-FAKE16:       ; %bb.0: ; %entry
+; GFX1250-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 offset:2 scope:SCOPE_SYS
+; GFX1250-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT:    s_clause 0x1
+; GFX1250-FAKE16-NEXT:    scratch_store_b32 off, v0, s32 offset:8
+; GFX1250-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 scope:SCOPE_SYS
+; GFX1250-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT:    scratch_store_b32 off, v0, s32 offset:12 ; 4-byte Folded Spill
+; GFX1250-FAKE16-NEXT:    s_wait_xcnt 0x0
+; GFX1250-FAKE16-NEXT:    ;;#ASMSTART
+; GFX1250-FAKE16-NEXT:    ;;#ASMEND
+; GFX1250-FAKE16-NEXT:    scratch_load_b32 v0, off, s32 offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload
+; GFX1250-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT:    scratch_store_b16 off, v0, s32 offset:2 scope:SCOPE_SYS
+; GFX1250-FAKE16-NEXT:    s_wait_storecnt 0x0
+; GFX1250-FAKE16-NEXT:    scratch_load_b32 v0, off, s32 offset:12 th:TH_LOAD_LU ; 4-byte Folded Reload
+; GFX1250-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT:    scratch_store_b16 off, v0, s32 scope:SCOPE_SYS
+; GFX1250-FAKE16-NEXT:    s_wait_storecnt 0x0
+; GFX1250-FAKE16-NEXT:    s_set_pc_i64 s[30:31]
 entry:
   %alloca = alloca <2 x i16>, i32 2, align 1, addrspace(5)
 
@@ -341,6 +519,47 @@ define void @spill_2xi16_from_v2i16_one_free_reg() {
 ; GCN-FAKE16-NEXT:    scratch_store_b16 off, v0, s32 dlc
 ; GCN-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-TRUE16-LABEL: spill_2xi16_from_v2i16_one_free_reg:
+; GFX1250-TRUE16:       ; %bb.0: ; %entry
+; GFX1250-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-TRUE16-NEXT:    scratch_load_u16 v7, off, s32 offset:2 scope:SCOPE_SYS
+; GFX1250-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT:    scratch_load_u16 v0, off, s32 scope:SCOPE_SYS
+; GFX1250-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT:    scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill
+; GFX1250-TRUE16-NEXT:    s_wait_xcnt 0x0
+; GFX1250-TRUE16-NEXT:    ;;#ASMSTART
+; GFX1250-TRUE16-NEXT:    ;;#ASMEND
+; GFX1250-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v7.l
+; GFX1250-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:2 scope:SCOPE_SYS
+; GFX1250-TRUE16-NEXT:    s_wait_storecnt 0x0
+; GFX1250-TRUE16-NEXT:    scratch_load_b32 v0, off, s32 offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload
+; GFX1250-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 scope:SCOPE_SYS
+; GFX1250-TRUE16-NEXT:    s_wait_storecnt 0x0
+; GFX1250-TRUE16-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-FAKE16-LABEL: spill_2xi16_from_v2i16_one_free_reg:
+; GFX1250-FAKE16:       ; %bb.0: ; %entry
+; GFX1250-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-FAKE16-NEXT:    scratch_load_u16 v7, off, s32 offset:2 scope:SCOPE_SYS
+; GFX1250-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 scope:SCOPE_SYS
+; GFX1250-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT:    scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill
+; GFX1250-FAKE16-NEXT:    s_wait_xcnt 0x0
+; GFX1250-FAKE16-NEXT:    ;;#ASMSTART
+; GFX1250-FAKE16-NEXT:    ;;#ASMEND
+; GFX1250-FAKE16-NEXT:    scratch_store_b16 off, v7, s32 offset:2 scope:SCOPE_SYS
+; GFX1250-FAKE16-NEXT:    s_wait_storecnt 0x0
+; GFX1250-FAKE16-NEXT:    scratch_load_b32 v0, off, s32 offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload
+; GFX1250-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT:    scratch_store_b16 off, v0, s32 scope:SCOPE_SYS
+; GFX1250-FAKE16-NEXT:    s_wait_storecnt 0x0
+; GFX1250-FAKE16-NEXT:    s_set_pc_i64 s[30:31]
 entry:
   %alloca = alloca <2 x i16>, i32 2, align 1, addrspace(5)
 
@@ -375,6 +594,22 @@ define void @spill_v2i16() {
 ; GCN-NEXT:    scratch_store_b32 off, v0, s32 offset:4 dlc
 ; GCN-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: spill_v2i16:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    scratch_load_b32 v0, off, s32 offset:4 scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    ;;#ASMSTART
+; GFX1250-NEXT:    ;;#ASMEND
+; GFX1250-NEXT:    scratch_load_b32 v0, off, s32 offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    scratch_store_b32 off, v0, s32 offset:4 scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
 entry:
   %alloca = alloca <2 x i16>, i32 2, align 1, addrspace(5)
 
diff --git a/llvm/test/CodeGen/AMDGPU/spillv16.mir b/llvm/test/CodeGen/AMDGPU/spillv16.mir
index 05569bf394c43..ba2d926eb8883 100644
--- a/llvm/test/CodeGen/AMDGPU/spillv16.mir
+++ b/llvm/test/CodeGen/AMDGPU/spillv16.mir
@@ -1,6 +1,7 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -march=amdgcn -verify-machineinstrs -mcpu=gfx1100 -mattr=+real-true16 -run-pass=regallocfast -o - %s | FileCheck -check-prefix=SPILLED %s
 # RUN: llc -march=amdgcn -verify-machineinstrs -mcpu=gfx1100 -mattr=+real-true16 -run-pass=regallocfast,prologepilog -o - %s | FileCheck -check-prefix=EXPANDED %s
+# RUN: llc -march=amdgcn -verify-machineinstrs -mcpu=gfx1250 -mattr=+real-true16 -run-pass=regallocfast,prologepilog -o - %s | FileCheck -check-prefix=SRAMECC-EXPANDED %s
 
 ---
 name: spill_restore_vgpr16
@@ -46,6 +47,27 @@ body: |
   ; EXPANDED-NEXT:   $vgpr0_lo16 = SCRATCH_LOAD_SHORT_D16_SADDR_t16 $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s16) from %stack.0, addrspace 5)
   ; EXPANDED-NEXT:   $vgpr0_hi16 = SCRATCH_LOAD_SHORT_D16_SADDR_t16 $sgpr32, 2, 0, implicit $exec, implicit $flat_scr :: (load (s16) from %stack.1, addrspace 5)
   ; EXPANDED-NEXT:   S_NOP 0, implicit killed renamable $vgpr0_lo16, implicit killed renamable $vgpr0_hi16
+  ;
+  ; SRAMECC-EXPANDED-LABEL: name: spill_restore_vgpr16
+  ; SRAMECC-EXPANDED: bb.0:
+  ; SRAMECC-EXPANDED-NEXT:   successors: %bb.1(0x80000000)
+  ; SRAMECC-EXPANDED-NEXT: {{  $}}
+  ; SRAMECC-EXPANDED-NEXT:   S_NOP 0, implicit-def renamable $vgpr0_lo16, implicit-def renamable $vgpr0_hi16
+  ; SRAMECC-EXPANDED-NEXT:   SCRATCH_STORE_SHORT_SADDR_t16 killed $vgpr0_hi16, $sgpr32, 2, 0, implicit $exec, implicit $flat_scr :: (store (s16) into %stack.1, addrspace 5)
+  ; SRAMECC-EXPANDED-NEXT:   SCRATCH_STORE_SHORT_SADDR_t16 killed $vgpr0_lo16, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into %stack.0, addrspace 5)
+  ; SRAMECC-EXPANDED-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+  ; SRAMECC-EXPANDED-NEXT: {{  $}}
+  ; SRAMECC-EXPANDED-NEXT: bb.1:
+  ; SRAMECC-EXPANDED-NEXT:   successors: %bb.2(0x80000000)
+  ; SRAMECC-EXPANDED-NEXT: {{  $}}
+  ; SRAMECC-EXPANDED-NEXT:   S_NOP 1
+  ; SRAMECC-EXPANDED-NEXT: {{  $}}
+  ; SRAMECC-EXPANDED-NEXT: bb.2:
+  ; SRAMECC-EXPANDED-NEXT:   $vgpr1 = SCRATCH_LOAD_USHORT_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s16) from %stack.0, addrspace 5)
+  ; SRAMECC-EXPANDED-NEXT:   $vgpr0_lo16 = V_MOV_B16_t16_e64 0, killed $vgpr1_lo16, 0, implicit $exec
+  ; SRAMECC-EXPANDED-NEXT:   $vgpr1 = SCRATCH_LOAD_USHORT_SADDR $sgpr32, 2, 0, implicit $exec, implicit $flat_scr :: (load (s16) from %stack.1, addrspace 5)
+  ; SRAMECC-EXPANDED-NEXT:   $vgpr0_hi16 = V_MOV_B16_t16_e64 0, killed $vgpr1_lo16, 0, implicit $exec
+  ; SRAMECC-EXPANDED-NEXT:   S_NOP 0, implicit killed renamable $vgpr0_lo16, implicit killed renamable $vgpr0_hi16
   bb.0:
     S_NOP 0, implicit-def %0:vgpr_16, implicit-def %1:vgpr_16
     S_CBRANCH_SCC1 implicit undef $scc, %bb.1
diff --git a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
index 5aafb0f576fb4..364598f7cf6c0 100644
--- a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
+++ b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
@@ -31,8 +31,8 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:sgpr_32 = COPY $sgpr10
   ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:sgpr_32 = COPY $sgpr8
   ; CHECK-NEXT:   undef [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub0_sub1:sgpr_128 = S_LOAD_DWORDX2_IMM [[COPY]], 232, 0 :: (invariant load (s64) from %ir.39, addrspace 4)
-  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %125:sgpr_128, 0, 0 :: (dereferenceable invariant load (s32))
-  ; CHECK-NEXT:   KILL undef %125:sgpr_128
+  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %117:sgpr_128, 0, 0 :: (dereferenceable invariant load (s32))
+  ; CHECK-NEXT:   KILL undef %117:sgpr_128
   ; CHECK-NEXT:   [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY5]], 4, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_LSHL_B32_1:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY4]], 4, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_LSHL_B32_2:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY3]], 4, implicit-def dead $scc
@@ -44,87 +44,85 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK-NEXT:   [[S_SUB_I32_1:%[0-9]+]]:sreg_32 = S_SUB_I32 [[S_BUFFER_LOAD_DWORD_IMM]], 30, implicit-def dead $scc
   ; CHECK-NEXT:   undef [[S_ADD_U32_:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_2]], implicit-def $scc
   ; CHECK-NEXT:   [[S_ADD_U32_:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_]], 16, 0 :: (invariant load (s128) from %ir.81, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_]], 16, 0 :: (invariant load (s128) from %ir.71, addrspace 4)
   ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM1:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM undef %74:sreg_64, 0, 0 :: (invariant load (s128) from `ptr addrspace(4) poison`, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM2:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_]], 64, 0 :: (invariant load (s128) from %ir.88, addrspace 4)
   ; CHECK-NEXT:   KILL undef %74:sreg_64
   ; CHECK-NEXT:   KILL [[S_ADD_U32_]].sub0, [[S_ADD_U32_]].sub1
   ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[S_LOAD_DWORDX4_IMM]], 0, 0 :: (dereferenceable invariant load (s32))
   ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
   ; CHECK-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = S_MOV_B32 0
-  ; CHECK-NEXT:   [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET undef %118:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
-  ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], undef %89:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+  ; CHECK-NEXT:   [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET undef %112:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+  ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], undef %87:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
-  ; CHECK-NEXT:   KILL undef %89:sgpr_128
-  ; CHECK-NEXT:   KILL undef %118:sgpr_128
+  ; CHECK-NEXT:   KILL undef %112:sgpr_128
+  ; CHECK-NEXT:   KILL undef %87:sgpr_128
   ; CHECK-NEXT:   [[S_SUB_I32_2:%[0-9]+]]:sreg_32 = S_SUB_I32 [[S_BUFFER_LOAD_DWORD_IMM1]], 31, implicit-def dead $scc
   ; CHECK-NEXT:   undef [[S_ADD_U32_1:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_]], implicit-def $scc
   ; CHECK-NEXT:   [[S_ADD_U32_1:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
   ; CHECK-NEXT:   undef [[S_ADD_U32_2:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_1]], implicit-def $scc
   ; CHECK-NEXT:   [[S_ADD_U32_2:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   undef [[S_ADD_U32_3:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_2]], implicit-def $scc
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM2:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_1]], 64, 0 :: (invariant load (s128) from %ir.87, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM3:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_2]], 64, 0 :: (invariant load (s128) from %ir.93, addrspace 4)
-  ; CHECK-NEXT:   KILL [[S_ADD_U32_1]].sub0, [[S_ADD_U32_1]].sub1
+  ; CHECK-NEXT:   [[S_ASHR_I32_3:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 undef %148:sreg_32, 31, implicit-def dead $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_3:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], undef %148:sreg_32, implicit-def $scc
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM3:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_1]], 64, 0 :: (invariant load (s128) from %ir.77, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM4:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_2]], 64, 0 :: (invariant load (s128) from %ir.83, addrspace 4)
   ; CHECK-NEXT:   KILL [[S_ADD_U32_2]].sub0, [[S_ADD_U32_2]].sub1
-  ; CHECK-NEXT:   [[S_ADD_U32_3:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_ASHR_I32_3:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 undef %169:sreg_32, 31, implicit-def dead $scc
-  ; CHECK-NEXT:   undef [[S_ADD_U32_4:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], undef %169:sreg_32, implicit-def $scc
-  ; CHECK-NEXT:   [[S_ADD_U32_4:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   undef [[S_ADD_U32_5:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_]], implicit-def $scc
-  ; CHECK-NEXT:   [[S_ADD_U32_5:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   undef [[S_ADD_U32_6:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_1]], implicit-def $scc
-  ; CHECK-NEXT:   [[S_ADD_U32_6:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   undef [[S_ADD_U32_7:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, undef %169:sreg_32, implicit-def $scc
-  ; CHECK-NEXT:   [[S_ADD_U32_7:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   undef [[S_ADD_U32_8:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_2]], implicit-def $scc
-  ; CHECK-NEXT:   [[S_ADD_U32_8:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   undef [[S_ADD_U32_9:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY8]], [[S_LSHL_B32_]], implicit-def $scc
-  ; CHECK-NEXT:   [[S_ADD_U32_9:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %48:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   undef [[S_ADD_U32_10:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_1]], implicit-def $scc
-  ; CHECK-NEXT:   [[S_ADD_U32_10:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %45:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   undef [[S_ADD_U32_11:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_2]], implicit-def $scc
-  ; CHECK-NEXT:   [[S_ADD_U32_11:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %45:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   KILL [[S_ADD_U32_1]].sub0, [[S_ADD_U32_1]].sub1
+  ; CHECK-NEXT:   [[S_ADD_U32_3:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_4:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_4:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_5:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_1]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_5:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_6:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, undef %148:sreg_32, implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_6:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_7:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_2]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_7:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_8:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY8]], [[S_LSHL_B32_]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_8:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %48:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_9:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_1]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_9:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %45:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_10:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_2]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_10:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %45:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
   ; CHECK-NEXT:   [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_]], 16, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_2]], 16, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B32_]], [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load (s32))
-  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B32_]], undef %302:sreg_32, 0, 0 :: (dereferenceable invariant load (s32))
+  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B32_]], undef %279:sreg_32, 0, 0 :: (dereferenceable invariant load (s32))
   ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B32_]], [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load (s32))
   ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[S_MOV_B32_]], 16, 0 :: (dereferenceable invariant load (s32))
-  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %357:sgpr_128, undef %358:sreg_32, 0, 0 :: (dereferenceable invariant load (s32))
-  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %368:sgpr_128, 16, 0 :: (dereferenceable invariant load (s32))
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM4:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_3]], 64, 0 :: (invariant load (s128) from %ir.99, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM5:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_4]], 64, 0 :: (invariant load (s128) from %ir.107, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM6:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 0, 0 :: (invariant load (s128) from %ir.112, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM7:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_6]], 0, 0 :: (invariant load (s128) from %ir.117, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM8:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_7]], 0, 0 :: (invariant load (s128) from %ir.124, addrspace 4)
-  ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM2]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
-  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %352:sgpr_128, [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load (s32))
-  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %363:sgpr_128, [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load (s32))
-  ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM3]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %334:sgpr_128, undef %335:sreg_32, 0, 0 :: (dereferenceable invariant load (s32))
+  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %345:sgpr_128, 16, 0 :: (dereferenceable invariant load (s32))
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM5:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_3]], 64, 0 :: (invariant load (s128) from %ir.95, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM6:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_4]], 0, 0 :: (invariant load (s128) from %ir.100, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM7:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 0, 0 :: (invariant load (s128) from %ir.105, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM8:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_6]], 0, 0 :: (invariant load (s128) from %ir.112, addrspace 4)
+  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %329:sgpr_128, [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load (s32))
+  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %340:sgpr_128, [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load (s32))
+  ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM3]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+  ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM4]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[S_ADD_I32_2:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM]], -98, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ADD_I32_3:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM1]], -114, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ADD_I32_4:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM2]], -130, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ADD_I32_5:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM2]], -178, implicit-def dead $scc
-  ; CHECK-NEXT:   undef [[S_ADD_U32_12:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY10]], [[S_LSHL_B32_]], implicit-def $scc
-  ; CHECK-NEXT:   [[S_ADD_U32_12:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %42:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   undef [[S_ADD_U32_13:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY11]], [[S_LSHL_B32_]], implicit-def $scc
-  ; CHECK-NEXT:   [[S_ADD_U32_13:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   undef [[S_ADD_U32_14:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY11]], [[S_LSHL_B32_1]], implicit-def $scc
-  ; CHECK-NEXT:   [[S_ADD_U32_14:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   undef [[S_ADD_U32_15:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY11]], [[S_LSHL_B32_2]], implicit-def $scc
-  ; CHECK-NEXT:   [[S_ADD_U32_15:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_11:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY10]], [[S_LSHL_B32_]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_11:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %42:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_12:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY11]], [[S_LSHL_B32_]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_12:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_13:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY11]], [[S_LSHL_B32_1]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_13:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_14:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY11]], [[S_LSHL_B32_2]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_14:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
   ; CHECK-NEXT:   [[S_LSHL_B32_3:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY12]], 4, implicit-def dead $scc
-  ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN4:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM4]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+  ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN4:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM2]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[S_ADD_I32_6:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_3]], 16, implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %384:sgpr_128, [[S_ADD_I32_6]], 0, 0 :: (dereferenceable invariant load (s32))
+  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %361:sgpr_128, [[S_ADD_I32_6]], 0, 0 :: (dereferenceable invariant load (s32))
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN5:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM5]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM9:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 224, 0 :: (invariant load (s128) from %ir.129, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM10:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY7]], 224, 0 :: (invariant load (s128) from %ir.145, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM11:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 576, 0 :: (invariant load (s128) from %ir.150, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM9:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_4]], 224, 0 :: (invariant load (s128) from %ir.117, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM10:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY7]], 224, 0 :: (invariant load (s128) from %ir.133, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM11:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_4]], 576, 0 :: (invariant load (s128) from %ir.138, addrspace 4)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN6:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM6]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM12:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_6]], 224, 0 :: (invariant load (s128) from %ir.134, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM13:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_7]], 576, 0 :: (invariant load (s128) from %ir.162, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM14:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_8]], 224, 0 :: (invariant load (s128) from %ir.140, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM12:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 224, 0 :: (invariant load (s128) from %ir.122, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM13:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_6]], 576, 0 :: (invariant load (s128) from %ir.150, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM14:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_7]], 224, 0 :: (invariant load (s128) from %ir.128, addrspace 4)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN7:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM7]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN8:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM8]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[S_ADD_I32_7:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM4]], -217, implicit-def dead $scc
@@ -135,49 +133,49 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK-NEXT:   [[S_ADD_I32_12:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -329, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ADD_I32_13:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -345, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ADD_I32_14:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM6]], -441, implicit-def dead $scc
-  ; CHECK-NEXT:   undef [[S_ADD_U32_16:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY2]], [[S_LSHL_B32_2]], implicit-def $scc
-  ; CHECK-NEXT:   [[S_ADD_U32_16:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %36:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_15:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY2]], [[S_LSHL_B32_2]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_15:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %36:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
   ; CHECK-NEXT:   [[S_LSHL_B32_4:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY13]], 4, implicit-def dead $scc
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN9:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM9]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[S_ASHR_I32_4:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_4]], 31, implicit-def dead $scc
-  ; CHECK-NEXT:   undef [[S_ADD_U32_17:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY2]], [[S_LSHL_B32_4]], implicit-def $scc
-  ; CHECK-NEXT:   [[S_ADD_U32_17:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %36:sreg_32, [[S_ASHR_I32_4]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_16:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY2]], [[S_LSHL_B32_4]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_16:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %36:sreg_32, [[S_ASHR_I32_4]], implicit-def dead $scc, implicit $scc
   ; CHECK-NEXT:   [[S_LSHL_B32_5:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY5]], 3, implicit-def dead $scc
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN10:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM12]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[S_ASHR_I32_5:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_5]], 31, implicit-def dead $scc
-  ; CHECK-NEXT:   undef [[S_ADD_U32_18:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_5]], implicit-def $scc
-  ; CHECK-NEXT:   [[S_ADD_U32_18:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_5]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_18]], 168, 0 :: (invariant load (s32) from %ir.273, align 8, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM15:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_8]], 576, 0 :: (invariant load (s128) from %ir.157, addrspace 4)
+  ; CHECK-NEXT:   undef [[S_ADD_U32_17:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_5]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_17:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_5]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_17]], 168, 0 :: (invariant load (s32) from %ir.260, align 8, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM15:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_7]], 576, 0 :: (invariant load (s128) from %ir.145, addrspace 4)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN11:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM14]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN12:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM10]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN13:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM11]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub3:sgpr_128 = S_MOV_B32 553734060
   ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub2:sgpr_128 = S_MOV_B32 -1
   ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]]
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM16:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_9]], 0, 0 :: (invariant load (s128) from %ir.170, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM16:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_8]], 0, 0 :: (invariant load (s128) from %ir.158, addrspace 4)
   ; CHECK-NEXT:   [[COPY15:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub1
   ; CHECK-NEXT:   [[COPY15:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORD_IMM]]
   ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY15]], 0, 0 :: (dereferenceable invariant load (s32))
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN14:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM15]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN15:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM13]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM17:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_10]], 0, 0 :: (invariant load (s128) from %ir.178, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM18:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_11]], 0, 0 :: (invariant load (s128) from %ir.183, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM17:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_9]], 0, 0 :: (invariant load (s128) from %ir.166, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM18:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_10]], 0, 0 :: (invariant load (s128) from %ir.171, addrspace 4)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN16:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM16]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[S_LSHL_B32_6:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY4]], 3, implicit-def dead $scc
   ; CHECK-NEXT:   [[BUFFER_LOAD_DWORD_OFFSET1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[S_ASHR_I32_6:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_6]], 31, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ADD_I32_15:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM4]], -467, implicit-def dead $scc
-  ; CHECK-NEXT:   undef [[S_ADD_U32_19:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_6]], implicit-def $scc
-  ; CHECK-NEXT:   [[S_ADD_U32_19:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_6]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_19]], 168, 0 :: (invariant load (s64) from %ir.282, addrspace 4)
+  ; CHECK-NEXT:   undef [[S_ADD_U32_18:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_6]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_18:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_6]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_18]], 168, 0 :: (invariant load (s64) from %ir.269, addrspace 4)
   ; CHECK-NEXT:   [[BUFFER_LOAD_DWORD_OFFSET2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM17]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[BUFFER_LOAD_DWORD_OFFSET3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM18]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM19:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_12]], 0, 0 :: (invariant load (s128) from %ir.205, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM20:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_13]], 0, 0 :: (invariant load (s128) from %ir.211, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM19:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_11]], 0, 0 :: (invariant load (s128) from %ir.193, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM20:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_12]], 0, 0 :: (invariant load (s128) from %ir.199, addrspace 4)
   ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]]
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM21:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_14]], 0, 0 :: (invariant load (s128) from %ir.216, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM22:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_15]], 0, 0 :: (invariant load (s128) from %ir.221, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM21:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_13]], 0, 0 :: (invariant load (s128) from %ir.204, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM22:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_14]], 0, 0 :: (invariant load (s128) from %ir.209, addrspace 4)
   ; CHECK-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORDX2_IMM1]].sub1, 65535, implicit-def dead $scc
   ; CHECK-NEXT:   [[COPY16:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM1]].sub0
   ; CHECK-NEXT:   [[COPY16:%[0-9]+]].sub1:sgpr_128 = COPY [[S_AND_B32_]]
@@ -189,30 +187,30 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN20:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM22]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[S_ASHR_I32_7:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_7]], 31, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ADD_I32_16:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM5]], -468, implicit-def dead $scc
-  ; CHECK-NEXT:   undef [[S_ADD_U32_20:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_7]], implicit-def $scc
-  ; CHECK-NEXT:   [[S_ADD_U32_20:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_7]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM2:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_20]], 168, 0 :: (invariant load (s64) from %ir.293, addrspace 4)
+  ; CHECK-NEXT:   undef [[S_ADD_U32_19:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_7]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_19:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_7]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM2:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_19]], 168, 0 :: (invariant load (s64) from %ir.280, addrspace 4)
   ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]]
   ; CHECK-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORDX2_IMM2]].sub1, 65535, implicit-def dead $scc
   ; CHECK-NEXT:   [[COPY17:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM2]].sub0
   ; CHECK-NEXT:   [[COPY17:%[0-9]+]].sub1:sgpr_128 = COPY [[S_AND_B32_1]]
   ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY17]], 0, 0 :: (dereferenceable invariant load (s32))
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM23:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_16]], 160, 0 :: (invariant load (s128) from %ir.256, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef %470:sreg_64, 0, 0 :: (invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4)
-  ; CHECK-NEXT:   KILL [[S_ADD_U32_16]].sub0, [[S_ADD_U32_16]].sub1
-  ; CHECK-NEXT:   KILL undef %470:sreg_64
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM23:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_15]], 160, 0 :: (invariant load (s128) from %ir.244, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef %443:sreg_64, 0, 0 :: (invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4)
+  ; CHECK-NEXT:   KILL [[S_ADD_U32_15]].sub0, [[S_ADD_U32_15]].sub1
   ; CHECK-NEXT:   KILL [[COPY17]].sub0_sub1_sub2, [[COPY17]].sub3
+  ; CHECK-NEXT:   KILL undef %443:sreg_64
   ; CHECK-NEXT:   [[S_LSHL_B32_8:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY14]], 3, implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM24:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_17]], 160, 0 :: (invariant load (s128) from %ir.265, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM24:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_16]], 160, 0 :: (invariant load (s128) from %ir.252, addrspace 4)
   ; CHECK-NEXT:   [[S_ASHR_I32_8:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_8]], 31, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ADD_I32_17:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM6]], -469, implicit-def dead $scc
-  ; CHECK-NEXT:   undef [[S_ADD_U32_21:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_8]], implicit-def $scc
-  ; CHECK-NEXT:   [[S_ADD_U32_21:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_8]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_21]], 168, 0 :: (invariant load (s32) from %ir.305, align 8, addrspace 4)
+  ; CHECK-NEXT:   undef [[S_ADD_U32_20:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_8]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_20:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_8]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_20]], 168, 0 :: (invariant load (s32) from %ir.291, align 8, addrspace 4)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN21:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM23]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN22:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM24]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
-  ; CHECK-NEXT:   KILL [[S_LOAD_DWORDX4_IMM24]]
   ; CHECK-NEXT:   KILL [[S_LOAD_DWORDX4_IMM23]]
+  ; CHECK-NEXT:   KILL [[S_LOAD_DWORDX4_IMM24]]
   ; CHECK-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORD_IMM1]], 65535, implicit-def dead $scc
   ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]]
   ; CHECK-NEXT:   [[COPY18:%[0-9]+]].sub1:sgpr_128 = COPY [[S_AND_B32_2]]
@@ -224,22 +222,22 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK-NEXT:   [[S_ADD_I32_21:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -507, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ADD_I32_22:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -539, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ADD_I32_23:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM7]], -473, implicit-def dead $scc
-  ; CHECK-NEXT:   undef [[S_ADD_U32_22:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_]], implicit-def $scc
-  ; CHECK-NEXT:   [[S_ADD_U32_22:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM25:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_22]], 96, 0 :: (invariant load (s128) from %ir.323, addrspace 4)
-  ; CHECK-NEXT:   undef [[S_ADD_U32_23:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_1]], implicit-def $scc
-  ; CHECK-NEXT:   [[S_ADD_U32_23:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM26:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_23]], 96, 0 :: (invariant load (s128) from %ir.329, addrspace 4)
-  ; CHECK-NEXT:   undef [[S_ADD_U32_24:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_2]], implicit-def $scc
-  ; CHECK-NEXT:   [[S_ADD_U32_24:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM27:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_24]], 96, 0 :: (invariant load (s128) from %ir.335, addrspace 4)
+  ; CHECK-NEXT:   undef [[S_ADD_U32_21:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_21:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM25:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_21]], 96, 0 :: (invariant load (s128) from %ir.309, addrspace 4)
+  ; CHECK-NEXT:   undef [[S_ADD_U32_22:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_1]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_22:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM26:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_22]], 96, 0 :: (invariant load (s128) from %ir.315, addrspace 4)
+  ; CHECK-NEXT:   undef [[S_ADD_U32_23:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_2]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_23:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM27:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_23]], 96, 0 :: (invariant load (s128) from %ir.321, addrspace 4)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN23:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM25]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN24:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM26]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN25:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM27]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
-  ; CHECK-NEXT:   KILL [[S_LOAD_DWORDX4_IMM27]]
   ; CHECK-NEXT:   KILL [[S_LOAD_DWORDX4_IMM25]]
-  ; CHECK-NEXT:   KILL [[V_MOV_B32_e32_]]
   ; CHECK-NEXT:   KILL [[S_LOAD_DWORDX4_IMM26]]
+  ; CHECK-NEXT:   KILL [[V_MOV_B32_e32_]]
+  ; CHECK-NEXT:   KILL [[S_LOAD_DWORDX4_IMM27]]
   ; CHECK-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -2, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
   ; CHECK-NEXT:   [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -1, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], 0, implicit $exec
   ; CHECK-NEXT:   [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -3, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
@@ -351,13 +349,13 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK-NEXT:   [[V_OR_B32_e64_64:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_63]], [[V_ADD_U32_e64_28]], implicit $exec
   ; CHECK-NEXT:   [[V_ADD_U32_e64_30:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -593, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
   ; CHECK-NEXT:   [[V_OR_B32_e64_65:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_64]], [[V_ADD_U32_e64_29]], implicit $exec
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM undef %543:sreg_64, 0, 0 :: (invariant load (s256) from `ptr addrspace(4) poison`, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM undef %516:sreg_64, 0, 0 :: (invariant load (s256) from `ptr addrspace(4) poison`, addrspace 4)
   ; CHECK-NEXT:   [[V_OR_B32_e64_66:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_65]], [[V_ADD_U32_e64_30]], implicit $exec
   ; CHECK-NEXT:   [[S_ADD_I32_24:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM8]], -594, implicit-def dead $scc
   ; CHECK-NEXT:   [[V_OR_B32_e64_67:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_24]], [[V_OR_B32_e64_66]], implicit $exec
   ; CHECK-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 0, [[V_OR_B32_e64_67]], implicit $exec
   ; CHECK-NEXT:   undef [[V_CNDMASK_B32_e64_:%[0-9]+]].sub3:vreg_128 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[V_CMP_EQ_U32_e64_]], implicit $exec
-  ; CHECK-NEXT:   IMAGE_STORE_V4_V2_nsa_gfx10 [[V_CNDMASK_B32_e64_]], undef %557:vgpr_32, undef %559:vgpr_32, [[S_LOAD_DWORDX8_IMM]], 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), addrspace 8)
+  ; CHECK-NEXT:   IMAGE_STORE_V4_V2_nsa_gfx10 [[V_CNDMASK_B32_e64_]], undef %530:vgpr_32, undef %532:vgpr_32, [[S_LOAD_DWORDX8_IMM]], 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), addrspace 8)
   ; CHECK-NEXT:   S_ENDPGM 0
 .expVert:
   %0 = extractelement <31 x i32> %userData, i64 2
diff --git a/llvm/test/CodeGen/AMDGPU/umin-sub-to-usubo-select-combine.ll b/llvm/test/CodeGen/AMDGPU/umin-sub-to-usubo-select-combine.ll
new file mode 100644
index 0000000000000..22e4a24435f12
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/umin-sub-to-usubo-select-combine.ll
@@ -0,0 +1,236 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
+
+define i16 @v_underflow_compare_fold_i16(i16 %a, i16 %b) #0 {
+; GFX9-LABEL: v_underflow_compare_fold_i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_sub_u16_e32 v1, v0, v1
+; GFX9-NEXT:    v_min_u16_e32 v0, v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_underflow_compare_fold_i16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_sub_nc_u16 v0.h, v0.l, v1.l
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_u16 v0.l, v0.h, v0.l
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %sub = sub i16 %a, %b
+  %cond = call i16 @llvm.umin.i16(i16 %sub, i16 %a)
+  ret i16 %cond
+}
+
+define i32 @v_underflow_compare_fold_i32(i32 %a, i32 %b) #0 {
+; GFX9-LABEL: v_underflow_compare_fold_i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_sub_u32_e32 v1, v0, v1
+; GFX9-NEXT:    v_min_u32_e32 v0, v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_underflow_compare_fold_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_sub_nc_u32_e32 v1, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_u32_e32 v0, v1, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %sub = sub i32 %a, %b
+  %cond = call i32 @llvm.umin.i32(i32 %sub, i32 %a)
+  ret i32 %cond
+}
+
+define i32 @v_underflow_compare_fold_i32_commute(i32 %a, i32 %b) #0 {
+; GFX9-LABEL: v_underflow_compare_fold_i32_commute:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_sub_u32_e32 v1, v0, v1
+; GFX9-NEXT:    v_min_u32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_underflow_compare_fold_i32_commute:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_sub_nc_u32_e32 v1, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_u32_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %sub = sub i32 %a, %b
+  %cond = call i32 @llvm.umin.i32(i32 %a, i32 %sub)
+  ret i32 %cond
+}
+
+define i32 @v_underflow_compare_fold_i32_multi_use(i32 %a, i32 %b, ptr addrspace(1) %ptr) #0 {
+; GFX9-LABEL: v_underflow_compare_fold_i32_multi_use:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_sub_u32_e32 v1, v0, v1
+; GFX9-NEXT:    v_min_u32_e32 v0, v1, v0
+; GFX9-NEXT:    global_store_dword v[2:3], v1, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_underflow_compare_fold_i32_multi_use:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_sub_nc_u32_e32 v1, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_u32_e32 v0, v1, v0
+; GFX11-NEXT:    global_store_b32 v[2:3], v1, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %sub = sub i32 %a, %b
+  store i32 %sub, ptr addrspace(1) %ptr
+  %cond = call i32 @llvm.umin.i32(i32 %sub, i32 %a)
+  ret i32 %cond
+}
+
+define i64 @v_underflow_compare_fold_i64(i64 %a, i64 %b) #0 {
+; GFX9-LABEL: v_underflow_compare_fold_i64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, v0, v2
+; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
+; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_underflow_compare_fold_i64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_sub_co_u32 v2, vcc_lo, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_sub_co_ci_u32_e64 v3, null, v1, v3, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[0:1]
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v0, v2 :: v_dual_cndmask_b32 v1, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %sub = sub i64 %a, %b
+  %cond = call i64 @llvm.umin.i64(i64 %sub, i64 %a)
+  ret i64 %cond
+}
+
+define i64 @v_underflow_compare_fold_i64_commute(i64 %a, i64 %b) #0 {
+; GFX9-LABEL: v_underflow_compare_fold_i64_commute:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, v0, v2
+; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
+; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_underflow_compare_fold_i64_commute:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_sub_co_u32 v2, vcc_lo, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_sub_co_ci_u32_e64 v3, null, v1, v3, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %sub = sub i64 %a, %b
+  %cond = call i64 @llvm.umin.i64(i64 %a, i64 %sub)
+  ret i64 %cond
+}
+
+define i64 @v_underflow_compare_fold_i64_multi_use(i64 %a, i64 %b, ptr addrspace(1) %ptr) #0 {
+; GFX9-LABEL: v_underflow_compare_fold_i64_multi_use:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, v0, v2
+; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
+; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1]
+; GFX9-NEXT:    global_store_dwordx2 v[4:5], v[2:3], off
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_underflow_compare_fold_i64_multi_use:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_sub_co_u32 v2, vcc_lo, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_sub_co_ci_u32_e64 v3, null, v1, v3, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[0:1]
+; GFX11-NEXT:    global_store_b64 v[4:5], v[2:3], off
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v0, v2 :: v_dual_cndmask_b32 v1, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %sub = sub i64 %a, %b
+  store i64 %sub, ptr addrspace(1) %ptr
+  %cond = call i64 @llvm.umin.i64(i64 %sub, i64 %a)
+  ret i64 %cond
+}
+
+define amdgpu_ps i16 @s_underflow_compare_fold_i16(i16 inreg %a, i16 inreg %b) #0 {
+; GFX9-LABEL: s_underflow_compare_fold_i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_sub_i32 s1, s0, s1
+; GFX9-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX9-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX9-NEXT:    s_min_u32 s0, s1, s0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: s_underflow_compare_fold_i16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_sub_i32 s1, s0, s1
+; GFX11-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX11-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_min_u32 s0, s1, s0
+; GFX11-NEXT:    ; return to shader part epilog
+  %sub = sub i16 %a, %b
+  %cond = call i16 @llvm.umin.i16(i16 %sub, i16 %a)
+  ret i16 %cond
+}
+
+define amdgpu_ps i32 @s_underflow_compare_fold_i32(i32 inreg %a, i32 inreg %b) #0 {
+; GFX9-LABEL: s_underflow_compare_fold_i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_sub_i32 s1, s0, s1
+; GFX9-NEXT:    s_min_u32 s0, s1, s0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: s_underflow_compare_fold_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_sub_i32 s1, s0, s1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_min_u32 s0, s1, s0
+; GFX11-NEXT:    ; return to shader part epilog
+  %sub = sub i32 %a, %b
+  %cond = call i32 @llvm.umin.i32(i32 %sub, i32 %a)
+  ret i32 %cond
+}
+
+define amdgpu_ps i64 @s_underflow_compare_fold_i64(i64 inreg %a, i64 inreg %b) #0 {
+; GFX9-LABEL: s_underflow_compare_fold_i64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_sub_u32 s2, s0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    s_subb_u32 s3, s1, s3
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
+; GFX9-NEXT:    s_and_b64 s[4:5], vcc, exec
+; GFX9-NEXT:    s_cselect_b32 s1, s3, s1
+; GFX9-NEXT:    s_cselect_b32 s0, s2, s0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: s_underflow_compare_fold_i64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_sub_u32 s2, s0, s2
+; GFX11-NEXT:    s_subb_u32 s3, s1, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_cmp_lt_u64_e64 s4, s[2:3], s[0:1]
+; GFX11-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s0, s2, s0
+; GFX11-NEXT:    s_cselect_b32 s1, s3, s1
+; GFX11-NEXT:    ; return to shader part epilog
+  %sub = sub i64 %a, %b
+  %cond = call i64 @llvm.umin.i64(i64 %sub, i64 %a)
+  ret i64 %cond
+}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250-t16.mir b/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250-t16.mir
index 8a70a8acd28d3..32cc398740d62 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250-t16.mir
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250-t16.mir
@@ -36,7 +36,7 @@ body:             |
     ; GCN-NEXT: v_add_f16_e64 v128.l /*v384.l*/, v129.l /*v385.l*/, v130.l /*v386.l*/
     $vgpr384_lo16 = V_ADD_F16_t16_e64 0, undef $vgpr385_lo16, 0, undef $vgpr386_lo16, 0, 0, 0, implicit $exec, implicit $mode
 
-    ; GCN-NEXT: s_set_vgpr_msb 0x8a
+    ; GCN-NEXT: s_set_vgpr_msb 0x458a
     ; ASM-SAME:                                         ;  msbs: dst=2 src0=2 src1=2 src2=0
     ; GCN-NEXT: v_add_f16_e64 v0.h /*v512.h*/, v1.h /*v513.h*/, v2.h /*v514.h*/
     $vgpr512_hi16 = V_ADD_F16_t16_e64 0, undef $vgpr513_hi16, 0, undef $vgpr514_hi16, 0, 0, 0, implicit $exec, implicit $mode
@@ -50,7 +50,7 @@ body:             |
     ; GCN-NEXT: v_add_f16_e64 v128.l /*v640.l*/, v129.l /*v641.l*/, v130.l /*v642.l*/
     $vgpr640_lo16 = V_ADD_F16_t16_e64 0, undef $vgpr641_lo16, 0, undef $vgpr642_lo16, 0, 0, 0, implicit $exec, implicit $mode
 
-    ; GCN-NEXT: s_set_vgpr_msb 0xcf
+    ; GCN-NEXT: s_set_vgpr_msb 0x8acf
     ; ASM-SAME:                                         ;  msbs: dst=3 src0=3 src1=3 src2=0
     ; GCN-NEXT: v_add_f16_e64 v0.h /*v768.h*/, v1.h /*v769.h*/, v2.h /*v770.h*/
     $vgpr768_hi16 = V_ADD_F16_t16_e64 0, undef $vgpr769_hi16, 0, undef $vgpr770_hi16, 0, 0, 0, implicit $exec, implicit $mode
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250.mir b/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250.mir
index f508df2292e90..7e1c28f8e7bbb 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250.mir
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250.mir
@@ -22,13 +22,13 @@ body:             |
     $vgpr257 = V_MOV_B32_e32 undef $vgpr510, implicit $exec
 
     ; Single bit change
-    ; GCN-NEXT: s_set_vgpr_msb 1
+    ; GCN-NEXT: s_set_vgpr_msb 0x4101
     ; ASM-SAME:                                         ;  msbs: dst=0 src0=1 src1=0 src2=0
     ; GCN-NEXT: v_rcp_f32_e64 v255, v2 /*v258*/
     $vgpr255 = V_RCP_F32_e64 0, undef $vgpr258, 0, 0, implicit $exec, implicit $mode
 
     ; Reset
-    ; GCN-NEXT: s_set_vgpr_msb 0
+    ; GCN-NEXT: s_set_vgpr_msb 0x100
     ; ASM-SAME:                                         ;  msbs: dst=0 src0=0 src1=0 src2=0
     ; GCN-NEXT: v_rcp_f32_e64 v255, v1
     $vgpr255 = V_RCP_F32_e64 0, undef $vgpr1, 0, 0, implicit $exec, implicit $mode
@@ -40,7 +40,7 @@ body:             |
     ; GCN-NEXT: v_add_nc_u32_e32 v0, v253 /*v509*/, v252 /*v508*/
     $vgpr0 = V_ADD_U32_e32 undef $vgpr509, undef $vgpr508, implicit $exec
 
-    ; GCN-NEXT: s_set_vgpr_msb 0x44
+    ; GCN-NEXT: s_set_vgpr_msb 0x544
     ; ASM-SAME:                                         ;  msbs: dst=1 src0=0 src1=1 src2=0
     ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
     ; GCN-NEXT: v_add_f32_e64 v2 /*v258*/, v0, v251 /*v507*/
@@ -48,7 +48,7 @@ body:             |
 
     ; VOP3
 
-    ; GCN-NEXT: s_set_vgpr_msb 0x55
+    ; GCN-NEXT: s_set_vgpr_msb 0x4455
     ; ASM-SAME:                                         ;  msbs: dst=1 src0=1 src1=1 src2=1
     ; GCN-NEXT: v_fma_f32 v3 /*v259*/, v4 /*v260*/, v5 /*v261*/, v6 /*v262*/
     $vgpr259 = V_FMA_F32_e64 0, undef $vgpr260, 0, undef $vgpr261, 0, undef $vgpr262, 0, 0, implicit $exec, implicit $mode
@@ -58,32 +58,32 @@ body:             |
     $vgpr259 = V_FMA_F32_e64 0, undef $vgpr260, 0, undef $vgpr261, 0, undef $vgpr262, 0, 0, implicit $exec, implicit $mode
 
     ; Tuple crossing the 256 boundary
-    ; GCN-NEXT: s_set_vgpr_msb 17
+    ; GCN-NEXT: s_set_vgpr_msb 0x5511
     ; ASM-SAME:                                         ;  msbs: dst=0 src0=1 src1=0 src2=1
     ; GCN-NEXT: v_mqsad_u32_u8 v[254:257], v[2:3] /*v[258:259]*/, v0, v[244:247] /*v[500:503]*/
     $vgpr254_vgpr255_vgpr256_vgpr257 = V_MQSAD_U32_U8_e64 $vgpr258_vgpr259, $vgpr0, undef $vgpr500_vgpr501_vgpr502_vgpr503, 0, implicit $exec
 
     ; DPP/tied operand
-    ; GCN-NEXT: s_set_vgpr_msb 0x45
+    ; GCN-NEXT: s_set_vgpr_msb 0x1145
     ; ASM-SAME:                                         ;  msbs: dst=1 src0=1 src1=1 src2=0
     ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
     ; GCN-NEXT: v_add_nc_u16_e64_dpp v0 /*v256*/, v1 /*v257*/, v2 /*v258*/ quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
     $vgpr256 = V_ADD_NC_U16_fake16_e64_dpp $vgpr256, 0, $vgpr257, 0, undef $vgpr258, 0, 0, 1, 15, 15, 1, implicit $exec
 
-    ; GCN-NEXT: s_set_vgpr_msb 17
+    ; GCN-NEXT: s_set_vgpr_msb 0x4511
     ; ASM-SAME:                                         ;  msbs: dst=0 src0=1 src1=0 src2=1
     ; GCN-NEXT: v_add3_u32_e64_dpp v0, v1 /*v257*/, v0, v2 /*v258*/ quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
     $vgpr0 = V_ADD3_U32_e64_dpp $vgpr0, $vgpr257, $vgpr0, undef $vgpr258, 1, 15, 15, 1, implicit $exec
 
     ; DS (addr, data0, and data1 operands)
 
-    ; GCN-NEXT: s_set_vgpr_msb 20
+    ; GCN-NEXT: s_set_vgpr_msb 0x1114
     ; ASM-SAME:                                         ;  msbs: dst=0 src0=0 src1=1 src2=1
     ; GCN-NEXT: ds_store_2addr_b32 v0, v248 /*v504*/, v249 /*v505*/ offset1:1
     DS_WRITE2_B32_gfx9 $vgpr0, undef $vgpr504, undef $vgpr505, 0, 1, 0, implicit $exec
 
     ; Reset
-    ; GCN-NEXT: s_set_vgpr_msb 0
+    ; GCN-NEXT: s_set_vgpr_msb 0x1400
     ; ASM-SAME:                                         ;  msbs: dst=0 src0=0 src1=0 src2=0
     ; GCN-NEXT: ds_store_2addr_b32 v0, v248, v249 offset1:1
     DS_WRITE2_B32_gfx9 $vgpr0, undef $vgpr248, undef $vgpr249, 0, 1, 0, implicit $exec
@@ -93,13 +93,13 @@ body:             |
     ; GCN-NEXT: ds_load_b32 v0, v255 /*v511*/
     $vgpr0 = DS_READ_B32_gfx9 $vgpr511, 0, 0, implicit $exec
 
-    ; GCN-NEXT: s_set_vgpr_msb 0x44
+    ; GCN-NEXT: s_set_vgpr_msb 0x144
     ; ASM-SAME:                                         ;  msbs: dst=1 src0=0 src1=1 src2=0
     ; GCN-NEXT: ds_add_rtn_u32 v255 /*v511*/, v0, v248 /*v504*/
     $vgpr511 = DS_ADD_RTN_U32_gfx9 $vgpr0, undef $vgpr504, 0, 0, implicit $exec
 
     ; Reset
-    ; GCN-NEXT: s_set_vgpr_msb 0
+    ; GCN-NEXT: s_set_vgpr_msb 0x4400
     ; ASM-SAME:                                         ;  msbs: dst=0 src0=0 src1=0 src2=0
     ; GCN-NEXT: ds_add_rtn_u32 v0, v0, v0
     $vgpr0 = DS_ADD_RTN_U32_gfx9 $vgpr0, $vgpr0, 0, 0, implicit $exec
@@ -111,17 +111,17 @@ body:             |
     ; GCN-NEXT: global_load_b32 v2, v[2:3] /*v[258:259]*/, off
     $vgpr2 = GLOBAL_LOAD_DWORD undef $vgpr258_vgpr259, 0, 0, implicit $exec
 
-    ; GCN-NEXT: s_set_vgpr_msb 64
+    ; GCN-NEXT: s_set_vgpr_msb 0x140
     ; ASM-SAME:                                         ;  msbs: dst=1 src0=0 src1=0 src2=0
     ; GCN-NEXT: global_load_b32 v255 /*v511*/, v0, s[0:1]
     $vgpr511 = GLOBAL_LOAD_DWORD_SADDR undef $sgpr0_sgpr1, $vgpr0, 0, 0, implicit $exec
 
-    ; GCN-NEXT: s_set_vgpr_msb 1
+    ; GCN-NEXT: s_set_vgpr_msb 0x4001
     ; ASM-SAME:                                         ;  msbs: dst=0 src0=1 src1=0 src2=0
     ; GCN-NEXT: scratch_load_u8 v0, v255 /*v511*/, s0
     $vgpr0 = SCRATCH_LOAD_UBYTE_SVS $vgpr511, undef $sgpr0, 0, 0, implicit $exec, implicit $flat_scr
 
-    ; GCN-NEXT: s_set_vgpr_msb 0
+    ; GCN-NEXT: s_set_vgpr_msb 0x100
     ; ASM-SAME:                                         ;  msbs: dst=0 src0=0 src1=0 src2=0
     ; GCN-NEXT: global_store_b32 v[0:1], v2, off
     GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
@@ -135,13 +135,13 @@ body:             |
     ; GCN-NEXT: global_store_b96 v[0:1] /*v[256:257]*/, v[244:246] /*v[500:502]*/, off
     GLOBAL_STORE_DWORDX3 $vgpr256_vgpr257, $vgpr500_vgpr501_vgpr502, 0, 0, implicit $exec
 
-    ; GCN-NEXT: s_set_vgpr_msb 0x44
+    ; GCN-NEXT: s_set_vgpr_msb 0x544
     ; ASM-SAME:                                         ;  msbs: dst=1 src0=0 src1=1 src2=0
     ; GCN-NEXT: flat_atomic_add_u32 v254 /*v510*/, v[0:1], v255 /*v511*/ th:TH_ATOMIC_RETURN
     $vgpr510 = FLAT_ATOMIC_ADD_RTN $vgpr0_vgpr1, $vgpr511, 0, 1, implicit $exec, implicit $flat_scr
 
     ; Reset
-    ; GCN-NEXT: s_set_vgpr_msb 0
+    ; GCN-NEXT: s_set_vgpr_msb 0x4400
     ; ASM-SAME:                                         ;  msbs: dst=0 src0=0 src1=0 src2=0
     ; GCN-NEXT: flat_atomic_add_u32 v0, v[0:1], v255 th:TH_ATOMIC_RETURN
     $vgpr0 = FLAT_ATOMIC_ADD_RTN $vgpr0_vgpr1, $vgpr255, 0, 1, implicit $exec, implicit $flat_scr
@@ -156,12 +156,12 @@ body:             |
     ; GCN-NEXT: buffer_load_b32 v1 /*v257*/, v0, s[8:11], s3 offen
     $vgpr257 = BUFFER_LOAD_DWORD_VBUFFER_OFFEN $vgpr0, undef $sgpr8_sgpr9_sgpr10_sgpr11, undef $sgpr3, 0, 0, 0, implicit $exec
 
-    ; GCN-NEXT: s_set_vgpr_msb 0x41
+    ; GCN-NEXT: s_set_vgpr_msb 0x4041
     ; ASM-SAME:                                         ;  msbs: dst=1 src0=1 src1=0 src2=0
     ; GCN-NEXT: buffer_load_b32 v1 /*v257*/, v0 /*v256*/, s[8:11], s3 offen
     $vgpr257 = BUFFER_LOAD_DWORD_VBUFFER_OFFEN $vgpr256, undef $sgpr8_sgpr9_sgpr10_sgpr11, undef $sgpr3, 0, 0, 0, implicit $exec
 
-    ; GCN-NEXT: s_set_vgpr_msb 0
+    ; GCN-NEXT: s_set_vgpr_msb 0x4100
     ; ASM-SAME:                                         ;  msbs: dst=0 src0=0 src1=0 src2=0
     ; GCN-NEXT: buffer_store_b32 v0, v1, s[0:3], s3 offen
     BUFFER_STORE_DWORD_VBUFFER_OFFEN $vgpr0, $vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr3, 0, 0, 0, implicit $exec
@@ -171,7 +171,7 @@ body:             |
     ; GCN-NEXT: buffer_store_b32 v0 /*v256*/, v1 /*v257*/, s[0:3], s3 offen
     BUFFER_STORE_DWORD_VBUFFER_OFFEN $vgpr256, $vgpr257, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr3, 0, 0, 0, implicit $exec
 
-    ; GCN-NEXT: s_set_vgpr_msb 0
+    ; GCN-NEXT: s_set_vgpr_msb 0x4100
     ; ASM-SAME:                                         ;  msbs: dst=0 src0=0 src1=0 src2=0
     ; GCN-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s3 offen
     BUFFER_ATOMIC_ADD_F32_VBUFFER_OFFEN $vgpr0, $vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr3, 0, 0, implicit $exec
@@ -183,44 +183,44 @@ body:             |
 
     ; VGPRs above 512
 
-    ; GCN-NEXT: s_set_vgpr_msb 0xaa
+    ; GCN-NEXT: s_set_vgpr_msb 0x41aa
     ; ASM-SAME:                                         ;  msbs: dst=2 src0=2 src1=2 src2=2
     ; GCN-NEXT: v_fma_f32 v0 /*v512*/, v1 /*v513*/, v2 /*v514*/, v3 /*v515*/
     $vgpr512 = V_FMA_F32_e64 0, undef $vgpr513, 0, undef $vgpr514, 0, undef $vgpr515, 0, 0, implicit $exec, implicit $mode
 
-    ; GCN-NEXT: s_set_vgpr_msb 0xab
+    ; GCN-NEXT: s_set_vgpr_msb 0xaaab
     ; ASM-SAME:                                         ;  msbs: dst=2 src0=3 src1=2 src2=2
     ; GCN-NEXT: v_fma_f32 v0 /*v512*/, v0 /*v768*/, v2 /*v514*/, v3 /*v515*/
     $vgpr512 = V_FMA_F32_e64 0, undef $vgpr768, 0, undef $vgpr514, 0, undef $vgpr515, 0, 0, implicit $exec, implicit $mode
 
-    ; GCN-NEXT: s_set_vgpr_msb 0xae
+    ; GCN-NEXT: s_set_vgpr_msb 0xabae
     ; ASM-SAME:                                         ;  msbs: dst=2 src0=2 src1=3 src2=2
     ; GCN-NEXT: v_fma_f32 v0 /*v512*/, v1 /*v513*/, v2 /*v770*/, v3 /*v515*/
     $vgpr512 = V_FMA_F32_e64 0, undef $vgpr513, 0, undef $vgpr770, 0, undef $vgpr515, 0, 0, implicit $exec, implicit $mode
 
-    ; GCN-NEXT: s_set_vgpr_msb 0xba
+    ; GCN-NEXT: s_set_vgpr_msb 0xaeba
     ; ASM-SAME:                                         ;  msbs: dst=2 src0=2 src1=2 src2=3
     ; GCN-NEXT: v_fma_f32 v0 /*v512*/, v1 /*v513*/, v2 /*v514*/, v3 /*v771*/
     $vgpr512 = V_FMA_F32_e64 0, undef $vgpr513, 0, undef $vgpr514, 0, undef $vgpr771, 0, 0, implicit $exec, implicit $mode
 
-    ; GCN-NEXT: s_set_vgpr_msb 0xea
+    ; GCN-NEXT: s_set_vgpr_msb 0xbaea
     ; ASM-SAME:                                         ;  msbs: dst=3 src0=2 src1=2 src2=2
     ; GCN-NEXT: v_fma_f32 v255 /*v1023*/, v1 /*v513*/, v2 /*v514*/, v3 /*v515*/
     $vgpr1023 = V_FMA_F32_e64 0, undef $vgpr513, 0, undef $vgpr514, 0, undef $vgpr515, 0, 0, implicit $exec, implicit $mode
 
-    ; GCN-NEXT: s_set_vgpr_msb 0xff
+    ; GCN-NEXT: s_set_vgpr_msb 0xeaff
     ; ASM-SAME:                                         ;  msbs: dst=3 src0=3 src1=3 src2=3
     ; GCN-NEXT: v_fma_f32 v0 /*v768*/, v1 /*v769*/, v2 /*v770*/, v3 /*v771*/
     $vgpr768 = V_FMA_F32_e64 0, undef $vgpr769, 0, undef $vgpr770, 0, undef $vgpr771, 0, 0, implicit $exec, implicit $mode
 
-    ; GCN-NEXT: s_set_vgpr_msb 0x42
+    ; GCN-NEXT: s_set_vgpr_msb 0xff42
     ; ASM-SAME:                                         ;  msbs: dst=1 src0=2 src1=0 src2=0
     ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v0 /*v512*/
     $vgpr256 = V_MOV_B32_e32 undef $vgpr512, implicit $exec
 
     ; Reset
 
-    ; GCN-NEXT: s_set_vgpr_msb 0
+    ; GCN-NEXT: s_set_vgpr_msb 0x4200
     ; ASM-SAME:                                         ;  msbs: dst=0 src0=0 src1=0 src2=0
     ; GCN-NEXT: v_fma_f32 v0, v1, v2, v3
     $vgpr0 = V_FMA_F32_e64 0, undef $vgpr1, 0, undef $vgpr2, 0, undef $vgpr3, 0, 0, implicit $exec, implicit $mode
@@ -232,12 +232,12 @@ body:             |
     ; GCN-NEXT: global_store_b96 v[0:1] /*v[512:513]*/, v[0:2] /*v[512:514]*/, off
     GLOBAL_STORE_DWORDX3 $vgpr512_vgpr513, $vgpr512_vgpr513_vgpr514, 0, 0, implicit $exec
 
-    ; GCN-NEXT: s_set_vgpr_msb 11
+    ; GCN-NEXT: s_set_vgpr_msb 0xa0b
     ; ASM-SAME:                                         ;  msbs: dst=0 src0=3 src1=2 src2=0
     ; GCN-NEXT: global_store_b64 v[254:255] /*v[1022:1023]*/, v[254:255] /*v[766:767]*/, off
     GLOBAL_STORE_DWORDX2 $vgpr1022_vgpr1023, $vgpr766_vgpr767, 0, 0, implicit $exec
 
-    ; GCN-NEXT: s_set_vgpr_msb 0x55
+    ; GCN-NEXT: s_set_vgpr_msb 0xb55
     ; ASM-SAME:                                         ;  msbs: dst=1 src0=1 src1=1 src2=1
     ; GCN-NEXT: v_wmma_f32_16x16x32_bf16 v[14:21] /*v[270:277]*/, v[26:33] /*v[282:289]*/, v[34:41] /*v[290:297]*/, v[14:21] /*v[270:277]*/
     early-clobber $vgpr270_vgpr271_vgpr272_vgpr273_vgpr274_vgpr275_vgpr276_vgpr277 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, undef $vgpr282_vgpr283_vgpr284_vgpr285_vgpr286_vgpr287_vgpr288_vgpr289, 8, undef $vgpr290_vgpr291_vgpr292_vgpr293_vgpr294_vgpr295_vgpr296_vgpr297, 8, killed undef $vgpr270_vgpr271_vgpr272_vgpr273_vgpr274_vgpr275_vgpr276_vgpr277, 0, 0, 0, 0, implicit $exec
@@ -247,6 +247,7 @@ body:             |
 ...
 
 # ASM-LABEL: {{^}}vopd:
+
 # DIS-LABEL: <vopd>:
 ---
 name:            vopd
@@ -262,35 +263,35 @@ body:             |
     ; GCN-NEXT: v_dual_sub_f32 v244 /*v500*/, v1, v2 :: v_dual_mul_f32 v0 /*v256*/, v3, v4
     $vgpr500, $vgpr256 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx1250 undef $vgpr1, undef $vgpr2, undef $vgpr3, undef $vgpr4, implicit $mode, implicit $exec
 
-    ; GCN-NEXT: s_set_vgpr_msb 0x41
+    ; GCN-NEXT: s_set_vgpr_msb 0x4041
     ; GCN-NEXT: v_dual_sub_f32 v244 /*v500*/, s1, v2 :: v_dual_mul_f32 v0 /*v256*/, v44 /*v300*/, v4
     $vgpr500, $vgpr256 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx1250 undef $sgpr1, undef $vgpr2, undef $vgpr300, undef $vgpr4, implicit $mode, implicit $exec
 
-    ; GCN-NEXT: s_set_vgpr_msb 4
+    ; GCN-NEXT: s_set_vgpr_msb 0x4104
     ; GCN-NEXT: v_dual_sub_f32 v255, v1, v44 /*v300*/ :: v_dual_mul_f32 v6, v0, v1 /*v257*/
     $vgpr255, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx1250 undef $vgpr1, undef $vgpr300, undef $vgpr0, $vgpr257, implicit $mode, implicit $exec
 
-    ; GCN-NEXT: s_set_vgpr_msb 1
+    ; GCN-NEXT: s_set_vgpr_msb 0x401
     ; GCN-NEXT: v_dual_sub_f32 v255, 0, v1 :: v_dual_mul_f32 v6, v44 /*v300*/, v3
     $vgpr255, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx1250 0, undef $vgpr1, undef $vgpr300, undef $vgpr3, implicit $mode, implicit $exec
 
-    ; GCN-NEXT: s_set_vgpr_msb 64
+    ; GCN-NEXT: s_set_vgpr_msb 0x140
     ; GCN-NEXT: v_dual_fmamk_f32 v243 /*v499*/, v0, 0xa, v3 :: v_dual_fmac_f32 v0 /*v256*/, v1, v1
     $vgpr499, $vgpr256 = V_DUAL_FMAMK_F32_X_FMAC_F32_e32_gfx1250 undef $vgpr0, 10, undef $vgpr3, undef $vgpr1, undef $vgpr1, $vgpr256, implicit $mode, implicit $exec
 
-    ; GCN-NEXT: s_set_vgpr_msb 5
+    ; GCN-NEXT: s_set_vgpr_msb 0x4005
     ; GCN-NEXT: v_dual_mov_b32 v2, v3 /*v259*/ :: v_dual_add_f32 v3, v1 /*v257*/, v2 /*v258*/
     $vgpr2, $vgpr3 = V_DUAL_MOV_B32_e32_X_ADD_F32_e32_gfx1250 undef $vgpr259, undef $vgpr257, undef $vgpr258, implicit $exec, implicit $mode
 
-    ; GCN-NEXT: s_set_vgpr_msb 0x44
+    ; GCN-NEXT: s_set_vgpr_msb 0x544
     ; GCN-NEXT: v_dual_fmamk_f32 v244 /*v500*/, v0, 0xa, v44 /*v300*/ :: v_dual_fmac_f32 v3 /*v259*/, v1, v1 /*v257*/
     $vgpr500, $vgpr259 = V_DUAL_FMAMK_F32_X_FMAC_F32_e32_gfx1250 undef $vgpr0, 10, undef $vgpr300, undef $vgpr1, undef $vgpr257, $vgpr259, implicit $mode, implicit $exec
 
-    ; GCN-NEXT: s_set_vgpr_msb 16
+    ; GCN-NEXT: s_set_vgpr_msb 0x4410
     ; GCN-NEXT: v_dual_fma_f32 v0, v6, v6, v44 /*v300*/ :: v_dual_fma_f32 v1, v4, v5, v45 /*v301*/
     $vgpr0, $vgpr1 = V_DUAL_FMA_F32_e64_X_FMA_F32_e64_e96_gfx1250 0, undef $vgpr6, 0, undef $vgpr6, 0, undef $vgpr300, 0, undef $vgpr4, 0, undef $vgpr5, 0, undef $vgpr301, implicit $mode, implicit $exec
 
-    ; GCN-NEXT: s_set_vgpr_msb 0
+    ; GCN-NEXT: s_set_vgpr_msb 0x1000
     ; GCN-NEXT: v_dual_fmac_f32 v2, v6, v6 :: v_dual_fma_f32 v3, v4, v5, v3
     $vgpr2, $vgpr3 = V_DUAL_FMAC_F32_e32_X_FMA_F32_e64_e96_gfx1250 0, undef $vgpr6, 0, undef $vgpr6, undef $vgpr2, 0, undef $vgpr4, 0, undef $vgpr5, 0, $vgpr3, implicit $mode, implicit $exec
 
@@ -298,7 +299,7 @@ body:             |
     ; GCN-NEXT: v_dual_fma_f32 v244 /*v500*/, v6, v7, v8 :: v_dual_add_f32 v3 /*v259*/, v4, v5
     $vgpr500, $vgpr259 = V_DUAL_FMA_F32_e64_X_ADD_F32_e32_e96_gfx1250 0, undef $vgpr6, 0, undef $vgpr7, 0, undef $vgpr8, 0, undef $vgpr4, 0, undef $vgpr5, implicit $mode, implicit $exec
 
-    ; GCN-NEXT: s_set_vgpr_msb 0xae
+    ; GCN-NEXT: s_set_vgpr_msb 0x40ae
     ; GCN-NEXT: v_dual_fmac_f32 v2 /*v514*/, v6 /*v518*/, v8 /*v776*/ :: v_dual_fma_f32 v3 /*v515*/, v4 /*v516*/, v7 /*v775*/, v3 /*v515*/
     $vgpr514, $vgpr515 = V_DUAL_FMAC_F32_e32_X_FMA_F32_e64_e96_gfx1250 0, undef $vgpr518, 0, undef $vgpr776, undef $vgpr514, 0, undef $vgpr516, 0, undef $vgpr775, 0, $vgpr515, implicit $mode, implicit $exec
 
@@ -319,31 +320,31 @@ body:             |
     ; GCN-NEXT: v_fmaak_f32 v0 /*v256*/, v1 /*v257*/, v2 /*v258*/, 0x1
     $vgpr256 = V_FMAAK_F32 undef $vgpr257, undef $vgpr258, 1, implicit $exec, implicit $mode
 
-    ; GCN-NEXT: s_set_vgpr_msb 5
+    ; GCN-NEXT: s_set_vgpr_msb 0x4505
     ; GCN-NEXT: v_fmaak_f32 v0, v1 /*v257*/, v2 /*v258*/, 0x1
     $vgpr0 = V_FMAAK_F32 undef $vgpr257, undef $vgpr258, 1, implicit $exec, implicit $mode
 
-    ; GCN-NEXT: s_set_vgpr_msb 0x41
+    ; GCN-NEXT: s_set_vgpr_msb 0x541
     ; GCN-NEXT: v_fmaak_f32 v0 /*v256*/, v1 /*v257*/, v2, 0x1
     $vgpr256 = V_FMAAK_F32 undef $vgpr257, undef $vgpr2, 1, implicit $exec, implicit $mode
 
-    ; GCN-NEXT: s_set_vgpr_msb 0x44
+    ; GCN-NEXT: s_set_vgpr_msb 0x4144
     ; GCN-NEXT: v_fmaak_f32 v0 /*v256*/, v1, v2 /*v258*/, 0x1
     $vgpr256 = V_FMAAK_F32 undef $vgpr1, undef $vgpr258, 1, implicit $exec, implicit $mode
 
-    ; GCN-NEXT: s_set_vgpr_msb 0x45
+    ; GCN-NEXT: s_set_vgpr_msb 0x4445
     ; GCN-NEXT: v_fmamk_f32 v0 /*v256*/, v1 /*v257*/, 0x1, v2 /*v258*/
     $vgpr256 = V_FMAMK_F32 undef $vgpr257, 1, undef $vgpr258, implicit $exec, implicit $mode
 
-    ; GCN-NEXT: s_set_vgpr_msb 5
+    ; GCN-NEXT: s_set_vgpr_msb 0x4505
     ; GCN-NEXT: v_fmamk_f32 v0, v1 /*v257*/, 0x1, v2 /*v258*/
     $vgpr0 = V_FMAMK_F32 undef $vgpr257, 1, undef $vgpr258, implicit $exec, implicit $mode
 
-    ; GCN-NEXT: s_set_vgpr_msb 0x41
+    ; GCN-NEXT: s_set_vgpr_msb 0x541
     ; GCN-NEXT: v_fmamk_f32 v0 /*v256*/, v1 /*v257*/, 0x1, v2
     $vgpr256 = V_FMAMK_F32 undef $vgpr257, 1, undef $vgpr2, implicit $exec, implicit $mode
 
-    ; GCN-NEXT: s_set_vgpr_msb 0x44
+    ; GCN-NEXT: s_set_vgpr_msb 0x4144
     ; GCN-NEXT: v_fmamk_f32 v0 /*v256*/, v1, 0x1, v2 /*v258*/
     $vgpr256 = V_FMAMK_F32 undef $vgpr1, 1, undef $vgpr258, implicit $exec, implicit $mode
 
@@ -389,15 +390,15 @@ body:             |
     ; GCN-NEXT: v_lshlrev_b32_e64 v0, v0 /*v256*/, v2
     $vgpr0 = V_LSHLREV_B32_e64 undef $vgpr256, undef $vgpr2, implicit $exec
 
-    ; GCN-NEXT: s_set_vgpr_msb 4
+    ; GCN-NEXT: s_set_vgpr_msb 0x104
     ; GCN-NEXT: v_lshlrev_b32_e64 v0, v1, v0 /*v256*/
     $vgpr0 = V_LSHLREV_B32_e64 undef $vgpr1, undef $vgpr256, implicit $exec
 
-    ; GCN-NEXT: s_set_vgpr_msb 1
+    ; GCN-NEXT: s_set_vgpr_msb 0x401
     ; GCN-NEXT: v_subrev_nc_u32_e32 v0, v0 /*v256*/, v2
     $vgpr0 = V_SUBREV_U32_e32 undef $vgpr256, undef $vgpr2, implicit $exec
 
-    ; GCN-NEXT: s_set_vgpr_msb 4
+    ; GCN-NEXT: s_set_vgpr_msb 0x104
     ; GCN-NEXT: v_subrev_nc_u32_e32 v0, v1, v0 /*v256*/
     $vgpr0 = V_SUBREV_U32_e32 undef $vgpr1, undef $vgpr256, implicit $exec
 
@@ -417,7 +418,7 @@ body:             |
     ; GCN-NEXT: v_fma_f32 v3 /*v259*/, v4 /*v260*/, v5 /*v261*/, v6 /*v262*/
     $vgpr259 = V_FMA_F32_e64 0, undef $vgpr260, 0, undef $vgpr261, 0, undef $vgpr262, 0, 0, implicit $exec, implicit $mode
 
-    ; GCN-NEXT: s_set_vgpr_msb 0
+    ; GCN-NEXT: s_set_vgpr_msb 0x5500
     ; GCN-NEXT: v_add_nc_u32_e32 v0, v1, v2
     $vgpr0 = V_ADD_U32_e32 undef $vgpr1, undef $vgpr2, implicit $exec
 
@@ -431,7 +432,7 @@ body:             |
     ; GCN-NEXT: v_add_nc_u32_e32 v0 /*v256*/, v1, v2
     $vgpr256 = V_ADD_U32_e32 undef $vgpr1, undef $vgpr2, implicit $exec
 
-    ; GCN-NEXT: s_set_vgpr_msb 0
+    ; GCN-NEXT: s_set_vgpr_msb 0x4000
     ; GCN-NEXT: v_fma_f32 v3, v4, v5, s2
     $vgpr3 = V_FMA_F32_e64 0, undef $vgpr4, 0, undef $vgpr5, 0, undef $sgpr2, 0, 0, implicit $exec, implicit $mode
 
@@ -439,17 +440,17 @@ body:             |
     ; GCN-NEXT: v_fma_f32 v3, v4 /*v260*/, v5, 1
     $vgpr3 = V_FMA_F32_e64 0, undef $vgpr260, 0, undef $vgpr5, 0, 1, 0, 0, implicit $exec, implicit $mode
 
-    ; GCN-NEXT: s_set_vgpr_msb 4
+    ; GCN-NEXT: s_set_vgpr_msb 0x104
     ; GCN-NEXT: v_mov_b32_e32 v0, v1
     $vgpr0 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
 
     ; GCN-NEXT: v_add_nc_u32_e32 v2, v1, v3 /*v259*/
     $vgpr2 = V_ADD_U32_e32 undef $vgpr1, undef $vgpr259, implicit $exec
 
-    ; GCN-NEXT: s_set_vgpr_msb 1
+    ; GCN-NEXT: s_set_vgpr_msb 0x401
     ; GCN-NEXT: v_mov_b32_e32 v0, v0 /*v256*/
     ; GCN-NEXT: v_add_nc_u32_e32 v1, v1 /*v257*/, v1
-    ; GCN-NEXT: s_set_vgpr_msb 5
+    ; GCN-NEXT: s_set_vgpr_msb 0x105
     ; GCN-NEXT: v_add_nc_u32_e32 v2, v2 /*v258*/, v2 /*v258*/
     $vgpr0 = V_MOV_B32_e32 undef $vgpr256, implicit $exec
     $vgpr1 = V_ADD_U32_e32 undef $vgpr257, undef $vgpr1, implicit $exec
@@ -478,16 +479,18 @@ body:             |
     ; ASM: .LBB{{.*_1}}:
     ; GCN-NEXT: s_set_vgpr_msb 64
     ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1
+    ; GCN-NEXT: s_set_vgpr_msb 0x4000
     $vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
 
-    ; No mode switch on fall through
+    ; Reset on fallthrough block end
 
   bb.2:
     ; ASM-NEXT: %bb.2:
-    ; GCN-NEXT: s_nop 0
-    ; GCN-NEXT: s_set_vgpr_msb 0
+    ; GCN-NEXT: s_set_vgpr_msb 64
+    ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1
+    ; GCN-NEXT: s_set_vgpr_msb 0x4000
     ; GCN-NEXT: s_branch
-    S_NOP 0
+    $vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
     S_BRANCH %bb.3
 
     ; Reset mode on terminator
@@ -496,7 +499,7 @@ body:             |
     ; ASM: .LBB{{.*_3}}:
     ; GCN-NEXT: s_set_vgpr_msb 64
     ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1
-    ; GCN-NEXT: s_set_vgpr_msb 0
+    ; GCN-NEXT: s_set_vgpr_msb 0x4000
     ; GCN-NEXT: s_swap_pc_i64
     $vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
     $exec = S_SWAPPC_B64 undef $sgpr0_sgpr1
@@ -518,7 +521,7 @@ body:             |
     ; GCN-NEXT: v_mov_b32_e32 v0, v1
     ; GCN-NEXT: s_set_vgpr_msb 64
     ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1
-    ; GCN-NEXT: s_set_vgpr_msb 0
+    ; GCN-NEXT: s_set_vgpr_msb 0x4000
     ; GCN-NEXT: s_set_pc_i64
     $vgpr0 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
     $vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
@@ -538,7 +541,7 @@ body:             |
     ; ASM-NEXT: %bb.7:
     ; GCN-NEXT: s_set_vgpr_msb 64
     ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1
-    ; GCN-NEXT: s_set_vgpr_msb 0
+    ; GCN-NEXT: s_set_vgpr_msb 0x4000
     ; ASM-NEXT: ; return to shader part epilog
     $vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
     SI_RETURN_TO_EPILOG undef $vgpr0, implicit-def $exec
@@ -556,7 +559,7 @@ body:             |
     ; ASM-NEXT: %bb.9:
     ; GCN-NEXT: s_set_vgpr_msb 64
     ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1
-    ; GCN-NEXT: s_set_vgpr_msb 0
+    ; GCN-NEXT: s_set_vgpr_msb 0x4000
     ; GCN-NEXT: s_set_pc_i64
     $vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
     S_SETPC_B64_return undef $sgpr0_sgpr1, implicit-def $exec
@@ -574,13 +577,14 @@ body:             |
     ; ASM: %bb.0:
     ; GCN-NEXT: s_set_vgpr_msb 64
     ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v0
+    ; GCN-NEXT: s_set_vgpr_msb 0x4000
     $vgpr256 = V_MOV_B32_e32 undef $vgpr0, implicit $exec
 
   bb.1:
     ; ASM: .LBB{{[0-9]+}}_1:
     ; GCN-NEXT: s_set_vgpr_msb 64
     ; GCN-NEXT: v_mov_b32_e32 v1 /*v257*/, v1
-    ; GCN-NEXT: s_set_vgpr_msb 0
+    ; GCN-NEXT: s_set_vgpr_msb 0x4000
     ; GCN-NEXT: s_cbranch_scc0
     $vgpr257 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
     S_CBRANCH_SCC0 %bb.1, undef implicit $scc
@@ -604,7 +608,7 @@ body:             |
     ; ASM: %bb.0:
     ; GCN-NEXT: s_set_vgpr_msb 64
     ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1
-    ; GCN-NEXT: s_set_vgpr_msb 0
+    ; GCN-NEXT: s_set_vgpr_msb 0x4000
     ; ASM:      def v0
     ; GCN-NOT:  s_set_vgpr_msb
     ; ASM:      use v0
@@ -638,7 +642,7 @@ body:             |
     ; GCN-NEXT: s_set_vgpr_msb 64
     ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1
     ; GCN-NEXT: s_nop 0
-    ; GCN-NEXT: s_set_vgpr_msb 1
+    ; GCN-NEXT: s_set_vgpr_msb 0x4001
     ; GCN-NEXT: v_mov_b32_e32 v1, v0 /*v256*/
     BUNDLE implicit-def $vgpr256 {
       $vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
@@ -680,7 +684,7 @@ body:             |
 
     ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1
     ; GCN-NEXT: v_mov_b32_e32 v1 /*v257*/, v1
-    ; GCN-NEXT: s_set_vgpr_msb 0
+    ; GCN-NEXT: s_set_vgpr_msb 0x4000
     ; GCN-NEXT: v_mov_b32_e32 v2, v1
     ; GCN-NEXT: v_mov_b32_e32 v3, v1
     BUNDLE implicit-def $vgpr256, implicit-def $vgpr257, implicit-def $vgpr2, implicit-def $vgpr3, implicit undef $vgpr1 {
@@ -709,7 +713,7 @@ body:             |
 
     ; GCN-NEXT: s_clause 0x3e
     ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1
-    ; GCN-NEXT: s_set_vgpr_msb 0
+    ; GCN-NEXT: s_set_vgpr_msb 0x4000
     ; GCN-NEXT: v_mov_b32_e32 v1, v1
     ; GCN-NEXT: v_mov_b32_e32 v2, v1
     ; GCN-COUNT-60: v_mov_b32_e32 v1, v1
@@ -823,7 +827,7 @@ body:             |
     ; GCN-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[210:217], v[244:259] /*v[500:515]*/, v[244:259] /*v[500:515]*/, v[10:17], v1, v2
     $vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr undef $vgpr500_vgpr501_vgpr502_vgpr503_vgpr504_vgpr505_vgpr506_vgpr507_vgpr508_vgpr509_vgpr510_vgpr511_vgpr512_vgpr513_vgpr514_vgpr515, undef $vgpr500_vgpr501_vgpr502_vgpr503_vgpr504_vgpr505_vgpr506_vgpr507_vgpr508_vgpr509_vgpr510_vgpr511_vgpr512_vgpr513_vgpr514_vgpr515, 0, undef $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, undef $vgpr1, undef $vgpr2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
 
-    ; GCN-NEXT: s_set_vgpr_msb 0
+    ; GCN-NEXT: s_set_vgpr_msb 0x500
     ; GCN-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[210:217], v[100:115], v[100:115], v[10:17], v1, v2
     $vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr undef $vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115, undef $vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115, 0, undef $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, undef $vgpr1, undef $vgpr2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
 
@@ -835,11 +839,11 @@ body:             |
     ; GCN-NEXT: v_wmma_ld_scale16_paired_b64 v[0:1], v[2:3]
     V_WMMA_LD_SCALE16_PAIRED_B64 undef $vgpr0_vgpr1, undef $vgpr2_vgpr3, 0, 0, 0, 0, 0, 0, implicit $exec
 
-    ; GCN-NEXT: s_set_vgpr_msb 5
+    ; GCN-NEXT: s_set_vgpr_msb 0x105
     ; GCN-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[210:217], v[244:259] /*v[500:515]*/, v[244:259] /*v[500:515]*/, v[10:17], v[0:1], v[2:3]
     $vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217 = V_WMMA_SCALE16_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr undef $vgpr500_vgpr501_vgpr502_vgpr503_vgpr504_vgpr505_vgpr506_vgpr507_vgpr508_vgpr509_vgpr510_vgpr511_vgpr512_vgpr513_vgpr514_vgpr515, undef $vgpr500_vgpr501_vgpr502_vgpr503_vgpr504_vgpr505_vgpr506_vgpr507_vgpr508_vgpr509_vgpr510_vgpr511_vgpr512_vgpr513_vgpr514_vgpr515, 0, undef $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, undef $vgpr0_vgpr1, undef $vgpr2_vgpr3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
 
-    ; GCN-NEXT: s_set_vgpr_msb 0
+    ; GCN-NEXT: s_set_vgpr_msb 0x500
     ; GCN-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[210:217], v[100:115], v[100:115], v[10:17], v[0:1], v[2:3]
     $vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217 = V_WMMA_SCALE16_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr undef $vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115, undef $vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115, 0, undef $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, undef $vgpr0_vgpr1, undef $vgpr2_vgpr3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
 
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
index db49339ea1f78..9c16b3c8a3f86 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
@@ -22,8 +22,6 @@
 ; GFX9-DAG: s_mov_b32 s[[DESC3:[0-9]+]], 0xe00000
 
 ; OFFREG is offset system SGPR
-; GCN: buffer_store_dword {{v[0-9]+}}, off, s[[[DESC0]]:[[DESC3]]], 0 offset:{{[0-9]+}} ; 4-byte Folded Spill
-; GCN: buffer_load_dword v{{[0-9]+}}, off, s[[[DESC0]]:[[DESC3]]], 0 offset:{{[0-9]+}} ; 4-byte Folded Reload
 ; GCN: NumVgprs: 256
 ; GCN: ScratchSize: 640
 
diff --git a/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
index 1b8e126f19ae1..a1381ecad81e2 100644
--- a/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
+++ b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
@@ -945,7 +945,6 @@ body: |
     $vgpr0 = V_MOV_B32_e32 0, implicit $exec
 ...
 
-# FIXME: Missing S_WAIT_XCNT before overwriting vgpr0.
 ---
 name: wait_kmcnt_with_outstanding_vmem_2
 tracksRegLiveness: true
@@ -971,6 +970,7 @@ body: |
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   S_WAIT_KMCNT 0
   ; GCN-NEXT:   $sgpr2 = S_MOV_B32 $sgpr2
+  ; GCN-NEXT:   S_WAIT_XCNT 0
   ; GCN-NEXT:   $vgpr0 = V_MOV_B32_e32 0, implicit $exec
   bb.0:
     liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
@@ -985,6 +985,180 @@ body: |
     $vgpr0 = V_MOV_B32_e32 0, implicit $exec
 ...
 
+---
+name: wait_kmcnt_and_wait_loadcnt
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body: |
+  ; GCN-LABEL: name: wait_kmcnt_and_wait_loadcnt
+  ; GCN: bb.0:
+  ; GCN-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; GCN-NEXT:   liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+  ; GCN-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit $scc
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.1:
+  ; GCN-NEXT:   successors: %bb.2(0x80000000)
+  ; GCN-NEXT:   liveins: $vgpr0_vgpr1, $sgpr2
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.2:
+  ; GCN-NEXT:   liveins: $sgpr2
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   S_WAIT_KMCNT 0
+  ; GCN-NEXT:   $sgpr2 = S_MOV_B32 $sgpr2
+  ; GCN-NEXT:   S_WAIT_LOADCNT 0
+  ; GCN-NEXT:   $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+  bb.0:
+    liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+    S_CBRANCH_SCC1 %bb.2, implicit $scc
+  bb.1:
+    liveins: $vgpr0_vgpr1, $sgpr2
+    $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+  bb.2:
+    liveins: $sgpr2
+    $sgpr2 = S_MOV_B32 $sgpr2
+    $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+...
+
+---
+name: implicit_handling_of_pending_vmem_group
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body: |
+  ; GCN-LABEL: name: implicit_handling_of_pending_vmem_group
+  ; GCN: bb.0:
+  ; GCN-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; GCN-NEXT:   liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+  ; GCN-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit $scc
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.1:
+  ; GCN-NEXT:   successors: %bb.2(0x80000000)
+  ; GCN-NEXT:   liveins: $vgpr0_vgpr1, $sgpr2
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.2:
+  ; GCN-NEXT:   liveins: $sgpr0_sgpr1, $sgpr2
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   S_WAIT_KMCNT 0
+  ; GCN-NEXT:   $sgpr2 = S_MOV_B32 $sgpr2
+  ; GCN-NEXT:   $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+  ; GCN-NEXT:   $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GCN-NEXT:   S_WAIT_XCNT 0
+  ; GCN-NEXT:   $sgpr0 = S_MOV_B32 $sgpr0
+  bb.0:
+    liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+    S_CBRANCH_SCC1 %bb.2, implicit $scc
+  bb.1:
+    liveins: $vgpr0_vgpr1, $sgpr2
+    $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+  bb.2:
+    liveins: $sgpr0_sgpr1, $sgpr2
+    $sgpr2 = S_MOV_B32 $sgpr2
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+    $sgpr0 = S_MOV_B32 $sgpr0
+...
+
+---
+name: pending_vmem_event_between_block
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body: |
+  ; GCN-LABEL: name: pending_vmem_event_between_block
+  ; GCN: bb.0:
+  ; GCN-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; GCN-NEXT:   liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+  ; GCN-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit $scc
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.1:
+  ; GCN-NEXT:   successors: %bb.2(0x80000000)
+  ; GCN-NEXT:   liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $sgpr2
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   $vgpr4 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+  ; GCN-NEXT:   $vgpr5 = GLOBAL_LOAD_DWORD $vgpr2_vgpr3, 0, 0, implicit $exec
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.2:
+  ; GCN-NEXT:   liveins: $sgpr0_sgpr1, $sgpr2, $vgpr2
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   S_WAIT_KMCNT 0
+  ; GCN-NEXT:   $sgpr2 = S_MOV_B32 $sgpr2
+  ; GCN-NEXT:   S_WAIT_XCNT 1
+  ; GCN-NEXT:   $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+  ; GCN-NEXT:   S_WAIT_XCNT 0
+  ; GCN-NEXT:   $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+  ; GCN-NEXT:   $sgpr0 = S_MOV_B32 $sgpr0
+  bb.0:
+    liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+    S_CBRANCH_SCC1 %bb.2, implicit $scc
+  bb.1:
+    liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $sgpr2
+    $vgpr4 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+    $vgpr5 = GLOBAL_LOAD_DWORD $vgpr2_vgpr3, 0, 0, implicit $exec
+  bb.2:
+    liveins: $sgpr0_sgpr1, $sgpr2, $vgpr2
+    $sgpr2 = S_MOV_B32 $sgpr2
+    $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+    $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+    $sgpr0 = S_MOV_B32 $sgpr0
+...
+
+---
+name: flushing_vmem_cnt_on_block_entry
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body: |
+  ; GCN-LABEL: name: flushing_vmem_cnt_on_block_entry
+  ; GCN: bb.0:
+  ; GCN-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; GCN-NEXT:   liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+  ; GCN-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit $scc
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.1:
+  ; GCN-NEXT:   successors: %bb.2(0x80000000)
+  ; GCN-NEXT:   liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $sgpr2
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   $vgpr4 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+  ; GCN-NEXT:   $vgpr5 = GLOBAL_LOAD_DWORD $vgpr2_vgpr3, 0, 0, implicit $exec
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.2:
+  ; GCN-NEXT:   liveins: $sgpr0_sgpr1, $sgpr2, $vgpr2
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   S_WAIT_XCNT 0
+  ; GCN-NEXT:   $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+  ; GCN-NEXT:   $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+  ; GCN-NEXT:   $sgpr0 = S_MOV_B32 $sgpr0
+  bb.0:
+    liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+    S_CBRANCH_SCC1 %bb.2, implicit $scc
+  bb.1:
+    liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $sgpr2
+    $vgpr4 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+    $vgpr5 = GLOBAL_LOAD_DWORD $vgpr2_vgpr3, 0, 0, implicit $exec
+  bb.2:
+    liveins: $sgpr0_sgpr1, $sgpr2, $vgpr2
+    $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+    $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+    $sgpr0 = S_MOV_B32 $sgpr0
+...
+
 ---
 name: wait_loadcnt_with_outstanding_smem
 tracksRegLiveness: true
diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll
index a42c8ac706d27..75817105e74fd 100644
--- a/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll
+++ b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll
@@ -3182,7 +3182,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v253 /*v509*/, s33 offset:1592
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v254 /*v510*/, s33 offset:1596
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v255 /*v511*/, s33 offset:1600
-; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 8 ; msbs: dst=0 src0=0 src1=2 src2=0
+; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 0x408 ; msbs: dst=0 src0=0 src1=2 src2=0
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v0 /*v512*/, s33 offset:1604
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v1 /*v513*/, s33 offset:1608
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v2 /*v514*/, s33 offset:1612
@@ -3443,7 +3443,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v253 /*v765*/, s33 offset:2616
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v254 /*v766*/, s33 offset:2620
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v255 /*v767*/, s33 offset:2624
-; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 12 ; msbs: dst=0 src0=0 src1=3 src2=0
+; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 0x80c ; msbs: dst=0 src0=0 src1=3 src2=0
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v0 /*v768*/, s33 offset:2628
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v1 /*v769*/, s33 offset:2632
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v2 /*v770*/, s33 offset:2636
@@ -3706,7 +3706,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v255 /*v1023*/, s33 offset:3648
 ; GFX1250-DAGISEL-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-DAGISEL-NEXT:    s_mov_b32 exec_lo, -1
-; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 0 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 0xc00 ; msbs: dst=0 src0=0 src1=0 src2=0
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX1250-DAGISEL-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-DAGISEL-NEXT:    v_writelane_b32 v40, s0, 3
@@ -4135,7 +4135,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v253 /*v509*/, off, s33 offset:1592
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v254 /*v510*/, off, s33 offset:1596
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v255 /*v511*/, off, s33 offset:1600
-; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 0x80 ; msbs: dst=2 src0=0 src1=0 src2=0
+; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 0x4080 ; msbs: dst=2 src0=0 src1=0 src2=0
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v0 /*v512*/, off, s33 offset:1604
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v1 /*v513*/, off, s33 offset:1608
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v2 /*v514*/, off, s33 offset:1612
@@ -4396,7 +4396,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v253 /*v765*/, off, s33 offset:2616
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v254 /*v766*/, off, s33 offset:2620
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v255 /*v767*/, off, s33 offset:2624
-; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 0xc0 ; msbs: dst=3 src0=0 src1=0 src2=0
+; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 0x80c0 ; msbs: dst=3 src0=0 src1=0 src2=0
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v0 /*v768*/, off, s33 offset:2628
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v1 /*v769*/, off, s33 offset:2632
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v2 /*v770*/, off, s33 offset:2636
@@ -4661,7 +4661,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; GFX1250-DAGISEL-NEXT:    s_mov_b32 exec_lo, s4
 ; GFX1250-DAGISEL-NEXT:    s_mov_b32 s33, s0
 ; GFX1250-DAGISEL-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 0 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 0xc000 ; msbs: dst=0 src0=0 src1=0 src2=0
 ; GFX1250-DAGISEL-NEXT:    s_set_pc_i64 s[30:31]
   %ret = call amdgpu_gfx <2 x half>(<2 x half>, <2 x half>) @gfx_callee(<2 x half> %y, <2 x half> %x) convergent
   ret <2 x half> %ret
@@ -6346,7 +6346,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v253 /*v509*/, s32 offset:1588
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v254 /*v510*/, s32 offset:1592
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v255 /*v511*/, s32 offset:1596
-; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 8 ; msbs: dst=0 src0=0 src1=2 src2=0
+; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 0x408 ; msbs: dst=0 src0=0 src1=2 src2=0
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v0 /*v512*/, s32 offset:1600
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v1 /*v513*/, s32 offset:1604
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v2 /*v514*/, s32 offset:1608
@@ -6607,7 +6607,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v253 /*v765*/, s32 offset:2612
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v254 /*v766*/, s32 offset:2616
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v255 /*v767*/, s32 offset:2620
-; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 12 ; msbs: dst=0 src0=0 src1=3 src2=0
+; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 0x80c ; msbs: dst=0 src0=0 src1=3 src2=0
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v0 /*v768*/, s32 offset:2624
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v1 /*v769*/, s32 offset:2628
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v2 /*v770*/, s32 offset:2632
@@ -6872,7 +6872,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; GFX1250-DAGISEL-NEXT:    s_mov_b32 exec_lo, -1
 ; GFX1250-DAGISEL-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX1250-DAGISEL-NEXT:    s_mov_b64 s[36:37], gfx_callee@abs64
-; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 0 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 0xc00 ; msbs: dst=0 src0=0 src1=0 src2=0
 ; GFX1250-DAGISEL-NEXT:    v_swap_b32 v0, v1
 ; GFX1250-DAGISEL-NEXT:    s_xor_b32 exec_lo, s0, -1
 ; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
@@ -7283,7 +7283,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v253 /*v509*/, off, s32 offset:1588
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v254 /*v510*/, off, s32 offset:1592
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v255 /*v511*/, off, s32 offset:1596
-; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 0x80 ; msbs: dst=2 src0=0 src1=0 src2=0
+; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 0x4080 ; msbs: dst=2 src0=0 src1=0 src2=0
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v0 /*v512*/, off, s32 offset:1600
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v1 /*v513*/, off, s32 offset:1604
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v2 /*v514*/, off, s32 offset:1608
@@ -7544,7 +7544,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v253 /*v765*/, off, s32 offset:2612
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v254 /*v766*/, off, s32 offset:2616
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v255 /*v767*/, off, s32 offset:2620
-; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 0xc0 ; msbs: dst=3 src0=0 src1=0 src2=0
+; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 0x80c0 ; msbs: dst=3 src0=0 src1=0 src2=0
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v0 /*v768*/, off, s32 offset:2624
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v1 /*v769*/, off, s32 offset:2628
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v2 /*v770*/, off, s32 offset:2632
@@ -7807,7 +7807,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v255 /*v1023*/, off, s32 offset:3644
 ; GFX1250-DAGISEL-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 0 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 0xc000 ; msbs: dst=0 src0=0 src1=0 src2=0
 ; GFX1250-DAGISEL-NEXT:    s_set_pc_i64 s[36:37]
   %ret = tail call amdgpu_gfx <2 x half>(<2 x half>, <2 x half>) @gfx_callee(<2 x half> %y, <2 x half> %x) convergent
   ret <2 x half> %ret
@@ -9657,7 +9657,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v253 /*v509*/, s33 offset:1600
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v254 /*v510*/, s33 offset:1604
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v255 /*v511*/, s33 offset:1608
-; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 8 ; msbs: dst=0 src0=0 src1=2 src2=0
+; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 0x408 ; msbs: dst=0 src0=0 src1=2 src2=0
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v0 /*v512*/, s33 offset:1612
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v1 /*v513*/, s33 offset:1616
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v2 /*v514*/, s33 offset:1620
@@ -9918,7 +9918,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v253 /*v765*/, s33 offset:2624
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v254 /*v766*/, s33 offset:2628
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v255 /*v767*/, s33 offset:2632
-; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 12 ; msbs: dst=0 src0=0 src1=3 src2=0
+; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 0x80c ; msbs: dst=0 src0=0 src1=3 src2=0
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v0 /*v768*/, s33 offset:2636
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v1 /*v769*/, s33 offset:2640
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v2 /*v770*/, s33 offset:2644
@@ -10181,7 +10181,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v255 /*v1023*/, s33 offset:3656
 ; GFX1250-DAGISEL-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-DAGISEL-NEXT:    s_mov_b32 exec_lo, -1
-; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 0 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 0xc00 ; msbs: dst=0 src0=0 src1=0 src2=0
 ; GFX1250-DAGISEL-NEXT:    s_clause 0x2
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v42, s33
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v40, s33 offset:164
@@ -10616,7 +10616,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v253 /*v509*/, off, s33 offset:1600
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v254 /*v510*/, off, s33 offset:1604
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v255 /*v511*/, off, s33 offset:1608
-; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 0x80 ; msbs: dst=2 src0=0 src1=0 src2=0
+; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 0x4080 ; msbs: dst=2 src0=0 src1=0 src2=0
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v0 /*v512*/, off, s33 offset:1612
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v1 /*v513*/, off, s33 offset:1616
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v2 /*v514*/, off, s33 offset:1620
@@ -10877,7 +10877,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v253 /*v765*/, off, s33 offset:2624
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v254 /*v766*/, off, s33 offset:2628
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v255 /*v767*/, off, s33 offset:2632
-; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 0xc0 ; msbs: dst=3 src0=0 src1=0 src2=0
+; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 0x80c0 ; msbs: dst=3 src0=0 src1=0 src2=0
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v0 /*v768*/, off, s33 offset:2636
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v1 /*v769*/, off, s33 offset:2640
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v2 /*v770*/, off, s33 offset:2644
@@ -11142,7 +11142,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GFX1250-DAGISEL-NEXT:    s_mov_b32 exec_lo, s4
 ; GFX1250-DAGISEL-NEXT:    s_mov_b32 s33, s0
 ; GFX1250-DAGISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 0 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 0xc000 ; msbs: dst=0 src0=0 src1=0 src2=0
 ; GFX1250-DAGISEL-NEXT:    s_set_pc_i64 s[30:31]
   %ret = call float(ptr, ...) @llvm.amdgcn.call.whole.wave(ptr @callee, <8 x float> %x) convergent
   store float %ret, ptr %p
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll
index ad8dcd3888e9f..21f0c008366a9 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -3477,13 +3477,10 @@ define amdgpu_gs void @wqm_init_exec_wwm() {
 ; GFX9-W64-NEXT:    s_mov_b64 exec, 0
 ; GFX9-W64-NEXT:    s_mov_b32 s1, 0
 ; GFX9-W64-NEXT:    s_mov_b32 s0, s1
-; GFX9-W64-NEXT:    s_cmp_lg_u64 exec, 0
-; GFX9-W64-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GFX9-W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX9-W64-NEXT:    s_cmp_eq_u64 s[0:1], 0
 ; GFX9-W64-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GFX9-W64-NEXT:    s_xor_b64 s[0:1], s[2:3], s[0:1]
-; GFX9-W64-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, s[0:1]
-; GFX9-W64-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-W64-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
 ; GFX9-W64-NEXT:    exp mrt0 off, off, off, off
 ; GFX9-W64-NEXT:    s_endpgm
 ;
@@ -3491,14 +3488,11 @@ define amdgpu_gs void @wqm_init_exec_wwm() {
 ; GFX10-W32:       ; %bb.0:
 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, 0
 ; GFX10-W32-NEXT:    s_mov_b32 s1, 0
-; GFX10-W32-NEXT:    s_cmp_lg_u64 exec, 0
+; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-W32-NEXT:    s_mov_b32 s0, s1
-; GFX10-W32-NEXT:    s_cselect_b32 s2, -1, 0
-; GFX10-W32-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX10-W32-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-W32-NEXT:    s_cmp_eq_u64 s[0:1], 0
 ; GFX10-W32-NEXT:    s_cselect_b32 s0, -1, 0
-; GFX10-W32-NEXT:    s_xor_b32 s0, s2, s0
-; GFX10-W32-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, s0
+; GFX10-W32-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, s0
 ; GFX10-W32-NEXT:    exp mrt0 off, off, off, off
 ; GFX10-W32-NEXT:    s_endpgm
   call void @llvm.amdgcn.init.exec(i64 0)
diff --git a/llvm/test/CodeGen/ARM/fp-intrinsics.ll b/llvm/test/CodeGen/ARM/fp-intrinsics.ll
index 93b6a58a22b6c..cb87508d53342 100644
--- a/llvm/test/CodeGen/ARM/fp-intrinsics.ll
+++ b/llvm/test/CodeGen/ARM/fp-intrinsics.ll
@@ -76,7 +76,6 @@ define i32 @fptosi_f32(float %x) #0 {
 ; CHECK-NOSP: bl __aeabi_f2iz
 ; CHECK-NOSP: bl __aeabi_f2iz
 ; CHECK-SP: vcvt.s32.f32
-; FIXME-CHECK-SP: vcvt.s32.f32
 define void @fptosi_f32_twice(float %arg, ptr %ptr) #0 {
 entry:
   %conv = call i32 @llvm.experimental.constrained.fptosi.i32.f32(float %arg, metadata !"fpexcept.strict") #0
@@ -146,6 +145,80 @@ define float @tan_f32(float %x) #0 {
   ret float %val
 }
 
+; CHECK-LABEL: acos_f32:
+; CHECK: bl acosf
+define float @acos_f32(float %x, float %y) #0 {
+  %val = call float @llvm.experimental.constrained.acos.f32(float %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret float %val
+}
+
+; CHECK-LABEL: asin_f32:
+; CHECK: bl asinf
+define float @asin_f32(float %x, float %y) #0 {
+  %val = call float @llvm.experimental.constrained.asin.f32(float %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret float %val
+}
+
+; CHECK-LABEL: atan_f32:
+; CHECK: bl atanf
+define float @atan_f32(float %x, float %y) #0 {
+  %val = call float @llvm.experimental.constrained.atan.f32(float %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret float %val
+}
+
+; CHECK-LABEL: cosh_f32:
+; CHECK: bl coshf
+define float @cosh_f32(float %x, float %y) #0 {
+  %val = call float @llvm.experimental.constrained.cosh.f32(float %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret float %val
+}
+
+; CHECK-LABEL: sinh_f32:
+; CHECK: bl sinhf
+define float @sinh_f32(float %x, float %y) #0 {
+  %val = call float @llvm.experimental.constrained.sinh.f32(float %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret float %val
+}
+
+; CHECK-LABEL: tanh_f32:
+; CHECK: bl tanhf
+define float @tanh_f32(float %x, float %y) #0 {
+  %val = call float @llvm.experimental.constrained.tanh.f32(float %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret float %val
+}
+
+; CHECK-LABEL: fmuladd_f32:
+; CHECK-SP: vfma.f32
+; CHECK-NOSP: bl __aeabi_fmul
+; CHECK-NOSP: bl __aeabi_fadd
+define float @fmuladd_f32(float %x, float %y, float %z) #0 {
+  %val = call float @llvm.experimental.constrained.fmuladd.f32(float %x, float %y, float %z, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret float %val
+}
+
+; CHECK-LABEL: ldexp_f32:
+; CHECK: bl ldexpf
+define float @ldexp_f32(float %x, i32 %y) #0 {
+  %val = call float @llvm.experimental.constrained.ldexp.f32.i32(float %x, i32 %y, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret float %val
+}
+
+; CHECK-LABEL: roundeven_f32:
+; CHECK-SP-V8: vrintn.f32
+; CHECK-NOSP: bl roundevenf
+define float @roundeven_f32(float %x) #0 {
+  %val = call float @llvm.experimental.constrained.roundeven.f32(float %x, metadata !"fpexcept.strict") #0
+  ret float %val
+}
+
+; CHECK-LABEL: uitofp_f32_i32:
+; CHECK-NOSP: bl __aeabi_ui2f
+; FIXME-CHECK-SP: vcvt.f32.f64
+define float @uitofp_f32_i32(i32 %x) #0 {
+  %val = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret float %val
+}
+
 ; CHECK-LABEL: atan2_f32:
 ; CHECK: bl atan2f
 define float @atan2_f32(float %x, float %y) #0 {
@@ -617,6 +690,80 @@ define double @tan_f64(double %x) #0 {
   ret double %val
 }
 
+; CHECK-LABEL: acos_f64:
+; CHECK: bl acos
+define double @acos_f64(double %x, double %y) #0 {
+  %val = call double @llvm.experimental.constrained.acos.f64(double %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret double %val
+}
+
+; CHECK-LABEL: asin_f64:
+; CHECK: bl asin
+define double @asin_f64(double %x, double %y) #0 {
+  %val = call double @llvm.experimental.constrained.asin.f64(double %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret double %val
+}
+
+; CHECK-LABEL: atan_f64:
+; CHECK: bl atan
+define double @atan_f64(double %x, double %y) #0 {
+  %val = call double @llvm.experimental.constrained.atan.f64(double %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret double %val
+}
+
+; CHECK-LABEL: cosh_f64:
+; CHECK: bl cosh
+define double @cosh_f64(double %x, double %y) #0 {
+  %val = call double @llvm.experimental.constrained.cosh.f64(double %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret double %val
+}
+
+; CHECK-LABEL: sinh_f64:
+; CHECK: bl sinh
+define double @sinh_f64(double %x, double %y) #0 {
+  %val = call double @llvm.experimental.constrained.sinh.f64(double %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret double %val
+}
+
+; CHECK-LABEL: tanh_f64:
+; CHECK: bl tanh
+define double @tanh_f64(double %x, double %y) #0 {
+  %val = call double @llvm.experimental.constrained.tanh.f64(double %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret double %val
+}
+
+; CHECK-LABEL: fmuladd_f64:
+; CHECK-DP: vfma.f64
+; CHECK-NODP: bl __aeabi_dmul
+; CHECK-NODP: bl __aeabi_dadd
+define double @fmuladd_f64(double %x, double %y, double %z) #0 {
+  %val = call double @llvm.experimental.constrained.fmuladd.f64(double %x, double %y, double %z, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret double %val
+}
+
+; CHECK-LABEL: ldexp_f64:
+; CHECK: bl ldexp
+define double @ldexp_f64(double %x, i32 %y) #0 {
+  %val = call double @llvm.experimental.constrained.ldexp.f64.i32(double %x, i32 %y, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret double %val
+}
+
+; CHECK-LABEL: roundeven_f64:
+; CHECK-DP-V8: vrintn.f64
+; CHECK-NODP: bl roundeven
+define double @roundeven_f64(double %x) #0 {
+  %val = call double @llvm.experimental.constrained.roundeven.f64(double %x, metadata !"fpexcept.strict") #0
+  ret double %val
+}
+
+; CHECK-LABEL: uitofp_f64_i32:
+; CHECK-NOSP: bl __aeabi_ui2d
+; FIXME-CHECK-SP: vsub.f64
+define double @uitofp_f64_i32(i32 %x) #0 {
+  %val = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret double %val
+}
+
 ; CHECK-LABEL: atan2_f64:
 ; CHECK: bl atan2
 define double @atan2_f64(double %x, double %y) #0 {
@@ -1052,6 +1199,16 @@ declare float @llvm.experimental.constrained.powi.f32(float, i32, metadata, meta
 declare float @llvm.experimental.constrained.sin.f32(float, metadata, metadata)
 declare float @llvm.experimental.constrained.cos.f32(float, metadata, metadata)
 declare float @llvm.experimental.constrained.tan.f32(float, metadata, metadata)
+declare float @llvm.experimental.constrained.acos.f32(float, metadata, metadata)
+declare float @llvm.experimental.constrained.asin.f32(float, metadata, metadata)
+declare float @llvm.experimental.constrained.atan.f32(float, metadata, metadata)
+declare float @llvm.experimental.constrained.cosh.f32(float, metadata, metadata)
+declare float @llvm.experimental.constrained.sinh.f32(float, metadata, metadata)
+declare float @llvm.experimental.constrained.tanh.f32(float, metadata, metadata)
+declare float @llvm.experimental.constrained.fmuladd.f32(float, float, float, metadata, metadata)
+declare float @llvm.experimental.constrained.ldexp.f32.i32(float, i32, metadata, metadata)
+declare float @llvm.experimental.constrained.roundeven.f32(float, metadata)
+declare float @llvm.experimental.constrained.uitofp.f32.i32(i32, metadata, metadata)
 declare float @llvm.experimental.constrained.atan2.f32(float, float, metadata, metadata)
 declare float @llvm.experimental.constrained.pow.f32(float, float, metadata, metadata)
 declare float @llvm.experimental.constrained.log.f32(float, metadata, metadata)
@@ -1087,6 +1244,16 @@ declare double @llvm.experimental.constrained.powi.f64(double, i32, metadata, me
 declare double @llvm.experimental.constrained.sin.f64(double, metadata, metadata)
 declare double @llvm.experimental.constrained.cos.f64(double, metadata, metadata)
 declare double @llvm.experimental.constrained.tan.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.acos.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.asin.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.atan.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.cosh.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.sinh.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.tanh.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.fmuladd.f64(double, double, double, metadata, metadata)
+declare double @llvm.experimental.constrained.ldexp.f64.i32(double, i32, metadata, metadata)
+declare double @llvm.experimental.constrained.roundeven.f64(double, metadata)
+declare double @llvm.experimental.constrained.uitofp.f64.i32(i32, metadata, metadata)
 declare double @llvm.experimental.constrained.atan2.f64(double, double, metadata, metadata)
 declare double @llvm.experimental.constrained.pow.f64(double, double, metadata, metadata)
 declare double @llvm.experimental.constrained.log.f64(double, metadata, metadata)
diff --git a/llvm/test/CodeGen/ARM/fp16-fullfp16.ll b/llvm/test/CodeGen/ARM/fp16-fullfp16.ll
index 200b14bae56ed..b4060d5fdb574 100644
--- a/llvm/test/CodeGen/ARM/fp16-fullfp16.ll
+++ b/llvm/test/CodeGen/ARM/fp16-fullfp16.ll
@@ -98,12 +98,18 @@ define i32 @test_fptosi_i32(ptr %p) {
   ret i32 %r
 }
 
-; FIXME
-;define i64 @test_fptosi_i64(ptr %p) {
-;  %a = load half, ptr %p, align 2
-;  %r = fptosi half %a to i64
-;  ret i64 %r
-;}
+define i64 @test_fptosi_i64(ptr %p) {
+; CHECK-LABEL: test_fptosi_i64:
+; CHECK:         .save {r11, lr}
+; CHECK-NEXT:    push {r11, lr}
+; CHECK-NEXT:    ldrh r0, [r0]
+; CHECK-NEXT:    vmov s0, r0
+; CHECK-NEXT:    bl __fixhfdi
+; CHECK-NEXT:    pop {r11, pc}
+  %a = load half, ptr %p, align 2
+  %r = fptosi half %a to i64
+  ret i64 %r
+}
 
 define i32 @test_fptoui_i32(ptr %p) {
 ; CHECK-LABEL: test_fptoui_i32:
@@ -116,12 +122,18 @@ define i32 @test_fptoui_i32(ptr %p) {
   ret i32 %r
 }
 
-; FIXME
-;define i64 @test_fptoui_i64(ptr %p) {
-;  %a = load half, ptr %p, align 2
-;  %r = fptoui half %a to i64
-;  ret i64 %r
-;}
+define i64 @test_fptoui_i64(ptr %p) {
+; CHECK-LABEL: test_fptoui_i64:
+; CHECK:         .save {r11, lr}
+; CHECK-NEXT:    push {r11, lr}
+; CHECK-NEXT:    ldrh r0, [r0]
+; CHECK-NEXT:    vmov s0, r0
+; CHECK-NEXT:    bl __fixunshfdi
+; CHECK-NEXT:    pop {r11, pc}
+  %a = load half, ptr %p, align 2
+  %r = fptoui half %a to i64
+  ret i64 %r
+}
 
 define void @test_sitofp_i32(i32 %a, ptr %p) {
 ; CHECK-LABEL: test_sitofp_i32:
@@ -145,19 +157,31 @@ define void @test_uitofp_i32(i32 %a, ptr %p) {
   ret void
 }
 
-; FIXME
-;define void @test_sitofp_i64(i64 %a, ptr %p) {
-;  %r = sitofp i64 %a to half
-;  store half %r, ptr %p
-;  ret void
-;}
+define void @test_sitofp_i64(i64 %a, ptr %p) {
+; CHECK-LABEL: test_sitofp_i64:
+; CHECK:         .save {r4, lr}
+; CHECK-NEXT:    push {r4, lr}
+; CHECK-NEXT:    mov r4, r2
+; CHECK-NEXT:    bl __floatdihf
+; CHECK-NEXT:    vstr.16 s0, [r4]
+; CHECK-NEXT:    pop {r4, pc}
+  %r = sitofp i64 %a to half
+  store half %r, ptr %p
+  ret void
+}
 
-; FIXME
-;define void @test_uitofp_i64(i64 %a, ptr %p) {
-;  %r = uitofp i64 %a to half
-;  store half %r, ptr %p
-;  ret void
-;}
+define void @test_uitofp_i64(i64 %a, ptr %p) {
+; CHECK-LABEL: test_uitofp_i64:
+; CHECK:         .save {r4, lr}
+; CHECK-NEXT:    push {r4, lr}
+; CHECK-NEXT:    mov r4, r2
+; CHECK-NEXT:    bl __floatundihf
+; CHECK-NEXT:    vstr.16 s0, [r4]
+; CHECK-NEXT:    pop {r4, pc}
+  %r = uitofp i64 %a to half
+  store half %r, ptr %p
+  ret void
+}
 
 define void @test_fptrunc_float(float %f, ptr %p) {
 ; CHECK-LABEL: test_fptrunc_float:
@@ -613,6 +637,902 @@ define void @test_fmuladd(ptr %p, ptr %q, ptr %r) {
   ret void
 }
 
+; Half-precision intrinsics
+
+define half @add_f16(half %x, half %y) #0 {
+; CHECK-LABEL: add_f16:
+; CHECK:         vadd.f16 s0, s0, s1
+; CHECK-NEXT:    bx lr
+  %val = call half @llvm.experimental.constrained.fadd.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret half %val
+}
+
+define half @sub_f16(half %x, half %y) #0 {
+; CHECK-LABEL: sub_f16:
+; CHECK:         vsub.f16 s0, s0, s1
+; CHECK-NEXT:    bx lr
+  %val = call half @llvm.experimental.constrained.fsub.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret half %val
+}
+
+define half @mul_f16(half %x, half %y) #0 {
+; CHECK-LABEL: mul_f16:
+; CHECK:         vmul.f16 s0, s0, s1
+; CHECK-NEXT:    bx lr
+  %val = call half @llvm.experimental.constrained.fmul.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret half %val
+}
+
+define half @div_f16(half %x, half %y) #0 {
+; CHECK-LABEL: div_f16:
+; CHECK:         vdiv.f16 s0, s0, s1
+; CHECK-NEXT:    bx lr
+  %val = call half @llvm.experimental.constrained.fdiv.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret half %val
+}
+
+define half @frem_f16(half %x, half %y) #0 {
+; CHECK-LABEL: frem_f16:
+; CHECK:         .save {r11, lr}
+; CHECK-NEXT:    push {r11, lr}
+; CHECK-NEXT:    vcvtb.f32.f16 s0, s0
+; CHECK-NEXT:    vcvtb.f32.f16 s1, s1
+; CHECK-NEXT:    bl fmodf
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
+; CHECK-NEXT:    pop {r11, pc}
+  %val = call half @llvm.experimental.constrained.frem.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret half %val
+}
+
+define half @fma_f16(half %x, half %y, half %z) #0 {
+; CHECK-LABEL: fma_f16:
+; CHECK:         vfma.f16 s2, s0, s1
+; CHECK-NEXT:    vmov.f32 s0, s2
+; CHECK-NEXT:    bx lr
+  %val = call half @llvm.experimental.constrained.fma.f16(half %x, half %y, half %z, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret half %val
+}
+
+define half @fmuladd_f16(half %x, half %y, half %z) #0 {
+; CHECK-LABEL: fmuladd_f16:
+; CHECK:         vfma.f16 s2, s0, s1
+; CHECK-NEXT:    vmov.f32 s0, s2
+; CHECK-NEXT:    bx lr
+  %val = call half @llvm.experimental.constrained.fmuladd.f16(half %x, half %y, half %z, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret half %val
+}
+
+define i32 @fptosi_i32_f16(half %x) #0 {
+; CHECK-LABEL: fptosi_i32_f16:
+; CHECK:         vcvt.s32.f16 s0, s0
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    bx lr
+  %val = call i32 @llvm.experimental.constrained.fptosi.i32.f16(half %x, metadata !"fpexcept.strict") #0
+  ret i32 %val
+}
+
+define i32 @fptoui_i32_f16(half %x) #0 {
+; CHECK-LABEL: fptoui_i32_f16:
+; CHECK:         vcvt.s32.f16 s0, s0
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    bx lr
+  %val = call i32 @llvm.experimental.constrained.fptoui.i32.f16(half %x, metadata !"fpexcept.strict") #0
+  ret i32 %val
+}
+
+define i64 @fptosi_i64_f16(half %x) #0 {
+; CHECK-LABEL: fptosi_i64_f16:
+; CHECK:         .save {r11, lr}
+; CHECK-NEXT:    push {r11, lr}
+; CHECK-NEXT:    vmov.f16 r0, s0
+; CHECK-NEXT:    vmov s0, r0
+; CHECK-NEXT:    bl __fixhfdi
+; CHECK-NEXT:    pop {r11, pc}
+  %val = call i64 @llvm.experimental.constrained.fptosi.i64.f16(half %x, metadata !"fpexcept.strict") #0
+  ret i64 %val
+}
+
+define i64 @fptoui_i64_f16(half %x) #0 {
+; CHECK-LABEL: fptoui_i64_f16:
+; CHECK:         .save {r11, lr}
+; CHECK-NEXT:    push {r11, lr}
+; CHECK-NEXT:    vmov.f16 r0, s0
+; CHECK-NEXT:    vmov s0, r0
+; CHECK-NEXT:    bl __fixunshfdi
+; CHECK-NEXT:    pop {r11, pc}
+  %val = call i64 @llvm.experimental.constrained.fptoui.i64.f16(half %x, metadata !"fpexcept.strict") #0
+  ret i64 %val
+}
+
+define half @sitofp_f16_i32(i32 %x) #0 {
+; CHECK-LABEL: sitofp_f16_i32:
+; CHECK:         .pad #8
+; CHECK-NEXT:    sub sp, sp, #8
+; CHECK-NEXT:    movw r1, #0
+; CHECK-NEXT:    eor r0, r0, #-2147483648
+; CHECK-NEXT:    movt r1, #17200
+; CHECK-NEXT:    str r0, [sp]
+; CHECK-NEXT:    str r1, [sp, #4]
+; CHECK-NEXT:    vldr d16, .LCPI57_0
+; CHECK-NEXT:    vldr d17, [sp]
+; CHECK-NEXT:    vsub.f64 d16, d17, d16
+; CHECK-NEXT:    vcvtb.f16.f64 s0, d16
+; CHECK-NEXT:    add sp, sp, #8
+; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    .p2align 3
+; CHECK-NEXT:  .LCPI57_0:
+; CHECK-NEXT:    .long 2147483648
+; CHECK-NEXT:    .long 1127219200
+  %val = call half @llvm.experimental.constrained.sitofp.f16.i32(i32 %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret half %val
+}
+
+define half @uitofp_f16_i32(i32 %x) #0 {
+; CHECK-LABEL: uitofp_f16_i32:
+; CHECK:         .pad #8
+; CHECK-NEXT:    sub sp, sp, #8
+; CHECK-NEXT:    movw r1, #0
+; CHECK-NEXT:    str r0, [sp]
+; CHECK-NEXT:    movt r1, #17200
+; CHECK-NEXT:    vldr d16, .LCPI58_0
+; CHECK-NEXT:    str r1, [sp, #4]
+; CHECK-NEXT:    vldr d17, [sp]
+; CHECK-NEXT:    vsub.f64 d16, d17, d16
+; CHECK-NEXT:    vcvtb.f16.f64 s0, d16
+; CHECK-NEXT:    add sp, sp, #8
+; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    .p2align 3
+; CHECK-NEXT:  .LCPI58_0:
+; CHECK-NEXT:    .long 0
+; CHECK-NEXT:    .long 1127219200
+  %val = call half @llvm.experimental.constrained.uitofp.f16.i32(i32 %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret half %val
+}
+
+define half @sitofp_f16_i64(i64 %x) #0 {
+; CHECK-LABEL: sitofp_f16_i64:
+; CHECK:         .save {r11, lr}
+; CHECK-NEXT:    push {r11, lr}
+; CHECK-NEXT:    bl __floatdihf
+; CHECK-NEXT:    pop {r11, pc}
+  %val = call half @llvm.experimental.constrained.sitofp.f16.i64(i64 %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret half %val
+}
+
+define half @uitofp_f16_i64(i64 %x) #0 {
+; CHECK-LABEL: uitofp_f16_i64:
+; CHECK:         .save {r11, lr}
+; CHECK-NEXT:    push {r11, lr}
+; CHECK-NEXT:    bl __floatundihf
+; CHECK-NEXT:    pop {r11, pc}
+  %val = call half @llvm.experimental.constrained.uitofp.f16.i64(i64 %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret half %val
+}
+
+define half @sitofp_f16_i128(i128 %x) #0 {
+; CHECK-LABEL: sitofp_f16_i128:
+; CHECK:         .save {r11, lr}
+; CHECK-NEXT:    push {r11, lr}
+; CHECK-NEXT:    bl __floattihf
+; CHECK-NEXT:    pop {r11, pc}
+  %val = call half @llvm.experimental.constrained.sitofp.f16.i128(i128 %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret half %val
+}
+
+define half @uitofp_f16_i128(i128 %x) #0 {
+; CHECK-LABEL: uitofp_f16_i128:
+; CHECK:         .save {r11, lr}
+; CHECK-NEXT:    push {r11, lr}
+; CHECK-NEXT:    bl __floatuntihf
+; CHECK-NEXT:    pop {r11, pc}
+  %val = call half @llvm.experimental.constrained.uitofp.f16.i128(i128 %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret half %val
+}
+
+define half @sqrt_f16(half %x) #0 {
+; CHECK-LABEL: sqrt_f16:
+; CHECK:         vsqrt.f16 s0, s0
+; CHECK-NEXT:    bx lr
+  %val = call half @llvm.experimental.constrained.sqrt.f16(half %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret half %val
+}
+
+define half @powi_f16(half %x, i32 %y) #0 {
+; CHECK-LABEL: powi_f16:
+; CHECK:         .save {r11, lr}
+; CHECK-NEXT:    push {r11, lr}
+; CHECK-NEXT:    vcvtb.f32.f16 s0, s0
+; CHECK-NEXT:    bl __powisf2
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
+; CHECK-NEXT:    pop {r11, pc}
+  %val = call half @llvm.experimental.constrained.powi.f16(half %x, i32 %y, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret half %val
+}
+
+define half @sin_f16(half %x) #0 {
+; CHECK-LABEL: sin_f16:
+; CHECK:         .save {r11, lr}
+; CHECK-NEXT:    push {r11, lr}
+; CHECK-NEXT:    vcvtb.f32.f16 s0, s0
+; CHECK-NEXT:    bl sinf
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
+; CHECK-NEXT:    pop {r11, pc}
+  %val = call half @llvm.experimental.constrained.sin.f16(half %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret half %val
+}
+
+define half @cos_f16(half %x) #0 {
+; CHECK-LABEL: cos_f16:
+; CHECK:         .save {r11, lr}
+; CHECK-NEXT:    push {r11, lr}
+; CHECK-NEXT:    vcvtb.f32.f16 s0, s0
+; CHECK-NEXT:    bl cosf
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
+; CHECK-NEXT:    pop {r11, pc}
+  %val = call half @llvm.experimental.constrained.cos.f16(half %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret half %val
+}
+
+define half @tan_f16(half %x) #0 {
+; CHECK-LABEL: tan_f16:
+; CHECK:         .save {r11, lr}
+; CHECK-NEXT:    push {r11, lr}
+; CHECK-NEXT:    vcvtb.f32.f16 s0, s0
+; CHECK-NEXT:    bl tanf
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
+; CHECK-NEXT:    pop {r11, pc}
+  %val = call half @llvm.experimental.constrained.tan.f16(half %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret half %val
+}
+
+define half @asin_f16(half %x) #0 {
+; CHECK-LABEL: asin_f16:
+; CHECK:         .save {r11, lr}
+; CHECK-NEXT:    push {r11, lr}
+; CHECK-NEXT:    vcvtb.f32.f16 s0, s0
+; CHECK-NEXT:    bl asinf
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
+; CHECK-NEXT:    pop {r11, pc}
+  %val = call half @llvm.experimental.constrained.asin.f16(half %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret half %val
+}
+
+define half @acos_f16(half %x) #0 {
+; CHECK-LABEL: acos_f16:
+; CHECK:         .save {r11, lr}
+; CHECK-NEXT:    push {r11, lr}
+; CHECK-NEXT:    vcvtb.f32.f16 s0, s0
+; CHECK-NEXT:    bl acosf
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
+; CHECK-NEXT:    pop {r11, pc}
+  %val = call half @llvm.experimental.constrained.acos.f16(half %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret half %val
+}
+
+define half @atan_f16(half %x) #0 {
+; CHECK-LABEL: atan_f16:
+; CHECK:         .save {r11, lr}
+; CHECK-NEXT:    push {r11, lr}
+; CHECK-NEXT:    vcvtb.f32.f16 s0, s0
+; CHECK-NEXT:    bl atanf
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
+; CHECK-NEXT:    pop {r11, pc}
+  %val = call half @llvm.experimental.constrained.atan.f16(half %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret half %val
+}
+
+define half @atan2_f16(half %x, half %y) #0 {
+; CHECK-LABEL: atan2_f16:
+; CHECK:         .save {r11, lr}
+; CHECK-NEXT:    push {r11, lr}
+; CHECK-NEXT:    vcvtb.f32.f16 s0, s0
+; CHECK-NEXT:    vcvtb.f32.f16 s1, s1
+; CHECK-NEXT:    bl atan2f
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
+; CHECK-NEXT:    pop {r11, pc}
+  %val = call half @llvm.experimental.constrained.atan2.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret half %val
+}
+
+define half @sinh_f16(half %x) #0 {
+; CHECK-LABEL: sinh_f16:
+; CHECK:         .save {r11, lr}
+; CHECK-NEXT:    push {r11, lr}
+; CHECK-NEXT:    vcvtb.f32.f16 s0, s0
+; CHECK-NEXT:    bl sinhf
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
+; CHECK-NEXT:    pop {r11, pc}
+  %val = call half @llvm.experimental.constrained.sinh.f16(half %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret half %val
+}
+
+define half @cosh_f16(half %x) #0 {
+; CHECK-LABEL: cosh_f16:
+; CHECK:         .save {r11, lr}
+; CHECK-NEXT:    push {r11, lr}
+; CHECK-NEXT:    vcvtb.f32.f16 s0, s0
+; CHECK-NEXT:    bl coshf
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
+; CHECK-NEXT:    pop {r11, pc}
+  %val = call half @llvm.experimental.constrained.cosh.f16(half %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret half %val
+}
+
+define half @tanh_f16(half %x) #0 {
+; CHECK-LABEL: tanh_f16:
+; CHECK:         .save {r11, lr}
+; CHECK-NEXT:    push {r11, lr}
+; CHECK-NEXT:    vcvtb.f32.f16 s0, s0
+; CHECK-NEXT:    bl tanhf
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
+; CHECK-NEXT:    pop {r11, pc}
+  %val = call half @llvm.experimental.constrained.tanh.f16(half %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret half %val
+}
+
+define half @pow_f16(half %x, half %y) #0 {
+; CHECK-LABEL: pow_f16:
+; CHECK:         .save {r11, lr}
+; CHECK-NEXT:    push {r11, lr}
+; CHECK-NEXT:    vcvtb.f32.f16 s0, s0
+; CHECK-NEXT:    vcvtb.f32.f16 s1, s1
+; CHECK-NEXT:    bl powf
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
+; CHECK-NEXT:    pop {r11, pc}
+  %val = call half @llvm.experimental.constrained.pow.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret half %val
+}
+
+define half @log_f16(half %x) #0 {
+; CHECK-LABEL: log_f16:
+; CHECK:         .save {r11, lr}
+; CHECK-NEXT:    push {r11, lr}
+; CHECK-NEXT:    vcvtb.f32.f16 s0, s0
+; CHECK-NEXT:    bl logf
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
+; CHECK-NEXT:    pop {r11, pc}
+  %val = call half @llvm.experimental.constrained.log.f16(half %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret half %val
+}
+
+define half @log10_f16(half %x) #0 {
+; CHECK-LABEL: log10_f16:
+; CHECK:         .save {r11, lr}
+; CHECK-NEXT:    push {r11, lr}
+; CHECK-NEXT:    vcvtb.f32.f16 s0, s0
+; CHECK-NEXT:    bl log10f
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
+; CHECK-NEXT:    pop {r11, pc}
+  %val = call half @llvm.experimental.constrained.log10.f16(half %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret half %val
+}
+
+define half @log2_f16(half %x) #0 {
+; CHECK-LABEL: log2_f16:
+; CHECK:         .save {r11, lr}
+; CHECK-NEXT:    push {r11, lr}
+; CHECK-NEXT:    vcvtb.f32.f16 s0, s0
+; CHECK-NEXT:    bl log2f
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
+; CHECK-NEXT:    pop {r11, pc}
+  %val = call half @llvm.experimental.constrained.log2.f16(half %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret half %val
+}
+
+define half @exp_f16(half %x) #0 {
+; CHECK-LABEL: exp_f16:
+; CHECK:         .save {r11, lr}
+; CHECK-NEXT:    push {r11, lr}
+; CHECK-NEXT:    vcvtb.f32.f16 s0, s0
+; CHECK-NEXT:    bl expf
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
+; CHECK-NEXT:    pop {r11, pc}
+  %val = call half @llvm.experimental.constrained.exp.f16(half %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret half %val
+}
+
+define half @exp2_f16(half %x) #0 {
+; CHECK-LABEL: exp2_f16:
+; CHECK:         .save {r11, lr}
+; CHECK-NEXT:    push {r11, lr}
+; CHECK-NEXT:    vcvtb.f32.f16 s0, s0
+; CHECK-NEXT:    bl exp2f
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
+; CHECK-NEXT:    pop {r11, pc}
+  %val = call half @llvm.experimental.constrained.exp2.f16(half %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret half %val
+}
+
+define half @rint_f16(half %x) #0 {
+; CHECK-LABEL: rint_f16:
+; CHECK:         vrintx.f16 s0, s0
+; CHECK-NEXT:    bx lr
+  %val = call half @llvm.experimental.constrained.rint.f16(half %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret half %val
+}
+
+define half @nearbyint_f16(half %x) #0 {
+; CHECK-LABEL: nearbyint_f16:
+; CHECK:         vrintr.f16 s0, s0
+; CHECK-NEXT:    bx lr
+  %val = call half @llvm.experimental.constrained.nearbyint.f16(half %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret half %val
+}
+
+define i32 @lrint_f16(half %x) #0 {
+; CHECK-LABEL: lrint_f16:
+; CHECK:         .save {r11, lr}
+; CHECK-NEXT:    push {r11, lr}
+; CHECK-NEXT:    vcvtb.f32.f16 s0, s0
+; CHECK-NEXT:    bl lrintf
+; CHECK-NEXT:    pop {r11, pc}
+  %val = call i32 @llvm.experimental.constrained.lrint.i32.f16(half %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret i32 %val
+}
+
+define i64 @llrint_f16(half %x) #0 {
+; CHECK-LABEL: llrint_f16:
+; CHECK:         .save {r11, lr}
+; CHECK-NEXT:    push {r11, lr}
+; CHECK-NEXT:    vcvtb.f32.f16 s0, s0
+; CHECK-NEXT:    bl llrintf
+; CHECK-NEXT:    pop {r11, pc}
+  %val = call i64 @llvm.experimental.constrained.llrint.i64.f16(half %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret i64 %val
+}
+
+define half @maxnum_f16(half %x, half %y) #0 {
+; CHECK-LABEL: maxnum_f16:
+; CHECK:         vmaxnm.f16 s0, s0, s1
+; CHECK-NEXT:    bx lr
+  %val = call half @llvm.experimental.constrained.maxnum.f16(half %x, half %y, metadata !"fpexcept.strict") #0
+  ret half %val
+}
+
+define half @minnum_f16(half %x, half %y) #0 {
+; CHECK-LABEL: minnum_f16:
+; CHECK:         vminnm.f16 s0, s0, s1
+; CHECK-NEXT:    bx lr
+  %val = call half @llvm.experimental.constrained.minnum.f16(half %x, half %y, metadata !"fpexcept.strict") #0
+  ret half %val
+}
+
+define half @ceil_f16(half %x) #0 {
+; CHECK-LABEL: ceil_f16:
+; CHECK:         vrintp.f16 s0, s0
+; CHECK-NEXT:    bx lr
+  %val = call half @llvm.experimental.constrained.ceil.f16(half %x, metadata !"fpexcept.strict") #0
+  ret half %val
+}
+
+define half @floor_f16(half %x) #0 {
+; CHECK-LABEL: floor_f16:
+; CHECK:         vrintm.f16 s0, s0
+; CHECK-NEXT:    bx lr
+  %val = call half @llvm.experimental.constrained.floor.f16(half %x, metadata !"fpexcept.strict") #0
+  ret half %val
+}
+
+define i32 @lround_f16(half %x) #0 {
+; CHECK-LABEL: lround_f16:
+; CHECK:         .save {r11, lr}
+; CHECK-NEXT:    push {r11, lr}
+; CHECK-NEXT:    vcvtb.f32.f16 s0, s0
+; CHECK-NEXT:    bl lroundf
+; CHECK-NEXT:    pop {r11, pc}
+  %val = call i32 @llvm.experimental.constrained.lround.i32.f16(half %x, metadata !"fpexcept.strict") #0
+  ret i32 %val
+}
+
+define i64 @llround_f16(half %x) #0 {
+; CHECK-LABEL: llround_f16:
+; CHECK:         .save {r11, lr}
+; CHECK-NEXT:    push {r11, lr}
+; CHECK-NEXT:    vcvtb.f32.f16 s0, s0
+; CHECK-NEXT:    bl llroundf
+; CHECK-NEXT:    pop {r11, pc}
+  %val = call i64 @llvm.experimental.constrained.llround.i64.f16(half %x, metadata !"fpexcept.strict") #0
+  ret i64 %val
+}
+
+define half @round_f16(half %x) #0 {
+; CHECK-LABEL: round_f16:
+; CHECK:         vrinta.f16 s0, s0
+; CHECK-NEXT:    bx lr
+  %val = call half @llvm.experimental.constrained.round.f16(half %x, metadata !"fpexcept.strict") #0
+  ret half %val
+}
+
+define half @roundeven_f16(half %x) #0 {
+; CHECK-LABEL: roundeven_f16:
+; CHECK:         vrintn.f16 s0, s0
+; CHECK-NEXT:    bx lr
+  %val = call half @llvm.experimental.constrained.roundeven.f16(half %x, metadata !"fpexcept.strict") #0
+  ret half %val
+}
+
+define half @trunc_f16(half %x) #0 {
+; CHECK-LABEL: trunc_f16:
+; CHECK:         vrintz.f16 s0, s0
+; CHECK-NEXT:    bx lr
+  %val = call half @llvm.experimental.constrained.trunc.f16(half %x, metadata !"fpexcept.strict") #0
+  ret half %val
+}
+
+define half @ldexp_f16(half %x, i32 %y) #0 {
+; CHECK-LABEL: ldexp_f16:
+; CHECK:         .save {r11, lr}
+; CHECK-NEXT:    push {r11, lr}
+; CHECK-NEXT:    vcvtb.f32.f16 s0, s0
+; CHECK-NEXT:    bl ldexpf
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
+; CHECK-NEXT:    pop {r11, pc}
+  %val = call half @llvm.experimental.constrained.ldexp.f16.i32(half %x, i32 %y, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret half %val
+}
+
+define i32 @fcmp_olt_f16(half %a, half %b) #0 {
+; CHECK-LABEL: fcmp_olt_f16:
+; CHECK:         vcmp.f16 s0, s1
+; CHECK-NEXT:    mov r0, #0
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    movwmi r0, #1
+; CHECK-NEXT:    bx lr
+  %cmp = call i1 @llvm.experimental.constrained.fcmp.f16(half %a, half %b, metadata !"olt", metadata !"fpexcept.strict") #0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @fcmp_ole_f16(half %a, half %b) #0 {
+; CHECK-LABEL: fcmp_ole_f16:
+; CHECK:         vcmp.f16 s0, s1
+; CHECK-NEXT:    mov r0, #0
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    movwls r0, #1
+; CHECK-NEXT:    bx lr
+  %cmp = call i1 @llvm.experimental.constrained.fcmp.f16(half %a, half %b, metadata !"ole", metadata !"fpexcept.strict") #0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @fcmp_ogt_f16(half %a, half %b) #0 {
+; CHECK-LABEL: fcmp_ogt_f16:
+; CHECK:         vcmp.f16 s0, s1
+; CHECK-NEXT:    mov r0, #0
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    movwgt r0, #1
+; CHECK-NEXT:    bx lr
+  %cmp = call i1 @llvm.experimental.constrained.fcmp.f16(half %a, half %b, metadata !"ogt", metadata !"fpexcept.strict") #0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @fcmp_oge_f16(half %a, half %b) #0 {
+; CHECK-LABEL: fcmp_oge_f16:
+; CHECK:         vcmp.f16 s0, s1
+; CHECK-NEXT:    mov r0, #0
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    movwge r0, #1
+; CHECK-NEXT:    bx lr
+  %cmp = call i1 @llvm.experimental.constrained.fcmp.f16(half %a, half %b, metadata !"oge", metadata !"fpexcept.strict") #0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @fcmp_oeq_f16(half %a, half %b) #0 {
+; CHECK-LABEL: fcmp_oeq_f16:
+; CHECK:         vcmp.f16 s0, s1
+; CHECK-NEXT:    mov r0, #0
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    movweq r0, #1
+; CHECK-NEXT:    bx lr
+  %cmp = call i1 @llvm.experimental.constrained.fcmp.f16(half %a, half %b, metadata !"oeq", metadata !"fpexcept.strict") #0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @fcmp_one_f16(half %a, half %b) #0 {
+; CHECK-LABEL: fcmp_one_f16:
+; CHECK:         vcmp.f16 s0, s1
+; CHECK-NEXT:    mov r0, #0
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    movwmi r0, #1
+; CHECK-NEXT:    movwgt r0, #1
+; CHECK-NEXT:    bx lr
+  %cmp = call i1 @llvm.experimental.constrained.fcmp.f16(half %a, half %b, metadata !"one", metadata !"fpexcept.strict") #0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @fcmp_ult_f16(half %a, half %b) #0 {
+; CHECK-LABEL: fcmp_ult_f16:
+; CHECK:         vcmp.f16 s0, s1
+; CHECK-NEXT:    mov r0, #0
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    movwlt r0, #1
+; CHECK-NEXT:    bx lr
+  %cmp = call i1 @llvm.experimental.constrained.fcmp.f16(half %a, half %b, metadata !"ult", metadata !"fpexcept.strict") #0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @fcmp_ule_f16(half %a, half %b) #0 {
+; CHECK-LABEL: fcmp_ule_f16:
+; CHECK:         vcmp.f16 s0, s1
+; CHECK-NEXT:    mov r0, #0
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    movwle r0, #1
+; CHECK-NEXT:    bx lr
+  %cmp = call i1 @llvm.experimental.constrained.fcmp.f16(half %a, half %b, metadata !"ule", metadata !"fpexcept.strict") #0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @fcmp_ugt_f16(half %a, half %b) #0 {
+; CHECK-LABEL: fcmp_ugt_f16:
+; CHECK:         vcmp.f16 s0, s1
+; CHECK-NEXT:    mov r0, #0
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    movwhi r0, #1
+; CHECK-NEXT:    bx lr
+  %cmp = call i1 @llvm.experimental.constrained.fcmp.f16(half %a, half %b, metadata !"ugt", metadata !"fpexcept.strict") #0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @fcmp_uge_f16(half %a, half %b) #0 {
+; CHECK-LABEL: fcmp_uge_f16:
+; CHECK:         vcmp.f16 s0, s1
+; CHECK-NEXT:    mov r0, #0
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    movwpl r0, #1
+; CHECK-NEXT:    bx lr
+  %cmp = call i1 @llvm.experimental.constrained.fcmp.f16(half %a, half %b, metadata !"uge", metadata !"fpexcept.strict") #0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @fcmp_ueq_f16(half %a, half %b) #0 {
+; CHECK-LABEL: fcmp_ueq_f16:
+; CHECK:         vcmp.f16 s0, s1
+; CHECK-NEXT:    mov r0, #0
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    movweq r0, #1
+; CHECK-NEXT:    movwvs r0, #1
+; CHECK-NEXT:    bx lr
+  %cmp = call i1 @llvm.experimental.constrained.fcmp.f16(half %a, half %b, metadata !"ueq", metadata !"fpexcept.strict") #0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @fcmp_une_f16(half %a, half %b) #0 {
+; CHECK-LABEL: fcmp_une_f16:
+; CHECK:         vcmp.f16 s0, s1
+; CHECK-NEXT:    mov r0, #0
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    movwne r0, #1
+; CHECK-NEXT:    bx lr
+  %cmp = call i1 @llvm.experimental.constrained.fcmp.f16(half %a, half %b, metadata !"une", metadata !"fpexcept.strict") #0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @fcmps_olt_f16(half %a, half %b) #0 {
+; CHECK-LABEL: fcmps_olt_f16:
+; CHECK:         vcmpe.f16 s0, s1
+; CHECK-NEXT:    mov r0, #0
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    movwmi r0, #1
+; CHECK-NEXT:    bx lr
+  %cmp = call i1 @llvm.experimental.constrained.fcmps.f16(half %a, half %b, metadata !"olt", metadata !"fpexcept.strict") #0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @fcmps_ole_f16(half %a, half %b) #0 {
+; CHECK-LABEL: fcmps_ole_f16:
+; CHECK:         vcmpe.f16 s0, s1
+; CHECK-NEXT:    mov r0, #0
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    movwls r0, #1
+; CHECK-NEXT:    bx lr
+  %cmp = call i1 @llvm.experimental.constrained.fcmps.f16(half %a, half %b, metadata !"ole", metadata !"fpexcept.strict") #0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @fcmps_ogt_f16(half %a, half %b) #0 {
+; CHECK-LABEL: fcmps_ogt_f16:
+; CHECK:         vcmpe.f16 s0, s1
+; CHECK-NEXT:    mov r0, #0
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    movwgt r0, #1
+; CHECK-NEXT:    bx lr
+  %cmp = call i1 @llvm.experimental.constrained.fcmps.f16(half %a, half %b, metadata !"ogt", metadata !"fpexcept.strict") #0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @fcmps_oge_f16(half %a, half %b) #0 {
+; CHECK-LABEL: fcmps_oge_f16:
+; CHECK:         vcmpe.f16 s0, s1
+; CHECK-NEXT:    mov r0, #0
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    movwge r0, #1
+; CHECK-NEXT:    bx lr
+  %cmp = call i1 @llvm.experimental.constrained.fcmps.f16(half %a, half %b, metadata !"oge", metadata !"fpexcept.strict") #0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @fcmps_oeq_f16(half %a, half %b) #0 {
+; CHECK-LABEL: fcmps_oeq_f16:
+; CHECK:         vcmpe.f16 s0, s1
+; CHECK-NEXT:    mov r0, #0
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    movweq r0, #1
+; CHECK-NEXT:    bx lr
+  %cmp = call i1 @llvm.experimental.constrained.fcmps.f16(half %a, half %b, metadata !"oeq", metadata !"fpexcept.strict") #0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @fcmps_one_f16(half %a, half %b) #0 {
+; CHECK-LABEL: fcmps_one_f16:
+; CHECK:         vcmpe.f16 s0, s1
+; CHECK-NEXT:    mov r0, #0
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    movwmi r0, #1
+; CHECK-NEXT:    movwgt r0, #1
+; CHECK-NEXT:    bx lr
+  %cmp = call i1 @llvm.experimental.constrained.fcmps.f16(half %a, half %b, metadata !"one", metadata !"fpexcept.strict") #0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @fcmps_ult_f16(half %a, half %b) #0 {
+; CHECK-LABEL: fcmps_ult_f16:
+; CHECK:         vcmpe.f16 s0, s1
+; CHECK-NEXT:    mov r0, #0
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    movwlt r0, #1
+; CHECK-NEXT:    bx lr
+  %cmp = call i1 @llvm.experimental.constrained.fcmps.f16(half %a, half %b, metadata !"ult", metadata !"fpexcept.strict") #0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @fcmps_ule_f16(half %a, half %b) #0 {
+; CHECK-LABEL: fcmps_ule_f16:
+; CHECK:         vcmpe.f16 s0, s1
+; CHECK-NEXT:    mov r0, #0
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    movwle r0, #1
+; CHECK-NEXT:    bx lr
+  %cmp = call i1 @llvm.experimental.constrained.fcmps.f16(half %a, half %b, metadata !"ule", metadata !"fpexcept.strict") #0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @fcmps_ugt_f16(half %a, half %b) #0 {
+; CHECK-LABEL: fcmps_ugt_f16:
+; CHECK:         vcmpe.f16 s0, s1
+; CHECK-NEXT:    mov r0, #0
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    movwhi r0, #1
+; CHECK-NEXT:    bx lr
+  %cmp = call i1 @llvm.experimental.constrained.fcmps.f16(half %a, half %b, metadata !"ugt", metadata !"fpexcept.strict") #0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @fcmps_uge_f16(half %a, half %b) #0 {
+; CHECK-LABEL: fcmps_uge_f16:
+; CHECK:         vcmpe.f16 s0, s1
+; CHECK-NEXT:    mov r0, #0
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    movwpl r0, #1
+; CHECK-NEXT:    bx lr
+  %cmp = call i1 @llvm.experimental.constrained.fcmps.f16(half %a, half %b, metadata !"uge", metadata !"fpexcept.strict") #0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @fcmps_ueq_f16(half %a, half %b) #0 {
+; CHECK-LABEL: fcmps_ueq_f16:
+; CHECK:         vcmpe.f16 s0, s1
+; CHECK-NEXT:    mov r0, #0
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    movweq r0, #1
+; CHECK-NEXT:    movwvs r0, #1
+; CHECK-NEXT:    bx lr
+  %cmp = call i1 @llvm.experimental.constrained.fcmps.f16(half %a, half %b, metadata !"ueq", metadata !"fpexcept.strict") #0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @fcmps_une_f16(half %a, half %b) #0 {
+; CHECK-LABEL: fcmps_une_f16:
+; CHECK:         vcmpe.f16 s0, s1
+; CHECK-NEXT:    mov r0, #0
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    movwne r0, #1
+; CHECK-NEXT:    bx lr
+  %cmp = call i1 @llvm.experimental.constrained.fcmps.f16(half %a, half %b, metadata !"une", metadata !"fpexcept.strict") #0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+
+; Intrinsics to convert between floating-point types
+
+define half @fptrunc_f16_f32(float %x) #0 {
+; CHECK-LABEL: fptrunc_f16_f32:
+; CHECK:         vcvtb.f16.f32 s0, s0
+; CHECK-NEXT:    bx lr
+  %val = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret half %val
+}
+
+define float @fpext_f32_f16(half %x) #0 {
+; CHECK-LABEL: fpext_f32_f16:
+; CHECK:         vcvtb.f32.f16 s0, s0
+; CHECK-NEXT:    bx lr
+  %val = call float @llvm.experimental.constrained.fpext.f32.f16(half %x, metadata !"fpexcept.strict") #0
+  ret float %val
+}
+
+
+attributes #0 = { strictfp }
+
+declare half @llvm.experimental.constrained.fadd.f16(half, half, metadata, metadata)
+declare half @llvm.experimental.constrained.fsub.f16(half, half, metadata, metadata)
+declare half @llvm.experimental.constrained.fmul.f16(half, half, metadata, metadata)
+declare half @llvm.experimental.constrained.fdiv.f16(half, half, metadata, metadata)
+declare half @llvm.experimental.constrained.frem.f16(half, half, metadata, metadata)
+declare half @llvm.experimental.constrained.fma.f16(half, half, half, metadata, metadata)
+declare half @llvm.experimental.constrained.fmuladd.f16(half, half, half, metadata, metadata)
+declare i32 @llvm.experimental.constrained.fptosi.i32.f16(half, metadata)
+declare i32 @llvm.experimental.constrained.fptoui.i32.f16(half, metadata)
+declare i64 @llvm.experimental.constrained.fptosi.i64.f16(half, metadata)
+declare i64 @llvm.experimental.constrained.fptoui.i64.f16(half, metadata)
+declare half @llvm.experimental.constrained.sitofp.f16.i32(i32, metadata, metadata)
+declare half @llvm.experimental.constrained.uitofp.f16.i32(i32, metadata, metadata)
+declare half @llvm.experimental.constrained.sitofp.f16.i64(i64, metadata, metadata)
+declare half @llvm.experimental.constrained.uitofp.f16.i64(i64, metadata, metadata)
+declare half @llvm.experimental.constrained.sitofp.f16.i128(i128, metadata, metadata)
+declare half @llvm.experimental.constrained.uitofp.f16.i128(i128, metadata, metadata)
+declare half @llvm.experimental.constrained.sqrt.f16(half, metadata, metadata)
+declare half @llvm.experimental.constrained.powi.f16(half, i32, metadata, metadata)
+declare half @llvm.experimental.constrained.sin.f16(half, metadata, metadata)
+declare half @llvm.experimental.constrained.cos.f16(half, metadata, metadata)
+declare half @llvm.experimental.constrained.tan.f16(half, metadata, metadata)
+declare half @llvm.experimental.constrained.pow.f16(half, half, metadata, metadata)
+declare half @llvm.experimental.constrained.log.f16(half, metadata, metadata)
+declare half @llvm.experimental.constrained.log10.f16(half, metadata, metadata)
+declare half @llvm.experimental.constrained.log2.f16(half, metadata, metadata)
+declare half @llvm.experimental.constrained.exp.f16(half, metadata, metadata)
+declare half @llvm.experimental.constrained.exp2.f16(half, metadata, metadata)
+declare half @llvm.experimental.constrained.rint.f16(half, metadata, metadata)
+declare half @llvm.experimental.constrained.nearbyint.f16(half, metadata, metadata)
+declare i32 @llvm.experimental.constrained.lrint.i32.f16(half, metadata, metadata)
+declare i64 @llvm.experimental.constrained.llrint.i64.f16(half, metadata, metadata)
+declare half @llvm.experimental.constrained.maxnum.f16(half, half, metadata)
+declare half @llvm.experimental.constrained.minnum.f16(half, half, metadata)
+declare half @llvm.experimental.constrained.ceil.f16(half, metadata)
+declare half @llvm.experimental.constrained.floor.f16(half, metadata)
+declare i32 @llvm.experimental.constrained.lround.i32.f16(half, metadata)
+declare i64 @llvm.experimental.constrained.llround.i64.f16(half, metadata)
+declare half @llvm.experimental.constrained.round.f16(half, metadata)
+declare half @llvm.experimental.constrained.roundeven.f16(half, metadata)
+declare half @llvm.experimental.constrained.trunc.f16(half, metadata)
+declare i1 @llvm.experimental.constrained.fcmps.f16(half, half, metadata, metadata)
+declare i1 @llvm.experimental.constrained.fcmp.f16(half, half, metadata, metadata)
+
+declare half @llvm.experimental.constrained.fptrunc.f16.f32(float, metadata, metadata)
+declare float @llvm.experimental.constrained.fpext.f32.f16(half, metadata)
+
+
 declare half @llvm.sqrt.f16(half %a)
 declare half @llvm.powi.f16.i32(half %a, i32 %b)
 declare half @llvm.sin.f16(half %a)
diff --git a/llvm/test/CodeGen/ARM/strict-fp-func.ll b/llvm/test/CodeGen/ARM/strict-fp-func.ll
new file mode 100644
index 0000000000000..39bb2b46bdac5
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/strict-fp-func.ll
@@ -0,0 +1,13 @@
+; RUN: llc -mtriple arm-none-eabi -stop-after=finalize-isel %s -o - | FileCheck %s
+
+define float @func_02(float %x, float %y) strictfp nounwind {
+  %call = call float @func_01(float %x) strictfp
+  %res = call float @llvm.experimental.constrained.fadd.f32(float %call, float %y, metadata !"round.dynamic", metadata !"fpexcept.ignore") strictfp
+  ret float %res
+}
+; CHECK-LABEL: name: func_02
+; CHECK:       BL @func_01, {{.*}}, implicit-def $fpscr_rm
+
+
+declare float @func_01(float)
+declare float @llvm.experimental.constrained.fadd.f32(float, float, metadata, metadata)
diff --git a/llvm/test/CodeGen/ARM/strict-fp-int-promote.ll b/llvm/test/CodeGen/ARM/strict-fp-int-promote.ll
new file mode 100644
index 0000000000000..6e5b58974fc50
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/strict-fp-int-promote.ll
@@ -0,0 +1,159 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple armv7-- -mattr=+vfp4 -O0 -o - %s | FileCheck %s
+; RUN: llc -mtriple armv7-- -mattr=+vfp4 -O3 -o - %s | FileCheck %s --check-prefix=CHECK-O3
+
+declare float @llvm.experimental.constrained.sitofp.f32.i32(i32, metadata, metadata)
+declare float @llvm.experimental.constrained.sitofp.f32.i16(i16, metadata, metadata)
+declare i1 @llvm.experimental.constrained.fcmp.f32(float, float, metadata, metadata)
+declare float @llvm.experimental.constrained.uitofp.f32.i16(i16, metadata, metadata)
+
+define i32 @test(i32 %a, i16 %b) #0 {
+; CHECK-LABEL: test:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    mov r2, r0
+; CHECK-NEXT:    sxth r0, r1
+; CHECK-NEXT:    movw r1, #0
+; CHECK-NEXT:    movt r1, #17200
+; CHECK-NEXT:    str r1, [sp, #4]
+; CHECK-NEXT:    eor r2, r2, #-2147483648
+; CHECK-NEXT:    str r2, [sp]
+; CHECK-NEXT:    vldr d16, [sp]
+; CHECK-NEXT:    vldr d17, .LCPI0_0
+; CHECK-NEXT:    vsub.f64 d16, d16, d17
+; CHECK-NEXT:    vcvt.f32.f64 s0, d16
+; CHECK-NEXT:    str r1, [sp, #12]
+; CHECK-NEXT:    eor r0, r0, #-2147483648
+; CHECK-NEXT:    str r0, [sp, #8]
+; CHECK-NEXT:    vldr d16, [sp, #8]
+; CHECK-NEXT:    vsub.f64 d16, d16, d17
+; CHECK-NEXT:    vcvt.f32.f64 s2, d16
+; CHECK-NEXT:    vcmp.f32 s0, s2
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    mov r0, #0
+; CHECK-NEXT:    movweq r0, #1
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    .p2align 3
+; CHECK-NEXT:  @ %bb.1:
+; CHECK-NEXT:  .LCPI0_0:
+; CHECK-NEXT:    .long 2147483648 @ double 4503601774854144
+; CHECK-NEXT:    .long 1127219200
+;
+; CHECK-O3-LABEL: test:
+; CHECK-O3:       @ %bb.0: @ %entry
+; CHECK-O3-NEXT:    sub sp, sp, #16
+; CHECK-O3-NEXT:    sxth r1, r1
+; CHECK-O3-NEXT:    movw r2, #0
+; CHECK-O3-NEXT:    movt r2, #17200
+; CHECK-O3-NEXT:    str r2, [sp, #4]
+; CHECK-O3-NEXT:    eor r0, r0, #-2147483648
+; CHECK-O3-NEXT:    str r0, [sp]
+; CHECK-O3-NEXT:    vldr d16, [sp]
+; CHECK-O3-NEXT:    vldr d17, .LCPI0_0
+; CHECK-O3-NEXT:    vsub.f64 d16, d16, d17
+; CHECK-O3-NEXT:    vcvt.f32.f64 s0, d16
+; CHECK-O3-NEXT:    str r2, [sp, #12]
+; CHECK-O3-NEXT:    eor r0, r1, #-2147483648
+; CHECK-O3-NEXT:    str r0, [sp, #8]
+; CHECK-O3-NEXT:    vldr d16, [sp, #8]
+; CHECK-O3-NEXT:    vsub.f64 d16, d16, d17
+; CHECK-O3-NEXT:    vcvt.f32.f64 s2, d16
+; CHECK-O3-NEXT:    vcmp.f32 s0, s2
+; CHECK-O3-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-O3-NEXT:    mov r0, #0
+; CHECK-O3-NEXT:    movweq r0, #1
+; CHECK-O3-NEXT:    add sp, sp, #16
+; CHECK-O3-NEXT:    bx lr
+; CHECK-O3-NEXT:    .p2align 3
+; CHECK-O3-NEXT:  @ %bb.1:
+; CHECK-O3-NEXT:  .LCPI0_0:
+; CHECK-O3-NEXT:    .long 2147483648 @ double 4503601774854144
+; CHECK-O3-NEXT:    .long 1127219200
+entry:
+  %conv = call float @llvm.experimental.constrained.sitofp.f32.i32(i32 %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  %conv1 = call float @llvm.experimental.constrained.sitofp.f32.i16(i16 %b, metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  %cmp = call i1 @llvm.experimental.constrained.fcmp.f32(float %conv, float %conv1, metadata !"oeq", metadata !"fpexcept.strict") #1
+  %conv2 = zext i1 %cmp to i32
+  ret i32 %conv2
+}
+
+define i32 @test2(i32 %a, i16 %b) #0 {
+; CHECK-LABEL: test2:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    mov r2, r0
+; CHECK-NEXT:    uxth r0, r1
+; CHECK-NEXT:    movw r1, #0
+; CHECK-NEXT:    movt r1, #17200
+; CHECK-NEXT:    str r1, [sp, #4]
+; CHECK-NEXT:    eor r2, r2, #-2147483648
+; CHECK-NEXT:    str r2, [sp]
+; CHECK-NEXT:    vldr d16, [sp]
+; CHECK-NEXT:    vldr d17, .LCPI1_0
+; CHECK-NEXT:    vsub.f64 d16, d16, d17
+; CHECK-NEXT:    vcvt.f32.f64 s0, d16
+; CHECK-NEXT:    str r1, [sp, #12]
+; CHECK-NEXT:    str r0, [sp, #8]
+; CHECK-NEXT:    vldr d16, [sp, #8]
+; CHECK-NEXT:    vldr d17, .LCPI1_1
+; CHECK-NEXT:    vsub.f64 d16, d16, d17
+; CHECK-NEXT:    vcvt.f32.f64 s2, d16
+; CHECK-NEXT:    vcmp.f32 s0, s2
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    mov r0, #0
+; CHECK-NEXT:    movweq r0, #1
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    .p2align 3
+; CHECK-NEXT:  @ %bb.1:
+; CHECK-NEXT:  .LCPI1_0:
+; CHECK-NEXT:    .long 2147483648 @ double 4503601774854144
+; CHECK-NEXT:    .long 1127219200
+; CHECK-NEXT:  .LCPI1_1:
+; CHECK-NEXT:    .long 0 @ double 4503599627370496
+; CHECK-NEXT:    .long 1127219200
+;
+; CHECK-O3-LABEL: test2:
+; CHECK-O3:       @ %bb.0: @ %entry
+; CHECK-O3-NEXT:    sub sp, sp, #16
+; CHECK-O3-NEXT:    uxth r1, r1
+; CHECK-O3-NEXT:    movw r2, #0
+; CHECK-O3-NEXT:    movt r2, #17200
+; CHECK-O3-NEXT:    str r2, [sp, #4]
+; CHECK-O3-NEXT:    eor r0, r0, #-2147483648
+; CHECK-O3-NEXT:    str r0, [sp]
+; CHECK-O3-NEXT:    vldr d16, [sp]
+; CHECK-O3-NEXT:    vldr d17, .LCPI1_0
+; CHECK-O3-NEXT:    vsub.f64 d16, d16, d17
+; CHECK-O3-NEXT:    vcvt.f32.f64 s0, d16
+; CHECK-O3-NEXT:    str r2, [sp, #12]
+; CHECK-O3-NEXT:    str r1, [sp, #8]
+; CHECK-O3-NEXT:    vldr d16, [sp, #8]
+; CHECK-O3-NEXT:    vldr d17, .LCPI1_1
+; CHECK-O3-NEXT:    vsub.f64 d16, d16, d17
+; CHECK-O3-NEXT:    vcvt.f32.f64 s2, d16
+; CHECK-O3-NEXT:    vcmp.f32 s0, s2
+; CHECK-O3-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-O3-NEXT:    mov r0, #0
+; CHECK-O3-NEXT:    movweq r0, #1
+; CHECK-O3-NEXT:    add sp, sp, #16
+; CHECK-O3-NEXT:    bx lr
+; CHECK-O3-NEXT:    .p2align 3
+; CHECK-O3-NEXT:  @ %bb.1:
+; CHECK-O3-NEXT:  .LCPI1_0:
+; CHECK-O3-NEXT:    .long 2147483648 @ double 4503601774854144
+; CHECK-O3-NEXT:    .long 1127219200
+; CHECK-O3-NEXT:  .LCPI1_1:
+; CHECK-O3-NEXT:    .long 0 @ double 4503599627370496
+; CHECK-O3-NEXT:    .long 1127219200
+entry:
+  %conv = call float @llvm.experimental.constrained.sitofp.f32.i32(i32 %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  %conv1 = call float @llvm.experimental.constrained.uitofp.f32.i16(i16 %b, metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  %cmp = call i1 @llvm.experimental.constrained.fcmp.f32(float %conv, float %conv1, metadata !"oeq", metadata !"fpexcept.strict") #1
+  %conv2 = zext i1 %cmp to i32
+  ret i32 %conv2
+}
+
+attributes #0 = { strictfp noinline optnone }
+attributes #1 = { strictfp }
diff --git a/llvm/test/CodeGen/ARM/strict-fp-ops.ll b/llvm/test/CodeGen/ARM/strict-fp-ops.ll
new file mode 100644
index 0000000000000..608ab0716e0df
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/strict-fp-ops.ll
@@ -0,0 +1,202 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple armv7-- -mattr=+vfp4 %s -o - | FileCheck %s
+
+
+; Div whose result is unused should be removed unless we have strict exceptions
+
+define void @unused_div(float %x, float %y) {
+; CHECK-LABEL: unused_div:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    bx lr
+entry:
+  %add = fdiv float %x, %y
+  ret void
+}
+
+define void @unused_div_fpexcept_strict(float %x, float %y) #0 {
+; CHECK-LABEL: unused_div_fpexcept_strict:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov s0, r1
+; CHECK-NEXT:    vmov s2, r0
+; CHECK-NEXT:    vdiv.f32 s0, s2, s0
+; CHECK-NEXT:    bx lr
+entry:
+  %add = call float @llvm.experimental.constrained.fdiv.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret void
+}
+
+define void @unused_div_round_dynamic(float %x, float %y) #0 {
+; CHECK-LABEL: unused_div_round_dynamic:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    bx lr
+entry:
+  %add = call float @llvm.experimental.constrained.fdiv.f32(float %x, float %y, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
+  ret void
+}
+
+
+; Machine CSE should eliminate the second add unless we have strict exceptions
+
+define float @add_twice(float %x, float %y, i32 %n) {
+; CHECK-LABEL: add_twice:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov s0, r1
+; CHECK-NEXT:    cmp r2, #0
+; CHECK-NEXT:    vmov s2, r0
+; CHECK-NEXT:    vadd.f32 s0, s2, s0
+; CHECK-NEXT:    vmul.f32 s2, s0, s0
+; CHECK-NEXT:    vmoveq.f32 s2, s0
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    bx lr
+entry:
+  %add = fadd float %x, %y
+  %tobool.not = icmp eq i32 %n, 0
+  br i1 %tobool.not, label %if.end, label %if.then
+
+if.then:
+  %add1 = fadd float %x, %y
+  %mul = fmul float %add, %add1
+  br label %if.end
+
+if.end:
+  %a.0 = phi float [ %mul, %if.then ], [ %add, %entry ]
+  ret float %a.0
+}
+
+define float @add_twice_fpexcept_strict(float %x, float %y, i32 %n) #0 {
+; CHECK-LABEL: add_twice_fpexcept_strict:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov s2, r1
+; CHECK-NEXT:    cmp r2, #0
+; CHECK-NEXT:    vmov s4, r0
+; CHECK-NEXT:    vadd.f32 s0, s4, s2
+; CHECK-NEXT:    vaddne.f32 s2, s4, s2
+; CHECK-NEXT:    vmulne.f32 s0, s0, s2
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    bx lr
+entry:
+  %add = call float @llvm.experimental.constrained.fadd.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  %tobool.not = icmp eq i32 %n, 0
+  br i1 %tobool.not, label %if.end, label %if.then
+
+if.then:
+  %add1 = call float @llvm.experimental.constrained.fadd.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  %mul = call float @llvm.experimental.constrained.fmul.f32(float %add, float %add1, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  br label %if.end
+
+if.end:
+  %a.0 = phi float [ %mul, %if.then ], [ %add, %entry ]
+  ret float %a.0
+}
+
+define float @add_twice_round_dynamic(float %x, float %y, i32 %n) #0 {
+; CHECK-LABEL: add_twice_round_dynamic:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov s0, r1
+; CHECK-NEXT:    cmp r2, #0
+; CHECK-NEXT:    vmov s2, r0
+; CHECK-NEXT:    vadd.f32 s0, s2, s0
+; CHECK-NEXT:    vmulne.f32 s0, s0, s0
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    bx lr
+entry:
+  %add = call float @llvm.experimental.constrained.fadd.f32(float %x, float %y, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
+  %tobool.not = icmp eq i32 %n, 0
+  br i1 %tobool.not, label %if.end, label %if.then
+
+if.then:
+  %add1 = call float @llvm.experimental.constrained.fadd.f32(float %x, float %y, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
+  %mul = call float @llvm.experimental.constrained.fmul.f32(float %add, float %add1, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
+  br label %if.end
+
+if.end:
+  %a.0 = phi float [ %mul, %if.then ], [ %add, %entry ]
+  ret float %a.0
+}
+
+; Two adds separated by llvm.set.rounding should be preserved when rounding is
+; dynamic (as they may give different results) or when we have strict exceptions
+; (the llvm.set.rounding is irrelevant, but both could trap).
+
+define float @set_rounding(float %x, float %y) {
+; CHECK-LABEL: set_rounding:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmrs r2, fpscr
+; CHECK-NEXT:    vmov s2, r0
+; CHECK-NEXT:    vmov s0, r1
+; CHECK-NEXT:    vadd.f32 s0, s2, s0
+; CHECK-NEXT:    vsub.f32 s0, s0, s0
+; CHECK-NEXT:    orr r0, r2, #12582912
+; CHECK-NEXT:    vmsr fpscr, r0
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    vmrs r1, fpscr
+; CHECK-NEXT:    bic r1, r1, #12582912
+; CHECK-NEXT:    vmsr fpscr, r1
+; CHECK-NEXT:    bx lr
+entry:
+  %add1 = fadd float %x, %y
+  call void @llvm.set.rounding(i32 0)
+  %add2 = fadd float %x, %y
+  call void @llvm.set.rounding(i32 1)
+  %sub = fsub float %add1, %add2
+  ret float %sub
+}
+
+define float @set_rounding_fpexcept_strict(float %x, float %y) #0 {
+; CHECK-LABEL: set_rounding_fpexcept_strict:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov s0, r1
+; CHECK-NEXT:    vmov s2, r0
+; CHECK-NEXT:    vadd.f32 s4, s2, s0
+; CHECK-NEXT:    vmrs r0, fpscr
+; CHECK-NEXT:    orr r0, r0, #12582912
+; CHECK-NEXT:    vmsr fpscr, r0
+; CHECK-NEXT:    vadd.f32 s0, s2, s0
+; CHECK-NEXT:    vmrs r0, fpscr
+; CHECK-NEXT:    bic r0, r0, #12582912
+; CHECK-NEXT:    vmsr fpscr, r0
+; CHECK-NEXT:    vsub.f32 s0, s4, s0
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    bx lr
+entry:
+  %add1 = call float @llvm.experimental.constrained.fadd.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  call void @llvm.set.rounding(i32 0) #0
+  %add2 = call float @llvm.experimental.constrained.fadd.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  call void @llvm.set.rounding(i32 1) #0
+  %sub = call float @llvm.experimental.constrained.fsub.f32(float %add1, float %add2, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret float %sub
+}
+
+define float @set_rounding_round_dynamic(float %x, float %y) #0 {
+; CHECK-LABEL: set_rounding_round_dynamic:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov s2, r0
+; CHECK-NEXT:    vmrs r0, fpscr
+; CHECK-NEXT:    vmov s0, r1
+; CHECK-NEXT:    vadd.f32 s4, s2, s0
+; CHECK-NEXT:    orr r0, r0, #12582912
+; CHECK-NEXT:    vmsr fpscr, r0
+; CHECK-NEXT:    vmrs r0, fpscr
+; CHECK-NEXT:    vadd.f32 s0, s2, s0
+; CHECK-NEXT:    bic r0, r0, #12582912
+; CHECK-NEXT:    vmsr fpscr, r0
+; CHECK-NEXT:    vsub.f32 s0, s4, s0
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    bx lr
+entry:
+  %add1 = call float @llvm.experimental.constrained.fadd.f32(float %x, float %y, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
+  call void @llvm.set.rounding(i32 0) #0
+  %add2 = call float @llvm.experimental.constrained.fadd.f32(float %x, float %y, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
+  call void @llvm.set.rounding(i32 1) #0
+  %sub = call float @llvm.experimental.constrained.fsub.f32(float %add1, float %add2, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
+  ret float %sub
+}
+
+declare float @llvm.experimental.constrained.fadd.f32(float, float, metadata, metadata)
+declare float @llvm.experimental.constrained.fsub.f32(float, float, metadata, metadata)
+declare float @llvm.experimental.constrained.fmul.f32(float, float, metadata, metadata)
+declare float @llvm.experimental.constrained.fdiv.f32(float, float, metadata, metadata)
+declare i32 @llvm.get.rounding()
+declare void @llvm.set.rounding(i32)
+
+attributes #0 = { strictfp }
diff --git a/llvm/test/CodeGen/ARM/strictfp_f16_abi_promote.ll b/llvm/test/CodeGen/ARM/strictfp_f16_abi_promote.ll
new file mode 100644
index 0000000000000..5906c796d2751
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/strictfp_f16_abi_promote.ll
@@ -0,0 +1,270 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=armv7-- < %s | FileCheck -check-prefix=NOFP16 %s
+
+declare void @f16_user(half)
+declare half @f16_result()
+
+declare void @v2f16_user(<2 x half>)
+declare <2 x half> @v2f16_result()
+
+declare void @v4f16_user(<4 x half>)
+declare <4 x half> @v4f16_result()
+
+declare void @v8f16_user(<8 x half>)
+declare <8 x half> @v8f16_result()
+
+define void @f16_arg(half %arg, ptr %ptr) #0 {
+; NOFP16-LABEL: f16_arg:
+; NOFP16:       @ %bb.0:
+; NOFP16-NEXT:    push {r4, lr}
+; NOFP16-NEXT:    uxth r0, r0
+; NOFP16-NEXT:    mov r4, r1
+; NOFP16-NEXT:    bl __gnu_h2f_ieee
+; NOFP16-NEXT:    str r0, [r4]
+; NOFP16-NEXT:    pop {r4, pc}
+  %fpext = call float @llvm.experimental.constrained.fpext.f32.f16(half %arg, metadata !"fpexcept.strict")
+  store float %fpext, ptr %ptr
+  ret void
+}
+
+define void @v2f16_arg(<2 x half> %arg, ptr %ptr) #0 {
+; NOFP16-LABEL: v2f16_arg:
+; NOFP16:       @ %bb.0:
+; NOFP16-NEXT:    push {r4, r5, r11, lr}
+; NOFP16-NEXT:    vpush {d8}
+; NOFP16-NEXT:    mov r5, r0
+; NOFP16-NEXT:    uxth r0, r1
+; NOFP16-NEXT:    mov r4, r2
+; NOFP16-NEXT:    bl __gnu_h2f_ieee
+; NOFP16-NEXT:    uxth r1, r5
+; NOFP16-NEXT:    vmov s17, r0
+; NOFP16-NEXT:    mov r0, r1
+; NOFP16-NEXT:    bl __gnu_h2f_ieee
+; NOFP16-NEXT:    vmov s16, r0
+; NOFP16-NEXT:    vstr d8, [r4]
+; NOFP16-NEXT:    vpop {d8}
+; NOFP16-NEXT:    pop {r4, r5, r11, pc}
+  %fpext = call <2 x float> @llvm.experimental.constrained.fpext.v2f32.v2f16(<2 x half> %arg, metadata !"fpexcept.strict")
+  store <2 x float> %fpext, ptr %ptr
+  ret void
+}
+
+define void @v3f16_arg(<3 x half> %arg, ptr %ptr) #0 {
+; NOFP16-LABEL: v3f16_arg:
+; NOFP16:       @ %bb.0:
+; NOFP16-NEXT:    push {r4, r5, r6, lr}
+; NOFP16-NEXT:    vpush {d8}
+; NOFP16-NEXT:    mov r6, r0
+; NOFP16-NEXT:    uxth r0, r1
+; NOFP16-NEXT:    mov r4, r3
+; NOFP16-NEXT:    mov r5, r2
+; NOFP16-NEXT:    bl __gnu_h2f_ieee
+; NOFP16-NEXT:    uxth r1, r6
+; NOFP16-NEXT:    vmov s17, r0
+; NOFP16-NEXT:    mov r0, r1
+; NOFP16-NEXT:    bl __gnu_h2f_ieee
+; NOFP16-NEXT:    vmov s16, r0
+; NOFP16-NEXT:    uxth r0, r5
+; NOFP16-NEXT:    vst1.32 {d8}, [r4:64]!
+; NOFP16-NEXT:    bl __gnu_h2f_ieee
+; NOFP16-NEXT:    str r0, [r4]
+; NOFP16-NEXT:    vpop {d8}
+; NOFP16-NEXT:    pop {r4, r5, r6, pc}
+  %fpext = call <3 x float> @llvm.experimental.constrained.fpext.v3f32.v3f16(<3 x half> %arg, metadata !"fpexcept.strict")
+  store <3 x float> %fpext, ptr %ptr
+  ret void
+}
+
+define void @v4f16_arg(<4 x half> %arg, ptr %ptr) #0 {
+; NOFP16-LABEL: v4f16_arg:
+; NOFP16:       @ %bb.0:
+; NOFP16-NEXT:    push {r4, r5, r6, r7, r11, lr}
+; NOFP16-NEXT:    vpush {d8, d9}
+; NOFP16-NEXT:    mov r6, r0
+; NOFP16-NEXT:    uxth r0, r1
+; NOFP16-NEXT:    mov r4, r3
+; NOFP16-NEXT:    mov r5, r2
+; NOFP16-NEXT:    bl __gnu_h2f_ieee
+; NOFP16-NEXT:    mov r7, r0
+; NOFP16-NEXT:    uxth r0, r4
+; NOFP16-NEXT:    bl __gnu_h2f_ieee
+; NOFP16-NEXT:    vmov s19, r0
+; NOFP16-NEXT:    uxth r0, r5
+; NOFP16-NEXT:    ldr r4, [sp, #40]
+; NOFP16-NEXT:    bl __gnu_h2f_ieee
+; NOFP16-NEXT:    vmov s18, r0
+; NOFP16-NEXT:    uxth r0, r6
+; NOFP16-NEXT:    vmov s17, r7
+; NOFP16-NEXT:    bl __gnu_h2f_ieee
+; NOFP16-NEXT:    vmov s16, r0
+; NOFP16-NEXT:    vst1.64 {d8, d9}, [r4]
+; NOFP16-NEXT:    vpop {d8, d9}
+; NOFP16-NEXT:    pop {r4, r5, r6, r7, r11, pc}
+  %fpext = call <4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half> %arg, metadata !"fpexcept.strict")
+  store <4 x float> %fpext, ptr %ptr
+  ret void
+}
+
+ define half @f16_return(float %arg) #0 {
+; NOFP16-LABEL: f16_return:
+; NOFP16:       @ %bb.0:
+; NOFP16-NEXT:    push {r11, lr}
+; NOFP16-NEXT:    bl __gnu_f2h_ieee
+; NOFP16-NEXT:    pop {r11, pc}
+   %fptrunc = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
+   ret half %fptrunc
+ }
+
+ define <2 x half> @v2f16_return(<2 x float> %arg) #0 {
+; NOFP16-LABEL: v2f16_return:
+; NOFP16:       @ %bb.0:
+; NOFP16-NEXT:    push {r11, lr}
+; NOFP16-NEXT:    vpush {d8}
+; NOFP16-NEXT:    sub sp, sp, #8
+; NOFP16-NEXT:    vmov d8, r0, r1
+; NOFP16-NEXT:    vmov r0, s17
+; NOFP16-NEXT:    bl __gnu_f2h_ieee
+; NOFP16-NEXT:    vmov r1, s16
+; NOFP16-NEXT:    strh r0, [sp, #6]
+; NOFP16-NEXT:    mov r0, r1
+; NOFP16-NEXT:    bl __gnu_f2h_ieee
+; NOFP16-NEXT:    strh r0, [sp, #4]
+; NOFP16-NEXT:    add r0, sp, #4
+; NOFP16-NEXT:    vld1.32 {d16[0]}, [r0:32]
+; NOFP16-NEXT:    vmovl.u16 q8, d16
+; NOFP16-NEXT:    vmov.32 r0, d16[0]
+; NOFP16-NEXT:    vmov.32 r1, d16[1]
+; NOFP16-NEXT:    add sp, sp, #8
+; NOFP16-NEXT:    vpop {d8}
+; NOFP16-NEXT:    pop {r11, pc}
+   %fptrunc = call <2 x half> @llvm.experimental.constrained.fptrunc.v2f16.v2f32(<2 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
+   ret <2 x half> %fptrunc
+ }
+
+ define <3 x half> @v3f16_return(<3 x float> %arg) #0 {
+; NOFP16-LABEL: v3f16_return:
+; NOFP16:       @ %bb.0:
+; NOFP16-NEXT:    push {r4, r5, r6, lr}
+; NOFP16-NEXT:    vmov d1, r2, r3
+; NOFP16-NEXT:    mov r5, r0
+; NOFP16-NEXT:    vmov d0, r0, r1
+; NOFP16-NEXT:    mov r4, r1
+; NOFP16-NEXT:    vmov r0, s2
+; NOFP16-NEXT:    bl __gnu_f2h_ieee
+; NOFP16-NEXT:    uxth r6, r0
+; NOFP16-NEXT:    mov r0, r4
+; NOFP16-NEXT:    bl __gnu_f2h_ieee
+; NOFP16-NEXT:    mov r4, r0
+; NOFP16-NEXT:    mov r0, r5
+; NOFP16-NEXT:    bl __gnu_f2h_ieee
+; NOFP16-NEXT:    pkhbt r0, r0, r4, lsl #16
+; NOFP16-NEXT:    vmov d16, r0, r6
+; NOFP16-NEXT:    vmov.u16 r0, d16[0]
+; NOFP16-NEXT:    vmov.u16 r1, d16[1]
+; NOFP16-NEXT:    vmov.u16 r2, d16[2]
+; NOFP16-NEXT:    pop {r4, r5, r6, pc}
+   %fptrunc = call <3 x half> @llvm.experimental.constrained.fptrunc.v3f16.v3f32(<3 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
+   ret <3 x half> %fptrunc
+ }
+
+ define <4 x half> @v4f16_return(<4 x float> %arg) #0 {
+; NOFP16-LABEL: v4f16_return:
+; NOFP16:       @ %bb.0:
+; NOFP16-NEXT:    push {r4, r5, r11, lr}
+; NOFP16-NEXT:    vpush {d8, d9}
+; NOFP16-NEXT:    vmov d8, r2, r3
+; NOFP16-NEXT:    vmov d9, r0, r1
+; NOFP16-NEXT:    vmov r2, s17
+; NOFP16-NEXT:    mov r0, r2
+; NOFP16-NEXT:    bl __gnu_f2h_ieee
+; NOFP16-NEXT:    mov r4, r0
+; NOFP16-NEXT:    vmov r0, s16
+; NOFP16-NEXT:    bl __gnu_f2h_ieee
+; NOFP16-NEXT:    vmov r1, s19
+; NOFP16-NEXT:    pkhbt r5, r0, r4, lsl #16
+; NOFP16-NEXT:    mov r0, r1
+; NOFP16-NEXT:    bl __gnu_f2h_ieee
+; NOFP16-NEXT:    mov r4, r0
+; NOFP16-NEXT:    vmov r0, s18
+; NOFP16-NEXT:    bl __gnu_f2h_ieee
+; NOFP16-NEXT:    pkhbt r0, r0, r4, lsl #16
+; NOFP16-NEXT:    vmov d16, r0, r5
+; NOFP16-NEXT:    vmov.u16 r0, d16[0]
+; NOFP16-NEXT:    vmov.u16 r1, d16[1]
+; NOFP16-NEXT:    vmov.u16 r2, d16[2]
+; NOFP16-NEXT:    vmov.u16 r3, d16[3]
+; NOFP16-NEXT:    vpop {d8, d9}
+; NOFP16-NEXT:    pop {r4, r5, r11, pc}
+   %fptrunc = call <4 x half> @llvm.experimental.constrained.fptrunc.v4f16.v4f32(<4 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
+   ret <4 x half> %fptrunc
+ }
+
+define void @outgoing_v4f16_return(ptr %ptr) #0 {
+; NOFP16-LABEL: outgoing_v4f16_return:
+; NOFP16:       @ %bb.0:
+; NOFP16-NEXT:    push {r4, lr}
+; NOFP16-NEXT:    mov r4, r0
+; NOFP16-NEXT:    bl v4f16_result
+; NOFP16-NEXT:    strh r3, [r4, #6]
+; NOFP16-NEXT:    strh r2, [r4, #4]
+; NOFP16-NEXT:    strh r1, [r4, #2]
+; NOFP16-NEXT:    strh r0, [r4]
+; NOFP16-NEXT:    pop {r4, pc}
+  %val = call <4 x half> @v4f16_result() #0
+  store <4 x half> %val, ptr %ptr
+  ret void
+}
+
+define void @outgoing_v8f16_return(ptr %ptr) #0 {
+; NOFP16-LABEL: outgoing_v8f16_return:
+; NOFP16:       @ %bb.0:
+; NOFP16-NEXT:    push {r4, r10, r11, lr}
+; NOFP16-NEXT:    add r11, sp, #8
+; NOFP16-NEXT:    sub sp, sp, #16
+; NOFP16-NEXT:    bfc sp, #0, #4
+; NOFP16-NEXT:    mov r4, r0
+; NOFP16-NEXT:    mov r0, sp
+; NOFP16-NEXT:    bl v8f16_result
+; NOFP16-NEXT:    ldm sp, {r0, r1, r2, r3}
+; NOFP16-NEXT:    stm r4, {r0, r1, r2, r3}
+; NOFP16-NEXT:    sub sp, r11, #8
+; NOFP16-NEXT:    pop {r4, r10, r11, pc}
+  %val = call <8 x half> @v8f16_result() #0
+  store <8 x half> %val, ptr %ptr
+  ret void
+}
+
+define half @call_split_type_used_outside_block_v8f16() #0 {
+; NOFP16-LABEL: call_split_type_used_outside_block_v8f16:
+; NOFP16:       @ %bb.0: @ %bb0
+; NOFP16-NEXT:    push {r4, r10, r11, lr}
+; NOFP16-NEXT:    add r11, sp, #8
+; NOFP16-NEXT:    sub sp, sp, #16
+; NOFP16-NEXT:    bfc sp, #0, #4
+; NOFP16-NEXT:    mov r4, sp
+; NOFP16-NEXT:    mov r0, r4
+; NOFP16-NEXT:    bl v8f16_result
+; NOFP16-NEXT:    vld1.32 {d16[0]}, [r4:32]
+; NOFP16-NEXT:    vmov.u16 r0, d16[0]
+; NOFP16-NEXT:    sub sp, r11, #8
+; NOFP16-NEXT:    pop {r4, r10, r11, pc}
+bb0:
+  %split.ret.type = call <8 x half> @v8f16_result() #0
+  br label %bb1
+
+bb1:
+  %extract = extractelement <8 x half> %split.ret.type, i32 0
+  ret half %extract
+}
+
+declare float @llvm.experimental.constrained.fpext.f32.f16(half, metadata) #0
+declare <2 x float> @llvm.experimental.constrained.fpext.v2f32.v2f16(<2 x half>, metadata) #0
+declare <3 x float> @llvm.experimental.constrained.fpext.v3f32.v3f16(<3 x half>, metadata) #0
+declare <4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half>, metadata) #0
+
+declare half @llvm.experimental.constrained.fptrunc.f16.f32(float, metadata, metadata) #0
+declare <2 x half> @llvm.experimental.constrained.fptrunc.v2f16.v2f32(<2 x float>, metadata, metadata) #0
+declare <3 x half> @llvm.experimental.constrained.fptrunc.v3f16.v3f32(<3 x float>, metadata, metadata) #0
+declare <4 x half> @llvm.experimental.constrained.fptrunc.v4f16.v4f32(<4 x float>, metadata, metadata) #0
+
+attributes #0 = { strictfp }
diff --git a/llvm/test/CodeGen/DirectX/Metadata/loop-md-errs.ll b/llvm/test/CodeGen/DirectX/Metadata/loop-md-errs.ll
new file mode 100644
index 0000000000000..fbe4653b45dea
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/Metadata/loop-md-errs.ll
@@ -0,0 +1,113 @@
+; RUN: split-file %s %t
+; RUN: not opt -S --dxil-translate-metadata %t/args.ll 2>&1 | FileCheck %t/args.ll
+; RUN: not opt -S --dxil-translate-metadata %t/bad-count.ll 2>&1 | FileCheck %t/bad-count.ll
+; RUN: not opt -S --dxil-translate-metadata %t/invalid-disable.ll 2>&1 | FileCheck %t/invalid-disable.ll
+; RUN: not opt -S --dxil-translate-metadata %t/invalid-full.ll 2>&1 | FileCheck %t/invalid-full.ll
+
+; Test that loop metadata is validated as with the DXIL validator
+
+;--- args.ll
+
+; CHECK: Invalid "llvm.loop" metadata: Provided conflicting hints
+
+target triple = "dxilv1.0-unknown-shadermodel6.0-library"
+
+define void @example_loop(i32 %n) {
+entry:
+  br label %loop.header
+
+loop.header:
+  %i = phi i32 [ 0, %entry ], [ %i.next, %loop.body ]
+  %cmp = icmp slt i32 %i, %n
+  br i1 %cmp, label %loop.body, label %exit
+
+loop.body:
+  %i.next = add nsw i32 %i, 1
+  br label %loop.header, !llvm.loop !1
+
+exit:
+  ret void
+}
+
+!1 = !{!1, !2, !3} ; conflicting args
+!2 = !{!"llvm.loop.unroll.full"}
+!3 = !{!"llvm.loop.unroll.disable"}
+
+;--- bad-count.ll
+
+; CHECK: "llvm.loop.unroll.count" must have 2 operands and the second must be a constant integer
+
+target triple = "dxilv1.0-unknown-shadermodel6.0-library"
+
+define void @example_loop(i32 %n) {
+entry:
+  br label %loop.header
+
+loop.header:
+  %i = phi i32 [ 0, %entry ], [ %i.next, %loop.body ]
+  %cmp = icmp slt i32 %i, %n
+  br i1 %cmp, label %loop.body, label %exit
+
+loop.body:
+  %i.next = add nsw i32 %i, 1
+  br label %loop.header, !llvm.loop !1
+
+exit:
+  ret void
+}
+
+!1 = !{!1, !2}
+!2 = !{!"llvm.loop.unroll.count", !"not an int"} ; invalid count parameters
+
+;--- invalid-disable.ll
+
+; CHECK: Invalid "llvm.loop" metadata: "llvm.loop.unroll.disable" and "llvm.loop.unroll.full" must be provided as a single operand
+
+target triple = "dxilv1.0-unknown-shadermodel6.0-library"
+
+define void @example_loop(i32 %n) {
+entry:
+  br label %loop.header
+
+loop.header:
+  %i = phi i32 [ 0, %entry ], [ %i.next, %loop.body ]
+  %cmp = icmp slt i32 %i, %n
+  br i1 %cmp, label %loop.body, label %exit
+
+loop.body:
+  %i.next = add nsw i32 %i, 1
+  br label %loop.header, !llvm.loop !1
+
+exit:
+  ret void
+}
+
+!1 = !{!1, !2}
+!2 = !{!"llvm.loop.unroll.disable", i32 0} ; invalid second operand
+
+
+;--- invalid-full.ll
+
+; CHECK: Invalid "llvm.loop" metadata: "llvm.loop.unroll.disable" and "llvm.loop.unroll.full" must be provided as a single operand
+
+target triple = "dxilv1.0-unknown-shadermodel6.0-library"
+
+define void @example_loop(i32 %n) {
+entry:
+  br label %loop.header
+
+loop.header:
+  %i = phi i32 [ 0, %entry ], [ %i.next, %loop.body ]
+  %cmp = icmp slt i32 %i, %n
+  br i1 %cmp, label %loop.body, label %exit
+
+loop.body:
+  %i.next = add nsw i32 %i, 1
+  br label %loop.header, !llvm.loop !1
+
+exit:
+  ret void
+}
+
+!1 = !{!1, !2}
+!2 = !{!"llvm.loop.unroll.full", i32 0} ; invalid second operand
diff --git a/llvm/test/CodeGen/DirectX/Metadata/loop-md-stripped.ll b/llvm/test/CodeGen/DirectX/Metadata/loop-md-stripped.ll
new file mode 100644
index 0000000000000..09d8aec2ff0e5
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/Metadata/loop-md-stripped.ll
@@ -0,0 +1,58 @@
+; RUN: split-file %s %t
+; RUN: opt -S --dxil-translate-metadata %t/not-distinct.ll 2>&1 | FileCheck %t/not-distinct.ll
+; RUN: opt -S --dxil-translate-metadata %t/not-md.ll 2>&1 | FileCheck %t/not-md.ll
+
+; Test that DXIL incompatible loop metadata is stripped
+
+;--- not-distinct.ll
+
+; Ensure it is stripped because it is not provided a distinct loop parent
+; CHECK-NOT: {!"llvm.loop.unroll.disable"}
+
+target triple = "dxilv1.0-unknown-shadermodel6.0-library"
+
+define void @example_loop(i32 %n) {
+entry:
+  br label %loop.header
+
+loop.header:
+  %i = phi i32 [ 0, %entry ], [ %i.next, %loop.body ]
+  %cmp = icmp slt i32 %i, %n
+  br i1 %cmp, label %loop.body, label %exit
+
+loop.body:
+  %i.next = add nsw i32 %i, 1
+  br label %loop.header, !llvm.loop !1
+
+exit:
+  ret void
+}
+
+!1 = !{!"llvm.loop.unroll.disable"} ; first node must be a distinct self-reference
+
+
+;--- not-md.ll
+
+target triple = "dxilv1.0-unknown-shadermodel6.0-library"
+
+define void @example_loop(i32 %n) {
+entry:
+  br label %loop.header
+
+loop.header:
+  %i = phi i32 [ 0, %entry ], [ %i.next, %loop.body ]
+  %cmp = icmp slt i32 %i, %n
+  br i1 %cmp, label %loop.body, label %exit
+
+loop.body:
+  %i.next = add nsw i32 %i, 1
+  ; CHECK: br label %loop.header, !llvm.loop ![[#LOOP_MD:]]
+  br label %loop.header, !llvm.loop !1
+
+exit:
+  ret void
+}
+
+; CHECK: ![[#LOOP_MD:]] = distinct !{![[#LOOP_MD]]}
+
+!1 = !{!1, i32 0} ; second operand is not a metadata node
diff --git a/llvm/test/CodeGen/DirectX/Metadata/loop-md-valid.ll b/llvm/test/CodeGen/DirectX/Metadata/loop-md-valid.ll
new file mode 100644
index 0000000000000..a189c0e3f8aaa
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/Metadata/loop-md-valid.ll
@@ -0,0 +1,95 @@
+; RUN: split-file %s %t
+; RUN: opt -S --dxil-translate-metadata %t/count.ll | FileCheck %t/count.ll
+; RUN: opt -S --dxil-translate-metadata %t/disable.ll | FileCheck %t/disable.ll
+; RUN: opt -S --dxil-translate-metadata %t/full.ll | FileCheck %t/full.ll
+
+;--- count.ll
+
+; Test that we collapse a self-referential chain and allow a unroll.count hint
+
+target triple = "dxilv1.0-unknown-shadermodel6.0-library"
+
+define void @example_loop(i32 %n) {
+entry:
+  br label %loop.header
+
+loop.header:
+  %i = phi i32 [ 0, %entry ], [ %i.next, %loop.body ]
+  %cmp = icmp slt i32 %i, %n
+  br i1 %cmp, label %loop.body, label %exit
+
+loop.body:
+  %i.next = add nsw i32 %i, 1
+  ; CHECK: br label %loop.header, !llvm.loop ![[#LOOP_MD:]]
+  br label %loop.header, !llvm.loop !0
+
+exit:
+  ret void
+}
+
+; CHECK: ![[#LOOP_MD]] = distinct !{![[#LOOP_MD]], ![[#COUNT:]]}
+; CHECK: ![[#COUNT]] = !{!"llvm.loop.unroll.count", i6 4}
+
+!0 = !{!0, !1}
+!1 = !{!1, !2}
+!2 = !{!"llvm.loop.unroll.count", i6 4}
+
+;--- disable.ll
+
+; Test that we allow a disable hint
+
+target triple = "dxilv1.0-unknown-shadermodel6.0-library"
+
+define void @example_loop(i32 %n) {
+entry:
+  br label %loop.header
+
+loop.header:
+  %i = phi i32 [ 0, %entry ], [ %i.next, %loop.body ]
+  %cmp = icmp slt i32 %i, %n
+  br i1 %cmp, label %loop.body, label %exit
+
+loop.body:
+  %i.next = add nsw i32 %i, 1
+  ; CHECK: br label %loop.header, !llvm.loop ![[#LOOP_MD:]]
+  br label %loop.header, !llvm.loop !0
+
+exit:
+  ret void
+}
+
+; CHECK: ![[#LOOP_MD]] = distinct !{![[#LOOP_MD]], ![[#DISABLE:]]}
+; CHECK: ![[#DISABLE]] = !{!"llvm.loop.unroll.disable"}
+
+!0 = !{!0, !1}
+!1 = !{!"llvm.loop.unroll.disable"}
+
+;--- full.ll
+
+; Test that we allow a full hint
+
+target triple = "dxilv1.0-unknown-shadermodel6.0-library"
+
+define void @example_loop(i32 %n) {
+entry:
+  br label %loop.header
+
+loop.header:
+  %i = phi i32 [ 0, %entry ], [ %i.next, %loop.body ]
+  %cmp = icmp slt i32 %i, %n
+  br i1 %cmp, label %loop.body, label %exit
+
+loop.body:
+  %i.next = add nsw i32 %i, 1
+  ; CHECK: br label %loop.header, !llvm.loop ![[#LOOP_MD:]]
+  br label %loop.header, !llvm.loop !0
+
+exit:
+  ret void
+}
+
+; CHECK: ![[#LOOP_MD]] = distinct !{![[#LOOP_MD]], ![[#FULL:]]}
+; CHECK: ![[#FULL]] = !{!"llvm.loop.unroll.full"}
+
+!0 = !{!0, !1}
+!1 = !{!"llvm.loop.unroll.full"}
diff --git a/llvm/test/CodeGen/DirectX/Metadata/multiple-entries-cs-error.ll b/llvm/test/CodeGen/DirectX/Metadata/multiple-entries-cs-error.ll
index 9697d4389a888..5740ee11401f2 100644
--- a/llvm/test/CodeGen/DirectX/Metadata/multiple-entries-cs-error.ll
+++ b/llvm/test/CodeGen/DirectX/Metadata/multiple-entries-cs-error.ll
@@ -1,4 +1,4 @@
-; RUN: not opt -S  -S -dxil-translate-metadata %s 2>&1 | FileCheck %s
+; RUN: not opt -S -dxil-translate-metadata %s 2>&1 | FileCheck %s
 target triple = "dxil-pc-shadermodel6.8-compute"
 
 ; CHECK: Non-library shader: One and only one entry expected
diff --git a/llvm/test/CodeGen/DirectX/ShaderFlags/wave-ops.ll b/llvm/test/CodeGen/DirectX/ShaderFlags/wave-ops.ll
index 7a876f67615cd..3544017062e8e 100644
--- a/llvm/test/CodeGen/DirectX/ShaderFlags/wave-ops.ll
+++ b/llvm/test/CodeGen/DirectX/ShaderFlags/wave-ops.ll
@@ -76,6 +76,20 @@ entry:
   ret i32 %ret
 }
 
+define noundef i32 @wave_reduce_min(i32 noundef %x) {
+entry:
+  ; CHECK: Function wave_reduce_min : [[WAVE_FLAG]]
+  %ret = call i32 @llvm.dx.wave.reduce.min.i32(i32 %x)
+  ret i32 %ret
+}
+
+define noundef i32 @wave_reduce_umin(i32 noundef %x) {
+entry:
+  ; CHECK: Function wave_reduce_umin : [[WAVE_FLAG]]
+  %ret = call i32 @llvm.dx.wave.reduce.umin.i32(i32 %x)
+  ret i32 %ret
+}
+
 define void @wave_active_countbits(i1 %expr) {
 entry:
   ; CHECK: Function wave_active_countbits : [[WAVE_FLAG]]
diff --git a/llvm/test/CodeGen/DirectX/WaveActiveMin.ll b/llvm/test/CodeGen/DirectX/WaveActiveMin.ll
new file mode 100644
index 0000000000000..24fde48fadfeb
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/WaveActiveMin.ll
@@ -0,0 +1,143 @@
+; RUN: opt -S -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library < %s | FileCheck %s
+
+; Test that for scalar values, WaveActiveMin maps down to the DirectX op
+
+define noundef half @wave_active_min_half(half noundef %expr) {
+entry:
+; CHECK: call half @dx.op.waveActiveOp.f16(i32 119, half %expr, i8 2, i8 0){{$}}
+  %ret = call half @llvm.dx.wave.reduce.min.f16(half %expr)
+  ret half %ret
+}
+
+define noundef float @wave_active_min_float(float noundef %expr) {
+entry:
+; CHECK: call float @dx.op.waveActiveOp.f32(i32 119, float %expr, i8 2, i8 0){{$}}
+  %ret = call float @llvm.dx.wave.reduce.min.f32(float %expr)
+  ret float %ret
+}
+
+define noundef double @wave_active_min_double(double noundef %expr) {
+entry:
+; CHECK: call double @dx.op.waveActiveOp.f64(i32 119, double %expr, i8 2, i8 0){{$}}
+  %ret = call double @llvm.dx.wave.reduce.min.f64(double %expr)
+  ret double %ret
+}
+
+define noundef i16 @wave_active_min_i16(i16 noundef %expr) {
+entry:
+; CHECK: call i16 @dx.op.waveActiveOp.i16(i32 119, i16 %expr, i8 2, i8 0){{$}}
+  %ret = call i16 @llvm.dx.wave.reduce.min.i16(i16 %expr)
+  ret i16 %ret
+}
+
+define noundef i32 @wave_active_min_i32(i32 noundef %expr) {
+entry:
+; CHECK: call i32 @dx.op.waveActiveOp.i32(i32 119, i32 %expr, i8 2, i8 0){{$}}
+  %ret = call i32 @llvm.dx.wave.reduce.min.i32(i32 %expr)
+  ret i32 %ret
+}
+
+define noundef i64 @wave_active_min_i64(i64 noundef %expr) {
+entry:
+; CHECK: call i64 @dx.op.waveActiveOp.i64(i32 119, i64 %expr, i8 2, i8 0){{$}}
+  %ret = call i64 @llvm.dx.wave.reduce.min.i64(i64 %expr)
+  ret i64 %ret
+}
+
+define noundef i16 @wave_active_umin_i16(i16 noundef %expr) {
+entry:
+; CHECK: call i16 @dx.op.waveActiveOp.i16(i32 119, i16 %expr, i8 2, i8 1){{$}}
+  %ret = call i16 @llvm.dx.wave.reduce.umin.i16(i16 %expr)
+  ret i16 %ret
+}
+
+define noundef i32 @wave_active_umin_i32(i32 noundef %expr) {
+entry:
+; CHECK: call i32 @dx.op.waveActiveOp.i32(i32 119, i32 %expr, i8 2, i8 1){{$}}
+  %ret = call i32 @llvm.dx.wave.reduce.umin.i32(i32 %expr)
+  ret i32 %ret
+}
+
+define noundef i64 @wave_active_umin_i64(i64 noundef %expr) {
+entry:
+; CHECK: call i64 @dx.op.waveActiveOp.i64(i32 119, i64 %expr, i8 2, i8 1){{$}}
+  %ret = call i64 @llvm.dx.wave.reduce.umin.i64(i64 %expr)
+  ret i64 %ret
+}
+
+declare half @llvm.dx.wave.reduce.min.f16(half)
+declare float @llvm.dx.wave.reduce.min.f32(float)
+declare double @llvm.dx.wave.reduce.min.f64(double)
+
+declare i16 @llvm.dx.wave.reduce.min.i16(i16)
+declare i32 @llvm.dx.wave.reduce.min.i32(i32)
+declare i64 @llvm.dx.wave.reduce.min.i64(i64)
+
+declare i16 @llvm.dx.wave.reduce.umin.i16(i16)
+declare i32 @llvm.dx.wave.reduce.umin.i32(i32)
+declare i64 @llvm.dx.wave.reduce.umin.i64(i64)
+
+; Test that for vector values, WaveActiveMin scalarizes and maps down to the
+; DirectX op
+
+define noundef <2 x half> @wave_active_min_v2half(<2 x half> noundef %expr) {
+entry:
+; CHECK: call half @dx.op.waveActiveOp.f16(i32 119, half %expr.i0, i8 2, i8 0){{$}}
+; CHECK: call half @dx.op.waveActiveOp.f16(i32 119, half %expr.i1, i8 2, i8 0){{$}}
+  %ret = call <2 x half> @llvm.dx.wave.reduce.min.v2f16(<2 x half> %expr)
+  ret <2 x half> %ret
+}
+
+define noundef <3 x i32> @wave_active_min_v3i32(<3 x i32> noundef %expr) {
+entry:
+; CHECK: call i32 @dx.op.waveActiveOp.i32(i32 119, i32 %expr.i0, i8 2, i8 0){{$}}
+; CHECK: call i32 @dx.op.waveActiveOp.i32(i32 119, i32 %expr.i1, i8 2, i8 0){{$}}
+; CHECK: call i32 @dx.op.waveActiveOp.i32(i32 119, i32 %expr.i2, i8 2, i8 0){{$}}
+  %ret = call <3 x i32> @llvm.dx.wave.reduce.min.v3i32(<3 x i32> %expr)
+  ret <3 x i32> %ret
+}
+
+define noundef <4 x double> @wave_active_min_v4f64(<4 x double> noundef %expr) {
+entry:
+; CHECK: call double @dx.op.waveActiveOp.f64(i32 119, double %expr.i0, i8 2, i8 0){{$}}
+; CHECK: call double @dx.op.waveActiveOp.f64(i32 119, double %expr.i1, i8 2, i8 0){{$}}
+; CHECK: call double @dx.op.waveActiveOp.f64(i32 119, double %expr.i2, i8 2, i8 0){{$}}
+; CHECK: call double @dx.op.waveActiveOp.f64(i32 119, double %expr.i3, i8 2, i8 0){{$}}
+  %ret = call <4 x double> @llvm.dx.wave.reduce.min.v4f64(<4 x double> %expr)
+  ret <4 x double> %ret
+}
+
+declare <2 x half> @llvm.dx.wave.reduce.min.v2f16(<2 x half>)
+declare <3 x i32> @llvm.dx.wave.reduce.min.v3i32(<3 x i32>)
+declare <4 x double> @llvm.dx.wave.reduce.min.v4f64(<4 x double>)
+
+define noundef <2 x i16> @wave_active_umin_v2i16(<2 x i16> noundef %expr) {
+entry:
+; CHECK: call i16 @dx.op.waveActiveOp.i16(i32 119, i16 %expr.i0, i8 2, i8 1){{$}}
+; CHECK: call i16 @dx.op.waveActiveOp.i16(i32 119, i16 %expr.i1, i8 2, i8 1){{$}}
+  %ret = call <2 x i16> @llvm.dx.wave.reduce.umin.v2f16(<2 x i16> %expr)
+  ret <2 x i16> %ret
+}
+
+define noundef <3 x i32> @wave_active_umin_v3i32(<3 x i32> noundef %expr) {
+entry:
+; CHECK: call i32 @dx.op.waveActiveOp.i32(i32 119, i32 %expr.i0, i8 2, i8 1){{$}}
+; CHECK: call i32 @dx.op.waveActiveOp.i32(i32 119, i32 %expr.i1, i8 2, i8 1){{$}}
+; CHECK: call i32 @dx.op.waveActiveOp.i32(i32 119, i32 %expr.i2, i8 2, i8 1){{$}}
+  %ret = call <3 x i32> @llvm.dx.wave.reduce.umin.v3i32(<3 x i32> %expr)
+  ret <3 x i32> %ret
+}
+
+define noundef <4 x i64> @wave_active_umin_v4f64(<4 x i64> noundef %expr) {
+entry:
+; CHECK: call i64 @dx.op.waveActiveOp.i64(i32 119, i64 %expr.i0, i8 2, i8 1){{$}}
+; CHECK: call i64 @dx.op.waveActiveOp.i64(i32 119, i64 %expr.i1, i8 2, i8 1){{$}}
+; CHECK: call i64 @dx.op.waveActiveOp.i64(i32 119, i64 %expr.i2, i8 2, i8 1){{$}}
+; CHECK: call i64 @dx.op.waveActiveOp.i64(i32 119, i64 %expr.i3, i8 2, i8 1){{$}}
+  %ret = call <4 x i64> @llvm.dx.wave.reduce.umin.v4f64(<4 x i64> %expr)
+  ret <4 x i64> %ret
+}
+
+declare <2 x i16> @llvm.dx.wave.reduce.umin.v2f16(<2 x i16>)
+declare <3 x i32> @llvm.dx.wave.reduce.umin.v3i32(<3 x i32>)
+declare <4 x i64> @llvm.dx.wave.reduce.umin.v4f64(<4 x i64>)
diff --git a/llvm/test/CodeGen/DirectX/metadata-stripping.ll b/llvm/test/CodeGen/DirectX/metadata-stripping.ll
index 531ab6c334d24..53716ff29f292 100644
--- a/llvm/test/CodeGen/DirectX/metadata-stripping.ll
+++ b/llvm/test/CodeGen/DirectX/metadata-stripping.ll
@@ -14,7 +14,7 @@ entry:
 
   %cmp.i = icmp ult i32 1, 2
   ; Ensure that the !llvm.loop metadata node gets dropped.
-  ; CHECK: br i1 %cmp.i, label %_Z4mainDv3_j.exit, label %_Z4mainDv3_j.exit{{$}}
+  ; CHECK: br i1 %cmp.i, label %_Z4mainDv3_j.exit, label %_Z4mainDv3_j.exit, !llvm.loop [[LOOPMD:![0-9]+]]
   br i1 %cmp.i, label %_Z4mainDv3_j.exit, label %_Z4mainDv3_j.exit, !llvm.loop !0
 
 _Z4mainDv3_j.exit:                                ; preds = %for.body.i, %entry
@@ -25,7 +25,8 @@ _Z4mainDv3_j.exit:                                ; preds = %for.body.i, %entry
 ; No more metadata should be necessary, the rest (the current 0 and 1)
 ; should be removed.
 ; CHECK-NOT: !{!"llvm.loop.mustprogress"}
-; CHECK: [[RANGEMD]] = !{i32 1, i32 5}
+; CHECK-DAG: [[RANGEMD]] = !{i32 1, i32 5}
+; CHECK-DAG: [[LOOPMD]] = distinct !{[[LOOPMD]]}
 ; CHECK-NOT: !{!"llvm.loop.mustprogress"}
 !0 = distinct !{!0, !1}
 !1 = !{!"llvm.loop.mustprogress"}
diff --git a/llvm/test/CodeGen/DirectX/strip-module-md.ll b/llvm/test/CodeGen/DirectX/strip-module-md.ll
new file mode 100644
index 0000000000000..4d8b9ec935f6b
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/strip-module-md.ll
@@ -0,0 +1,75 @@
+; RUN: opt -S -dxil-translate-metadata < %s | FileCheck %s
+
+; Ensures that only metadata explictly specified on the allow list, or debug
+; related, metadata is emitted
+
+target triple = "dxil-unknown-shadermodel6.0-compute"
+
+; CHECK-NOT: !dx.rootsignatures
+; CHECK-NOT: !llvm.errno.tbaa
+
+; CHECK-DAG: !llvm.dbg.cu
+
+; CHECK-DAG: !llvm.module.flags = !{![[#DWARF_VER:]], ![[#DEBUG_VER:]]}
+; CHECK-DAG: !llvm.ident = !{![[#IDENT:]]}
+
+; CHECK-DAG: !dx.shaderModel
+; CHECK-DAG: !dx.version
+; CHECK-DAG: !dx.entryPoints
+; CHECK-DAG: !dx.valver
+; CHECK-DAG: !dx.resources
+
+; CHECK-NOT: !dx.rootsignatures
+; CHECK-NOT: !llvm.errno.tbaa
+
+; Check allowed llvm metadata structure to ensure it is still DXIL compatible
+; If this fails, please ensure that the updated form is DXIL compatible before
+; updating the test.
+
+; CHECK-DAG: ![[#IDENT]] = !{!"clang 22.0.0"}
+; CHECK-DAG: ![[#DWARF_VER]] = !{i32 2, !"Dwarf Version", i32 2}
+; CHECK-DAG: ![[#DEBUG_VER]] = !{i32 2, !"Debug Info Version", i32 3}
+
+; CHECK-NOT: !dx.rootsignatures
+; CHECK-NOT: !llvm.errno.tbaa
+
+@BufA.str = private unnamed_addr constant [5 x i8] c"BufA\00", align 1
+
+define void @main () #0 {
+entry:
+  %typed0 = call target("dx.TypedBuffer", <4 x float>, 1, 0, 0)
+              @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_v4f32_1_0_0(
+                  i32 3, i32 5, i32 1, i32 0, ptr @BufA.str)
+  ret void
+}
+
+attributes #0 = { noinline nounwind "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
+
+; Incompatible
+!dx.rootsignatures = !{!2}
+!llvm.errno.tbaa = !{!5}
+
+; Compatible
+!llvm.dbg.cu = !{!8}
+!llvm.module.flags = !{!11, !12}
+!llvm.ident = !{!13}
+!dx.valver = !{!14}
+
+!2 = !{ ptr @main, !3, i32 2 }
+!3 = !{ !4 }
+!4 = !{ !"RootFlags", i32 1 }
+
+!5 = !{!6, !6, i64 0}
+!6 = !{!"omnipotent char", !7}
+!7 = !{!"Simple C/C++ TBAA"}
+
+!8 = distinct !DICompileUnit(language: DW_LANG_C99, file: !9, producer: "Some Compiler", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !10, splitDebugInlining: false, nameTableKind: None)
+!9 = !DIFile(filename: "hlsl.hlsl", directory: "/some-path")
+!10 = !{}
+
+!11 = !{i32 2, !"Dwarf Version", i32 2}
+!12 = !{i32 2, !"Debug Info Version", i32 3}
+
+!13 = !{!"clang 22.0.0"}
+
+!14 = !{i32 1, i32 1}
diff --git a/llvm/test/CodeGen/Hexagon/inst_masked_store_bug1.ll b/llvm/test/CodeGen/Hexagon/inst_masked_store_bug1.ll
new file mode 100644
index 0000000000000..fcf124699e8e7
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/inst_masked_store_bug1.ll
@@ -0,0 +1,94 @@
+;; REQUIRES: asserts
+;; RUN: llc --mtriple=hexagon -mattr=+hvxv79,+hvx-length128b %s -o - | FileCheck %s
+;; Sanity check for lowering masked scatter without assertion errors.
+
+define void @outer_product(ptr  %aptr, ptr  %bptr, ptr %cptr, i32 %T, i32 %W) {
+entry:
+  %W.ripple.bcast.splatinsert = insertelement <8 x i32> poison, i32 %W, i64 0
+  %W.ripple.bcast.splat = shufflevector <8 x i32> %W.ripple.bcast.splatinsert, <8 x i32> poison, <8 x i32> zeroinitializer
+  %div1194 = lshr i32 %T, 3
+  %cmp84.not = icmp ult i32 %T, 8
+  br i1 %cmp84.not, label %for.end49, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  %div10195 = lshr i32 %W, 3
+  %cmp1782.not = icmp ult i32 %W, 8
+  %arrayidx27.ripple.LS.dim.slope = mul <8 x i32> %W.ripple.bcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %arrayidx27.ripple.LS.dim.slope.ripple.bcast = shufflevector <8 x i32> %arrayidx27.ripple.LS.dim.slope, <8 x i32> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+  %arrayidx27.ripple.LS.slope = add <64 x i32> %arrayidx27.ripple.LS.dim.slope.ripple.bcast, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %invariant.gep196 = getelementptr i8, ptr %cptr, <64 x i32> %arrayidx27.ripple.LS.slope
+  br label %for.body
+
+for.body:                                         ; preds = %for.end, %for.body.preheader
+  %ripple.par.iv.085 = phi i32 [ %add48, %for.end ], [ 0, %for.body.preheader ]
+  %mul2 = shl i32 %ripple.par.iv.085, 3
+  br i1 %cmp1782.not, label %for.end, label %for.body18.lr.ph
+
+for.body18.lr.ph:                                 ; preds = %for.body
+  %arrayidx = getelementptr inbounds nuw i8, ptr %aptr, i32 %mul2
+  %mul25 = mul i32 %mul2, %W
+  %gep197 = getelementptr i8, <64 x ptr> %invariant.gep196, i32 %mul25
+  br label %for.body18
+
+for.body18:                                       ; preds = %for.body18, %for.body18.lr.ph
+  %ripple.par.iv15.083 = phi i32 [ 0, %for.body18.lr.ph ], [ %add28, %for.body18 ]
+  %mul19 = shl i32 %ripple.par.iv15.083, 3
+  %.ripple.LS.instance184 = load <8 x i8>, ptr %arrayidx, align 1
+  %.ripple.LS.instance184.ripple.bcast = shufflevector <8 x i8> %.ripple.LS.instance184, <8 x i8> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+  %arrayidx21 = getelementptr inbounds nuw i8, ptr %bptr, i32 %mul19
+  %.ripple.LS.instance = load <8 x i8>, ptr %arrayidx21, align 1
+  %.ripple.LS.instance.ripple.bcast = shufflevector <8 x i8> %.ripple.LS.instance, <8 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %mul23.ripple.LS.instance = mul <64 x i8> %.ripple.LS.instance.ripple.bcast, %.ripple.LS.instance184.ripple.bcast
+  %gep = getelementptr i8, <64 x ptr> %gep197, i32 %mul19
+  tail call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> %mul23.ripple.LS.instance, <64 x ptr> %gep, i32 1, <64 x i1> splat (i1 true))
+  %add28 = add nuw i32 %ripple.par.iv15.083, 1
+  %cmp17 = icmp ult i32 %add28, %div10195
+  br i1 %cmp17, label %for.body18, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body18
+  %0 = shl i32 %add28, 3
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %for.body
+  %ripple.par.iv15.0.lcssa = phi i32 [ 0, %for.body ], [ %0, %for.end.loopexit ]
+  %add30.ripple.bcast.splatinsert = insertelement <8 x i32> poison, i32 %ripple.par.iv15.0.lcssa, i64 0
+  %add30.ripple.bcast.splat = shufflevector <8 x i32> %add30.ripple.bcast.splatinsert, <8 x i32> poison, <8 x i32> zeroinitializer
+  %add30.ripple.LS.instance = or disjoint <8 x i32> %add30.ripple.bcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %cmp32.ripple.LS.instance = icmp ne i32 %ripple.par.iv15.0.lcssa, %W
+  %cmp32.ripple.LS.instance.ripple.bcast.splatinsert = insertelement <8 x i1> poison, i1 %cmp32.ripple.LS.instance, i64 0
+  %cmp32.ripple.LS.instance.ripple.bcast.splat = shufflevector <8 x i1> %cmp32.ripple.LS.instance.ripple.bcast.splatinsert, <8 x i1> poison, <8 x i32> zeroinitializer
+  %cmp33.ripple.vectorized = icmp ult <8 x i32> %add30.ripple.LS.instance, %W.ripple.bcast.splat
+  %or.cond.ripple.LS.instance = select <8 x i1> %cmp32.ripple.LS.instance.ripple.bcast.splat, <8 x i1> %cmp33.ripple.vectorized, <8 x i1> zeroinitializer
+  %or.cond.ripple.LS.instance.ripple.bcast = shufflevector <8 x i1> %or.cond.ripple.LS.instance, <8 x i1> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %or.cond.ripple.LS.instance.ripple.reducelog2.shuffle = shufflevector <8 x i1> %or.cond.ripple.LS.instance, <8 x i1> <i1 poison, i1 poison, i1 poison, i1 poison, i1 poison, i1 poison, i1 poison, i1 false>, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 15>
+  %or.cond.ripple.LS.instance.ripple.reducelog2.operator = or <8 x i1> %or.cond.ripple.LS.instance, %or.cond.ripple.LS.instance.ripple.reducelog2.shuffle
+  %or.cond.ripple.LS.instance.ripple.reducelog2.shuffle189 = shufflevector <8 x i1> %or.cond.ripple.LS.instance.ripple.reducelog2.operator, <8 x i1> <i1 poison, i1 poison, i1 poison, i1 poison, i1 poison, i1 poison, i1 false, i1 false>, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 14, i32 15>
+  %or.cond.ripple.LS.instance.ripple.reducelog2.operator190 = or <8 x i1> %or.cond.ripple.LS.instance.ripple.reducelog2.operator, %or.cond.ripple.LS.instance.ripple.reducelog2.shuffle189
+  %or.cond.ripple.LS.instance.ripple.reducelog2.shuffle191 = shufflevector <8 x i1> %or.cond.ripple.LS.instance.ripple.reducelog2.operator190, <8 x i1> poison, <8 x i32> <i32 4, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %or.cond.ripple.LS.instance.ripple.reducelog2.operator192 = or <8 x i1> %or.cond.ripple.LS.instance.ripple.reducelog2.operator190, %or.cond.ripple.LS.instance.ripple.reducelog2.shuffle191
+  %ripple.red.extract.ripple.bcast.splat = shufflevector <8 x i1> %or.cond.ripple.LS.instance.ripple.reducelog2.operator192, <8 x i1> poison, <8 x i32> zeroinitializer
+  %arrayidx34.ripple.branch.clone = getelementptr inbounds nuw i8, ptr %aptr, i32 %mul2
+  %.ripple.LS.instance188.ripple.branch.clone.ripple.masked.load = tail call <8 x i8> @llvm.masked.load.v8i8.p0(ptr %arrayidx34.ripple.branch.clone, i32 1, <8 x i1> %ripple.red.extract.ripple.bcast.splat, <8 x i8> poison)
+  %.ripple.LS.instance188.ripple.bcast.ripple.branch.clone = shufflevector <8 x i8> %.ripple.LS.instance188.ripple.branch.clone.ripple.masked.load, <8 x i8> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+  %arrayidx36.ripple.branch.clone = getelementptr inbounds nuw i8, ptr %bptr, i32 %ripple.par.iv15.0.lcssa
+  %.ripple.LS.instance187.ripple.branch.clone.ripple.masked.load = tail call <8 x i8> @llvm.masked.load.v8i8.p0(ptr %arrayidx36.ripple.branch.clone, i32 1, <8 x i1> %or.cond.ripple.LS.instance, <8 x i8> poison)
+  %.ripple.LS.instance187.ripple.bcast.ripple.branch.clone = shufflevector <8 x i8> %.ripple.LS.instance187.ripple.branch.clone.ripple.masked.load, <8 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %mul38.ripple.LS.instance.ripple.branch.clone = mul <64 x i8> %.ripple.LS.instance187.ripple.bcast.ripple.branch.clone, %.ripple.LS.instance188.ripple.bcast.ripple.branch.clone
+  %mul40.ripple.branch.clone = mul i32 %mul2, %W
+  %1 = getelementptr i8, ptr %cptr, i32 %mul40.ripple.branch.clone
+  %arrayidx42.ripple.branch.clone = getelementptr i8, ptr %1, i32 %ripple.par.iv15.0.lcssa
+  %arrayidx42.ripple.LS.instance.ripple.branch.clone = getelementptr i8, ptr %arrayidx42.ripple.branch.clone, <64 x i32> %arrayidx27.ripple.LS.slope
+  tail call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> %mul38.ripple.LS.instance.ripple.branch.clone, <64 x ptr> %arrayidx42.ripple.LS.instance.ripple.branch.clone, i32 1, <64 x i1> %or.cond.ripple.LS.instance.ripple.bcast)
+  %add48 = add nuw i32 %ripple.par.iv.085, 1
+  %cmp = icmp ult i32 %add48, %div1194
+  br i1 %cmp, label %for.body, label %for.end49
+
+for.end49:                                        ; preds = %for.end, %entry
+  ret void
+}
+
+;; CHECK: outer_product
+;; CHECK: {{r[0-9]+}} = lsr({{r[0-9]+}},#3)
+;; CHECK: {{q[0-9]+}} = vand({{v[0-9]+}},{{r[0-9]+}})
+;; CHECK: {{v[0-9]+}} = vmux(q0,{{v[0-9]+}},{{v[0-9]+}})
+;; CHECK: vmem{{.*}} = {{v[0-9]+}}
diff --git a/llvm/test/CodeGen/Hexagon/isel-fclass.ll b/llvm/test/CodeGen/Hexagon/isel-fclass.ll
new file mode 100644
index 0000000000000..96b02106fa807
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/isel-fclass.ll
@@ -0,0 +1,86 @@
+; Tests lowering of sfclass/dfclass compares.
+; Sub-optimal code
+;         {
+;                p0 = sfclass(r0,#16)
+;                r0 = sfadd(r0,r0)
+;        }
+;        {
+;                r2 = p0
+;        }
+;        {
+;                if (p0.new) r0 = ##1065353216
+;                p0 = cmp.eq(r2,#0)
+;                jumpr r31
+;        }
+; With the patterns added, we should be generating
+;        {
+;                p0 = sfclass(r0,#16)
+;                r0 = sfadd(r0,r0)
+;        }
+;        {
+;                if (!p0) r0 = ##1065353216
+;                jumpr r31
+;        }
+
+; RUN: llc -march=hexagon -stop-after=hexagon-isel %s -o - | FileCheck %s
+
+; CHECK: bb.0.entry1
+; CHECK: F2_sfclass
+; CHECK-NOT: C2_cmp
+; CHECK: C2_not
+; CHECK: F2_sfadd
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+define float @test1(float noundef %x) {
+entry1:
+  %0 = tail call i32 @llvm.hexagon.F2.sfclass(float %x, i32 16)
+  %tobool.not = icmp eq i32 %0, 0
+  %add = fadd float %x, %x
+  %spec.select = select i1 %tobool.not, float 1.000000e+00, float %add
+  ret float %spec.select
+}
+
+; CHECK: bb.0.entry2
+; CHECK: F2_sfclass
+; CHECK-NOT: C2_cmp
+; CHECK: F2_sfadd
+define float @test2(float noundef %x) {
+entry2:
+  %0 = tail call i32 @llvm.hexagon.F2.sfclass(float %x, i32 16)
+  %tobool.not = icmp eq i32 %0, 0
+  %add = fadd float %x, %x
+  %spec.select = select i1 %tobool.not, float %add, float 1.000000e+00
+  ret float %spec.select
+}
+
+; CHECK: bb.0.entry3
+; CHECK: F2_dfclass
+; CHECK-NOT: C2_cmp
+; CHECK: C2_not
+; CHECK: F2_dfadd
+define double @test3(double noundef %x) {
+entry3:
+  %0 = tail call i32 @llvm.hexagon.F2.dfclass(double %x, i32 16)
+  %tobool.not = icmp eq i32 %0, 0
+  %add = fadd double %x, %x
+  %spec.select = select i1 %tobool.not, double 1.000000e+00, double %add
+  ret double %spec.select
+}
+
+; CHECK: bb.0.entry4
+; CHECK: F2_dfclass
+; CHECK-NOT: C2_cmp
+; CHECK: F2_dfadd
+define double @test4(double noundef %x) {
+entry4:
+  %0 = tail call i32 @llvm.hexagon.F2.dfclass(double %x, i32 16)
+  %tobool.not = icmp eq i32 %0, 0
+  %add = fadd double %x, %x
+  %spec.select = select i1 %tobool.not, double %add, double 1.000000e+00
+  ret double %spec.select
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare i32 @llvm.hexagon.F2.dfclass(double, i32 immarg)
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare i32 @llvm.hexagon.F2.sfclass(float, i32 immarg)
diff --git a/llvm/test/CodeGen/Hexagon/isel/trunc-vNi1-HVX.ll b/llvm/test/CodeGen/Hexagon/isel/trunc-vNi1-HVX.ll
new file mode 100644
index 0000000000000..1491729a17f30
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/isel/trunc-vNi1-HVX.ll
@@ -0,0 +1,18 @@
+; RUN: llc --mtriple=hexagon -mattr=+hvxv79,+hvx-length128b < %s | FileCheck %s
+
+define void @f5(<64 x i32> %a0, ptr %a1) {
+; CHECK-LABEL: f5:
+; CHECK: [[REG0:(r[0-9]+)]] = ##16843009
+; CHECK-DAG: q[[Q0:[0-9]+]] = vand(v{{[0-9]+}},[[REG0]])
+; CHECK-DAG: q[[Q1:[0-9]+]] = vand(v{{[0-9]+}},[[REG0]])
+; CHECK: v{{[0-9]+}}.b = vpacke(v{{[0-9]+}}.h,v{{[0-9]+}}.h)
+; CHECK: v{{[0-9]+}}.b = vpacke(v{{[0-9]+}}.h,v{{[0-9]+}}.h)
+; CHECK: v[[VROR:[0-9]+]] = vror(v{{[0-9]+}},r{{[0-9]+}})
+; CHECK: v[[VOR:[0-9]+]] = vor(v[[VROR]],v{{[0-9]+}})
+; CHECK: q{{[0-9]+}} = vand(v[[VOR]],r{{[0-9]+}})
+b0:
+  %v0 = trunc <64 x i32> %a0 to <64 x i1>
+  store <64 x i1> %v0, ptr %a1, align 1
+  ret void
+}
+
diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/flog2.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/flog2.ll
index 93fcd421e4bd7..e02a2e7cce9b2 100644
--- a/llvm/test/CodeGen/LoongArch/ir-instruction/flog2.ll
+++ b/llvm/test/CodeGen/LoongArch/ir-instruction/flog2.ll
@@ -12,8 +12,8 @@ define float @flog2_s(float %x) nounwind {
 ;
 ; LA64-LABEL: flog2_s:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    pcaddu18i $t8, %call36(log2f)
-; LA64-NEXT:    jr $t8
+; LA64-NEXT:    flogb.s $fa0, $fa0
+; LA64-NEXT:    ret
   %y = call float @llvm.log2.f32(float %x)
   ret float %y
 }
@@ -25,8 +25,8 @@ define double @flog2_d(double %x) nounwind {
 ;
 ; LA64-LABEL: flog2_d:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    pcaddu18i $t8, %call36(log2)
-; LA64-NEXT:    jr $t8
+; LA64-NEXT:    flogb.d $fa0, $fa0
+; LA64-NEXT:    ret
   %y = call double @llvm.log2.f64(double %x)
   ret double %y
 }
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ctpop-ctlz.ll b/llvm/test/CodeGen/LoongArch/lasx/ctpop-ctlz.ll
index ba2118fb94f63..b3155c9313a8a 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/ctpop-ctlz.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/ctpop-ctlz.ll
@@ -106,6 +106,69 @@ define void @ctlz_v4i64(ptr %src, ptr %dst) nounwind {
   ret void
 }
 
+define void @not_ctlz_v32i8(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: not_ctlz_v32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a0, 0
+; CHECK-NEXT:    xvxori.b $xr0, $xr0, 255
+; CHECK-NEXT:    xvclz.b $xr0, $xr0
+; CHECK-NEXT:    xvst $xr0, $a1, 0
+; CHECK-NEXT:    ret
+  %v = load <32 x i8>, ptr %src
+  %neg = xor <32 x i8> %v, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  %res = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %neg, i1 false)
+  store <32 x i8> %res, ptr %dst
+  ret void
+}
+
+define void @not_ctlz_v16i16(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: not_ctlz_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a0, 0
+; CHECK-NEXT:    xvrepli.b $xr1, -1
+; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvclz.h $xr0, $xr0
+; CHECK-NEXT:    xvst $xr0, $a1, 0
+; CHECK-NEXT:    ret
+  %v = load <16 x i16>, ptr %src
+  %neg = xor <16 x i16> %v, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+  %res = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %neg, i1 false)
+  store <16 x i16> %res, ptr %dst
+  ret void
+}
+
+define void @not_ctlz_v8i32(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: not_ctlz_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a0, 0
+; CHECK-NEXT:    xvrepli.b $xr1, -1
+; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvclz.w $xr0, $xr0
+; CHECK-NEXT:    xvst $xr0, $a1, 0
+; CHECK-NEXT:    ret
+  %v = load <8 x i32>, ptr %src
+  %neg = xor <8 x i32> %v, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+  %res = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %neg, i1 false)
+  store <8 x i32> %res, ptr %dst
+  ret void
+}
+
+define void @not_ctlz_v4i64(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: not_ctlz_v4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a0, 0
+; CHECK-NEXT:    xvrepli.b $xr1, -1
+; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvclz.d $xr0, $xr0
+; CHECK-NEXT:    xvst $xr0, $a1, 0
+; CHECK-NEXT:    ret
+  %v = load <4 x i64>, ptr %src
+  %neg = xor <4 x i64> %v, <i64 -1, i64 -1, i64 -1, i64 -1>
+  %res = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %neg, i1 false)
+  store <4 x i64> %res, ptr %dst
+  ret void
+}
+
 declare <32 x i8> @llvm.ctpop.v32i8(<32 x i8>)
 declare <16 x i16> @llvm.ctpop.v16i16(<16 x i16>)
 declare <8 x i32> @llvm.ctpop.v8i32(<8 x i32>)
diff --git a/llvm/test/CodeGen/LoongArch/lasx/fp-max-min.ll b/llvm/test/CodeGen/LoongArch/lasx/fp-max-min.ll
new file mode 100644
index 0000000000000..8e08e1ee9e094
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lasx/fp-max-min.ll
@@ -0,0 +1,72 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
+
+define void @minnum_v8f32(ptr %res, ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: minnum_v8f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvfmin.s $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <8 x float>, ptr %x
+  %v1 = load <8 x float>, ptr %y
+  %r = call <8 x float> @llvm.minnum.v8f32(<8 x float> %v0, <8 x float> %v1)
+  store <8 x float> %r, ptr %res
+  ret void
+}
+
+define void @minnum_v4f64(ptr %res, ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: minnum_v4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvfmin.d $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <4 x double>, ptr %x
+  %v1 = load <4 x double>, ptr %y
+  %r = call <4 x double> @llvm.minnum.v4f64(<4 x double> %v0, <4 x double> %v1)
+  store <4 x double> %r, ptr %res
+  ret void
+}
+
+define void @maxnum_v8f32(ptr %res, ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: maxnum_v8f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvfmax.s $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <8 x float>, ptr %x
+  %v1 = load <8 x float>, ptr %y
+  %r = call <8 x float> @llvm.maxnum.v8f32(<8 x float> %v0, <8 x float> %v1)
+  store <8 x float> %r, ptr %res
+  ret void
+}
+
+define void @maxnum_v4f64(ptr %res, ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: maxnum_v4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvfmax.d $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <4 x double>, ptr %x
+  %v1 = load <4 x double>, ptr %y
+  %r = call <4 x double> @llvm.maxnum.v4f64(<4 x double> %v0, <4 x double> %v1)
+  store <4 x double> %r, ptr %res
+  ret void
+}
+
+declare <8 x float> @llvm.minnum.v8f32(<8 x float>, <8 x float>)
+declare <4 x double> @llvm.minnum.v4f64(<4 x double>, <4 x double>)
+declare <8 x float> @llvm.maxnum.v8f32(<8 x float>, <8 x float>)
+declare <4 x double> @llvm.maxnum.v4f64(<4 x double>, <4 x double>)
diff --git a/llvm/test/CodeGen/LoongArch/lasx/fp-rounding.ll b/llvm/test/CodeGen/LoongArch/lasx/fp-rounding.ll
new file mode 100644
index 0000000000000..fa5f27edf615e
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lasx/fp-rounding.ll
@@ -0,0 +1,132 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
+
+;; ceilf
+define void @ceil_v8f32(ptr %res, ptr %a0) nounwind {
+; CHECK-LABEL: ceil_v8f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvfrintrp.s $xr0, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <8 x float>, ptr %a0
+  %r = call <8 x float> @llvm.ceil.v8f32(<8 x float> %v0)
+  store <8 x float> %r, ptr %res
+  ret void
+}
+
+;; ceil
+define void @ceil_v4f64(ptr %res, ptr %a0) nounwind {
+; CHECK-LABEL: ceil_v4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvfrintrp.d $xr0, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <4 x double>, ptr %a0
+  %r = call <4 x double> @llvm.ceil.v4f64(<4 x double> %v0)
+  store <4 x double> %r, ptr %res
+  ret void
+}
+
+;; floorf
+define void @floor_v8f32(ptr %res, ptr %a0) nounwind {
+; CHECK-LABEL: floor_v8f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvfrintrm.s $xr0, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <8 x float>, ptr %a0
+  %r = call <8 x float> @llvm.floor.v8f32(<8 x float> %v0)
+  store <8 x float> %r, ptr %res
+  ret void
+}
+
+;; floor
+define void @floor_v4f64(ptr %res, ptr %a0) nounwind {
+; CHECK-LABEL: floor_v4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvfrintrm.d $xr0, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <4 x double>, ptr %a0
+  %r = call <4 x double> @llvm.floor.v4f64(<4 x double> %v0)
+  store <4 x double> %r, ptr %res
+  ret void
+}
+
+;; truncf
+define void @trunc_v8f32(ptr %res, ptr %a0) nounwind {
+; CHECK-LABEL: trunc_v8f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvfrintrz.s $xr0, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <8 x float>, ptr %a0
+  %r = call <8 x float> @llvm.trunc.v8f32(<8 x float> %v0)
+  store <8 x float> %r, ptr %res
+  ret void
+}
+
+;; trunc
+define void @trunc_v4f64(ptr %res, ptr %a0) nounwind {
+; CHECK-LABEL: trunc_v4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvfrintrz.d $xr0, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <4 x double>, ptr %a0
+  %r = call <4 x double> @llvm.trunc.v4f64(<4 x double> %v0)
+  store <4 x double> %r, ptr %res
+  ret void
+}
+
+;; roundevenf
+define void @roundeven_v8f32(ptr %res, ptr %a0) nounwind {
+; CHECK-LABEL: roundeven_v8f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvfrintrne.s $xr0, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <8 x float>, ptr %a0
+  %r = call <8 x float> @llvm.roundeven.v8f32(<8 x float> %v0)
+  store <8 x float> %r, ptr %res
+  ret void
+}
+
+;; roundeven
+define void @roundeven_v4f64(ptr %res, ptr %a0) nounwind {
+; CHECK-LABEL: roundeven_v4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvfrintrne.d $xr0, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <4 x double>, ptr %a0
+  %r = call <4 x double> @llvm.roundeven.v4f64(<4 x double> %v0)
+  store <4 x double> %r, ptr %res
+  ret void
+}
+
+declare <8 x float> @llvm.ceil.v8f32(<8 x float>)
+declare <4 x double> @llvm.ceil.v4f64(<4 x double>)
+declare <8 x float> @llvm.floor.v8f32(<8 x float>)
+declare <4 x double> @llvm.floor.v4f64(<4 x double>)
+declare <8 x float> @llvm.trunc.v8f32(<8 x float>)
+declare <4 x double> @llvm.trunc.v4f64(<4 x double>)
+declare <8 x float> @llvm.roundeven.v8f32(<8 x float>)
+declare <4 x double> @llvm.roundeven.v4f64(<4 x double>)
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/avg.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/avg.ll
new file mode 100644
index 0000000000000..5c5c19935080b
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/avg.ll
@@ -0,0 +1,321 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s --check-prefixes=CHECK,LA32
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s --check-prefixes=CHECK,LA64
+
+define void @xvavg_b(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: xvavg_b:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvavg.b $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <32 x i8>, ptr %a
+  %vb = load <32 x i8>, ptr %b
+  %add = add <32 x i8> %va, %vb
+  %shr = ashr <32 x i8> %add, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  store <32 x i8> %shr, ptr %res
+  ret void
+}
+
+define void @xvavg_h(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: xvavg_h:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvavg.h $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <16 x i16>, ptr %a
+  %vb = load <16 x i16>, ptr %b
+  %add = add <16 x i16> %va, %vb
+  %shr = ashr <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  store <16 x i16> %shr, ptr %res
+  ret void
+}
+
+define void @xvavg_w(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: xvavg_w:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvavg.w $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <8 x i32>, ptr %a
+  %vb = load <8 x i32>, ptr %b
+  %add = add <8 x i32> %va, %vb
+  %shr = ashr <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  store <8 x i32> %shr, ptr %res
+  ret void
+}
+
+define void @xvavg_d(ptr %res, ptr %a, ptr %b) nounwind {
+; LA32-LABEL: xvavg_d:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    xvld $xr0, $a1, 0
+; LA32-NEXT:    xvld $xr1, $a2, 0
+; LA32-NEXT:    xvadd.d $xr0, $xr0, $xr1
+; LA32-NEXT:    xvsrai.d $xr0, $xr0, 1
+; LA32-NEXT:    xvst $xr0, $a0, 0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: xvavg_d:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    xvld $xr0, $a1, 0
+; LA64-NEXT:    xvld $xr1, $a2, 0
+; LA64-NEXT:    xvavg.d $xr0, $xr0, $xr1
+; LA64-NEXT:    xvst $xr0, $a0, 0
+; LA64-NEXT:    ret
+entry:
+  %va = load <4 x i64>, ptr %a
+  %vb = load <4 x i64>, ptr %b
+  %add = add <4 x i64> %va, %vb
+  %shr = ashr <4 x i64> %add, <i64 1, i64 1, i64 1, i64 1>
+  store <4 x i64> %shr, ptr %res
+  ret void
+}
+
+define void @xvavg_bu(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: xvavg_bu:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvavg.bu $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <32 x i8>, ptr %a
+  %vb = load <32 x i8>, ptr %b
+  %add = add <32 x i8> %va, %vb
+  %shr = lshr <32 x i8> %add, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  store <32 x i8> %shr, ptr %res
+  ret void
+}
+
+define void @xvavg_hu(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: xvavg_hu:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvavg.hu $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <16 x i16>, ptr %a
+  %vb = load <16 x i16>, ptr %b
+  %add = add <16 x i16> %va, %vb
+  %shr = lshr <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  store <16 x i16> %shr, ptr %res
+  ret void
+}
+
+define void @xvavg_wu(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: xvavg_wu:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvavg.wu $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <8 x i32>, ptr %a
+  %vb = load <8 x i32>, ptr %b
+  %add = add <8 x i32> %va, %vb
+  %shr = lshr <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  store <8 x i32> %shr, ptr %res
+  ret void
+}
+
+define void @xvavg_du(ptr %res, ptr %a, ptr %b) nounwind {
+; LA32-LABEL: xvavg_du:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    xvld $xr0, $a1, 0
+; LA32-NEXT:    xvld $xr1, $a2, 0
+; LA32-NEXT:    xvadd.d $xr0, $xr0, $xr1
+; LA32-NEXT:    xvsrli.d $xr0, $xr0, 1
+; LA32-NEXT:    xvst $xr0, $a0, 0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: xvavg_du:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    xvld $xr0, $a1, 0
+; LA64-NEXT:    xvld $xr1, $a2, 0
+; LA64-NEXT:    xvavg.du $xr0, $xr0, $xr1
+; LA64-NEXT:    xvst $xr0, $a0, 0
+; LA64-NEXT:    ret
+entry:
+  %va = load <4 x i64>, ptr %a
+  %vb = load <4 x i64>, ptr %b
+  %add = add <4 x i64> %va, %vb
+  %shr = lshr <4 x i64> %add, <i64 1, i64 1, i64 1, i64 1>
+  store <4 x i64> %shr, ptr %res
+  ret void
+}
+
+define void @xvavgr_b(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: xvavgr_b:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvavgr.b $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <32 x i8>, ptr %a
+  %vb = load <32 x i8>, ptr %b
+  %add = add <32 x i8> %va, %vb
+  %add1 = add <32 x i8> %add, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %shr = ashr <32 x i8> %add1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  store <32 x i8> %shr, ptr %res
+  ret void
+}
+
+define void @xvavgr_h(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: xvavgr_h:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvavgr.h $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <16 x i16>, ptr %a
+  %vb = load <16 x i16>, ptr %b
+  %add = add <16 x i16> %va, %vb
+  %add1 = add <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %shr = ashr <16 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  store <16 x i16> %shr, ptr %res
+  ret void
+}
+
+define void @xvavgr_w(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: xvavgr_w:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvavgr.w $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <8 x i32>, ptr %a
+  %vb = load <8 x i32>, ptr %b
+  %add = add <8 x i32> %va, %vb
+  %add1 = add <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %shr = ashr <8 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  store <8 x i32> %shr, ptr %res
+  ret void
+}
+
+define void @xvavgr_d(ptr %res, ptr %a, ptr %b) nounwind {
+; LA32-LABEL: xvavgr_d:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    xvld $xr0, $a1, 0
+; LA32-NEXT:    xvld $xr1, $a2, 0
+; LA32-NEXT:    xvadd.d $xr0, $xr0, $xr1
+; LA32-NEXT:    xvaddi.du $xr0, $xr0, 1
+; LA32-NEXT:    xvsrai.d $xr0, $xr0, 1
+; LA32-NEXT:    xvst $xr0, $a0, 0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: xvavgr_d:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    xvld $xr0, $a1, 0
+; LA64-NEXT:    xvld $xr1, $a2, 0
+; LA64-NEXT:    xvavgr.d $xr0, $xr0, $xr1
+; LA64-NEXT:    xvst $xr0, $a0, 0
+; LA64-NEXT:    ret
+entry:
+  %va = load <4 x i64>, ptr %a
+  %vb = load <4 x i64>, ptr %b
+  %add = add <4 x i64> %va, %vb
+  %add1 = add <4 x i64> %add, <i64 1, i64 1, i64 1, i64 1>
+  %shr = ashr <4 x i64> %add1, <i64 1, i64 1, i64 1, i64 1>
+  store <4 x i64> %shr, ptr %res
+  ret void
+}
+
+define void @xvavgr_bu(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: xvavgr_bu:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvavgr.bu $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <32 x i8>, ptr %a
+  %vb = load <32 x i8>, ptr %b
+  %add = add <32 x i8> %va, %vb
+  %add1 = add <32 x i8> %add, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %shr = lshr <32 x i8> %add1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  store <32 x i8> %shr, ptr %res
+  ret void
+}
+
+define void @xvavgr_hu(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: xvavgr_hu:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvavgr.hu $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <16 x i16>, ptr %a
+  %vb = load <16 x i16>, ptr %b
+  %add = add <16 x i16> %va, %vb
+  %add1 = add <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %shr = lshr <16 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  store <16 x i16> %shr, ptr %res
+  ret void
+}
+
+define void @xvavgr_wu(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: xvavgr_wu:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvavgr.wu $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <8 x i32>, ptr %a
+  %vb = load <8 x i32>, ptr %b
+  %add = add <8 x i32> %va, %vb
+  %add1 = add <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %shr = lshr <8 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  store <8 x i32> %shr, ptr %res
+  ret void
+}
+
+define void @xvavgr_du(ptr %res, ptr %a, ptr %b) nounwind {
+; LA32-LABEL: xvavgr_du:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    xvld $xr0, $a1, 0
+; LA32-NEXT:    xvld $xr1, $a2, 0
+; LA32-NEXT:    xvadd.d $xr0, $xr0, $xr1
+; LA32-NEXT:    xvaddi.du $xr0, $xr0, 1
+; LA32-NEXT:    xvsrli.d $xr0, $xr0, 1
+; LA32-NEXT:    xvst $xr0, $a0, 0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: xvavgr_du:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    xvld $xr0, $a1, 0
+; LA64-NEXT:    xvld $xr1, $a2, 0
+; LA64-NEXT:    xvavgr.du $xr0, $xr0, $xr1
+; LA64-NEXT:    xvst $xr0, $a0, 0
+; LA64-NEXT:    ret
+entry:
+  %va = load <4 x i64>, ptr %a
+  %vb = load <4 x i64>, ptr %b
+  %add = add <4 x i64> %va, %vb
+  %add1 = add <4 x i64> %add, <i64 1, i64 1, i64 1, i64 1>
+  %shr = lshr <4 x i64> %add1, <i64 1, i64 1, i64 1, i64 1>
+  store <4 x i64> %shr, ptr %res
+  ret void
+}
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/avgfloor-ceil.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/avgfloor-ceil.ll
new file mode 100644
index 0000000000000..c82adcb250c64
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/avgfloor-ceil.ll
@@ -0,0 +1,379 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
+
+define void @xvavg_b(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: xvavg_b:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvand.v $xr2, $xr0, $xr1
+; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvsrai.b $xr0, $xr0, 1
+; CHECK-NEXT:    xvadd.b $xr0, $xr2, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <32 x i8>, ptr %a
+  %vb = load <32 x i8>, ptr %b
+  %ea = sext <32 x i8> %va to <32 x i16>
+  %eb = sext <32 x i8> %vb to <32 x i16>
+  %add = add <32 x i16> %ea, %eb
+  %shr = lshr <32 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %r = trunc <32 x i16> %shr to <32 x i8>
+  store <32 x i8> %r, ptr %res
+  ret void
+}
+
+define void @xvavg_h(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: xvavg_h:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvand.v $xr2, $xr0, $xr1
+; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvsrai.h $xr0, $xr0, 1
+; CHECK-NEXT:    xvadd.h $xr0, $xr2, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <16 x i16>, ptr %a
+  %vb = load <16 x i16>, ptr %b
+  %ea = sext <16 x i16> %va to <16 x i32>
+  %eb = sext <16 x i16> %vb to <16 x i32>
+  %add = add <16 x i32> %ea, %eb
+  %shr = lshr <16 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %r = trunc <16 x i32> %shr to <16 x i16>
+  store <16 x i16> %r, ptr %res
+  ret void
+}
+
+define void @xvavg_w(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: xvavg_w:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvand.v $xr2, $xr0, $xr1
+; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvsrai.w $xr0, $xr0, 1
+; CHECK-NEXT:    xvadd.w $xr0, $xr2, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <8 x i32>, ptr %a
+  %vb = load <8 x i32>, ptr %b
+  %ea = sext <8 x i32> %va to <8 x i64>
+  %eb = sext <8 x i32> %vb to <8 x i64>
+  %add = add <8 x i64> %ea, %eb
+  %shr = lshr <8 x i64> %add, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+  %r = trunc <8 x i64> %shr to <8 x i32>
+  store <8 x i32> %r, ptr %res
+  ret void
+}
+
+define void @xvavg_d(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: xvavg_d:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvand.v $xr2, $xr0, $xr1
+; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvsrai.d $xr0, $xr0, 1
+; CHECK-NEXT:    xvadd.d $xr0, $xr2, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <4 x i64>, ptr %a
+  %vb = load <4 x i64>, ptr %b
+  %ea = sext <4 x i64> %va to <4 x i128>
+  %eb = sext <4 x i64> %vb to <4 x i128>
+  %add = add <4 x i128> %ea, %eb
+  %shr = lshr <4 x i128> %add, <i128 1, i128 1, i128 1, i128 1>
+  %r = trunc <4 x i128> %shr to <4 x i64>
+  store <4 x i64> %r, ptr %res
+  ret void
+}
+
+define void @xvavg_bu(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: xvavg_bu:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvand.v $xr2, $xr0, $xr1
+; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvsrli.b $xr0, $xr0, 1
+; CHECK-NEXT:    xvadd.b $xr0, $xr2, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <32 x i8>, ptr %a
+  %vb = load <32 x i8>, ptr %b
+  %ea = zext <32 x i8> %va to <32 x i16>
+  %eb = zext <32 x i8> %vb to <32 x i16>
+  %add = add <32 x i16> %ea, %eb
+  %shr = lshr <32 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %r = trunc <32 x i16> %shr to <32 x i8>
+  store <32 x i8> %r, ptr %res
+  ret void
+}
+
+define void @xvavg_hu(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: xvavg_hu:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvand.v $xr2, $xr0, $xr1
+; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvsrli.h $xr0, $xr0, 1
+; CHECK-NEXT:    xvadd.h $xr0, $xr2, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <16 x i16>, ptr %a
+  %vb = load <16 x i16>, ptr %b
+  %ea = zext <16 x i16> %va to <16 x i32>
+  %eb = zext <16 x i16> %vb to <16 x i32>
+  %add = add <16 x i32> %ea, %eb
+  %shr = lshr <16 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %r = trunc <16 x i32> %shr to <16 x i16>
+  store <16 x i16> %r, ptr %res
+  ret void
+}
+
+define void @xvavg_wu(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: xvavg_wu:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvand.v $xr2, $xr0, $xr1
+; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvsrli.w $xr0, $xr0, 1
+; CHECK-NEXT:    xvadd.w $xr0, $xr2, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <8 x i32>, ptr %a
+  %vb = load <8 x i32>, ptr %b
+  %ea = zext <8 x i32> %va to <8 x i64>
+  %eb = zext <8 x i32> %vb to <8 x i64>
+  %add = add <8 x i64> %ea, %eb
+  %shr = lshr <8 x i64> %add, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+  %r = trunc <8 x i64> %shr to <8 x i32>
+  store <8 x i32> %r, ptr %res
+  ret void
+}
+
+define void @xvavg_du(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: xvavg_du:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvand.v $xr2, $xr0, $xr1
+; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvsrli.d $xr0, $xr0, 1
+; CHECK-NEXT:    xvadd.d $xr0, $xr2, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <4 x i64>, ptr %a
+  %vb = load <4 x i64>, ptr %b
+  %ea = zext <4 x i64> %va to <4 x i128>
+  %eb = zext <4 x i64> %vb to <4 x i128>
+  %add = add <4 x i128> %ea, %eb
+  %shr = lshr <4 x i128> %add, <i128 1, i128 1, i128 1, i128 1>
+  %r = trunc <4 x i128> %shr to <4 x i64>
+  store <4 x i64> %r, ptr %res
+  ret void
+}
+
+define void @xvavgr_b(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: xvavgr_b:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvor.v $xr2, $xr0, $xr1
+; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvsrai.b $xr0, $xr0, 1
+; CHECK-NEXT:    xvsub.b $xr0, $xr2, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <32 x i8>, ptr %a
+  %vb = load <32 x i8>, ptr %b
+  %ea = sext <32 x i8> %va to <32 x i16>
+  %eb = sext <32 x i8> %vb to <32 x i16>
+  %add = add <32 x i16> %ea, %eb
+  %add1 = add <32 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %shr = lshr <32 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %r = trunc <32 x i16> %shr to <32 x i8>
+  store <32 x i8> %r, ptr %res
+  ret void
+}
+
+define void @xvavgr_h(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: xvavgr_h:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvor.v $xr2, $xr0, $xr1
+; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvsrai.h $xr0, $xr0, 1
+; CHECK-NEXT:    xvsub.h $xr0, $xr2, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <16 x i16>, ptr %a
+  %vb = load <16 x i16>, ptr %b
+  %ea = sext <16 x i16> %va to <16 x i32>
+  %eb = sext <16 x i16> %vb to <16 x i32>
+  %add = add <16 x i32> %ea, %eb
+  %add1 = add <16 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %shr = lshr <16 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %r = trunc <16 x i32> %shr to <16 x i16>
+  store <16 x i16> %r, ptr %res
+  ret void
+}
+
+define void @xvavgr_w(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: xvavgr_w:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvor.v $xr2, $xr0, $xr1
+; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvsrai.w $xr0, $xr0, 1
+; CHECK-NEXT:    xvsub.w $xr0, $xr2, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <8 x i32>, ptr %a
+  %vb = load <8 x i32>, ptr %b
+  %ea = sext <8 x i32> %va to <8 x i64>
+  %eb = sext <8 x i32> %vb to <8 x i64>
+  %add = add <8 x i64> %ea, %eb
+  %add1 = add <8 x i64> %add, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+  %shr = lshr <8 x i64> %add1, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+  %r = trunc <8 x i64> %shr to <8 x i32>
+  store <8 x i32> %r, ptr %res
+  ret void
+}
+
+define void @xvavgr_d(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: xvavgr_d:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvor.v $xr2, $xr0, $xr1
+; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvsrai.d $xr0, $xr0, 1
+; CHECK-NEXT:    xvsub.d $xr0, $xr2, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <4 x i64>, ptr %a
+  %vb = load <4 x i64>, ptr %b
+  %ea = sext <4 x i64> %va to <4 x i128>
+  %eb = sext <4 x i64> %vb to <4 x i128>
+  %add = add <4 x i128> %ea, %eb
+  %add1 = add <4 x i128> %add, <i128 1, i128 1, i128 1, i128 1>
+  %shr = lshr <4 x i128> %add1, <i128 1, i128 1, i128 1, i128 1>
+  %r = trunc <4 x i128> %shr to <4 x i64>
+  store <4 x i64> %r, ptr %res
+  ret void
+}
+
+define void @xvavgr_bu(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: xvavgr_bu:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvor.v $xr2, $xr0, $xr1
+; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvsrli.b $xr0, $xr0, 1
+; CHECK-NEXT:    xvsub.b $xr0, $xr2, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <32 x i8>, ptr %a
+  %vb = load <32 x i8>, ptr %b
+  %ea = zext <32 x i8> %va to <32 x i16>
+  %eb = zext <32 x i8> %vb to <32 x i16>
+  %add = add <32 x i16> %ea, %eb
+  %add1 = add <32 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %shr = lshr <32 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %r = trunc <32 x i16> %shr to <32 x i8>
+  store <32 x i8> %r, ptr %res
+  ret void
+}
+
+define void @xvavgr_hu(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: xvavgr_hu:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvor.v $xr2, $xr0, $xr1
+; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvsrli.h $xr0, $xr0, 1
+; CHECK-NEXT:    xvsub.h $xr0, $xr2, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <16 x i16>, ptr %a
+  %vb = load <16 x i16>, ptr %b
+  %ea = zext <16 x i16> %va to <16 x i32>
+  %eb = zext <16 x i16> %vb to <16 x i32>
+  %add = add <16 x i32> %ea, %eb
+  %add1 = add <16 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %shr = lshr <16 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %r = trunc <16 x i32> %shr to <16 x i16>
+  store <16 x i16> %r, ptr %res
+  ret void
+}
+
+define void @xvavgr_wu(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: xvavgr_wu:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvor.v $xr2, $xr0, $xr1
+; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvsrli.w $xr0, $xr0, 1
+; CHECK-NEXT:    xvsub.w $xr0, $xr2, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <8 x i32>, ptr %a
+  %vb = load <8 x i32>, ptr %b
+  %ea = zext <8 x i32> %va to <8 x i64>
+  %eb = zext <8 x i32> %vb to <8 x i64>
+  %add = add <8 x i64> %ea, %eb
+  %add1 = add <8 x i64> %add, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+  %shr = lshr <8 x i64> %add1, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+  %r = trunc <8 x i64> %shr to <8 x i32>
+  store <8 x i32> %r, ptr %res
+  ret void
+}
+
+define void @xvavgr_du(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: xvavgr_du:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvor.v $xr2, $xr0, $xr1
+; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvsrli.d $xr0, $xr0, 1
+; CHECK-NEXT:    xvsub.d $xr0, $xr2, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <4 x i64>, ptr %a
+  %vb = load <4 x i64>, ptr %b
+  %ea = zext <4 x i64> %va to <4 x i128>
+  %eb = zext <4 x i64> %vb to <4 x i128>
+  %add = add <4 x i128> %ea, %eb
+  %add1 = add <4 x i128> %add, <i128 1, i128 1, i128 1, i128 1>
+  %shr = lshr <4 x i128> %add1, <i128 1, i128 1, i128 1, i128 1>
+  %r = trunc <4 x i128> %shr to <4 x i64>
+  store <4 x i64> %r, ptr %res
+  ret void
+}
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/flog2.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/flog2.ll
index 68f2e3ab488e1..6b5f5751e5706 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/flog2.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/flog2.ll
@@ -1,166 +1,17 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s --check-prefix=LA32
-; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s --check-prefix=LA64
+; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
 
 declare <8 x float> @llvm.log2.v8f32(<8 x float>)
 declare <4 x double> @llvm.log2.v4f64(<4 x double>)
 
 define void @flog2_v8f32(ptr %res, ptr %a) nounwind {
-; LA32-LABEL: flog2_v8f32:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    addi.w $sp, $sp, -128
-; LA32-NEXT:    st.w $ra, $sp, 124 # 4-byte Folded Spill
-; LA32-NEXT:    st.w $fp, $sp, 120 # 4-byte Folded Spill
-; LA32-NEXT:    xvld $xr0, $a1, 0
-; LA32-NEXT:    xvst $xr0, $sp, 80 # 32-byte Folded Spill
-; LA32-NEXT:    move $fp, $a0
-; LA32-NEXT:    xvpickve.w $xr0, $xr0, 5
-; LA32-NEXT:    # kill: def $f0 killed $f0 killed $xr0
-; LA32-NEXT:    bl log2f
-; LA32-NEXT:    # kill: def $f0 killed $f0 def $vr0
-; LA32-NEXT:    vst $vr0, $sp, 48 # 16-byte Folded Spill
-; LA32-NEXT:    xvld $xr0, $sp, 80 # 32-byte Folded Reload
-; LA32-NEXT:    xvpickve.w $xr0, $xr0, 4
-; LA32-NEXT:    # kill: def $f0 killed $f0 killed $xr0
-; LA32-NEXT:    bl log2f
-; LA32-NEXT:    # kill: def $f0 killed $f0 def $xr0
-; LA32-NEXT:    vld $vr1, $sp, 48 # 16-byte Folded Reload
-; LA32-NEXT:    vextrins.w $vr0, $vr1, 16
-; LA32-NEXT:    xvst $xr0, $sp, 48 # 32-byte Folded Spill
-; LA32-NEXT:    xvld $xr0, $sp, 80 # 32-byte Folded Reload
-; LA32-NEXT:    xvpickve.w $xr0, $xr0, 6
-; LA32-NEXT:    # kill: def $f0 killed $f0 killed $xr0
-; LA32-NEXT:    bl log2f
-; LA32-NEXT:    # kill: def $f0 killed $f0 def $vr0
-; LA32-NEXT:    xvld $xr1, $sp, 48 # 32-byte Folded Reload
-; LA32-NEXT:    vextrins.w $vr1, $vr0, 32
-; LA32-NEXT:    xvst $xr1, $sp, 48 # 32-byte Folded Spill
-; LA32-NEXT:    xvld $xr0, $sp, 80 # 32-byte Folded Reload
-; LA32-NEXT:    xvpickve.w $xr0, $xr0, 7
-; LA32-NEXT:    # kill: def $f0 killed $f0 killed $xr0
-; LA32-NEXT:    bl log2f
-; LA32-NEXT:    # kill: def $f0 killed $f0 def $vr0
-; LA32-NEXT:    xvld $xr1, $sp, 48 # 32-byte Folded Reload
-; LA32-NEXT:    vextrins.w $vr1, $vr0, 48
-; LA32-NEXT:    xvst $xr1, $sp, 48 # 32-byte Folded Spill
-; LA32-NEXT:    xvld $xr0, $sp, 80 # 32-byte Folded Reload
-; LA32-NEXT:    xvpickve.w $xr0, $xr0, 1
-; LA32-NEXT:    # kill: def $f0 killed $f0 killed $xr0
-; LA32-NEXT:    bl log2f
-; LA32-NEXT:    # kill: def $f0 killed $f0 def $vr0
-; LA32-NEXT:    vst $vr0, $sp, 16 # 16-byte Folded Spill
-; LA32-NEXT:    xvld $xr0, $sp, 80 # 32-byte Folded Reload
-; LA32-NEXT:    xvpickve.w $xr0, $xr0, 0
-; LA32-NEXT:    # kill: def $f0 killed $f0 killed $xr0
-; LA32-NEXT:    bl log2f
-; LA32-NEXT:    # kill: def $f0 killed $f0 def $xr0
-; LA32-NEXT:    vld $vr1, $sp, 16 # 16-byte Folded Reload
-; LA32-NEXT:    vextrins.w $vr0, $vr1, 16
-; LA32-NEXT:    xvst $xr0, $sp, 16 # 32-byte Folded Spill
-; LA32-NEXT:    xvld $xr0, $sp, 80 # 32-byte Folded Reload
-; LA32-NEXT:    xvpickve.w $xr0, $xr0, 2
-; LA32-NEXT:    # kill: def $f0 killed $f0 killed $xr0
-; LA32-NEXT:    bl log2f
-; LA32-NEXT:    # kill: def $f0 killed $f0 def $vr0
-; LA32-NEXT:    xvld $xr1, $sp, 16 # 32-byte Folded Reload
-; LA32-NEXT:    vextrins.w $vr1, $vr0, 32
-; LA32-NEXT:    xvst $xr1, $sp, 16 # 32-byte Folded Spill
-; LA32-NEXT:    xvld $xr0, $sp, 80 # 32-byte Folded Reload
-; LA32-NEXT:    xvpickve.w $xr0, $xr0, 3
-; LA32-NEXT:    # kill: def $f0 killed $f0 killed $xr0
-; LA32-NEXT:    bl log2f
-; LA32-NEXT:    # kill: def $f0 killed $f0 def $vr0
-; LA32-NEXT:    xvld $xr1, $sp, 16 # 32-byte Folded Reload
-; LA32-NEXT:    vextrins.w $vr1, $vr0, 48
-; LA32-NEXT:    xvld $xr0, $sp, 48 # 32-byte Folded Reload
-; LA32-NEXT:    xvpermi.q $xr1, $xr0, 2
-; LA32-NEXT:    xvst $xr1, $fp, 0
-; LA32-NEXT:    ld.w $fp, $sp, 120 # 4-byte Folded Reload
-; LA32-NEXT:    ld.w $ra, $sp, 124 # 4-byte Folded Reload
-; LA32-NEXT:    addi.w $sp, $sp, 128
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: flog2_v8f32:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    addi.d $sp, $sp, -128
-; LA64-NEXT:    st.d $ra, $sp, 120 # 8-byte Folded Spill
-; LA64-NEXT:    st.d $fp, $sp, 112 # 8-byte Folded Spill
-; LA64-NEXT:    xvld $xr0, $a1, 0
-; LA64-NEXT:    xvst $xr0, $sp, 80 # 32-byte Folded Spill
-; LA64-NEXT:    move $fp, $a0
-; LA64-NEXT:    xvpickve.w $xr0, $xr0, 5
-; LA64-NEXT:    # kill: def $f0 killed $f0 killed $xr0
-; LA64-NEXT:    pcaddu18i $ra, %call36(log2f)
-; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    # kill: def $f0 killed $f0 def $vr0
-; LA64-NEXT:    vst $vr0, $sp, 48 # 16-byte Folded Spill
-; LA64-NEXT:    xvld $xr0, $sp, 80 # 32-byte Folded Reload
-; LA64-NEXT:    xvpickve.w $xr0, $xr0, 4
-; LA64-NEXT:    # kill: def $f0 killed $f0 killed $xr0
-; LA64-NEXT:    pcaddu18i $ra, %call36(log2f)
-; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    # kill: def $f0 killed $f0 def $xr0
-; LA64-NEXT:    vld $vr1, $sp, 48 # 16-byte Folded Reload
-; LA64-NEXT:    vextrins.w $vr0, $vr1, 16
-; LA64-NEXT:    xvst $xr0, $sp, 48 # 32-byte Folded Spill
-; LA64-NEXT:    xvld $xr0, $sp, 80 # 32-byte Folded Reload
-; LA64-NEXT:    xvpickve.w $xr0, $xr0, 6
-; LA64-NEXT:    # kill: def $f0 killed $f0 killed $xr0
-; LA64-NEXT:    pcaddu18i $ra, %call36(log2f)
-; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    # kill: def $f0 killed $f0 def $vr0
-; LA64-NEXT:    xvld $xr1, $sp, 48 # 32-byte Folded Reload
-; LA64-NEXT:    vextrins.w $vr1, $vr0, 32
-; LA64-NEXT:    xvst $xr1, $sp, 48 # 32-byte Folded Spill
-; LA64-NEXT:    xvld $xr0, $sp, 80 # 32-byte Folded Reload
-; LA64-NEXT:    xvpickve.w $xr0, $xr0, 7
-; LA64-NEXT:    # kill: def $f0 killed $f0 killed $xr0
-; LA64-NEXT:    pcaddu18i $ra, %call36(log2f)
-; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    # kill: def $f0 killed $f0 def $vr0
-; LA64-NEXT:    xvld $xr1, $sp, 48 # 32-byte Folded Reload
-; LA64-NEXT:    vextrins.w $vr1, $vr0, 48
-; LA64-NEXT:    xvst $xr1, $sp, 48 # 32-byte Folded Spill
-; LA64-NEXT:    xvld $xr0, $sp, 80 # 32-byte Folded Reload
-; LA64-NEXT:    xvpickve.w $xr0, $xr0, 1
-; LA64-NEXT:    # kill: def $f0 killed $f0 killed $xr0
-; LA64-NEXT:    pcaddu18i $ra, %call36(log2f)
-; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    # kill: def $f0 killed $f0 def $vr0
-; LA64-NEXT:    vst $vr0, $sp, 16 # 16-byte Folded Spill
-; LA64-NEXT:    xvld $xr0, $sp, 80 # 32-byte Folded Reload
-; LA64-NEXT:    xvpickve.w $xr0, $xr0, 0
-; LA64-NEXT:    # kill: def $f0 killed $f0 killed $xr0
-; LA64-NEXT:    pcaddu18i $ra, %call36(log2f)
-; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    # kill: def $f0 killed $f0 def $xr0
-; LA64-NEXT:    vld $vr1, $sp, 16 # 16-byte Folded Reload
-; LA64-NEXT:    vextrins.w $vr0, $vr1, 16
-; LA64-NEXT:    xvst $xr0, $sp, 16 # 32-byte Folded Spill
-; LA64-NEXT:    xvld $xr0, $sp, 80 # 32-byte Folded Reload
-; LA64-NEXT:    xvpickve.w $xr0, $xr0, 2
-; LA64-NEXT:    # kill: def $f0 killed $f0 killed $xr0
-; LA64-NEXT:    pcaddu18i $ra, %call36(log2f)
-; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    # kill: def $f0 killed $f0 def $vr0
-; LA64-NEXT:    xvld $xr1, $sp, 16 # 32-byte Folded Reload
-; LA64-NEXT:    vextrins.w $vr1, $vr0, 32
-; LA64-NEXT:    xvst $xr1, $sp, 16 # 32-byte Folded Spill
-; LA64-NEXT:    xvld $xr0, $sp, 80 # 32-byte Folded Reload
-; LA64-NEXT:    xvpickve.w $xr0, $xr0, 3
-; LA64-NEXT:    # kill: def $f0 killed $f0 killed $xr0
-; LA64-NEXT:    pcaddu18i $ra, %call36(log2f)
-; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    # kill: def $f0 killed $f0 def $vr0
-; LA64-NEXT:    xvld $xr1, $sp, 16 # 32-byte Folded Reload
-; LA64-NEXT:    vextrins.w $vr1, $vr0, 48
-; LA64-NEXT:    xvld $xr0, $sp, 48 # 32-byte Folded Reload
-; LA64-NEXT:    xvpermi.q $xr1, $xr0, 2
-; LA64-NEXT:    xvst $xr1, $fp, 0
-; LA64-NEXT:    ld.d $fp, $sp, 112 # 8-byte Folded Reload
-; LA64-NEXT:    ld.d $ra, $sp, 120 # 8-byte Folded Reload
-; LA64-NEXT:    addi.d $sp, $sp, 128
-; LA64-NEXT:    ret
+; CHECK-LABEL: flog2_v8f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvflogb.s $xr0, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
 entry:
   %v = load <8 x float>, ptr %a
   %r = call <8 x float> @llvm.log2.v8f32(<8 x float> %v)
@@ -169,93 +20,12 @@ entry:
 }
 
 define void @flog2_v4f64(ptr %res, ptr %a) nounwind {
-; LA32-LABEL: flog2_v4f64:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    addi.w $sp, $sp, -112
-; LA32-NEXT:    st.w $ra, $sp, 108 # 4-byte Folded Spill
-; LA32-NEXT:    st.w $fp, $sp, 104 # 4-byte Folded Spill
-; LA32-NEXT:    xvld $xr0, $a1, 0
-; LA32-NEXT:    xvst $xr0, $sp, 64 # 32-byte Folded Spill
-; LA32-NEXT:    move $fp, $a0
-; LA32-NEXT:    xvpickve.d $xr0, $xr0, 3
-; LA32-NEXT:    # kill: def $f0_64 killed $f0_64 killed $xr0
-; LA32-NEXT:    bl log2
-; LA32-NEXT:    # kill: def $f0_64 killed $f0_64 def $vr0
-; LA32-NEXT:    vst $vr0, $sp, 32 # 16-byte Folded Spill
-; LA32-NEXT:    xvld $xr0, $sp, 64 # 32-byte Folded Reload
-; LA32-NEXT:    xvpickve.d $xr0, $xr0, 2
-; LA32-NEXT:    # kill: def $f0_64 killed $f0_64 killed $xr0
-; LA32-NEXT:    bl log2
-; LA32-NEXT:    # kill: def $f0_64 killed $f0_64 def $xr0
-; LA32-NEXT:    vld $vr1, $sp, 32 # 16-byte Folded Reload
-; LA32-NEXT:    vextrins.d $vr0, $vr1, 16
-; LA32-NEXT:    xvst $xr0, $sp, 32 # 32-byte Folded Spill
-; LA32-NEXT:    xvld $xr0, $sp, 64 # 32-byte Folded Reload
-; LA32-NEXT:    xvpickve.d $xr0, $xr0, 1
-; LA32-NEXT:    # kill: def $f0_64 killed $f0_64 killed $xr0
-; LA32-NEXT:    bl log2
-; LA32-NEXT:    # kill: def $f0_64 killed $f0_64 def $vr0
-; LA32-NEXT:    vst $vr0, $sp, 16 # 16-byte Folded Spill
-; LA32-NEXT:    xvld $xr0, $sp, 64 # 32-byte Folded Reload
-; LA32-NEXT:    xvpickve.d $xr0, $xr0, 0
-; LA32-NEXT:    # kill: def $f0_64 killed $f0_64 killed $xr0
-; LA32-NEXT:    bl log2
-; LA32-NEXT:    # kill: def $f0_64 killed $f0_64 def $xr0
-; LA32-NEXT:    vld $vr1, $sp, 16 # 16-byte Folded Reload
-; LA32-NEXT:    vextrins.d $vr0, $vr1, 16
-; LA32-NEXT:    xvld $xr1, $sp, 32 # 32-byte Folded Reload
-; LA32-NEXT:    xvpermi.q $xr0, $xr1, 2
-; LA32-NEXT:    xvst $xr0, $fp, 0
-; LA32-NEXT:    ld.w $fp, $sp, 104 # 4-byte Folded Reload
-; LA32-NEXT:    ld.w $ra, $sp, 108 # 4-byte Folded Reload
-; LA32-NEXT:    addi.w $sp, $sp, 112
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: flog2_v4f64:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    addi.d $sp, $sp, -112
-; LA64-NEXT:    st.d $ra, $sp, 104 # 8-byte Folded Spill
-; LA64-NEXT:    st.d $fp, $sp, 96 # 8-byte Folded Spill
-; LA64-NEXT:    xvld $xr0, $a1, 0
-; LA64-NEXT:    xvst $xr0, $sp, 64 # 32-byte Folded Spill
-; LA64-NEXT:    move $fp, $a0
-; LA64-NEXT:    xvpickve.d $xr0, $xr0, 3
-; LA64-NEXT:    # kill: def $f0_64 killed $f0_64 killed $xr0
-; LA64-NEXT:    pcaddu18i $ra, %call36(log2)
-; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    # kill: def $f0_64 killed $f0_64 def $vr0
-; LA64-NEXT:    vst $vr0, $sp, 32 # 16-byte Folded Spill
-; LA64-NEXT:    xvld $xr0, $sp, 64 # 32-byte Folded Reload
-; LA64-NEXT:    xvpickve.d $xr0, $xr0, 2
-; LA64-NEXT:    # kill: def $f0_64 killed $f0_64 killed $xr0
-; LA64-NEXT:    pcaddu18i $ra, %call36(log2)
-; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    # kill: def $f0_64 killed $f0_64 def $xr0
-; LA64-NEXT:    vld $vr1, $sp, 32 # 16-byte Folded Reload
-; LA64-NEXT:    vextrins.d $vr0, $vr1, 16
-; LA64-NEXT:    xvst $xr0, $sp, 32 # 32-byte Folded Spill
-; LA64-NEXT:    xvld $xr0, $sp, 64 # 32-byte Folded Reload
-; LA64-NEXT:    xvpickve.d $xr0, $xr0, 1
-; LA64-NEXT:    # kill: def $f0_64 killed $f0_64 killed $xr0
-; LA64-NEXT:    pcaddu18i $ra, %call36(log2)
-; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    # kill: def $f0_64 killed $f0_64 def $vr0
-; LA64-NEXT:    vst $vr0, $sp, 16 # 16-byte Folded Spill
-; LA64-NEXT:    xvld $xr0, $sp, 64 # 32-byte Folded Reload
-; LA64-NEXT:    xvpickve.d $xr0, $xr0, 0
-; LA64-NEXT:    # kill: def $f0_64 killed $f0_64 killed $xr0
-; LA64-NEXT:    pcaddu18i $ra, %call36(log2)
-; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    # kill: def $f0_64 killed $f0_64 def $xr0
-; LA64-NEXT:    vld $vr1, $sp, 16 # 16-byte Folded Reload
-; LA64-NEXT:    vextrins.d $vr0, $vr1, 16
-; LA64-NEXT:    xvld $xr1, $sp, 32 # 32-byte Folded Reload
-; LA64-NEXT:    xvpermi.q $xr0, $xr1, 2
-; LA64-NEXT:    xvst $xr0, $fp, 0
-; LA64-NEXT:    ld.d $fp, $sp, 96 # 8-byte Folded Reload
-; LA64-NEXT:    ld.d $ra, $sp, 104 # 8-byte Folded Reload
-; LA64-NEXT:    addi.d $sp, $sp, 112
-; LA64-NEXT:    ret
+; CHECK-LABEL: flog2_v4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvflogb.d $xr0, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
 entry:
   %v = load <4 x double>, ptr %a
   %r = call <4 x double> @llvm.log2.v4f64(<4 x double> %v)
diff --git a/llvm/test/CodeGen/LoongArch/lsx/ctpop-ctlz.ll b/llvm/test/CodeGen/LoongArch/lsx/ctpop-ctlz.ll
index a9a38e8f75f9c..6ac7d51de253b 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/ctpop-ctlz.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/ctpop-ctlz.ll
@@ -106,6 +106,69 @@ define void @ctlz_v2i64(ptr %src, ptr %dst) nounwind {
   ret void
 }
 
+define void @not_ctlz_v16i8(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: not_ctlz_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a0, 0
+; CHECK-NEXT:    vxori.b $vr0, $vr0, 255
+; CHECK-NEXT:    vclz.b $vr0, $vr0
+; CHECK-NEXT:    vst $vr0, $a1, 0
+; CHECK-NEXT:    ret
+  %v = load <16 x i8>, ptr %src
+  %neg = xor <16 x i8> %v, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  %res = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %neg, i1 false)
+  store <16 x i8> %res, ptr %dst
+  ret void
+}
+
+define void @not_ctlz_v8i16(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: not_ctlz_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a0, 0
+; CHECK-NEXT:    vrepli.b $vr1, -1
+; CHECK-NEXT:    vxor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vclz.h $vr0, $vr0
+; CHECK-NEXT:    vst $vr0, $a1, 0
+; CHECK-NEXT:    ret
+  %v = load <8 x i16>, ptr %src
+  %neg = xor <8 x i16> %v, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+  %res = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %neg, i1 false)
+  store <8 x i16> %res, ptr %dst
+  ret void
+}
+
+define void @not_ctlz_v4i32(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: not_ctlz_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a0, 0
+; CHECK-NEXT:    vrepli.b $vr1, -1
+; CHECK-NEXT:    vxor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vclz.w $vr0, $vr0
+; CHECK-NEXT:    vst $vr0, $a1, 0
+; CHECK-NEXT:    ret
+  %v = load <4 x i32>, ptr %src
+  %neg = xor <4 x i32> %v, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %res = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %neg, i1 false)
+  store <4 x i32> %res, ptr %dst
+  ret void
+}
+
+define void @not_ctlz_v2i64(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: not_ctlz_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a0, 0
+; CHECK-NEXT:    vrepli.b $vr1, -1
+; CHECK-NEXT:    vxor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vclz.d $vr0, $vr0
+; CHECK-NEXT:    vst $vr0, $a1, 0
+; CHECK-NEXT:    ret
+  %v = load <2 x i64>, ptr %src
+  %neg = xor <2 x i64> %v, <i64 -1, i64 -1>
+  %res = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %neg, i1 false)
+  store <2 x i64> %res, ptr %dst
+  ret void
+}
+
 declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8>)
 declare <8 x i16> @llvm.ctpop.v8i16(<8 x i16>)
 declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>)
diff --git a/llvm/test/CodeGen/LoongArch/lsx/fp-max-min.ll b/llvm/test/CodeGen/LoongArch/lsx/fp-max-min.ll
new file mode 100644
index 0000000000000..c17309230ee72
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lsx/fp-max-min.ll
@@ -0,0 +1,72 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s
+; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
+
+define void @minnum_v4f32(ptr %res, ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: minnum_v4f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vfmin.s $vr0, $vr0, $vr1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <4 x float>, ptr %x
+  %v1 = load <4 x float>, ptr %y
+  %r = call <4 x float> @llvm.minnum.v4f32(<4 x float> %v0, <4 x float> %v1)
+  store <4 x float> %r, ptr %res
+  ret void
+}
+
+define void @minnum_v2f64(ptr %res, ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: minnum_v2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vfmin.d $vr0, $vr0, $vr1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <2 x double>, ptr %x
+  %v1 = load <2 x double>, ptr %y
+  %r = call <2 x double> @llvm.minnum.v2f64(<2 x double> %v0, <2 x double> %v1)
+  store <2 x double> %r, ptr %res
+  ret void
+}
+
+define void @maxnum_v4f32(ptr %res, ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: maxnum_v4f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vfmax.s $vr0, $vr0, $vr1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <4 x float>, ptr %x
+  %v1 = load <4 x float>, ptr %y
+  %r = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %v0, <4 x float> %v1)
+  store <4 x float> %r, ptr %res
+  ret void
+}
+
+define void @maxnum_v2f64(ptr %res, ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: maxnum_v2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vfmax.d $vr0, $vr0, $vr1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <2 x double>, ptr %x
+  %v1 = load <2 x double>, ptr %y
+  %r = call <2 x double> @llvm.maxnum.v2f64(<2 x double> %v0, <2 x double> %v1)
+  store <2 x double> %r, ptr %res
+  ret void
+}
+
+declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.minnum.v2f64(<2 x double>, <2 x double>)
+declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.maxnum.v2f64(<2 x double>, <2 x double>)
diff --git a/llvm/test/CodeGen/LoongArch/lsx/fp-rounding.ll b/llvm/test/CodeGen/LoongArch/lsx/fp-rounding.ll
new file mode 100644
index 0000000000000..cb01ac0358ab3
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lsx/fp-rounding.ll
@@ -0,0 +1,132 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s
+; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
+
+;; ceilf
+define void @ceil_v4f32(ptr %res, ptr %a0) nounwind {
+; CHECK-LABEL: ceil_v4f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vfrintrp.s $vr0, $vr0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <4 x float>, ptr %a0
+  %r = call <4 x float> @llvm.ceil.v4f32(<4 x float> %v0)
+  store <4 x float> %r, ptr %res
+  ret void
+}
+
+;; ceil
+define void @ceil_v2f64(ptr %res, ptr %a0) nounwind {
+; CHECK-LABEL: ceil_v2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vfrintrp.d $vr0, $vr0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <2 x double>, ptr %a0
+  %r = call <2 x double> @llvm.ceil.v2f64(<2 x double> %v0)
+  store <2 x double> %r, ptr %res
+  ret void
+}
+
+;; floorf
+define void @floor_v4f32(ptr %res, ptr %a0) nounwind {
+; CHECK-LABEL: floor_v4f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vfrintrm.s $vr0, $vr0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <4 x float>, ptr %a0
+  %r = call <4 x float> @llvm.floor.v4f32(<4 x float> %v0)
+  store <4 x float> %r, ptr %res
+  ret void
+}
+
+;; floor
+define void @floor_v2f64(ptr %res, ptr %a0) nounwind {
+; CHECK-LABEL: floor_v2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vfrintrm.d $vr0, $vr0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <2 x double>, ptr %a0
+  %r = call <2 x double> @llvm.floor.v2f64(<2 x double> %v0)
+  store <2 x double> %r, ptr %res
+  ret void
+}
+
+;; truncf
+define void @trunc_v4f32(ptr %res, ptr %a0) nounwind {
+; CHECK-LABEL: trunc_v4f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vfrintrz.s $vr0, $vr0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <4 x float>, ptr %a0
+  %r = call <4 x float> @llvm.trunc.v4f32(<4 x float> %v0)
+  store <4 x float> %r, ptr %res
+  ret void
+}
+
+;; trunc
+define void @trunc_v2f64(ptr %res, ptr %a0) nounwind {
+; CHECK-LABEL: trunc_v2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vfrintrz.d $vr0, $vr0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <2 x double>, ptr %a0
+  %r = call <2 x double> @llvm.trunc.v2f64(<2 x double> %v0)
+  store <2 x double> %r, ptr %res
+  ret void
+}
+
+;; roundevenf
+define void @roundeven_v4f32(ptr %res, ptr %a0) nounwind {
+; CHECK-LABEL: roundeven_v4f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vfrintrne.s $vr0, $vr0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <4 x float>, ptr %a0
+  %r = call <4 x float> @llvm.roundeven.v4f32(<4 x float> %v0)
+  store <4 x float> %r, ptr %res
+  ret void
+}
+
+;; roundeven
+define void @roundeven_v2f64(ptr %res, ptr %a0) nounwind {
+; CHECK-LABEL: roundeven_v2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vfrintrne.d $vr0, $vr0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <2 x double>, ptr %a0
+  %r = call <2 x double> @llvm.roundeven.v2f64(<2 x double> %v0)
+  store <2 x double> %r, ptr %res
+  ret void
+}
+
+declare <4 x float> @llvm.ceil.v4f32(<4 x float>)
+declare <2 x double> @llvm.ceil.v2f64(<2 x double>)
+declare <4 x float> @llvm.floor.v4f32(<4 x float>)
+declare <2 x double> @llvm.floor.v2f64(<2 x double>)
+declare <4 x float> @llvm.trunc.v4f32(<4 x float>)
+declare <2 x double> @llvm.trunc.v2f64(<2 x double>)
+declare <4 x float> @llvm.roundeven.v4f32(<4 x float>)
+declare <2 x double> @llvm.roundeven.v2f64(<2 x double>)
diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/avg.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/avg.ll
new file mode 100644
index 0000000000000..334af22edee59
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/avg.ll
@@ -0,0 +1,321 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s --check-prefixes=CHECK,LA32
+; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s --check-prefixes=CHECK,LA64
+
+define void @vavg_b(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: vavg_b:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vavg.b $vr0, $vr0, $vr1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <16 x i8>, ptr %a
+  %vb = load <16 x i8>, ptr %b
+  %add = add <16 x i8> %va, %vb
+  %shr = ashr <16 x i8> %add, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  store <16 x i8> %shr, ptr %res
+  ret void
+}
+
+define void @vavg_h(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: vavg_h:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vavg.h $vr0, $vr0, $vr1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <8 x i16>, ptr %a
+  %vb = load <8 x i16>, ptr %b
+  %add = add <8 x i16> %va, %vb
+  %shr = ashr <8 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  store <8 x i16> %shr, ptr %res
+  ret void
+}
+
+define void @vavg_w(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: vavg_w:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vavg.w $vr0, $vr0, $vr1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <4 x i32>, ptr %a
+  %vb = load <4 x i32>, ptr %b
+  %add = add <4 x i32> %va, %vb
+  %shr = ashr <4 x i32> %add, <i32 1, i32 1, i32 1, i32 1>
+  store <4 x i32> %shr, ptr %res
+  ret void
+}
+
+define void @vavg_d(ptr %res, ptr %a, ptr %b) nounwind {
+; LA32-LABEL: vavg_d:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    vld $vr0, $a1, 0
+; LA32-NEXT:    vld $vr1, $a2, 0
+; LA32-NEXT:    vadd.d $vr0, $vr0, $vr1
+; LA32-NEXT:    vsrai.d $vr0, $vr0, 1
+; LA32-NEXT:    vst $vr0, $a0, 0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: vavg_d:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    vld $vr0, $a1, 0
+; LA64-NEXT:    vld $vr1, $a2, 0
+; LA64-NEXT:    vavg.d $vr0, $vr0, $vr1
+; LA64-NEXT:    vst $vr0, $a0, 0
+; LA64-NEXT:    ret
+entry:
+  %va = load <2 x i64>, ptr %a
+  %vb = load <2 x i64>, ptr %b
+  %add = add <2 x i64> %va, %vb
+  %shr = ashr <2 x i64> %add, <i64 1, i64 1>
+  store <2 x i64> %shr, ptr %res
+  ret void
+}
+
+define void @vavg_bu(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: vavg_bu:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vavg.bu $vr0, $vr0, $vr1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <16 x i8>, ptr %a
+  %vb = load <16 x i8>, ptr %b
+  %add = add <16 x i8> %va, %vb
+  %shr = lshr <16 x i8> %add, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  store <16 x i8> %shr, ptr %res
+  ret void
+}
+
+define void @vavg_hu(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: vavg_hu:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vavg.hu $vr0, $vr0, $vr1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <8 x i16>, ptr %a
+  %vb = load <8 x i16>, ptr %b
+  %add = add <8 x i16> %va, %vb
+  %shr = lshr <8 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  store <8 x i16> %shr, ptr %res
+  ret void
+}
+
+define void @vavg_wu(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: vavg_wu:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vavg.wu $vr0, $vr0, $vr1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <4 x i32>, ptr %a
+  %vb = load <4 x i32>, ptr %b
+  %add = add <4 x i32> %va, %vb
+  %shr = lshr <4 x i32> %add, <i32 1, i32 1, i32 1, i32 1>
+  store <4 x i32> %shr, ptr %res
+  ret void
+}
+
+define void @vavg_du(ptr %res, ptr %a, ptr %b) nounwind {
+; LA32-LABEL: vavg_du:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    vld $vr0, $a1, 0
+; LA32-NEXT:    vld $vr1, $a2, 0
+; LA32-NEXT:    vadd.d $vr0, $vr0, $vr1
+; LA32-NEXT:    vsrli.d $vr0, $vr0, 1
+; LA32-NEXT:    vst $vr0, $a0, 0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: vavg_du:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    vld $vr0, $a1, 0
+; LA64-NEXT:    vld $vr1, $a2, 0
+; LA64-NEXT:    vavg.du $vr0, $vr0, $vr1
+; LA64-NEXT:    vst $vr0, $a0, 0
+; LA64-NEXT:    ret
+entry:
+  %va = load <2 x i64>, ptr %a
+  %vb = load <2 x i64>, ptr %b
+  %add = add <2 x i64> %va, %vb
+  %shr = lshr <2 x i64> %add, <i64 1, i64 1>
+  store <2 x i64> %shr, ptr %res
+  ret void
+}
+
+define void @vavgr_b(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: vavgr_b:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vavgr.b $vr0, $vr0, $vr1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <16 x i8>, ptr %a
+  %vb = load <16 x i8>, ptr %b
+  %add = add <16 x i8> %va, %vb
+  %add1 = add <16 x i8> %add, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %shr = ashr <16 x i8> %add1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  store <16 x i8> %shr, ptr %res
+  ret void
+}
+
+define void @vavgr_h(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: vavgr_h:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vavgr.h $vr0, $vr0, $vr1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <8 x i16>, ptr %a
+  %vb = load <8 x i16>, ptr %b
+  %add = add <8 x i16> %va, %vb
+  %add1 = add <8 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %shr = ashr <8 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  store <8 x i16> %shr, ptr %res
+  ret void
+}
+
+define void @vavgr_w(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: vavgr_w:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vavgr.w $vr0, $vr0, $vr1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <4 x i32>, ptr %a
+  %vb = load <4 x i32>, ptr %b
+  %add = add <4 x i32> %va, %vb
+  %add1 = add <4 x i32> %add, <i32 1, i32 1, i32 1, i32 1>
+  %shr = ashr <4 x i32> %add1, <i32 1, i32 1, i32 1, i32 1>
+  store <4 x i32> %shr, ptr %res
+  ret void
+}
+
+define void @vavgr_d(ptr %res, ptr %a, ptr %b) nounwind {
+; LA32-LABEL: vavgr_d:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    vld $vr0, $a1, 0
+; LA32-NEXT:    vld $vr1, $a2, 0
+; LA32-NEXT:    vadd.d $vr0, $vr0, $vr1
+; LA32-NEXT:    vaddi.du $vr0, $vr0, 1
+; LA32-NEXT:    vsrai.d $vr0, $vr0, 1
+; LA32-NEXT:    vst $vr0, $a0, 0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: vavgr_d:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    vld $vr0, $a1, 0
+; LA64-NEXT:    vld $vr1, $a2, 0
+; LA64-NEXT:    vavgr.d $vr0, $vr0, $vr1
+; LA64-NEXT:    vst $vr0, $a0, 0
+; LA64-NEXT:    ret
+entry:
+  %va = load <2 x i64>, ptr %a
+  %vb = load <2 x i64>, ptr %b
+  %add = add <2 x i64> %va, %vb
+  %add1 = add <2 x i64> %add, <i64 1, i64 1>
+  %shr = ashr <2 x i64> %add1, <i64 1, i64 1>
+  store <2 x i64> %shr, ptr %res
+  ret void
+}
+
+define void @vavgr_bu(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: vavgr_bu:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vavgr.bu $vr0, $vr0, $vr1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <16 x i8>, ptr %a
+  %vb = load <16 x i8>, ptr %b
+  %add = add <16 x i8> %va, %vb
+  %add1 = add <16 x i8> %add, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %shr = lshr <16 x i8> %add1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  store <16 x i8> %shr, ptr %res
+  ret void
+}
+
+define void @vavgr_hu(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: vavgr_hu:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vavgr.hu $vr0, $vr0, $vr1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <8 x i16>, ptr %a
+  %vb = load <8 x i16>, ptr %b
+  %add = add <8 x i16> %va, %vb
+  %add1 = add <8 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %shr = lshr <8 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  store <8 x i16> %shr, ptr %res
+  ret void
+}
+
+define void @vavgr_wu(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: vavgr_wu:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vavgr.wu $vr0, $vr0, $vr1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <4 x i32>, ptr %a
+  %vb = load <4 x i32>, ptr %b
+  %add = add <4 x i32> %va, %vb
+  %add1 = add <4 x i32> %add, <i32 1, i32 1, i32 1, i32 1>
+  %shr = lshr <4 x i32> %add1, <i32 1, i32 1, i32 1, i32 1>
+  store <4 x i32> %shr, ptr %res
+  ret void
+}
+
+define void @vavgr_du(ptr %res, ptr %a, ptr %b) nounwind {
+; LA32-LABEL: vavgr_du:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    vld $vr0, $a1, 0
+; LA32-NEXT:    vld $vr1, $a2, 0
+; LA32-NEXT:    vadd.d $vr0, $vr0, $vr1
+; LA32-NEXT:    vaddi.du $vr0, $vr0, 1
+; LA32-NEXT:    vsrli.d $vr0, $vr0, 1
+; LA32-NEXT:    vst $vr0, $a0, 0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: vavgr_du:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    vld $vr0, $a1, 0
+; LA64-NEXT:    vld $vr1, $a2, 0
+; LA64-NEXT:    vavgr.du $vr0, $vr0, $vr1
+; LA64-NEXT:    vst $vr0, $a0, 0
+; LA64-NEXT:    ret
+entry:
+  %va = load <2 x i64>, ptr %a
+  %vb = load <2 x i64>, ptr %b
+  %add = add <2 x i64> %va, %vb
+  %add1 = add <2 x i64> %add, <i64 1, i64 1>
+  %shr = lshr <2 x i64> %add1, <i64 1, i64 1>
+  store <2 x i64> %shr, ptr %res
+  ret void
+}
diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/avgfloor-ceil.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/avgfloor-ceil.ll
new file mode 100644
index 0000000000000..bb4df64a48284
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/avgfloor-ceil.ll
@@ -0,0 +1,379 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s
+; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
+
+define void @vavg_b(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: vavg_b:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vand.v $vr2, $vr0, $vr1
+; CHECK-NEXT:    vxor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vsrai.b $vr0, $vr0, 1
+; CHECK-NEXT:    vadd.b $vr0, $vr2, $vr0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <16 x i8>, ptr %a
+  %vb = load <16 x i8>, ptr %b
+  %ea = sext <16 x i8> %va to <16 x i16>
+  %eb = sext <16 x i8> %vb to <16 x i16>
+  %add = add <16 x i16> %ea, %eb
+  %shr = lshr <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %r = trunc <16 x i16> %shr to <16 x i8>
+  store <16 x i8> %r, ptr %res
+  ret void
+}
+
+define void @vavg_h(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: vavg_h:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vand.v $vr2, $vr0, $vr1
+; CHECK-NEXT:    vxor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vsrai.h $vr0, $vr0, 1
+; CHECK-NEXT:    vadd.h $vr0, $vr2, $vr0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <8 x i16>, ptr %a
+  %vb = load <8 x i16>, ptr %b
+  %ea = sext <8 x i16> %va to <8 x i32>
+  %eb = sext <8 x i16> %vb to <8 x i32>
+  %add = add <8 x i32> %ea, %eb
+  %shr = lshr <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %r = trunc <8 x i32> %shr to <8 x i16>
+  store <8 x i16> %r, ptr %res
+  ret void
+}
+
+define void @vavg_w(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: vavg_w:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vand.v $vr2, $vr0, $vr1
+; CHECK-NEXT:    vxor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vsrai.w $vr0, $vr0, 1
+; CHECK-NEXT:    vadd.w $vr0, $vr2, $vr0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <4 x i32>, ptr %a
+  %vb = load <4 x i32>, ptr %b
+  %ea = sext <4 x i32> %va to <4 x i64>
+  %eb = sext <4 x i32> %vb to <4 x i64>
+  %add = add <4 x i64> %ea, %eb
+  %shr = lshr <4 x i64> %add, <i64 1, i64 1, i64 1, i64 1>
+  %r = trunc <4 x i64> %shr to <4 x i32>
+  store <4 x i32> %r, ptr %res
+  ret void
+}
+
+define void @vavg_d(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: vavg_d:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vand.v $vr2, $vr0, $vr1
+; CHECK-NEXT:    vxor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vsrai.d $vr0, $vr0, 1
+; CHECK-NEXT:    vadd.d $vr0, $vr2, $vr0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <2 x i64>, ptr %a
+  %vb = load <2 x i64>, ptr %b
+  %ea = sext <2 x i64> %va to <2 x i128>
+  %eb = sext <2 x i64> %vb to <2 x i128>
+  %add = add <2 x i128> %ea, %eb
+  %shr = lshr <2 x i128> %add, <i128 1, i128 1>
+  %r = trunc <2 x i128> %shr to <2 x i64>
+  store <2 x i64> %r, ptr %res
+  ret void
+}
+
+define void @vavg_bu(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: vavg_bu:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vand.v $vr2, $vr0, $vr1
+; CHECK-NEXT:    vxor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vsrli.b $vr0, $vr0, 1
+; CHECK-NEXT:    vadd.b $vr0, $vr2, $vr0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <16 x i8>, ptr %a
+  %vb = load <16 x i8>, ptr %b
+  %ea = zext <16 x i8> %va to <16 x i16>
+  %eb = zext <16 x i8> %vb to <16 x i16>
+  %add = add <16 x i16> %ea, %eb
+  %shr = lshr <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %r = trunc <16 x i16> %shr to <16 x i8>
+  store <16 x i8> %r, ptr %res
+  ret void
+}
+
+define void @vavg_hu(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: vavg_hu:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vand.v $vr2, $vr0, $vr1
+; CHECK-NEXT:    vxor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vsrli.h $vr0, $vr0, 1
+; CHECK-NEXT:    vadd.h $vr0, $vr2, $vr0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <8 x i16>, ptr %a
+  %vb = load <8 x i16>, ptr %b
+  %ea = zext <8 x i16> %va to <8 x i32>
+  %eb = zext <8 x i16> %vb to <8 x i32>
+  %add = add <8 x i32> %ea, %eb
+  %shr = lshr <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %r = trunc <8 x i32> %shr to <8 x i16>
+  store <8 x i16> %r, ptr %res
+  ret void
+}
+
+define void @vavg_wu(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: vavg_wu:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vand.v $vr2, $vr0, $vr1
+; CHECK-NEXT:    vxor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vsrli.w $vr0, $vr0, 1
+; CHECK-NEXT:    vadd.w $vr0, $vr2, $vr0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <4 x i32>, ptr %a
+  %vb = load <4 x i32>, ptr %b
+  %ea = zext <4 x i32> %va to <4 x i64>
+  %eb = zext <4 x i32> %vb to <4 x i64>
+  %add = add <4 x i64> %ea, %eb
+  %shr = lshr <4 x i64> %add, <i64 1, i64 1, i64 1, i64 1>
+  %r = trunc <4 x i64> %shr to <4 x i32>
+  store <4 x i32> %r, ptr %res
+  ret void
+}
+
+define void @vavg_du(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: vavg_du:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vand.v $vr2, $vr0, $vr1
+; CHECK-NEXT:    vxor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vsrli.d $vr0, $vr0, 1
+; CHECK-NEXT:    vadd.d $vr0, $vr2, $vr0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <2 x i64>, ptr %a
+  %vb = load <2 x i64>, ptr %b
+  %ea = zext <2 x i64> %va to <2 x i128>
+  %eb = zext <2 x i64> %vb to <2 x i128>
+  %add = add <2 x i128> %ea, %eb
+  %shr = lshr <2 x i128> %add, <i128 1, i128 1>
+  %r = trunc <2 x i128> %shr to <2 x i64>
+  store <2 x i64> %r, ptr %res
+  ret void
+}
+
+define void @vavgr_b(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: vavgr_b:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vor.v $vr2, $vr0, $vr1
+; CHECK-NEXT:    vxor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vsrai.b $vr0, $vr0, 1
+; CHECK-NEXT:    vsub.b $vr0, $vr2, $vr0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <16 x i8>, ptr %a
+  %vb = load <16 x i8>, ptr %b
+  %ea = sext <16 x i8> %va to <16 x i16>
+  %eb = sext <16 x i8> %vb to <16 x i16>
+  %add = add <16 x i16> %ea, %eb
+  %add1 = add <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %shr = lshr <16 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %r = trunc <16 x i16> %shr to <16 x i8>
+  store <16 x i8> %r, ptr %res
+  ret void
+}
+
+define void @vavgr_h(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: vavgr_h:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vor.v $vr2, $vr0, $vr1
+; CHECK-NEXT:    vxor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vsrai.h $vr0, $vr0, 1
+; CHECK-NEXT:    vsub.h $vr0, $vr2, $vr0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <8 x i16>, ptr %a
+  %vb = load <8 x i16>, ptr %b
+  %ea = sext <8 x i16> %va to <8 x i32>
+  %eb = sext <8 x i16> %vb to <8 x i32>
+  %add = add <8 x i32> %ea, %eb
+  %add1 = add <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %shr = lshr <8 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %r = trunc <8 x i32> %shr to <8 x i16>
+  store <8 x i16> %r, ptr %res
+  ret void
+}
+
+define void @vavgr_w(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: vavgr_w:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vor.v $vr2, $vr0, $vr1
+; CHECK-NEXT:    vxor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vsrai.w $vr0, $vr0, 1
+; CHECK-NEXT:    vsub.w $vr0, $vr2, $vr0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <4 x i32>, ptr %a
+  %vb = load <4 x i32>, ptr %b
+  %ea = sext <4 x i32> %va to <4 x i64>
+  %eb = sext <4 x i32> %vb to <4 x i64>
+  %add = add <4 x i64> %ea, %eb
+  %add1 = add <4 x i64> %add, <i64 1, i64 1, i64 1, i64 1>
+  %shr = lshr <4 x i64> %add1, <i64 1, i64 1, i64 1, i64 1>
+  %r = trunc <4 x i64> %shr to <4 x i32>
+  store <4 x i32> %r, ptr %res
+  ret void
+}
+
+define void @vavgr_d(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: vavgr_d:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vor.v $vr2, $vr0, $vr1
+; CHECK-NEXT:    vxor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vsrai.d $vr0, $vr0, 1
+; CHECK-NEXT:    vsub.d $vr0, $vr2, $vr0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <2 x i64>, ptr %a
+  %vb = load <2 x i64>, ptr %b
+  %ea = sext <2 x i64> %va to <2 x i128>
+  %eb = sext <2 x i64> %vb to <2 x i128>
+  %add = add <2 x i128> %ea, %eb
+  %add1 = add <2 x i128> %add, <i128 1, i128 1>
+  %shr = lshr <2 x i128> %add1, <i128 1, i128 1>
+  %r = trunc <2 x i128> %shr to <2 x i64>
+  store <2 x i64> %r, ptr %res
+  ret void
+}
+
+define void @vavgr_bu(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: vavgr_bu:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vor.v $vr2, $vr0, $vr1
+; CHECK-NEXT:    vxor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vsrli.b $vr0, $vr0, 1
+; CHECK-NEXT:    vsub.b $vr0, $vr2, $vr0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <16 x i8>, ptr %a
+  %vb = load <16 x i8>, ptr %b
+  %ea = zext <16 x i8> %va to <16 x i16>
+  %eb = zext <16 x i8> %vb to <16 x i16>
+  %add = add <16 x i16> %ea, %eb
+  %add1 = add <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %shr = lshr <16 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %r = trunc <16 x i16> %shr to <16 x i8>
+  store <16 x i8> %r, ptr %res
+  ret void
+}
+
+define void @vavgr_hu(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: vavgr_hu:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vor.v $vr2, $vr0, $vr1
+; CHECK-NEXT:    vxor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vsrli.h $vr0, $vr0, 1
+; CHECK-NEXT:    vsub.h $vr0, $vr2, $vr0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <8 x i16>, ptr %a
+  %vb = load <8 x i16>, ptr %b
+  %ea = zext <8 x i16> %va to <8 x i32>
+  %eb = zext <8 x i16> %vb to <8 x i32>
+  %add = add <8 x i32> %ea, %eb
+  %add1 = add <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %shr = lshr <8 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %r = trunc <8 x i32> %shr to <8 x i16>
+  store <8 x i16> %r, ptr %res
+  ret void
+}
+
+define void @vavgr_wu(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: vavgr_wu:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vor.v $vr2, $vr0, $vr1
+; CHECK-NEXT:    vxor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vsrli.w $vr0, $vr0, 1
+; CHECK-NEXT:    vsub.w $vr0, $vr2, $vr0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <4 x i32>, ptr %a
+  %vb = load <4 x i32>, ptr %b
+  %ea = zext <4 x i32> %va to <4 x i64>
+  %eb = zext <4 x i32> %vb to <4 x i64>
+  %add = add <4 x i64> %ea, %eb
+  %add1 = add <4 x i64> %add, <i64 1, i64 1, i64 1, i64 1>
+  %shr = lshr <4 x i64> %add1, <i64 1, i64 1, i64 1, i64 1>
+  %r = trunc <4 x i64> %shr to <4 x i32>
+  store <4 x i32> %r, ptr %res
+  ret void
+}
+
+define void @vavgr_du(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: vavgr_du:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vor.v $vr2, $vr0, $vr1
+; CHECK-NEXT:    vxor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vsrli.d $vr0, $vr0, 1
+; CHECK-NEXT:    vsub.d $vr0, $vr2, $vr0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <2 x i64>, ptr %a
+  %vb = load <2 x i64>, ptr %b
+  %ea = zext <2 x i64> %va to <2 x i128>
+  %eb = zext <2 x i64> %vb to <2 x i128>
+  %add = add <2 x i128> %ea, %eb
+  %add1 = add <2 x i128> %add, <i128 1, i128 1>
+  %shr = lshr <2 x i128> %add1, <i128 1, i128 1>
+  %r = trunc <2 x i128> %shr to <2 x i64>
+  store <2 x i64> %r, ptr %res
+  ret void
+}
diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/flog2.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/flog2.ll
index e5e75ec617b51..87cc7c6dbc708 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/flog2.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/flog2.ll
@@ -1,98 +1,17 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s --check-prefix=LA32
-; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s --check-prefix=LA64
+; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s
+; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
 
 declare <4 x float> @llvm.log2.v4f32(<4 x float>)
 declare <2 x double> @llvm.log2.v2f64(<2 x double>)
 
 define void @flog2_v4f32(ptr %res, ptr %a) nounwind {
-; LA32-LABEL: flog2_v4f32:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    addi.w $sp, $sp, -48
-; LA32-NEXT:    st.w $ra, $sp, 44 # 4-byte Folded Spill
-; LA32-NEXT:    st.w $fp, $sp, 40 # 4-byte Folded Spill
-; LA32-NEXT:    vld $vr0, $a1, 0
-; LA32-NEXT:    vst $vr0, $sp, 16 # 16-byte Folded Spill
-; LA32-NEXT:    move $fp, $a0
-; LA32-NEXT:    vreplvei.w $vr0, $vr0, 1
-; LA32-NEXT:    # kill: def $f0 killed $f0 killed $vr0
-; LA32-NEXT:    bl log2f
-; LA32-NEXT:    # kill: def $f0 killed $f0 def $vr0
-; LA32-NEXT:    vst $vr0, $sp, 0 # 16-byte Folded Spill
-; LA32-NEXT:    vld $vr0, $sp, 16 # 16-byte Folded Reload
-; LA32-NEXT:    vreplvei.w $vr0, $vr0, 0
-; LA32-NEXT:    # kill: def $f0 killed $f0 killed $vr0
-; LA32-NEXT:    bl log2f
-; LA32-NEXT:    # kill: def $f0 killed $f0 def $vr0
-; LA32-NEXT:    vld $vr1, $sp, 0 # 16-byte Folded Reload
-; LA32-NEXT:    vextrins.w $vr0, $vr1, 16
-; LA32-NEXT:    vst $vr0, $sp, 0 # 16-byte Folded Spill
-; LA32-NEXT:    vld $vr0, $sp, 16 # 16-byte Folded Reload
-; LA32-NEXT:    vreplvei.w $vr0, $vr0, 2
-; LA32-NEXT:    # kill: def $f0 killed $f0 killed $vr0
-; LA32-NEXT:    bl log2f
-; LA32-NEXT:    # kill: def $f0 killed $f0 def $vr0
-; LA32-NEXT:    vld $vr1, $sp, 0 # 16-byte Folded Reload
-; LA32-NEXT:    vextrins.w $vr1, $vr0, 32
-; LA32-NEXT:    vst $vr1, $sp, 0 # 16-byte Folded Spill
-; LA32-NEXT:    vld $vr0, $sp, 16 # 16-byte Folded Reload
-; LA32-NEXT:    vreplvei.w $vr0, $vr0, 3
-; LA32-NEXT:    # kill: def $f0 killed $f0 killed $vr0
-; LA32-NEXT:    bl log2f
-; LA32-NEXT:    # kill: def $f0 killed $f0 def $vr0
-; LA32-NEXT:    vld $vr1, $sp, 0 # 16-byte Folded Reload
-; LA32-NEXT:    vextrins.w $vr1, $vr0, 48
-; LA32-NEXT:    vst $vr1, $fp, 0
-; LA32-NEXT:    ld.w $fp, $sp, 40 # 4-byte Folded Reload
-; LA32-NEXT:    ld.w $ra, $sp, 44 # 4-byte Folded Reload
-; LA32-NEXT:    addi.w $sp, $sp, 48
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: flog2_v4f32:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    addi.d $sp, $sp, -48
-; LA64-NEXT:    st.d $ra, $sp, 40 # 8-byte Folded Spill
-; LA64-NEXT:    st.d $fp, $sp, 32 # 8-byte Folded Spill
-; LA64-NEXT:    vld $vr0, $a1, 0
-; LA64-NEXT:    vst $vr0, $sp, 16 # 16-byte Folded Spill
-; LA64-NEXT:    move $fp, $a0
-; LA64-NEXT:    vreplvei.w $vr0, $vr0, 1
-; LA64-NEXT:    # kill: def $f0 killed $f0 killed $vr0
-; LA64-NEXT:    pcaddu18i $ra, %call36(log2f)
-; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    # kill: def $f0 killed $f0 def $vr0
-; LA64-NEXT:    vst $vr0, $sp, 0 # 16-byte Folded Spill
-; LA64-NEXT:    vld $vr0, $sp, 16 # 16-byte Folded Reload
-; LA64-NEXT:    vreplvei.w $vr0, $vr0, 0
-; LA64-NEXT:    # kill: def $f0 killed $f0 killed $vr0
-; LA64-NEXT:    pcaddu18i $ra, %call36(log2f)
-; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    # kill: def $f0 killed $f0 def $vr0
-; LA64-NEXT:    vld $vr1, $sp, 0 # 16-byte Folded Reload
-; LA64-NEXT:    vextrins.w $vr0, $vr1, 16
-; LA64-NEXT:    vst $vr0, $sp, 0 # 16-byte Folded Spill
-; LA64-NEXT:    vld $vr0, $sp, 16 # 16-byte Folded Reload
-; LA64-NEXT:    vreplvei.w $vr0, $vr0, 2
-; LA64-NEXT:    # kill: def $f0 killed $f0 killed $vr0
-; LA64-NEXT:    pcaddu18i $ra, %call36(log2f)
-; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    # kill: def $f0 killed $f0 def $vr0
-; LA64-NEXT:    vld $vr1, $sp, 0 # 16-byte Folded Reload
-; LA64-NEXT:    vextrins.w $vr1, $vr0, 32
-; LA64-NEXT:    vst $vr1, $sp, 0 # 16-byte Folded Spill
-; LA64-NEXT:    vld $vr0, $sp, 16 # 16-byte Folded Reload
-; LA64-NEXT:    vreplvei.w $vr0, $vr0, 3
-; LA64-NEXT:    # kill: def $f0 killed $f0 killed $vr0
-; LA64-NEXT:    pcaddu18i $ra, %call36(log2f)
-; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    # kill: def $f0 killed $f0 def $vr0
-; LA64-NEXT:    vld $vr1, $sp, 0 # 16-byte Folded Reload
-; LA64-NEXT:    vextrins.w $vr1, $vr0, 48
-; LA64-NEXT:    vst $vr1, $fp, 0
-; LA64-NEXT:    ld.d $fp, $sp, 32 # 8-byte Folded Reload
-; LA64-NEXT:    ld.d $ra, $sp, 40 # 8-byte Folded Reload
-; LA64-NEXT:    addi.d $sp, $sp, 48
-; LA64-NEXT:    ret
+; CHECK-LABEL: flog2_v4f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vflogb.s $vr0, $vr0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
 entry:
   %v = load <4 x float>, ptr %a
   %r = call <4 x float> @llvm.log2.v4f32(<4 x float> %v)
@@ -101,59 +20,12 @@ entry:
 }
 
 define void @flog2_v2f64(ptr %res, ptr %a) nounwind {
-; LA32-LABEL: flog2_v2f64:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    addi.w $sp, $sp, -48
-; LA32-NEXT:    st.w $ra, $sp, 44 # 4-byte Folded Spill
-; LA32-NEXT:    st.w $fp, $sp, 40 # 4-byte Folded Spill
-; LA32-NEXT:    vld $vr0, $a1, 0
-; LA32-NEXT:    vst $vr0, $sp, 0 # 16-byte Folded Spill
-; LA32-NEXT:    move $fp, $a0
-; LA32-NEXT:    vreplvei.d $vr0, $vr0, 1
-; LA32-NEXT:    # kill: def $f0_64 killed $f0_64 killed $vr0
-; LA32-NEXT:    bl log2
-; LA32-NEXT:    # kill: def $f0_64 killed $f0_64 def $vr0
-; LA32-NEXT:    vst $vr0, $sp, 16 # 16-byte Folded Spill
-; LA32-NEXT:    vld $vr0, $sp, 0 # 16-byte Folded Reload
-; LA32-NEXT:    vreplvei.d $vr0, $vr0, 0
-; LA32-NEXT:    # kill: def $f0_64 killed $f0_64 killed $vr0
-; LA32-NEXT:    bl log2
-; LA32-NEXT:    # kill: def $f0_64 killed $f0_64 def $vr0
-; LA32-NEXT:    vld $vr1, $sp, 16 # 16-byte Folded Reload
-; LA32-NEXT:    vextrins.d $vr0, $vr1, 16
-; LA32-NEXT:    vst $vr0, $fp, 0
-; LA32-NEXT:    ld.w $fp, $sp, 40 # 4-byte Folded Reload
-; LA32-NEXT:    ld.w $ra, $sp, 44 # 4-byte Folded Reload
-; LA32-NEXT:    addi.w $sp, $sp, 48
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: flog2_v2f64:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    addi.d $sp, $sp, -48
-; LA64-NEXT:    st.d $ra, $sp, 40 # 8-byte Folded Spill
-; LA64-NEXT:    st.d $fp, $sp, 32 # 8-byte Folded Spill
-; LA64-NEXT:    vld $vr0, $a1, 0
-; LA64-NEXT:    vst $vr0, $sp, 0 # 16-byte Folded Spill
-; LA64-NEXT:    move $fp, $a0
-; LA64-NEXT:    vreplvei.d $vr0, $vr0, 1
-; LA64-NEXT:    # kill: def $f0_64 killed $f0_64 killed $vr0
-; LA64-NEXT:    pcaddu18i $ra, %call36(log2)
-; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    # kill: def $f0_64 killed $f0_64 def $vr0
-; LA64-NEXT:    vst $vr0, $sp, 16 # 16-byte Folded Spill
-; LA64-NEXT:    vld $vr0, $sp, 0 # 16-byte Folded Reload
-; LA64-NEXT:    vreplvei.d $vr0, $vr0, 0
-; LA64-NEXT:    # kill: def $f0_64 killed $f0_64 killed $vr0
-; LA64-NEXT:    pcaddu18i $ra, %call36(log2)
-; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    # kill: def $f0_64 killed $f0_64 def $vr0
-; LA64-NEXT:    vld $vr1, $sp, 16 # 16-byte Folded Reload
-; LA64-NEXT:    vextrins.d $vr0, $vr1, 16
-; LA64-NEXT:    vst $vr0, $fp, 0
-; LA64-NEXT:    ld.d $fp, $sp, 32 # 8-byte Folded Reload
-; LA64-NEXT:    ld.d $ra, $sp, 40 # 8-byte Folded Reload
-; LA64-NEXT:    addi.d $sp, $sp, 48
-; LA64-NEXT:    ret
+; CHECK-LABEL: flog2_v2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vflogb.d $vr0, $vr0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
 entry:
   %v = load <2 x double>, ptr %a
   %r = call <2 x double> @llvm.log2.v2f64(<2 x double> %v)
diff --git a/llvm/test/CodeGen/LoongArch/sink-fold-addi.ll b/llvm/test/CodeGen/LoongArch/sink-fold-addi.ll
new file mode 100644
index 0000000000000..9a806a12f7de6
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/sink-fold-addi.ll
@@ -0,0 +1,758 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx --verify-machineinstrs < %s \
+; RUN:   | FileCheck --check-prefix=LA32 %s
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx --verify-machineinstrs < %s \
+; RUN:   | FileCheck --check-prefix=LA64 %s
+
+%struct.S = type { i64, i64, i8 }
+%struct.F = type { float, double, float }
+%struct.V = type { <4 x i32>, <4 x i32>, <16 x i16> }
+
+define void @sink_fold_i64(i64 %k, i64 %n, ptr %a) nounwind {
+; LA32-LABEL: sink_fold_i64:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    addi.w $sp, $sp, -48
+; LA32-NEXT:    st.w $ra, $sp, 44 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $fp, $sp, 40 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s0, $sp, 36 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s1, $sp, 32 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s2, $sp, 28 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s3, $sp, 24 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s4, $sp, 20 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s5, $sp, 16 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s6, $sp, 12 # 4-byte Folded Spill
+; LA32-NEXT:    move $s0, $a3
+; LA32-NEXT:    move $s1, $a2
+; LA32-NEXT:    slli.w $a1, $a0, 4
+; LA32-NEXT:    alsl.w $a0, $a0, $a1, 3
+; LA32-NEXT:    add.w $a0, $a4, $a0
+; LA32-NEXT:    sltui $a1, $a3, 1
+; LA32-NEXT:    slti $a2, $a3, 0
+; LA32-NEXT:    masknez $a2, $a2, $a1
+; LA32-NEXT:    sltui $a3, $s1, 1
+; LA32-NEXT:    maskeqz $a1, $a3, $a1
+; LA32-NEXT:    or $a1, $a1, $a2
+; LA32-NEXT:    addi.w $s2, $a0, 8
+; LA32-NEXT:    bnez $a1, .LBB0_3
+; LA32-NEXT:  # %bb.1: # %for.body.preheader
+; LA32-NEXT:    move $fp, $a4
+; LA32-NEXT:    move $s4, $zero
+; LA32-NEXT:    move $s5, $zero
+; LA32-NEXT:    move $s3, $zero
+; LA32-NEXT:    move $s6, $zero
+; LA32-NEXT:    .p2align 4, , 16
+; LA32-NEXT:  .LBB0_2: # %for.body
+; LA32-NEXT:    # =>This Inner Loop Header: Depth=1
+; LA32-NEXT:    move $a0, $fp
+; LA32-NEXT:    bl f
+; LA32-NEXT:    ld.w $a0, $s2, 4
+; LA32-NEXT:    ld.w $a1, $s2, 0
+; LA32-NEXT:    add.w $a0, $a0, $s6
+; LA32-NEXT:    add.w $s3, $a1, $s3
+; LA32-NEXT:    sltu $a1, $s3, $a1
+; LA32-NEXT:    addi.w $s4, $s4, 1
+; LA32-NEXT:    sltui $a2, $s4, 1
+; LA32-NEXT:    add.w $s5, $s5, $a2
+; LA32-NEXT:    xor $a2, $s4, $s1
+; LA32-NEXT:    xor $a3, $s5, $s0
+; LA32-NEXT:    or $a2, $a2, $a3
+; LA32-NEXT:    add.w $s6, $a0, $a1
+; LA32-NEXT:    bnez $a2, .LBB0_2
+; LA32-NEXT:    b .LBB0_4
+; LA32-NEXT:  .LBB0_3:
+; LA32-NEXT:    move $s3, $zero
+; LA32-NEXT:    move $s6, $zero
+; LA32-NEXT:  .LBB0_4: # %for.cond.cleanup
+; LA32-NEXT:    st.w $s3, $s2, 0
+; LA32-NEXT:    st.w $s6, $s2, 4
+; LA32-NEXT:    ld.w $s6, $sp, 12 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s5, $sp, 16 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s4, $sp, 20 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s3, $sp, 24 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s2, $sp, 28 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s1, $sp, 32 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s0, $sp, 36 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $fp, $sp, 40 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $ra, $sp, 44 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 48
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: sink_fold_i64:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    addi.d $sp, $sp, -48
+; LA64-NEXT:    st.d $ra, $sp, 40 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $fp, $sp, 32 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s0, $sp, 24 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s1, $sp, 16 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s2, $sp, 8 # 8-byte Folded Spill
+; LA64-NEXT:    move $s0, $a1
+; LA64-NEXT:    slli.d $a1, $a0, 4
+; LA64-NEXT:    alsl.d $a0, $a0, $a1, 3
+; LA64-NEXT:    add.d $a0, $a2, $a0
+; LA64-NEXT:    addi.d $s1, $a0, 8
+; LA64-NEXT:    blez $s0, .LBB0_3
+; LA64-NEXT:  # %bb.1: # %for.body.preheader
+; LA64-NEXT:    move $fp, $a2
+; LA64-NEXT:    move $s2, $zero
+; LA64-NEXT:    .p2align 4, , 16
+; LA64-NEXT:  .LBB0_2: # %for.body
+; LA64-NEXT:    # =>This Inner Loop Header: Depth=1
+; LA64-NEXT:    move $a0, $fp
+; LA64-NEXT:    pcaddu18i $ra, %call36(f)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    ld.d $a0, $s1, 0
+; LA64-NEXT:    addi.d $s0, $s0, -1
+; LA64-NEXT:    add.d $s2, $a0, $s2
+; LA64-NEXT:    bnez $s0, .LBB0_2
+; LA64-NEXT:    b .LBB0_4
+; LA64-NEXT:  .LBB0_3:
+; LA64-NEXT:    move $s2, $zero
+; LA64-NEXT:  .LBB0_4: # %for.cond.cleanup
+; LA64-NEXT:    st.d $s2, $s1, 0
+; LA64-NEXT:    ld.d $s2, $sp, 8 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $s1, $sp, 16 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $s0, $sp, 24 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $fp, $sp, 32 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $ra, $sp, 40 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 48
+; LA64-NEXT:    ret
+entry:
+  %y = getelementptr inbounds %struct.S, ptr %a, i64 %k, i32 1
+  %cmp4 = icmp sgt i64 %n, 0
+  br i1 %cmp4, label %for.body, label %for.cond.cleanup
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.06 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %s.05 = phi i64 [ 0, %entry ], [ %add, %for.body ]
+  call void @f(ptr %a)
+  %0 = load i64, ptr %y
+  %add = add nsw i64 %0, %s.05
+  %inc = add nuw nsw i64 %i.06, 1
+  %exitcond.not = icmp eq i64 %inc, %n
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %s.0.lcssa = phi i64 [ 0, %entry ], [ %add, %for.body ]
+  store i64 %s.0.lcssa, ptr %y
+  ret void
+}
+
+define void @sink_fold_f32(i64 %k, i64 %n, ptr %a) nounwind {
+; LA32-LABEL: sink_fold_f32:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    addi.w $sp, $sp, -48
+; LA32-NEXT:    st.w $ra, $sp, 44 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $fp, $sp, 40 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s0, $sp, 36 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s1, $sp, 32 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s2, $sp, 28 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s3, $sp, 24 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s4, $sp, 20 # 4-byte Folded Spill
+; LA32-NEXT:    fst.d $fs0, $sp, 8 # 8-byte Folded Spill
+; LA32-NEXT:    move $s0, $a3
+; LA32-NEXT:    move $s1, $a2
+; LA32-NEXT:    slli.w $a1, $a0, 4
+; LA32-NEXT:    alsl.w $a0, $a0, $a1, 3
+; LA32-NEXT:    add.w $a0, $a4, $a0
+; LA32-NEXT:    sltui $a1, $a3, 1
+; LA32-NEXT:    slti $a2, $a3, 0
+; LA32-NEXT:    masknez $a2, $a2, $a1
+; LA32-NEXT:    sltui $a3, $s1, 1
+; LA32-NEXT:    maskeqz $a1, $a3, $a1
+; LA32-NEXT:    or $a1, $a1, $a2
+; LA32-NEXT:    addi.w $s2, $a0, 16
+; LA32-NEXT:    bnez $a1, .LBB1_3
+; LA32-NEXT:  # %bb.1: # %for.body.preheader
+; LA32-NEXT:    move $fp, $a4
+; LA32-NEXT:    move $s3, $zero
+; LA32-NEXT:    move $s4, $zero
+; LA32-NEXT:    movgr2fr.w $fs0, $zero
+; LA32-NEXT:    .p2align 4, , 16
+; LA32-NEXT:  .LBB1_2: # %for.body
+; LA32-NEXT:    # =>This Inner Loop Header: Depth=1
+; LA32-NEXT:    move $a0, $fp
+; LA32-NEXT:    bl f
+; LA32-NEXT:    fld.s $fa0, $s2, 0
+; LA32-NEXT:    addi.w $s3, $s3, 1
+; LA32-NEXT:    sltui $a0, $s3, 1
+; LA32-NEXT:    add.w $s4, $s4, $a0
+; LA32-NEXT:    xor $a0, $s3, $s1
+; LA32-NEXT:    xor $a1, $s4, $s0
+; LA32-NEXT:    or $a0, $a0, $a1
+; LA32-NEXT:    fadd.s $fs0, $fa0, $fs0
+; LA32-NEXT:    bnez $a0, .LBB1_2
+; LA32-NEXT:    b .LBB1_4
+; LA32-NEXT:  .LBB1_3:
+; LA32-NEXT:    movgr2fr.w $fs0, $zero
+; LA32-NEXT:  .LBB1_4: # %for.cond.cleanup
+; LA32-NEXT:    fst.s $fs0, $s2, 0
+; LA32-NEXT:    fld.d $fs0, $sp, 8 # 8-byte Folded Reload
+; LA32-NEXT:    ld.w $s4, $sp, 20 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s3, $sp, 24 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s2, $sp, 28 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s1, $sp, 32 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s0, $sp, 36 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $fp, $sp, 40 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $ra, $sp, 44 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 48
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: sink_fold_f32:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    addi.d $sp, $sp, -48
+; LA64-NEXT:    st.d $ra, $sp, 40 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $fp, $sp, 32 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s0, $sp, 24 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s1, $sp, 16 # 8-byte Folded Spill
+; LA64-NEXT:    fst.d $fs0, $sp, 8 # 8-byte Folded Spill
+; LA64-NEXT:    move $s0, $a1
+; LA64-NEXT:    slli.d $a1, $a0, 4
+; LA64-NEXT:    alsl.d $a0, $a0, $a1, 3
+; LA64-NEXT:    add.d $a0, $a2, $a0
+; LA64-NEXT:    addi.d $s1, $a0, 16
+; LA64-NEXT:    blez $s0, .LBB1_3
+; LA64-NEXT:  # %bb.1: # %for.body.preheader
+; LA64-NEXT:    move $fp, $a2
+; LA64-NEXT:    movgr2fr.w $fs0, $zero
+; LA64-NEXT:    .p2align 4, , 16
+; LA64-NEXT:  .LBB1_2: # %for.body
+; LA64-NEXT:    # =>This Inner Loop Header: Depth=1
+; LA64-NEXT:    move $a0, $fp
+; LA64-NEXT:    pcaddu18i $ra, %call36(f)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    fld.s $fa0, $s1, 0
+; LA64-NEXT:    addi.d $s0, $s0, -1
+; LA64-NEXT:    fadd.s $fs0, $fa0, $fs0
+; LA64-NEXT:    bnez $s0, .LBB1_2
+; LA64-NEXT:    b .LBB1_4
+; LA64-NEXT:  .LBB1_3:
+; LA64-NEXT:    movgr2fr.w $fs0, $zero
+; LA64-NEXT:  .LBB1_4: # %for.cond.cleanup
+; LA64-NEXT:    fst.s $fs0, $s1, 0
+; LA64-NEXT:    fld.d $fs0, $sp, 8 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $s1, $sp, 16 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $s0, $sp, 24 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $fp, $sp, 32 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $ra, $sp, 40 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 48
+; LA64-NEXT:    ret
+entry:
+  %y = getelementptr inbounds %struct.F, ptr %a, i64 %k, i32 2
+  %cmp4 = icmp sgt i64 %n, 0
+  br i1 %cmp4, label %for.body, label %for.cond.cleanup
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.06 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %s.05 = phi float [ 0.0, %entry ], [ %add, %for.body ]
+  call void @f(ptr %a)
+  %0 = load float, ptr %y
+  %add = fadd float %0, %s.05
+  %inc = add nuw nsw i64 %i.06, 1
+  %exitcond.not = icmp eq i64 %inc, %n
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %s.0.lcssa = phi float [ 0.0, %entry ], [ %add, %for.body ]
+  store float %s.0.lcssa, ptr %y
+  ret void
+}
+
+define void @sink_fold_v4i32(i64 %k, i64 %n, ptr %a) nounwind {
+; LA32-LABEL: sink_fold_v4i32:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    addi.w $sp, $sp, -48
+; LA32-NEXT:    st.w $ra, $sp, 44 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $fp, $sp, 40 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s0, $sp, 36 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s1, $sp, 32 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s2, $sp, 28 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s3, $sp, 24 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s4, $sp, 20 # 4-byte Folded Spill
+; LA32-NEXT:    move $s0, $a3
+; LA32-NEXT:    move $s1, $a2
+; LA32-NEXT:    slli.w $a0, $a0, 6
+; LA32-NEXT:    add.w $a0, $a4, $a0
+; LA32-NEXT:    sltui $a1, $a3, 1
+; LA32-NEXT:    slti $a2, $a3, 0
+; LA32-NEXT:    masknez $a2, $a2, $a1
+; LA32-NEXT:    sltui $a3, $s1, 1
+; LA32-NEXT:    maskeqz $a1, $a3, $a1
+; LA32-NEXT:    or $a1, $a1, $a2
+; LA32-NEXT:    addi.w $s2, $a0, 16
+; LA32-NEXT:    bnez $a1, .LBB2_3
+; LA32-NEXT:  # %bb.1: # %for.body.preheader
+; LA32-NEXT:    move $fp, $a4
+; LA32-NEXT:    move $s3, $zero
+; LA32-NEXT:    move $s4, $zero
+; LA32-NEXT:    vrepli.b $vr0, 0
+; LA32-NEXT:    .p2align 4, , 16
+; LA32-NEXT:  .LBB2_2: # %for.body
+; LA32-NEXT:    # =>This Inner Loop Header: Depth=1
+; LA32-NEXT:    vst $vr0, $sp, 0 # 16-byte Folded Spill
+; LA32-NEXT:    move $a0, $fp
+; LA32-NEXT:    bl f
+; LA32-NEXT:    vld $vr0, $s2, 0
+; LA32-NEXT:    addi.w $s3, $s3, 1
+; LA32-NEXT:    sltui $a0, $s3, 1
+; LA32-NEXT:    add.w $s4, $s4, $a0
+; LA32-NEXT:    xor $a0, $s3, $s1
+; LA32-NEXT:    xor $a1, $s4, $s0
+; LA32-NEXT:    or $a0, $a0, $a1
+; LA32-NEXT:    vld $vr1, $sp, 0 # 16-byte Folded Reload
+; LA32-NEXT:    vadd.w $vr1, $vr0, $vr1
+; LA32-NEXT:    vst $vr1, $sp, 0 # 16-byte Folded Spill
+; LA32-NEXT:    vld $vr0, $sp, 0 # 16-byte Folded Reload
+; LA32-NEXT:    bnez $a0, .LBB2_2
+; LA32-NEXT:    b .LBB2_4
+; LA32-NEXT:  .LBB2_3:
+; LA32-NEXT:    vrepli.b $vr0, 0
+; LA32-NEXT:  .LBB2_4: # %for.cond.cleanup
+; LA32-NEXT:    vst $vr0, $s2, 0
+; LA32-NEXT:    ld.w $s4, $sp, 20 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s3, $sp, 24 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s2, $sp, 28 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s1, $sp, 32 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s0, $sp, 36 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $fp, $sp, 40 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $ra, $sp, 44 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 48
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: sink_fold_v4i32:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    addi.d $sp, $sp, -48
+; LA64-NEXT:    st.d $ra, $sp, 40 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $fp, $sp, 32 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s0, $sp, 24 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s1, $sp, 16 # 8-byte Folded Spill
+; LA64-NEXT:    slli.d $a0, $a0, 6
+; LA64-NEXT:    add.d $a0, $a2, $a0
+; LA64-NEXT:    addi.d $s1, $a0, 16
+; LA64-NEXT:    blez $a1, .LBB2_3
+; LA64-NEXT:  # %bb.1: # %for.body.preheader
+; LA64-NEXT:    move $fp, $a2
+; LA64-NEXT:    move $s0, $a1
+; LA64-NEXT:    vrepli.b $vr0, 0
+; LA64-NEXT:    .p2align 4, , 16
+; LA64-NEXT:  .LBB2_2: # %for.body
+; LA64-NEXT:    # =>This Inner Loop Header: Depth=1
+; LA64-NEXT:    vst $vr0, $sp, 0 # 16-byte Folded Spill
+; LA64-NEXT:    move $a0, $fp
+; LA64-NEXT:    pcaddu18i $ra, %call36(f)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    vld $vr0, $s1, 0
+; LA64-NEXT:    addi.d $s0, $s0, -1
+; LA64-NEXT:    vld $vr1, $sp, 0 # 16-byte Folded Reload
+; LA64-NEXT:    vadd.w $vr1, $vr0, $vr1
+; LA64-NEXT:    vst $vr1, $sp, 0 # 16-byte Folded Spill
+; LA64-NEXT:    vld $vr0, $sp, 0 # 16-byte Folded Reload
+; LA64-NEXT:    bnez $s0, .LBB2_2
+; LA64-NEXT:    b .LBB2_4
+; LA64-NEXT:  .LBB2_3:
+; LA64-NEXT:    vrepli.b $vr0, 0
+; LA64-NEXT:  .LBB2_4: # %for.cond.cleanup
+; LA64-NEXT:    vst $vr0, $s1, 0
+; LA64-NEXT:    ld.d $s1, $sp, 16 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $s0, $sp, 24 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $fp, $sp, 32 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $ra, $sp, 40 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 48
+; LA64-NEXT:    ret
+entry:
+  %y = getelementptr inbounds %struct.V, ptr %a, i64 %k, i32 1
+  %cmp = icmp sgt i64 %n, 0
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %sum.0 = phi <4 x i32> [ zeroinitializer, %entry ], [ %addv, %for.body ]
+  call void @f(ptr %a)
+  %v = load <4 x i32>, ptr %y
+  %addv = add <4 x i32> %v, %sum.0
+  %inc = add nuw nsw i64 %i.0, 1
+  %exitcond = icmp eq i64 %inc, %n
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %sum.lcssa = phi <4 x i32> [ zeroinitializer, %entry ], [ %addv, %for.body ]
+  store <4 x i32> %sum.lcssa, ptr %y
+  ret void
+}
+
+define void @sink_fold_v16i16(i64 %k, i64 %n, ptr %a) nounwind {
+; LA32-LABEL: sink_fold_v16i16:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    addi.w $sp, $sp, -80
+; LA32-NEXT:    st.w $ra, $sp, 76 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $fp, $sp, 72 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s0, $sp, 68 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s1, $sp, 64 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s2, $sp, 60 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s3, $sp, 56 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s4, $sp, 52 # 4-byte Folded Spill
+; LA32-NEXT:    move $s0, $a3
+; LA32-NEXT:    move $s1, $a2
+; LA32-NEXT:    slli.w $a0, $a0, 6
+; LA32-NEXT:    add.w $a0, $a4, $a0
+; LA32-NEXT:    sltui $a1, $a3, 1
+; LA32-NEXT:    slti $a2, $a3, 0
+; LA32-NEXT:    masknez $a2, $a2, $a1
+; LA32-NEXT:    sltui $a3, $s1, 1
+; LA32-NEXT:    maskeqz $a1, $a3, $a1
+; LA32-NEXT:    or $a1, $a1, $a2
+; LA32-NEXT:    addi.w $s2, $a0, 32
+; LA32-NEXT:    bnez $a1, .LBB3_3
+; LA32-NEXT:  # %bb.1: # %for.body.preheader
+; LA32-NEXT:    move $fp, $a4
+; LA32-NEXT:    move $s3, $zero
+; LA32-NEXT:    move $s4, $zero
+; LA32-NEXT:    xvrepli.b $xr0, 0
+; LA32-NEXT:    .p2align 4, , 16
+; LA32-NEXT:  .LBB3_2: # %for.body
+; LA32-NEXT:    # =>This Inner Loop Header: Depth=1
+; LA32-NEXT:    xvst $xr0, $sp, 16 # 32-byte Folded Spill
+; LA32-NEXT:    move $a0, $fp
+; LA32-NEXT:    bl f
+; LA32-NEXT:    xvld $xr0, $s2, 0
+; LA32-NEXT:    addi.w $s3, $s3, 1
+; LA32-NEXT:    sltui $a0, $s3, 1
+; LA32-NEXT:    add.w $s4, $s4, $a0
+; LA32-NEXT:    xor $a0, $s3, $s1
+; LA32-NEXT:    xor $a1, $s4, $s0
+; LA32-NEXT:    or $a0, $a0, $a1
+; LA32-NEXT:    xvld $xr1, $sp, 16 # 32-byte Folded Reload
+; LA32-NEXT:    xvadd.h $xr1, $xr0, $xr1
+; LA32-NEXT:    xvst $xr1, $sp, 16 # 32-byte Folded Spill
+; LA32-NEXT:    xvld $xr0, $sp, 16 # 32-byte Folded Reload
+; LA32-NEXT:    bnez $a0, .LBB3_2
+; LA32-NEXT:    b .LBB3_4
+; LA32-NEXT:  .LBB3_3:
+; LA32-NEXT:    xvrepli.b $xr0, 0
+; LA32-NEXT:  .LBB3_4: # %for.cond.cleanup
+; LA32-NEXT:    xvst $xr0, $s2, 0
+; LA32-NEXT:    ld.w $s4, $sp, 52 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s3, $sp, 56 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s2, $sp, 60 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s1, $sp, 64 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s0, $sp, 68 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $fp, $sp, 72 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $ra, $sp, 76 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 80
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: sink_fold_v16i16:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    addi.d $sp, $sp, -80
+; LA64-NEXT:    st.d $ra, $sp, 72 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $fp, $sp, 64 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s0, $sp, 56 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s1, $sp, 48 # 8-byte Folded Spill
+; LA64-NEXT:    slli.d $a0, $a0, 6
+; LA64-NEXT:    add.d $a0, $a2, $a0
+; LA64-NEXT:    addi.d $s1, $a0, 32
+; LA64-NEXT:    blez $a1, .LBB3_3
+; LA64-NEXT:  # %bb.1: # %for.body.preheader
+; LA64-NEXT:    move $fp, $a2
+; LA64-NEXT:    move $s0, $a1
+; LA64-NEXT:    xvrepli.b $xr0, 0
+; LA64-NEXT:    .p2align 4, , 16
+; LA64-NEXT:  .LBB3_2: # %for.body
+; LA64-NEXT:    # =>This Inner Loop Header: Depth=1
+; LA64-NEXT:    xvst $xr0, $sp, 16 # 32-byte Folded Spill
+; LA64-NEXT:    move $a0, $fp
+; LA64-NEXT:    pcaddu18i $ra, %call36(f)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    xvld $xr0, $s1, 0
+; LA64-NEXT:    addi.d $s0, $s0, -1
+; LA64-NEXT:    xvld $xr1, $sp, 16 # 32-byte Folded Reload
+; LA64-NEXT:    xvadd.h $xr1, $xr0, $xr1
+; LA64-NEXT:    xvst $xr1, $sp, 16 # 32-byte Folded Spill
+; LA64-NEXT:    xvld $xr0, $sp, 16 # 32-byte Folded Reload
+; LA64-NEXT:    bnez $s0, .LBB3_2
+; LA64-NEXT:    b .LBB3_4
+; LA64-NEXT:  .LBB3_3:
+; LA64-NEXT:    xvrepli.b $xr0, 0
+; LA64-NEXT:  .LBB3_4: # %for.cond.cleanup
+; LA64-NEXT:    xvst $xr0, $s1, 0
+; LA64-NEXT:    ld.d $s1, $sp, 48 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $s0, $sp, 56 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $fp, $sp, 64 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $ra, $sp, 72 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 80
+; LA64-NEXT:    ret
+entry:
+  %y = getelementptr inbounds %struct.V, ptr %a, i64 %k, i32 2
+  %cmp = icmp sgt i64 %n, 0
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %sum.0 = phi <16 x i16> [ zeroinitializer, %entry ], [ %addv, %for.body ]
+  call void @f(ptr %a)
+  %v = load <16 x i16>, ptr %y
+  %addv = add <16 x i16> %v, %sum.0
+  %inc = add nuw nsw i64 %i.0, 1
+  %exitcond = icmp eq i64 %inc, %n
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %sum.lcssa = phi <16 x i16> [ zeroinitializer, %entry ], [ %addv, %for.body ]
+  store <16 x i16> %sum.lcssa, ptr %y
+  ret void
+}
+
+define void @sink_fold_extracti8(i64 %k, i64 %n, ptr %a) nounwind {
+; LA32-LABEL: sink_fold_extracti8:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    addi.w $sp, $sp, -48
+; LA32-NEXT:    st.w $ra, $sp, 44 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $fp, $sp, 40 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s0, $sp, 36 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s1, $sp, 32 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s2, $sp, 28 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s3, $sp, 24 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s4, $sp, 20 # 4-byte Folded Spill
+; LA32-NEXT:    move $s0, $a3
+; LA32-NEXT:    move $s1, $a2
+; LA32-NEXT:    slli.w $a1, $a0, 4
+; LA32-NEXT:    alsl.w $a0, $a0, $a1, 3
+; LA32-NEXT:    add.w $a0, $a4, $a0
+; LA32-NEXT:    sltui $a1, $a3, 1
+; LA32-NEXT:    slti $a2, $a3, 0
+; LA32-NEXT:    masknez $a2, $a2, $a1
+; LA32-NEXT:    sltui $a3, $s1, 1
+; LA32-NEXT:    maskeqz $a1, $a3, $a1
+; LA32-NEXT:    or $a1, $a1, $a2
+; LA32-NEXT:    addi.w $s2, $a0, 16
+; LA32-NEXT:    bnez $a1, .LBB4_3
+; LA32-NEXT:  # %bb.1: # %for.body.preheader
+; LA32-NEXT:    move $fp, $a4
+; LA32-NEXT:    move $s3, $zero
+; LA32-NEXT:    move $s4, $zero
+; LA32-NEXT:    vrepli.b $vr0, 0
+; LA32-NEXT:    .p2align 4, , 16
+; LA32-NEXT:  .LBB4_2: # %for.body
+; LA32-NEXT:    # =>This Inner Loop Header: Depth=1
+; LA32-NEXT:    vst $vr0, $sp, 0 # 16-byte Folded Spill
+; LA32-NEXT:    move $a0, $fp
+; LA32-NEXT:    bl f
+; LA32-NEXT:    vldrepl.b $vr0, $s2, 0
+; LA32-NEXT:    addi.w $s3, $s3, 1
+; LA32-NEXT:    sltui $a0, $s3, 1
+; LA32-NEXT:    add.w $s4, $s4, $a0
+; LA32-NEXT:    xor $a0, $s3, $s1
+; LA32-NEXT:    xor $a1, $s4, $s0
+; LA32-NEXT:    or $a0, $a0, $a1
+; LA32-NEXT:    vld $vr1, $sp, 0 # 16-byte Folded Reload
+; LA32-NEXT:    vadd.b $vr1, $vr0, $vr1
+; LA32-NEXT:    vst $vr1, $sp, 0 # 16-byte Folded Spill
+; LA32-NEXT:    vld $vr0, $sp, 0 # 16-byte Folded Reload
+; LA32-NEXT:    bnez $a0, .LBB4_2
+; LA32-NEXT:    b .LBB4_4
+; LA32-NEXT:  .LBB4_3:
+; LA32-NEXT:    vrepli.b $vr0, 0
+; LA32-NEXT:  .LBB4_4: # %for.cond.cleanup
+; LA32-NEXT:    vstelm.b $vr0, $s2, 0, 1
+; LA32-NEXT:    ld.w $s4, $sp, 20 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s3, $sp, 24 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s2, $sp, 28 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s1, $sp, 32 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s0, $sp, 36 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $fp, $sp, 40 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $ra, $sp, 44 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 48
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: sink_fold_extracti8:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    addi.d $sp, $sp, -48
+; LA64-NEXT:    st.d $ra, $sp, 40 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $fp, $sp, 32 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s0, $sp, 24 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s1, $sp, 16 # 8-byte Folded Spill
+; LA64-NEXT:    move $s0, $a1
+; LA64-NEXT:    slli.d $a1, $a0, 4
+; LA64-NEXT:    alsl.d $a0, $a0, $a1, 3
+; LA64-NEXT:    add.d $a0, $a2, $a0
+; LA64-NEXT:    addi.d $s1, $a0, 16
+; LA64-NEXT:    blez $s0, .LBB4_3
+; LA64-NEXT:  # %bb.1: # %for.body.preheader
+; LA64-NEXT:    move $fp, $a2
+; LA64-NEXT:    vrepli.b $vr0, 0
+; LA64-NEXT:    .p2align 4, , 16
+; LA64-NEXT:  .LBB4_2: # %for.body
+; LA64-NEXT:    # =>This Inner Loop Header: Depth=1
+; LA64-NEXT:    vst $vr0, $sp, 0 # 16-byte Folded Spill
+; LA64-NEXT:    move $a0, $fp
+; LA64-NEXT:    pcaddu18i $ra, %call36(f)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    vldrepl.b $vr0, $s1, 0
+; LA64-NEXT:    addi.d $s0, $s0, -1
+; LA64-NEXT:    vld $vr1, $sp, 0 # 16-byte Folded Reload
+; LA64-NEXT:    vadd.b $vr1, $vr0, $vr1
+; LA64-NEXT:    vst $vr1, $sp, 0 # 16-byte Folded Spill
+; LA64-NEXT:    vld $vr0, $sp, 0 # 16-byte Folded Reload
+; LA64-NEXT:    bnez $s0, .LBB4_2
+; LA64-NEXT:    b .LBB4_4
+; LA64-NEXT:  .LBB4_3:
+; LA64-NEXT:    vrepli.b $vr0, 0
+; LA64-NEXT:  .LBB4_4: # %for.cond.cleanup
+; LA64-NEXT:    vstelm.b $vr0, $s1, 0, 1
+; LA64-NEXT:    ld.d $s1, $sp, 16 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $s0, $sp, 24 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $fp, $sp, 32 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $ra, $sp, 40 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 48
+; LA64-NEXT:    ret
+entry:
+  %y = getelementptr inbounds %struct.S, ptr %a, i64 %k, i32 2
+  %cmp = icmp sgt i64 %n, 0
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %sum.0 = phi <16 x i8> [ zeroinitializer, %entry ], [ %addv, %for.body ]
+  call void @f(ptr %a)
+  %e = load i8, ptr %y
+  %ins0 = insertelement <16 x i8> poison, i8 %e, i32 0
+  %v = shufflevector <16 x i8> %ins0, <16 x i8> poison, <16 x i32> zeroinitializer
+  %addv = add <16 x i8> %v, %sum.0
+  %inc = add nuw nsw i64 %i.0, 1
+  %exitcond = icmp eq i64 %inc, %n
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %sum.lcssa = phi <16 x i8> [ zeroinitializer, %entry ], [ %addv, %for.body ]
+  %res = extractelement <16 x i8> %sum.lcssa, i32 1
+  store i8 %res, ptr %y
+  ret void
+}
+
+define void @sink_fold_extractf64(i64 %k, i64 %n, ptr %a) nounwind {
+; LA32-LABEL: sink_fold_extractf64:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    addi.w $sp, $sp, -80
+; LA32-NEXT:    st.w $ra, $sp, 76 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $fp, $sp, 72 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s0, $sp, 68 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s1, $sp, 64 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s2, $sp, 60 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s3, $sp, 56 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s4, $sp, 52 # 4-byte Folded Spill
+; LA32-NEXT:    move $s0, $a3
+; LA32-NEXT:    move $s1, $a2
+; LA32-NEXT:    slli.w $a1, $a0, 4
+; LA32-NEXT:    alsl.w $a0, $a0, $a1, 3
+; LA32-NEXT:    add.w $a0, $a4, $a0
+; LA32-NEXT:    sltui $a1, $a3, 1
+; LA32-NEXT:    slti $a2, $a3, 0
+; LA32-NEXT:    masknez $a2, $a2, $a1
+; LA32-NEXT:    sltui $a3, $s1, 1
+; LA32-NEXT:    maskeqz $a1, $a3, $a1
+; LA32-NEXT:    or $a1, $a1, $a2
+; LA32-NEXT:    addi.w $s2, $a0, 8
+; LA32-NEXT:    bnez $a1, .LBB5_3
+; LA32-NEXT:  # %bb.1: # %for.body.preheader
+; LA32-NEXT:    move $fp, $a4
+; LA32-NEXT:    move $s3, $zero
+; LA32-NEXT:    move $s4, $zero
+; LA32-NEXT:    xvrepli.b $xr0, 0
+; LA32-NEXT:    .p2align 4, , 16
+; LA32-NEXT:  .LBB5_2: # %for.body
+; LA32-NEXT:    # =>This Inner Loop Header: Depth=1
+; LA32-NEXT:    xvst $xr0, $sp, 16 # 32-byte Folded Spill
+; LA32-NEXT:    move $a0, $fp
+; LA32-NEXT:    bl f
+; LA32-NEXT:    xvldrepl.d $xr0, $s2, 0
+; LA32-NEXT:    addi.w $s3, $s3, 1
+; LA32-NEXT:    sltui $a0, $s3, 1
+; LA32-NEXT:    add.w $s4, $s4, $a0
+; LA32-NEXT:    xor $a0, $s3, $s1
+; LA32-NEXT:    xor $a1, $s4, $s0
+; LA32-NEXT:    or $a0, $a0, $a1
+; LA32-NEXT:    xvld $xr1, $sp, 16 # 32-byte Folded Reload
+; LA32-NEXT:    xvfadd.d $xr1, $xr0, $xr1
+; LA32-NEXT:    xvst $xr1, $sp, 16 # 32-byte Folded Spill
+; LA32-NEXT:    xvld $xr0, $sp, 16 # 32-byte Folded Reload
+; LA32-NEXT:    bnez $a0, .LBB5_2
+; LA32-NEXT:    b .LBB5_4
+; LA32-NEXT:  .LBB5_3:
+; LA32-NEXT:    xvrepli.b $xr0, 0
+; LA32-NEXT:  .LBB5_4: # %for.cond.cleanup
+; LA32-NEXT:    xvstelm.d $xr0, $s2, 0, 1
+; LA32-NEXT:    ld.w $s4, $sp, 52 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s3, $sp, 56 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s2, $sp, 60 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s1, $sp, 64 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s0, $sp, 68 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $fp, $sp, 72 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $ra, $sp, 76 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 80
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: sink_fold_extractf64:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    addi.d $sp, $sp, -80
+; LA64-NEXT:    st.d $ra, $sp, 72 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $fp, $sp, 64 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s0, $sp, 56 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s1, $sp, 48 # 8-byte Folded Spill
+; LA64-NEXT:    move $s0, $a1
+; LA64-NEXT:    slli.d $a1, $a0, 4
+; LA64-NEXT:    alsl.d $a0, $a0, $a1, 3
+; LA64-NEXT:    add.d $a0, $a2, $a0
+; LA64-NEXT:    addi.d $s1, $a0, 8
+; LA64-NEXT:    blez $s0, .LBB5_3
+; LA64-NEXT:  # %bb.1: # %for.body.preheader
+; LA64-NEXT:    move $fp, $a2
+; LA64-NEXT:    xvrepli.b $xr0, 0
+; LA64-NEXT:    .p2align 4, , 16
+; LA64-NEXT:  .LBB5_2: # %for.body
+; LA64-NEXT:    # =>This Inner Loop Header: Depth=1
+; LA64-NEXT:    xvst $xr0, $sp, 16 # 32-byte Folded Spill
+; LA64-NEXT:    move $a0, $fp
+; LA64-NEXT:    pcaddu18i $ra, %call36(f)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    xvldrepl.d $xr0, $s1, 0
+; LA64-NEXT:    addi.d $s0, $s0, -1
+; LA64-NEXT:    xvld $xr1, $sp, 16 # 32-byte Folded Reload
+; LA64-NEXT:    xvfadd.d $xr1, $xr0, $xr1
+; LA64-NEXT:    xvst $xr1, $sp, 16 # 32-byte Folded Spill
+; LA64-NEXT:    xvld $xr0, $sp, 16 # 32-byte Folded Reload
+; LA64-NEXT:    bnez $s0, .LBB5_2
+; LA64-NEXT:    b .LBB5_4
+; LA64-NEXT:  .LBB5_3:
+; LA64-NEXT:    xvrepli.b $xr0, 0
+; LA64-NEXT:  .LBB5_4: # %for.cond.cleanup
+; LA64-NEXT:    xvstelm.d $xr0, $s1, 0, 1
+; LA64-NEXT:    ld.d $s1, $sp, 48 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $s0, $sp, 56 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $fp, $sp, 64 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $ra, $sp, 72 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 80
+; LA64-NEXT:    ret
+entry:
+  %y = getelementptr inbounds %struct.F, ptr %a, i64 %k, i32 1
+  %cmp = icmp sgt i64 %n, 0
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %sum.0 = phi <4 x double> [ zeroinitializer, %entry ], [ %addv, %for.body ]
+  call void @f(ptr %a)
+  %e = load double, ptr %y
+  %ins0 = insertelement <4 x double> poison, double %e, i32 0
+  %v = shufflevector <4 x double> %ins0, <4 x double> poison, <4 x i32> zeroinitializer
+  %addv = fadd <4 x double> %v, %sum.0
+  %inc = add nuw nsw i64 %i.0, 1
+  %exitcond = icmp eq i64 %inc, %n
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %sum.lcssa = phi <4 x double> [ zeroinitializer, %entry ], [ %addv, %for.body ]
+  %res = extractelement <4 x double> %sum.lcssa, i32 1
+  store double %res, ptr %y
+  ret void
+}
+
+declare void @f(ptr)
diff --git a/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_print.txt b/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_print.txt
index d3c0da9862245..000c67efb1de7 100644
--- a/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_print.txt
+++ b/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_print.txt
@@ -1439,11 +1439,8 @@ Key: PSUBWrm:  [ 0.00  0.00 ]
 Key: PSUBWrr:  [ 0.00  0.00 ]
 Key: PSWAPDrm:  [ 0.00  0.00 ]
 Key: PSWAPDrr:  [ 0.00  0.00 ]
-Key: PT:  [ 0.00  0.00 ]
 Key: PTCMMIMFP:  [ 0.00  0.00 ]
 Key: PTCMMRLFP:  [ 0.00  0.00 ]
-Key: PTCONJTCMMIMFP:  [ 0.00  0.00 ]
-Key: PTCONJTFP:  [ 0.00  0.00 ]
 Key: PTCVTROWD:  [ 0.00  0.00 ]
 Key: PTCVTROWPS:  [ 0.00  0.00 ]
 Key: PTDPBF:  [ 0.00  0.00 ]
@@ -1471,20 +1468,11 @@ Key: PTILEMOVROWrre:  [ 0.00  0.00 ]
 Key: PTILEMOVROWrreV:  [ 0.00  0.00 ]
 Key: PTILEMOVROWrri:  [ 0.00  0.00 ]
 Key: PTILEMOVROWrriV:  [ 0.00  0.00 ]
-Key: PTILEPAIRLOAD:  [ 0.00  0.00 ]
-Key: PTILEPAIRSTORE:  [ 0.00  0.00 ]
 Key: PTILESTORED:  [ 0.00  0.00 ]
 Key: PTILESTOREDV:  [ 0.00  0.00 ]
 Key: PTILEZERO:  [ 0.00  0.00 ]
 Key: PTILEZEROV:  [ 0.00  0.00 ]
 Key: PTMMULTF:  [ 0.00  0.00 ]
-Key: PTTCMMIMFP:  [ 0.00  0.00 ]
-Key: PTTCMMRLFP:  [ 0.00  0.00 ]
-Key: PTTDPBF:  [ 0.00  0.00 ]
-Key: PTTDPFP:  [ 0.00  0.00 ]
-Key: PTTMMULTF:  [ 0.00  0.00 ]
-Key: PTTRANSPOSED:  [ 0.00  0.00 ]
-Key: PTTRANSPOSEDV:  [ 0.00  0.00 ]
 Key: PTWRITE:  [ 0.00  0.00 ]
 Key: PTWRITEm:  [ 0.00  0.00 ]
 Key: PTWRITEr:  [ 0.00  0.00 ]
@@ -1717,8 +1705,6 @@ Key: TAILJMPm:  [ 0.00  0.00 ]
 Key: TAILJMPr:  [ 0.00  0.00 ]
 Key: TCMMIMFP:  [ 0.00  0.00 ]
 Key: TCMMRLFP:  [ 0.00  0.00 ]
-Key: TCONJTCMMIMFP:  [ 0.00  0.00 ]
-Key: TCONJTFP:  [ 0.00  0.00 ]
 Key: TCRETURN_HIPE:  [ 0.00  0.00 ]
 Key: TCRETURN_WIN:  [ 0.00  0.00 ]
 Key: TCRETURN_WINmi:  [ 0.00  0.00 ]
@@ -1764,12 +1750,6 @@ Key: TPAUSE:  [ 0.00  0.00 ]
 Key: TRAP:  [ 0.00  0.00 ]
 Key: TST_F:  [ 0.00  0.00 ]
 Key: TST_Fp:  [ 0.00  0.00 ]
-Key: TTCMMIMFP:  [ 0.00  0.00 ]
-Key: TTCMMRLFP:  [ 0.00  0.00 ]
-Key: TTDPBF:  [ 0.00  0.00 ]
-Key: TTDPFP:  [ 0.00  0.00 ]
-Key: TTMMULTF:  [ 0.00  0.00 ]
-Key: TTRANSPOSED:  [ 0.00  0.00 ]
 Key: TZCNT:  [ 0.00  0.00 ]
 Key: TZMSK:  [ 0.00  0.00 ]
 Key: UBSAN_UD:  [ 0.00  0.00 ]
@@ -7034,7 +7014,6 @@ Key: PhyReg_VR256:  [ 0.00  0.00 ]
 Key: PhyReg_VR512:  [ 0.00  0.00 ]
 Key: PhyReg_VR512_0_15:  [ 0.00  0.00 ]
 Key: PhyReg_TILE:  [ 0.00  0.00 ]
-Key: PhyReg_TILEPAIR:  [ 0.00  0.00 ]
 Key: VirtReg_GR8:  [ 0.00  0.00 ]
 Key: VirtReg_GRH8:  [ 0.00  0.00 ]
 Key: VirtReg_GR8_NOREX2:  [ 0.00  0.00 ]
@@ -7170,4 +7149,3 @@ Key: VirtReg_VR256:  [ 0.00  0.00 ]
 Key: VirtReg_VR512:  [ 0.00  0.00 ]
 Key: VirtReg_VR512_0_15:  [ 0.00  0.00 ]
 Key: VirtReg_TILE:  [ 0.00  0.00 ]
-Key: VirtReg_TILEPAIR:  [ 0.00  0.00 ]
diff --git a/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_wo=0.5_print.txt b/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_wo=0.5_print.txt
index c6e5508248b9b..bb72886f73bfd 100644
--- a/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_wo=0.5_print.txt
+++ b/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_wo=0.5_print.txt
@@ -1439,11 +1439,8 @@ Key: PSUBWrm:  [ 0.00  0.00 ]
 Key: PSUBWrr:  [ 0.00  0.00 ]
 Key: PSWAPDrm:  [ 0.00  0.00 ]
 Key: PSWAPDrr:  [ 0.00  0.00 ]
-Key: PT:  [ 0.00  0.00 ]
 Key: PTCMMIMFP:  [ 0.00  0.00 ]
 Key: PTCMMRLFP:  [ 0.00  0.00 ]
-Key: PTCONJTCMMIMFP:  [ 0.00  0.00 ]
-Key: PTCONJTFP:  [ 0.00  0.00 ]
 Key: PTCVTROWD:  [ 0.00  0.00 ]
 Key: PTCVTROWPS:  [ 0.00  0.00 ]
 Key: PTDPBF:  [ 0.00  0.00 ]
@@ -1471,20 +1468,11 @@ Key: PTILEMOVROWrre:  [ 0.00  0.00 ]
 Key: PTILEMOVROWrreV:  [ 0.00  0.00 ]
 Key: PTILEMOVROWrri:  [ 0.00  0.00 ]
 Key: PTILEMOVROWrriV:  [ 0.00  0.00 ]
-Key: PTILEPAIRLOAD:  [ 0.00  0.00 ]
-Key: PTILEPAIRSTORE:  [ 0.00  0.00 ]
 Key: PTILESTORED:  [ 0.00  0.00 ]
 Key: PTILESTOREDV:  [ 0.00  0.00 ]
 Key: PTILEZERO:  [ 0.00  0.00 ]
 Key: PTILEZEROV:  [ 0.00  0.00 ]
 Key: PTMMULTF:  [ 0.00  0.00 ]
-Key: PTTCMMIMFP:  [ 0.00  0.00 ]
-Key: PTTCMMRLFP:  [ 0.00  0.00 ]
-Key: PTTDPBF:  [ 0.00  0.00 ]
-Key: PTTDPFP:  [ 0.00  0.00 ]
-Key: PTTMMULTF:  [ 0.00  0.00 ]
-Key: PTTRANSPOSED:  [ 0.00  0.00 ]
-Key: PTTRANSPOSEDV:  [ 0.00  0.00 ]
 Key: PTWRITE:  [ 0.00  0.00 ]
 Key: PTWRITEm:  [ 0.00  0.00 ]
 Key: PTWRITEr:  [ 0.00  0.00 ]
@@ -1717,8 +1705,6 @@ Key: TAILJMPm:  [ 0.00  0.00 ]
 Key: TAILJMPr:  [ 0.00  0.00 ]
 Key: TCMMIMFP:  [ 0.00  0.00 ]
 Key: TCMMRLFP:  [ 0.00  0.00 ]
-Key: TCONJTCMMIMFP:  [ 0.00  0.00 ]
-Key: TCONJTFP:  [ 0.00  0.00 ]
 Key: TCRETURN_HIPE:  [ 0.00  0.00 ]
 Key: TCRETURN_WIN:  [ 0.00  0.00 ]
 Key: TCRETURN_WINmi:  [ 0.00  0.00 ]
@@ -1764,12 +1750,6 @@ Key: TPAUSE:  [ 0.00  0.00 ]
 Key: TRAP:  [ 0.00  0.00 ]
 Key: TST_F:  [ 0.00  0.00 ]
 Key: TST_Fp:  [ 0.00  0.00 ]
-Key: TTCMMIMFP:  [ 0.00  0.00 ]
-Key: TTCMMRLFP:  [ 0.00  0.00 ]
-Key: TTDPBF:  [ 0.00  0.00 ]
-Key: TTDPFP:  [ 0.00  0.00 ]
-Key: TTMMULTF:  [ 0.00  0.00 ]
-Key: TTRANSPOSED:  [ 0.00  0.00 ]
 Key: TZCNT:  [ 0.00  0.00 ]
 Key: TZMSK:  [ 0.00  0.00 ]
 Key: UBSAN_UD:  [ 0.00  0.00 ]
@@ -7034,7 +7014,6 @@ Key: PhyReg_VR256:  [ 0.00  0.00 ]
 Key: PhyReg_VR512:  [ 0.00  0.00 ]
 Key: PhyReg_VR512_0_15:  [ 0.00  0.00 ]
 Key: PhyReg_TILE:  [ 0.00  0.00 ]
-Key: PhyReg_TILEPAIR:  [ 0.00  0.00 ]
 Key: VirtReg_GR8:  [ 0.00  0.00 ]
 Key: VirtReg_GRH8:  [ 0.00  0.00 ]
 Key: VirtReg_GR8_NOREX2:  [ 0.00  0.00 ]
@@ -7170,4 +7149,3 @@ Key: VirtReg_VR256:  [ 0.00  0.00 ]
 Key: VirtReg_VR512:  [ 0.00  0.00 ]
 Key: VirtReg_VR512_0_15:  [ 0.00  0.00 ]
 Key: VirtReg_TILE:  [ 0.00  0.00 ]
-Key: VirtReg_TILEPAIR:  [ 0.00  0.00 ]
diff --git a/llvm/test/CodeGen/MLRegAlloc/dev-mode-extra-features-logging.ll b/llvm/test/CodeGen/MLRegAlloc/dev-mode-extra-features-logging.ll
index bd8d882cda39b..9dd402d13b8e0 100644
--- a/llvm/test/CodeGen/MLRegAlloc/dev-mode-extra-features-logging.ll
+++ b/llvm/test/CodeGen/MLRegAlloc/dev-mode-extra-features-logging.ll
@@ -26,7 +26,7 @@
 ; Also, the first eviction problem is significantly less than 300 instructions. Check
 ; that there is a zero value.
 ; Note: we're regex-ing some of the opcodes to avoid test flakyness.
-; CHECK: instructions: 20,{{([0-9]{4})}},1{{([0-9]{3})}},2{{([0-9]{3})}},{{.*}},0,
+; CHECK: instructions: 20,{{([0-9]{4})}},{{([0-9]{4})}},{{([0-9]{4})}},{{.*}},0,
 ; Only the candidate virtreg and the 10th LR are included in this problem. Make
 ; sure the other LRs have values of zero. There are 2700 0s followed by some 1s.
 ; There's a limit to how many repetitions can be matched.
diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-1cta.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-1cta.ll
index b5c43fd259a75..d653895efa340 100644
--- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-1cta.ll
+++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-1cta.ll
@@ -1,8 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK-PTX64 %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK-PTX64 %s
 ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %}
 ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %}
+; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %}
+; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %}
 
 target triple = "nvptx64-nvidia-cuda"
 
diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-2cta.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-2cta.ll
index 57342dc9a49c5..5de1ac887b76c 100644
--- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-2cta.ll
+++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-2cta.ll
@@ -1,8 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK-PTX64 %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK-PTX64 %s
 ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %}
 ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %}
+; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %}
+; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %}
 
 target triple = "nvptx64-nvidia-cuda"
 
diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-gather4.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-gather4.ll
index 6296d5af8ab18..2f5c1ef4670da 100644
--- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-gather4.ll
+++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-gather4.ll
@@ -1,8 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK-PTX64 %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK-PTX64 %s
 ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %}
 ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %}
+; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %}
+; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %}
 
 target triple = "nvptx64-nvidia-cuda"
 
diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw.ll
index e5ae3875a0ede..a2b2c2f27fa5e 100644
--- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw.ll
+++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw.ll
@@ -1,8 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK-PTX64 %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK-PTX64 %s
 ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %}
 ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %}
+; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %}
+; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %}
 
 target triple = "nvptx64-nvidia-cuda"
 
diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw128.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw128.ll
index 7d04adaa774c3..e4c48ddddea18 100644
--- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw128.ll
+++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw128.ll
@@ -1,8 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK-PTX64 %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK-PTX64 %s
 ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %}
 ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %}
+; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %}
+; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %}
 
 target triple = "nvptx64-nvidia-cuda"
 
diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s.ll
index b0fe77c1a83be..727bb3b3aa8fd 100644
--- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s.ll
+++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s.ll
@@ -1,8 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| FileCheck --check-prefixes=CHECK-PTX64 %s
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK-PTX64 %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK-PTX64 %s
 ; RUN: %if ptxas-sm_90 && ptxas-isa-8.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| %ptxas-verify -arch=sm_90 %}
 ; RUN: %if ptxas-sm_90 && ptxas-isa-8.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 --nvptx-short-ptr| %ptxas-verify -arch=sm_90 %}
+; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %}
+; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %}
 
 target triple = "nvptx64-nvidia-cuda"
 
@@ -29,10 +33,10 @@ define void @cp_async_bulk_tensor_g2s_tile_1d(ptr addrspace(7) %d, ptr addrspace
 ; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_1d_param_1];
 ; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_tile_1d_param_2];
 ; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_tile_1d_param_3];
-; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1}], [%rd2];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_1d_param_4];
 ; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_tile_1d_param_5];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1}], [%rd2];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1}], [%rd2], %rd4;
-; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_1d_param_4];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1}], [%rd2], %rs1;
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1}], [%rd2], %rs1, %rd4;
 ; CHECK-PTX64-NEXT:    ret;
@@ -48,10 +52,10 @@ define void @cp_async_bulk_tensor_g2s_tile_1d(ptr addrspace(7) %d, ptr addrspace
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_tile_1d_param_1];
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_tile_1d_param_2];
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_tile_1d_param_3];
-; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3}], [%r2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_1d_param_4];
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_1d_param_5];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3}], [%r2];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3}], [%r2], %rd2;
-; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_1d_param_4];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3}], [%r2], %rs1;
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3}], [%r2], %rs1, %rd2;
 ; CHECK-PTX-SHARED32-NEXT:    ret;
@@ -79,10 +83,10 @@ define void @cp_async_bulk_tensor_g2s_tile_2d(ptr addrspace(7) %d, ptr addrspace
 ; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_tile_2d_param_2];
 ; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_tile_2d_param_3];
 ; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_tile_2d_param_4];
-; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2}], [%rd2];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_2d_param_5];
 ; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_tile_2d_param_6];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2}], [%rd2];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2}], [%rd2], %rd4;
-; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_2d_param_5];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2}], [%rd2], %rs1;
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2}], [%rd2], %rs1, %rd4;
 ; CHECK-PTX64-NEXT:    ret;
@@ -99,10 +103,10 @@ define void @cp_async_bulk_tensor_g2s_tile_2d(ptr addrspace(7) %d, ptr addrspace
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_tile_2d_param_2];
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_tile_2d_param_3];
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_tile_2d_param_4];
-; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4}], [%r2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_2d_param_5];
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_2d_param_6];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4}], [%r2];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4}], [%r2], %rd2;
-; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_2d_param_5];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4}], [%r2], %rs1;
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4}], [%r2], %rs1, %rd2;
 ; CHECK-PTX-SHARED32-NEXT:    ret;
@@ -131,10 +135,10 @@ define void @cp_async_bulk_tensor_g2s_tile_3d(ptr addrspace(7) %d, ptr addrspace
 ; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_tile_3d_param_3];
 ; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_tile_3d_param_4];
 ; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_tile_3d_param_5];
-; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_3d_param_6];
 ; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_tile_3d_param_7];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], %rd4;
-; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_3d_param_6];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], %rs1;
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], %rs1, %rd4;
 ; CHECK-PTX64-NEXT:    ret;
@@ -152,10 +156,10 @@ define void @cp_async_bulk_tensor_g2s_tile_3d(ptr addrspace(7) %d, ptr addrspace
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_tile_3d_param_3];
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_tile_3d_param_4];
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_tile_3d_param_5];
-; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_3d_param_6];
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_3d_param_7];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], %rd2;
-; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_3d_param_6];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], %rs1;
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], %rs1, %rd2;
 ; CHECK-PTX-SHARED32-NEXT:    ret;
@@ -185,10 +189,10 @@ define void @cp_async_bulk_tensor_g2s_tile_4d(ptr addrspace(7) %d, ptr addrspace
 ; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_tile_4d_param_4];
 ; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_tile_4d_param_5];
 ; CHECK-PTX64-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_tile_4d_param_6];
-; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_4d_param_7];
 ; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_tile_4d_param_8];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], %rd4;
-; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_4d_param_7];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], %rs1;
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], %rs1, %rd4;
 ; CHECK-PTX64-NEXT:    ret;
@@ -207,10 +211,10 @@ define void @cp_async_bulk_tensor_g2s_tile_4d(ptr addrspace(7) %d, ptr addrspace
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_tile_4d_param_4];
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_tile_4d_param_5];
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_tile_4d_param_6];
-; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_4d_param_7];
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_4d_param_8];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], %rd2;
-; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_4d_param_7];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], %rs1;
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], %rs1, %rd2;
 ; CHECK-PTX-SHARED32-NEXT:    ret;
@@ -241,10 +245,10 @@ define void @cp_async_bulk_tensor_g2s_tile_5d(ptr addrspace(7) %d, ptr addrspace
 ; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_tile_5d_param_5];
 ; CHECK-PTX64-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_tile_5d_param_6];
 ; CHECK-PTX64-NEXT:    ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_tile_5d_param_7];
-; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_5d_param_8];
 ; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_tile_5d_param_9];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rd4;
-; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_5d_param_8];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1;
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1, %rd4;
 ; CHECK-PTX64-NEXT:    ret;
@@ -264,10 +268,10 @@ define void @cp_async_bulk_tensor_g2s_tile_5d(ptr addrspace(7) %d, ptr addrspace
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_tile_5d_param_5];
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_tile_5d_param_6];
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r7, [cp_async_bulk_tensor_g2s_tile_5d_param_7];
-; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_5d_param_8];
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_5d_param_9];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rd2;
-; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_5d_param_8];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1;
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1, %rd2;
 ; CHECK-PTX-SHARED32-NEXT:    ret;
@@ -297,10 +301,10 @@ define void @cp_async_bulk_tensor_g2s_im2col_3d(ptr addrspace(7) %d, ptr addrspa
 ; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2col_3d_param_4];
 ; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2col_3d_param_5];
 ; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2col_3d_param_6];
-; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1};
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_3d_param_7];
 ; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2col_3d_param_8];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1};
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}, %rd4;
-; CHECK-PTX64-NEXT:    ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_3d_param_7];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}, %rs2;
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}, %rs2, %rd4;
 ; CHECK-PTX64-NEXT:    ret;
@@ -319,10 +323,10 @@ define void @cp_async_bulk_tensor_g2s_im2col_3d(ptr addrspace(7) %d, ptr addrspa
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2col_3d_param_4];
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2col_3d_param_5];
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2col_3d_param_6];
-; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1};
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_3d_param_7];
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2col_3d_param_8];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1};
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}, %rd2;
-; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_3d_param_7];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}, %rs2;
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}, %rs2, %rd2;
 ; CHECK-PTX-SHARED32-NEXT:    ret;
@@ -354,10 +358,10 @@ define void @cp_async_bulk_tensor_g2s_im2col_4d(ptr addrspace(7) %d, ptr addrspa
 ; CHECK-PTX64-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2col_4d_param_6];
 ; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2col_4d_param_7];
 ; CHECK-PTX64-NEXT:    ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_4d_param_8];
-; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2};
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2col_4d_param_9];
 ; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2col_4d_param_10];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2};
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rd4;
-; CHECK-PTX64-NEXT:    ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2col_4d_param_9];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3;
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3, %rd4;
 ; CHECK-PTX64-NEXT:    ret;
@@ -378,10 +382,10 @@ define void @cp_async_bulk_tensor_g2s_im2col_4d(ptr addrspace(7) %d, ptr addrspa
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2col_4d_param_6];
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2col_4d_param_7];
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_4d_param_8];
-; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2};
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2col_4d_param_9];
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2col_4d_param_10];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2};
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rd2;
-; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2col_4d_param_9];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3;
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3, %rd2;
 ; CHECK-PTX-SHARED32-NEXT:    ret;
@@ -415,10 +419,10 @@ define void @cp_async_bulk_tensor_g2s_im2col_5d(ptr addrspace(7) %d, ptr addrspa
 ; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2col_5d_param_8];
 ; CHECK-PTX64-NEXT:    ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_5d_param_9];
 ; CHECK-PTX64-NEXT:    ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2col_5d_param_10];
-; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3};
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs4, [cp_async_bulk_tensor_g2s_im2col_5d_param_11];
 ; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2col_5d_param_12];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3};
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}, %rd4;
-; CHECK-PTX64-NEXT:    ld.param.b16 %rs4, [cp_async_bulk_tensor_g2s_im2col_5d_param_11];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}, %rs4;
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}, %rs4, %rd4;
 ; CHECK-PTX64-NEXT:    ret;
@@ -441,10 +445,10 @@ define void @cp_async_bulk_tensor_g2s_im2col_5d(ptr addrspace(7) %d, ptr addrspa
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2col_5d_param_8];
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_5d_param_9];
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2col_5d_param_10];
-; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3};
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs4, [cp_async_bulk_tensor_g2s_im2col_5d_param_11];
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2col_5d_param_12];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3};
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}, %rd2;
-; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs4, [cp_async_bulk_tensor_g2s_im2col_5d_param_11];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}, %rs4;
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}, %rs4, %rd2;
 ; CHECK-PTX-SHARED32-NEXT:    ret;
diff --git a/llvm/test/CodeGen/NVPTX/f16-ex2.ll b/llvm/test/CodeGen/NVPTX/f16-ex2.ll
index ee79f9d6d056f..af3fe67269205 100644
--- a/llvm/test/CodeGen/NVPTX/f16-ex2.ll
+++ b/llvm/test/CodeGen/NVPTX/f16-ex2.ll
@@ -1,12 +1,13 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mcpu=sm_75 -mattr=+ptx70 | FileCheck --check-prefixes=CHECK-FP16 %s
-; RUN: %if ptxas-sm_75 && ptxas-isa-7.0 %{ llc < %s -mcpu=sm_75 -mattr=+ptx70 | %ptxas-verify -arch=sm_75 %}
+; RUN: llc < %s -mcpu=sm_90 -mattr=+ptx78 | FileCheck --check-prefixes=CHECK-FP16 %s
+; RUN: %if ptxas-sm_90 && ptxas-isa-7.8 %{ llc < %s -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %}
 target triple = "nvptx64-nvidia-cuda"
 
 declare half @llvm.nvvm.ex2.approx.f16(half)
-declare <2 x half> @llvm.nvvm.ex2.approx.f16x2(<2 x half>)
+declare <2 x half> @llvm.nvvm.ex2.approx.v2f16(<2 x half>)
+declare bfloat @llvm.nvvm.ex2.approx.ftz.bf16(bfloat)
+declare <2 x bfloat> @llvm.nvvm.ex2.approx.ftz.v2bf16(<2 x bfloat>)
 
-; CHECK-LABEL: ex2_half
 define half @ex2_half(half %0) {
 ; CHECK-FP16-LABEL: ex2_half(
 ; CHECK-FP16:       {
@@ -21,7 +22,6 @@ define half @ex2_half(half %0) {
   ret half %res
 }
 
-; CHECK-LABEL: ex2_2xhalf
 define <2 x half> @ex2_2xhalf(<2 x half> %0) {
 ; CHECK-FP16-LABEL: ex2_2xhalf(
 ; CHECK-FP16:       {
@@ -32,6 +32,34 @@ define <2 x half> @ex2_2xhalf(<2 x half> %0) {
 ; CHECK-FP16-NEXT:    ex2.approx.f16x2 %r2, %r1;
 ; CHECK-FP16-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-FP16-NEXT:    ret;
-  %res = call <2 x half> @llvm.nvvm.ex2.approx.f16x2(<2 x half> %0)
+  %res = call <2 x half> @llvm.nvvm.ex2.approx.v2f16(<2 x half> %0)
   ret <2 x half> %res
 }
+
+define bfloat @ex2_bfloat(bfloat %0) {
+; CHECK-FP16-LABEL: ex2_bfloat(
+; CHECK-FP16:       {
+; CHECK-FP16-NEXT:    .reg .b16 %rs<3>;
+; CHECK-FP16-EMPTY:
+; CHECK-FP16-NEXT:  // %bb.0:
+; CHECK-FP16-NEXT:    ld.param.b16 %rs1, [ex2_bfloat_param_0];
+; CHECK-FP16-NEXT:    ex2.approx.ftz.bf16 %rs2, %rs1;
+; CHECK-FP16-NEXT:    st.param.b16 [func_retval0], %rs2;
+; CHECK-FP16-NEXT:    ret;
+  %res = call bfloat @llvm.nvvm.ex2.approx.ftz.bf16(bfloat %0)
+  ret bfloat %res
+}
+
+define <2 x bfloat> @ex2_2xbfloat(<2 x bfloat> %0) {
+; CHECK-FP16-LABEL: ex2_2xbfloat(
+; CHECK-FP16:       {
+; CHECK-FP16-NEXT:    .reg .b32 %r<3>;
+; CHECK-FP16-EMPTY:
+; CHECK-FP16-NEXT:  // %bb.0:
+; CHECK-FP16-NEXT:    ld.param.b32 %r1, [ex2_2xbfloat_param_0];
+; CHECK-FP16-NEXT:    ex2.approx.ftz.bf16x2 %r2, %r1;
+; CHECK-FP16-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-FP16-NEXT:    ret;
+  %res = call <2 x bfloat> @llvm.nvvm.ex2.approx.ftz.v2bf16(<2 x bfloat> %0)
+  ret <2 x bfloat> %res
+}
diff --git a/llvm/test/CodeGen/NVPTX/f32-ex2.ll b/llvm/test/CodeGen/NVPTX/f32-ex2.ll
index 796d80d3c2c39..97b9d35be371e 100644
--- a/llvm/test/CodeGen/NVPTX/f32-ex2.ll
+++ b/llvm/test/CodeGen/NVPTX/f32-ex2.ll
@@ -3,7 +3,8 @@
 ; RUN: %if ptxas-sm_50 && ptxas-isa-3.2 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_50 -mattr=+ptx32 | %ptxas-verify -arch=sm_50 %}
 target triple = "nvptx-nvidia-cuda"
 
-declare float @llvm.nvvm.ex2.approx.f(float)
+declare float @llvm.nvvm.ex2.approx.f32(float)
+declare float @llvm.nvvm.ex2.approx.ftz.f32(float)
 
 ; CHECK-LABEL: ex2_float
 define float @ex2_float(float %0) {
@@ -16,7 +17,7 @@ define float @ex2_float(float %0) {
 ; CHECK-NEXT:    ex2.approx.f32 %r2, %r1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
-  %res = call float @llvm.nvvm.ex2.approx.f(float %0)
+  %res = call float @llvm.nvvm.ex2.approx.f32(float %0)
   ret float %res
 }
 
@@ -31,6 +32,6 @@ define float @ex2_float_ftz(float %0) {
 ; CHECK-NEXT:    ex2.approx.ftz.f32 %r2, %r1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
-  %res = call float @llvm.nvvm.ex2.approx.ftz.f(float %0)
+  %res = call float @llvm.nvvm.ex2.approx.ftz.f32(float %0)
   ret float %res
 }
diff --git a/llvm/test/CodeGen/NVPTX/insertelt-dynamic.ll b/llvm/test/CodeGen/NVPTX/insertelt-dynamic.ll
new file mode 100644
index 0000000000000..f2ccf3ed65c02
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/insertelt-dynamic.ll
@@ -0,0 +1,738 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -mcpu=sm_20 | %ptxas-verify %}
+target triple = "nvptx64-nvidia-cuda"
+
+; Test dynamic insertelt at the beginning of a chain
+define <4 x i32> @dynamic_at_beginning(i32 %idx) {
+; CHECK-LABEL: dynamic_at_beginning(
+; CHECK:       {
+; CHECK-NEXT:    .local .align 4 .b8 __local_depot0[16];
+; CHECK-NEXT:    .reg .b64 %SP;
+; CHECK-NEXT:    .reg .b64 %SPL;
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-NEXT:    .reg .b64 %rd<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.b64 %SPL, __local_depot0;
+; CHECK-NEXT:    cvta.local.u64 %SP, %SPL;
+; CHECK-NEXT:    ld.param.b32 %rd1, [dynamic_at_beginning_param_0];
+; CHECK-NEXT:    and.b64 %rd2, %rd1, 3;
+; CHECK-NEXT:    shl.b64 %rd3, %rd2, 2;
+; CHECK-NEXT:    add.u64 %rd4, %SP, 0;
+; CHECK-NEXT:    add.s64 %rd5, %rd4, %rd3;
+; CHECK-NEXT:    st.b32 [%rd5], 10;
+; CHECK-NEXT:    ld.b32 %r1, [%SP+12];
+; CHECK-NEXT:    ld.b32 %r2, [%SP];
+; CHECK-NEXT:    st.param.v4.b32 [func_retval0], {%r2, 20, 30, %r1};
+; CHECK-NEXT:    ret;
+  %v0 = insertelement <4 x i32> poison, i32 10, i32 %idx
+  %v1 = insertelement <4 x i32> %v0, i32 20, i32 1
+  %v2 = insertelement <4 x i32> %v1, i32 30, i32 2
+  ret <4 x i32> %v2
+}
+
+; Test dynamic insertelt at the end of a chain
+define <4 x i32> @dynamic_at_end(i32 %idx) {
+; CHECK-LABEL: dynamic_at_end(
+; CHECK:       {
+; CHECK-NEXT:    .local .align 4 .b8 __local_depot1[16];
+; CHECK-NEXT:    .reg .b64 %SP;
+; CHECK-NEXT:    .reg .b64 %SPL;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.b64 %SPL, __local_depot1;
+; CHECK-NEXT:    cvta.local.u64 %SP, %SPL;
+; CHECK-NEXT:    ld.param.b32 %rd1, [dynamic_at_end_param_0];
+; CHECK-NEXT:    and.b64 %rd2, %rd1, 3;
+; CHECK-NEXT:    shl.b64 %rd3, %rd2, 2;
+; CHECK-NEXT:    add.u64 %rd4, %SP, 0;
+; CHECK-NEXT:    add.s64 %rd5, %rd4, %rd3;
+; CHECK-NEXT:    st.b32 [%SP+4], 20;
+; CHECK-NEXT:    st.b32 [%SP], 10;
+; CHECK-NEXT:    st.b32 [%rd5], 30;
+; CHECK-NEXT:    ld.b32 %r1, [%SP+12];
+; CHECK-NEXT:    ld.b32 %r2, [%SP+8];
+; CHECK-NEXT:    ld.b32 %r3, [%SP+4];
+; CHECK-NEXT:    ld.b32 %r4, [%SP];
+; CHECK-NEXT:    st.param.v4.b32 [func_retval0], {%r4, %r3, %r2, %r1};
+; CHECK-NEXT:    ret;
+  %v0 = insertelement <4 x i32> poison, i32 10, i32 0
+  %v1 = insertelement <4 x i32> %v0, i32 20, i32 1
+  %v2 = insertelement <4 x i32> %v1, i32 30, i32 %idx
+  ret <4 x i32> %v2
+}
+
+; Test dynamic insertelt in the middle of a chain
+define <4 x i32> @dynamic_in_middle(i32 %idx) {
+; CHECK-LABEL: dynamic_in_middle(
+; CHECK:       {
+; CHECK-NEXT:    .local .align 4 .b8 __local_depot2[16];
+; CHECK-NEXT:    .reg .b64 %SP;
+; CHECK-NEXT:    .reg .b64 %SPL;
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-NEXT:    .reg .b64 %rd<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.b64 %SPL, __local_depot2;
+; CHECK-NEXT:    cvta.local.u64 %SP, %SPL;
+; CHECK-NEXT:    ld.param.b32 %rd1, [dynamic_in_middle_param_0];
+; CHECK-NEXT:    and.b64 %rd2, %rd1, 3;
+; CHECK-NEXT:    shl.b64 %rd3, %rd2, 2;
+; CHECK-NEXT:    add.u64 %rd4, %SP, 0;
+; CHECK-NEXT:    add.s64 %rd5, %rd4, %rd3;
+; CHECK-NEXT:    st.b32 [%SP], 10;
+; CHECK-NEXT:    st.b32 [%rd5], 20;
+; CHECK-NEXT:    ld.b32 %r1, [%SP+12];
+; CHECK-NEXT:    ld.b32 %r2, [%SP+4];
+; CHECK-NEXT:    ld.b32 %r3, [%SP];
+; CHECK-NEXT:    st.param.v4.b32 [func_retval0], {%r3, %r2, 30, %r1};
+; CHECK-NEXT:    ret;
+  %v0 = insertelement <4 x i32> poison, i32 10, i32 0
+  %v1 = insertelement <4 x i32> %v0, i32 20, i32 %idx
+  %v2 = insertelement <4 x i32> %v1, i32 30, i32 2
+  ret <4 x i32> %v2
+}
+
+; Test repeated dynamic insertelt with the same index
+define <4 x i32> @repeated_same_index(i32 %idx) {
+; CHECK-LABEL: repeated_same_index(
+; CHECK:       {
+; CHECK-NEXT:    .local .align 4 .b8 __local_depot3[16];
+; CHECK-NEXT:    .reg .b64 %SP;
+; CHECK-NEXT:    .reg .b64 %SPL;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.b64 %SPL, __local_depot3;
+; CHECK-NEXT:    cvta.local.u64 %SP, %SPL;
+; CHECK-NEXT:    ld.param.b32 %rd1, [repeated_same_index_param_0];
+; CHECK-NEXT:    and.b64 %rd2, %rd1, 3;
+; CHECK-NEXT:    shl.b64 %rd3, %rd2, 2;
+; CHECK-NEXT:    add.u64 %rd4, %SP, 0;
+; CHECK-NEXT:    add.s64 %rd5, %rd4, %rd3;
+; CHECK-NEXT:    st.b32 [%rd5], 20;
+; CHECK-NEXT:    ld.b32 %r1, [%SP+12];
+; CHECK-NEXT:    ld.b32 %r2, [%SP+8];
+; CHECK-NEXT:    ld.b32 %r3, [%SP+4];
+; CHECK-NEXT:    ld.b32 %r4, [%SP];
+; CHECK-NEXT:    st.param.v4.b32 [func_retval0], {%r4, %r3, %r2, %r1};
+; CHECK-NEXT:    ret;
+  %v0 = insertelement <4 x i32> poison, i32 10, i32 %idx
+  %v1 = insertelement <4 x i32> %v0, i32 20, i32 %idx
+  ret <4 x i32> %v1
+}
+
+; Test multiple dynamic insertelts
+define <4 x i32> @multiple_dynamic(i32 %idx0, i32 %idx1) {
+; CHECK-LABEL: multiple_dynamic(
+; CHECK:       {
+; CHECK-NEXT:    .local .align 4 .b8 __local_depot4[16];
+; CHECK-NEXT:    .reg .b64 %SP;
+; CHECK-NEXT:    .reg .b64 %SPL;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.b64 %SPL, __local_depot4;
+; CHECK-NEXT:    cvta.local.u64 %SP, %SPL;
+; CHECK-NEXT:    ld.param.b32 %rd1, [multiple_dynamic_param_0];
+; CHECK-NEXT:    and.b64 %rd2, %rd1, 3;
+; CHECK-NEXT:    shl.b64 %rd3, %rd2, 2;
+; CHECK-NEXT:    add.u64 %rd4, %SP, 0;
+; CHECK-NEXT:    add.s64 %rd5, %rd4, %rd3;
+; CHECK-NEXT:    st.b32 [%rd5], 10;
+; CHECK-NEXT:    ld.param.b32 %rd6, [multiple_dynamic_param_1];
+; CHECK-NEXT:    and.b64 %rd7, %rd6, 3;
+; CHECK-NEXT:    shl.b64 %rd8, %rd7, 2;
+; CHECK-NEXT:    add.s64 %rd9, %rd4, %rd8;
+; CHECK-NEXT:    st.b32 [%rd9], 20;
+; CHECK-NEXT:    ld.b32 %r1, [%SP+12];
+; CHECK-NEXT:    ld.b32 %r2, [%SP+8];
+; CHECK-NEXT:    ld.b32 %r3, [%SP+4];
+; CHECK-NEXT:    ld.b32 %r4, [%SP];
+; CHECK-NEXT:    st.param.v4.b32 [func_retval0], {%r4, %r3, %r2, %r1};
+; CHECK-NEXT:    ret;
+  %v0 = insertelement <4 x i32> poison, i32 10, i32 %idx0
+  %v1 = insertelement <4 x i32> %v0, i32 20, i32 %idx1
+  ret <4 x i32> %v1
+}
+
+; Test chain with all dynamic insertelts
+define <4 x i32> @all_dynamic(i32 %idx0, i32 %idx1, i32 %idx2, i32 %idx3) {
+; CHECK-LABEL: all_dynamic(
+; CHECK:       {
+; CHECK-NEXT:    .local .align 4 .b8 __local_depot5[16];
+; CHECK-NEXT:    .reg .b64 %SP;
+; CHECK-NEXT:    .reg .b64 %SPL;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<18>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.b64 %SPL, __local_depot5;
+; CHECK-NEXT:    cvta.local.u64 %SP, %SPL;
+; CHECK-NEXT:    ld.param.b32 %rd1, [all_dynamic_param_0];
+; CHECK-NEXT:    and.b64 %rd2, %rd1, 3;
+; CHECK-NEXT:    shl.b64 %rd3, %rd2, 2;
+; CHECK-NEXT:    add.u64 %rd4, %SP, 0;
+; CHECK-NEXT:    add.s64 %rd5, %rd4, %rd3;
+; CHECK-NEXT:    ld.param.b32 %rd6, [all_dynamic_param_1];
+; CHECK-NEXT:    and.b64 %rd7, %rd6, 3;
+; CHECK-NEXT:    shl.b64 %rd8, %rd7, 2;
+; CHECK-NEXT:    add.s64 %rd9, %rd4, %rd8;
+; CHECK-NEXT:    ld.param.b32 %rd10, [all_dynamic_param_2];
+; CHECK-NEXT:    and.b64 %rd11, %rd10, 3;
+; CHECK-NEXT:    shl.b64 %rd12, %rd11, 2;
+; CHECK-NEXT:    add.s64 %rd13, %rd4, %rd12;
+; CHECK-NEXT:    st.b32 [%rd5], 10;
+; CHECK-NEXT:    st.b32 [%rd9], 20;
+; CHECK-NEXT:    st.b32 [%rd13], 30;
+; CHECK-NEXT:    ld.param.b32 %rd14, [all_dynamic_param_3];
+; CHECK-NEXT:    and.b64 %rd15, %rd14, 3;
+; CHECK-NEXT:    shl.b64 %rd16, %rd15, 2;
+; CHECK-NEXT:    add.s64 %rd17, %rd4, %rd16;
+; CHECK-NEXT:    st.b32 [%rd17], 40;
+; CHECK-NEXT:    ld.b32 %r1, [%SP+12];
+; CHECK-NEXT:    ld.b32 %r2, [%SP+8];
+; CHECK-NEXT:    ld.b32 %r3, [%SP+4];
+; CHECK-NEXT:    ld.b32 %r4, [%SP];
+; CHECK-NEXT:    st.param.v4.b32 [func_retval0], {%r4, %r3, %r2, %r1};
+; CHECK-NEXT:    ret;
+  %v0 = insertelement <4 x i32> poison, i32 10, i32 %idx0
+  %v1 = insertelement <4 x i32> %v0, i32 20, i32 %idx1
+  %v2 = insertelement <4 x i32> %v1, i32 30, i32 %idx2
+  %v3 = insertelement <4 x i32> %v2, i32 40, i32 %idx3
+  ret <4 x i32> %v3
+}
+
+; Test mixed constant and dynamic insertelts with high ratio of dynamic ones.
+; Should lower all insertelts to stores.
+define <4 x i32> @mix_dynamic_constant(i32 %idx0, i32 %idx1) {
+; CHECK-LABEL: mix_dynamic_constant(
+; CHECK:       {
+; CHECK-NEXT:    .local .align 4 .b8 __local_depot6[16];
+; CHECK-NEXT:    .reg .b64 %SP;
+; CHECK-NEXT:    .reg .b64 %SPL;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.b64 %SPL, __local_depot6;
+; CHECK-NEXT:    cvta.local.u64 %SP, %SPL;
+; CHECK-NEXT:    ld.param.b32 %rd1, [mix_dynamic_constant_param_0];
+; CHECK-NEXT:    and.b64 %rd2, %rd1, 3;
+; CHECK-NEXT:    shl.b64 %rd3, %rd2, 2;
+; CHECK-NEXT:    add.u64 %rd4, %SP, 0;
+; CHECK-NEXT:    add.s64 %rd5, %rd4, %rd3;
+; CHECK-NEXT:    st.b32 [%rd5], 10;
+; CHECK-NEXT:    ld.param.b32 %rd6, [mix_dynamic_constant_param_1];
+; CHECK-NEXT:    and.b64 %rd7, %rd6, 3;
+; CHECK-NEXT:    shl.b64 %rd8, %rd7, 2;
+; CHECK-NEXT:    add.s64 %rd9, %rd4, %rd8;
+; CHECK-NEXT:    st.b32 [%SP+4], 20;
+; CHECK-NEXT:    st.b32 [%rd9], 30;
+; CHECK-NEXT:    ld.b32 %r1, [%SP+12];
+; CHECK-NEXT:    ld.b32 %r2, [%SP+8];
+; CHECK-NEXT:    ld.b32 %r3, [%SP+4];
+; CHECK-NEXT:    ld.b32 %r4, [%SP];
+; CHECK-NEXT:    st.param.v4.b32 [func_retval0], {%r4, %r3, %r2, %r1};
+; CHECK-NEXT:    ret;
+  %v0 = insertelement <4 x i32> poison, i32 10, i32 %idx0
+  %v1 = insertelement <4 x i32> %v0, i32 20, i32 1
+  %v2 = insertelement <4 x i32> %v1, i32 30, i32 %idx1
+  ret <4 x i32> %v2
+}
+
+; Test two separate chains that don't interfere
+define void @two_separate_chains(i32 %idx0, i32 %idx1, ptr %out0, ptr %out1) {
+; CHECK-LABEL: two_separate_chains(
+; CHECK:       {
+; CHECK-NEXT:    .local .align 4 .b8 __local_depot7[32];
+; CHECK-NEXT:    .reg .b64 %SP;
+; CHECK-NEXT:    .reg .b64 %SPL;
+; CHECK-NEXT:    .reg .b32 %r<7>;
+; CHECK-NEXT:    .reg .b64 %rd<13>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.b64 %SPL, __local_depot7;
+; CHECK-NEXT:    cvta.local.u64 %SP, %SPL;
+; CHECK-NEXT:    ld.param.b32 %rd1, [two_separate_chains_param_0];
+; CHECK-NEXT:    and.b64 %rd2, %rd1, 3;
+; CHECK-NEXT:    shl.b64 %rd3, %rd2, 2;
+; CHECK-NEXT:    add.u64 %rd4, %SP, 16;
+; CHECK-NEXT:    add.s64 %rd5, %rd4, %rd3;
+; CHECK-NEXT:    st.b32 [%rd5], 10;
+; CHECK-NEXT:    ld.param.b32 %rd6, [two_separate_chains_param_1];
+; CHECK-NEXT:    and.b64 %rd7, %rd6, 3;
+; CHECK-NEXT:    shl.b64 %rd8, %rd7, 2;
+; CHECK-NEXT:    add.u64 %rd9, %SP, 0;
+; CHECK-NEXT:    add.s64 %rd10, %rd9, %rd8;
+; CHECK-NEXT:    ld.b32 %r1, [%SP+28];
+; CHECK-NEXT:    ld.b32 %r2, [%SP+24];
+; CHECK-NEXT:    ld.b32 %r3, [%SP+16];
+; CHECK-NEXT:    ld.param.b64 %rd11, [two_separate_chains_param_2];
+; CHECK-NEXT:    st.b32 [%rd10], 30;
+; CHECK-NEXT:    ld.param.b64 %rd12, [two_separate_chains_param_3];
+; CHECK-NEXT:    ld.b32 %r4, [%SP+12];
+; CHECK-NEXT:    ld.b32 %r5, [%SP+4];
+; CHECK-NEXT:    ld.b32 %r6, [%SP];
+; CHECK-NEXT:    st.v4.b32 [%rd11], {%r3, 20, %r2, %r1};
+; CHECK-NEXT:    st.v4.b32 [%rd12], {%r6, %r5, 40, %r4};
+; CHECK-NEXT:    ret;
+  ; Chain 1
+  %v0 = insertelement <4 x i32> poison, i32 10, i32 %idx0
+  %v1 = insertelement <4 x i32> %v0, i32 20, i32 1
+
+  ; Chain 2
+  %w0 = insertelement <4 x i32> poison, i32 30, i32 %idx1
+  %w1 = insertelement <4 x i32> %w0, i32 40, i32 2
+
+  store <4 x i32> %v1, ptr %out0
+  store <4 x i32> %w1, ptr %out1
+  ret void
+}
+
+; Test overlapping chains (chain 2 starts from middle of chain 1)
+define void @overlapping_chains(i32 %idx0, i32 %idx1, ptr %out0, ptr %out1) {
+; CHECK-LABEL: overlapping_chains(
+; CHECK:       {
+; CHECK-NEXT:    .local .align 4 .b8 __local_depot8[32];
+; CHECK-NEXT:    .reg .b64 %SP;
+; CHECK-NEXT:    .reg .b64 %SPL;
+; CHECK-NEXT:    .reg .b32 %r<7>;
+; CHECK-NEXT:    .reg .b64 %rd<14>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.b64 %SPL, __local_depot8;
+; CHECK-NEXT:    cvta.local.u64 %SP, %SPL;
+; CHECK-NEXT:    ld.param.b32 %rd1, [overlapping_chains_param_0];
+; CHECK-NEXT:    and.b64 %rd2, %rd1, 3;
+; CHECK-NEXT:    shl.b64 %rd3, %rd2, 2;
+; CHECK-NEXT:    add.u64 %rd4, %SP, 16;
+; CHECK-NEXT:    add.s64 %rd5, %rd4, %rd3;
+; CHECK-NEXT:    st.b32 [%rd5], 10;
+; CHECK-NEXT:    add.u64 %rd6, %SP, 0;
+; CHECK-NEXT:    add.s64 %rd7, %rd6, %rd3;
+; CHECK-NEXT:    ld.b32 %r1, [%SP+28];
+; CHECK-NEXT:    ld.b32 %r2, [%SP+16];
+; CHECK-NEXT:    ld.param.b64 %rd8, [overlapping_chains_param_2];
+; CHECK-NEXT:    st.b32 [%rd7], 10;
+; CHECK-NEXT:    ld.param.b32 %rd9, [overlapping_chains_param_1];
+; CHECK-NEXT:    and.b64 %rd10, %rd9, 3;
+; CHECK-NEXT:    shl.b64 %rd11, %rd10, 2;
+; CHECK-NEXT:    add.s64 %rd12, %rd6, %rd11;
+; CHECK-NEXT:    st.b32 [%SP+4], 20;
+; CHECK-NEXT:    st.b32 [%rd12], 30;
+; CHECK-NEXT:    ld.param.b64 %rd13, [overlapping_chains_param_3];
+; CHECK-NEXT:    ld.b32 %r3, [%SP+12];
+; CHECK-NEXT:    ld.b32 %r4, [%SP+8];
+; CHECK-NEXT:    ld.b32 %r5, [%SP+4];
+; CHECK-NEXT:    ld.b32 %r6, [%SP];
+; CHECK-NEXT:    st.v4.b32 [%rd8], {%r2, 20, 40, %r1};
+; CHECK-NEXT:    st.v4.b32 [%rd13], {%r6, %r5, %r4, %r3};
+; CHECK-NEXT:    ret;
+  %v0 = insertelement <4 x i32> poison, i32 10, i32 %idx0
+  %v1 = insertelement <4 x i32> %v0, i32 20, i32 1
+
+  ; Chain 2 starts from v1
+  %w0 = insertelement <4 x i32> %v1, i32 30, i32 %idx1
+
+  ; Continue chain 1
+  %v2 = insertelement <4 x i32> %v1, i32 40, i32 2
+
+  store <4 x i32> %v2, ptr %out0
+  store <4 x i32> %w0, ptr %out1
+  ret void
+}
+
+; Test with i1 elements (1-bit, non-byte-aligned)
+define <8 x i1> @dynamic_i1(i32 %idx) {
+; CHECK-LABEL: dynamic_i1(
+; CHECK:       {
+; CHECK-NEXT:    .local .align 8 .b8 __local_depot9[8];
+; CHECK-NEXT:    .reg .b64 %SP;
+; CHECK-NEXT:    .reg .b64 %SPL;
+; CHECK-NEXT:    .reg .b32 %r<9>;
+; CHECK-NEXT:    .reg .b64 %rd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.b64 %SPL, __local_depot9;
+; CHECK-NEXT:    cvta.local.u64 %SP, %SPL;
+; CHECK-NEXT:    ld.param.b32 %rd1, [dynamic_i1_param_0];
+; CHECK-NEXT:    and.b64 %rd2, %rd1, 7;
+; CHECK-NEXT:    add.u64 %rd3, %SP, 0;
+; CHECK-NEXT:    or.b64 %rd4, %rd3, %rd2;
+; CHECK-NEXT:    st.v2.b32 [%SP], {%r1, %r2};
+; CHECK-NEXT:    st.b8 [%rd4], 1;
+; CHECK-NEXT:    ld.b32 %r3, [%SP];
+; CHECK-NEXT:    prmt.b32 %r4, %r3, 0, 0x7773U;
+; CHECK-NEXT:    ld.b32 %r5, [%SP+4];
+; CHECK-NEXT:    prmt.b32 %r6, %r5, 0, 0x7771U;
+; CHECK-NEXT:    prmt.b32 %r7, %r5, 0, 0x7772U;
+; CHECK-NEXT:    prmt.b32 %r8, %r5, 0, 0x7773U;
+; CHECK-NEXT:    st.param.b8 [func_retval0+4], %r5;
+; CHECK-NEXT:    st.param.b8 [func_retval0], %r3;
+; CHECK-NEXT:    st.param.b8 [func_retval0+7], %r8;
+; CHECK-NEXT:    st.param.b8 [func_retval0+6], %r7;
+; CHECK-NEXT:    st.param.b8 [func_retval0+5], %r6;
+; CHECK-NEXT:    st.param.b8 [func_retval0+3], %r4;
+; CHECK-NEXT:    st.param.b8 [func_retval0+2], 1;
+; CHECK-NEXT:    st.param.b8 [func_retval0+1], 0;
+; CHECK-NEXT:    ret;
+  %v0 = insertelement <8 x i1> poison, i1 1, i32 %idx
+  %v1 = insertelement <8 x i1> %v0, i1 0, i32 1
+  %v2 = insertelement <8 x i1> %v1, i1 1, i32 2
+  ret <8 x i1> %v2
+}
+
+; Test with i2 elements (2-bit, non-byte-aligned)
+define <8 x i2> @dynamic_i2(i32 %idx) {
+; CHECK-LABEL: dynamic_i2(
+; CHECK:       {
+; CHECK-NEXT:    .local .align 8 .b8 __local_depot10[16];
+; CHECK-NEXT:    .reg .b64 %SP;
+; CHECK-NEXT:    .reg .b64 %SPL;
+; CHECK-NEXT:    .reg .b16 %rs<24>;
+; CHECK-NEXT:    .reg .b32 %r<10>;
+; CHECK-NEXT:    .reg .b64 %rd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.b64 %SPL, __local_depot10;
+; CHECK-NEXT:    cvta.local.u64 %SP, %SPL;
+; CHECK-NEXT:    ld.param.b32 %rd1, [dynamic_i2_param_0];
+; CHECK-NEXT:    and.b64 %rd2, %rd1, 7;
+; CHECK-NEXT:    add.u64 %rd3, %SP, 0;
+; CHECK-NEXT:    or.b64 %rd4, %rd3, %rd2;
+; CHECK-NEXT:    st.v2.b32 [%SP], {%r1, %r2};
+; CHECK-NEXT:    st.b8 [%rd4], 1;
+; CHECK-NEXT:    ld.b32 %r3, [%SP+4];
+; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
+; CHECK-NEXT:    and.b16 %rs2, %rs1, 3;
+; CHECK-NEXT:    prmt.b32 %r4, %r3, 0, 0x7771U;
+; CHECK-NEXT:    cvt.u16.u32 %rs3, %r4;
+; CHECK-NEXT:    and.b16 %rs4, %rs3, 3;
+; CHECK-NEXT:    shl.b16 %rs5, %rs4, 2;
+; CHECK-NEXT:    or.b16 %rs6, %rs2, %rs5;
+; CHECK-NEXT:    prmt.b32 %r5, %r3, 0, 0x7772U;
+; CHECK-NEXT:    cvt.u16.u32 %rs7, %r5;
+; CHECK-NEXT:    and.b16 %rs8, %rs7, 3;
+; CHECK-NEXT:    shl.b16 %rs9, %rs8, 4;
+; CHECK-NEXT:    or.b16 %rs10, %rs6, %rs9;
+; CHECK-NEXT:    prmt.b32 %r6, %r3, 0, 0x7773U;
+; CHECK-NEXT:    cvt.u16.u32 %rs11, %r6;
+; CHECK-NEXT:    shl.b16 %rs12, %rs11, 6;
+; CHECK-NEXT:    or.b16 %rs13, %rs10, %rs12;
+; CHECK-NEXT:    st.b8 [%SP+8], %rs13;
+; CHECK-NEXT:    ld.b32 %r7, [%SP];
+; CHECK-NEXT:    prmt.b32 %r8, %r7, 0, 0x7773U;
+; CHECK-NEXT:    cvt.u16.u32 %rs14, %r8;
+; CHECK-NEXT:    shl.b16 %rs15, %rs14, 6;
+; CHECK-NEXT:    and.b16 %rs16, %rs15, 192;
+; CHECK-NEXT:    ld.s8 %rs17, [%SP+8];
+; CHECK-NEXT:    shl.b16 %rs18, %rs17, 8;
+; CHECK-NEXT:    or.b16 %rs19, %rs16, %rs18;
+; CHECK-NEXT:    prmt.b32 %r9, %r7, 0, 0x7770U;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %r9;
+; CHECK-NEXT:    st.param.b16 [func_retval0+8], %rs17;
+; CHECK-NEXT:    shr.s16 %rs20, %rs18, 14;
+; CHECK-NEXT:    st.param.b16 [func_retval0+14], %rs20;
+; CHECK-NEXT:    shr.s16 %rs21, %rs18, 12;
+; CHECK-NEXT:    st.param.b16 [func_retval0+12], %rs21;
+; CHECK-NEXT:    shr.s16 %rs22, %rs18, 10;
+; CHECK-NEXT:    st.param.b16 [func_retval0+10], %rs22;
+; CHECK-NEXT:    shr.s16 %rs23, %rs19, 6;
+; CHECK-NEXT:    st.param.b16 [func_retval0+6], %rs23;
+; CHECK-NEXT:    st.param.b16 [func_retval0+4], 3;
+; CHECK-NEXT:    st.param.b16 [func_retval0+2], 2;
+; CHECK-NEXT:    ret;
+  %v0 = insertelement <8 x i2> poison, i2 1, i32 %idx
+  %v1 = insertelement <8 x i2> %v0, i2 2, i32 1
+  %v2 = insertelement <8 x i2> %v1, i2 3, i32 2
+  ret <8 x i2> %v2
+}
+
+; Test with i3 elements (3-bit, non-byte-aligned)
+define <8 x i3> @dynamic_i3(i32 %idx) {
+; CHECK-LABEL: dynamic_i3(
+; CHECK:       {
+; CHECK-NEXT:    .local .align 8 .b8 __local_depot11[8];
+; CHECK-NEXT:    .reg .b64 %SP;
+; CHECK-NEXT:    .reg .b64 %SPL;
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<15>;
+; CHECK-NEXT:    .reg .b64 %rd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.b64 %SPL, __local_depot11;
+; CHECK-NEXT:    cvta.local.u64 %SP, %SPL;
+; CHECK-NEXT:    ld.param.b32 %rd1, [dynamic_i3_param_0];
+; CHECK-NEXT:    and.b64 %rd2, %rd1, 7;
+; CHECK-NEXT:    add.u64 %rd3, %SP, 0;
+; CHECK-NEXT:    or.b64 %rd4, %rd3, %rd2;
+; CHECK-NEXT:    st.v2.b32 [%SP], {%r1, %r2};
+; CHECK-NEXT:    st.b8 [%rd4], 1;
+; CHECK-NEXT:    ld.b32 %r3, [%SP];
+; CHECK-NEXT:    ld.b32 %r4, [%SP+4];
+; CHECK-NEXT:    prmt.b32 %r5, %r4, 0, 0x7773U;
+; CHECK-NEXT:    prmt.b32 %r6, %r4, 0, 0x7772U;
+; CHECK-NEXT:    prmt.b32 %r7, %r6, %r5, 0x5410U;
+; CHECK-NEXT:    st.param.b32 [func_retval0+12], %r7;
+; CHECK-NEXT:    prmt.b32 %r8, %r4, 0, 0x7771U;
+; CHECK-NEXT:    prmt.b32 %r9, %r4, 0, 0x7770U;
+; CHECK-NEXT:    prmt.b32 %r10, %r9, %r8, 0x5410U;
+; CHECK-NEXT:    st.param.b32 [func_retval0+8], %r10;
+; CHECK-NEXT:    prmt.b32 %r11, %r3, 0, 0x7773U;
+; CHECK-NEXT:    cvt.u16.u32 %rs1, %r11;
+; CHECK-NEXT:    mov.b16 %rs2, 3;
+; CHECK-NEXT:    mov.b32 %r12, {%rs2, %rs1};
+; CHECK-NEXT:    st.param.b32 [func_retval0+4], %r12;
+; CHECK-NEXT:    prmt.b32 %r13, %r3, 0, 0x7770U;
+; CHECK-NEXT:    cvt.u16.u32 %rs3, %r13;
+; CHECK-NEXT:    mov.b16 %rs4, 2;
+; CHECK-NEXT:    mov.b32 %r14, {%rs3, %rs4};
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r14;
+; CHECK-NEXT:    ret;
+  %v0 = insertelement <8 x i3> poison, i3 1, i32 %idx
+  %v1 = insertelement <8 x i3> %v0, i3 2, i32 1
+  %v2 = insertelement <8 x i3> %v1, i3 3, i32 2
+  ret <8 x i3> %v2
+}
+
+; Test with i4 elements (4-bit, non-byte-aligned)
+define <8 x i4> @dynamic_i4(i32 %idx) {
+; CHECK-LABEL: dynamic_i4(
+; CHECK:       {
+; CHECK-NEXT:    .local .align 8 .b8 __local_depot12[16];
+; CHECK-NEXT:    .reg .b64 %SP;
+; CHECK-NEXT:    .reg .b64 %SPL;
+; CHECK-NEXT:    .reg .b16 %rs<30>;
+; CHECK-NEXT:    .reg .b32 %r<22>;
+; CHECK-NEXT:    .reg .b64 %rd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.b64 %SPL, __local_depot12;
+; CHECK-NEXT:    cvta.local.u64 %SP, %SPL;
+; CHECK-NEXT:    ld.param.b32 %rd1, [dynamic_i4_param_0];
+; CHECK-NEXT:    and.b64 %rd2, %rd1, 7;
+; CHECK-NEXT:    add.u64 %rd3, %SP, 0;
+; CHECK-NEXT:    or.b64 %rd4, %rd3, %rd2;
+; CHECK-NEXT:    st.v2.b32 [%SP], {%r1, %r2};
+; CHECK-NEXT:    st.b8 [%rd4], 1;
+; CHECK-NEXT:    ld.b32 %r3, [%SP];
+; CHECK-NEXT:    prmt.b32 %r4, %r3, 0, 0x7770U;
+; CHECK-NEXT:    cvt.u16.u32 %rs1, %r4;
+; CHECK-NEXT:    and.b16 %rs2, %rs1, 15;
+; CHECK-NEXT:    prmt.b32 %r5, %r3, 0, 0x7771U;
+; CHECK-NEXT:    cvt.u16.u32 %rs3, %r5;
+; CHECK-NEXT:    and.b16 %rs4, %rs3, 15;
+; CHECK-NEXT:    shl.b16 %rs5, %rs4, 4;
+; CHECK-NEXT:    or.b16 %rs6, %rs2, %rs5;
+; CHECK-NEXT:    prmt.b32 %r6, %r3, 0, 0x7772U;
+; CHECK-NEXT:    cvt.u16.u32 %rs7, %r6;
+; CHECK-NEXT:    and.b16 %rs8, %rs7, 15;
+; CHECK-NEXT:    shl.b16 %rs9, %rs8, 8;
+; CHECK-NEXT:    or.b16 %rs10, %rs6, %rs9;
+; CHECK-NEXT:    prmt.b32 %r7, %r3, 0, 0x7773U;
+; CHECK-NEXT:    cvt.u16.u32 %rs11, %r7;
+; CHECK-NEXT:    shl.b16 %rs12, %rs11, 12;
+; CHECK-NEXT:    or.b16 %rs13, %rs10, %rs12;
+; CHECK-NEXT:    cvt.u32.u16 %r8, %rs13;
+; CHECK-NEXT:    ld.b32 %r9, [%SP+4];
+; CHECK-NEXT:    prmt.b32 %r10, %r9, 0, 0x7770U;
+; CHECK-NEXT:    cvt.u16.u32 %rs14, %r10;
+; CHECK-NEXT:    and.b16 %rs15, %rs14, 15;
+; CHECK-NEXT:    prmt.b32 %r11, %r9, 0, 0x7771U;
+; CHECK-NEXT:    cvt.u16.u32 %rs16, %r11;
+; CHECK-NEXT:    and.b16 %rs17, %rs16, 15;
+; CHECK-NEXT:    shl.b16 %rs18, %rs17, 4;
+; CHECK-NEXT:    or.b16 %rs19, %rs15, %rs18;
+; CHECK-NEXT:    prmt.b32 %r12, %r9, 0, 0x7772U;
+; CHECK-NEXT:    cvt.u16.u32 %rs20, %r12;
+; CHECK-NEXT:    and.b16 %rs21, %rs20, 15;
+; CHECK-NEXT:    shl.b16 %rs22, %rs21, 8;
+; CHECK-NEXT:    or.b16 %rs23, %rs19, %rs22;
+; CHECK-NEXT:    prmt.b32 %r13, %r9, 0, 0x7773U;
+; CHECK-NEXT:    cvt.u16.u32 %rs24, %r13;
+; CHECK-NEXT:    shl.b16 %rs25, %rs24, 12;
+; CHECK-NEXT:    or.b16 %rs26, %rs23, %rs25;
+; CHECK-NEXT:    cvt.u32.u16 %r14, %rs26;
+; CHECK-NEXT:    shl.b32 %r15, %r14, 16;
+; CHECK-NEXT:    or.b32 %r16, %r8, %r15;
+; CHECK-NEXT:    mov.b32 %r17, {%rs20, %rs24};
+; CHECK-NEXT:    st.param.b32 [func_retval0+12], %r17;
+; CHECK-NEXT:    mov.b32 %r18, {%rs14, %rs16};
+; CHECK-NEXT:    st.param.b32 [func_retval0+8], %r18;
+; CHECK-NEXT:    mov.b16 %rs27, 2;
+; CHECK-NEXT:    mov.b32 %r19, {%rs1, %rs27};
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r19;
+; CHECK-NEXT:    shr.u32 %r20, %r16, 12;
+; CHECK-NEXT:    cvt.u16.u32 %rs28, %r20;
+; CHECK-NEXT:    mov.b16 %rs29, 3;
+; CHECK-NEXT:    mov.b32 %r21, {%rs29, %rs28};
+; CHECK-NEXT:    st.param.b32 [func_retval0+4], %r21;
+; CHECK-NEXT:    ret;
+  %v0 = insertelement <8 x i4> poison, i4 1, i32 %idx
+  %v1 = insertelement <8 x i4> %v0, i4 2, i32 1
+  %v2 = insertelement <8 x i4> %v1, i4 3, i32 2
+  ret <8 x i4> %v2
+}
+
+; Test with i5 elements (5-bit, non-byte-aligned)
+define <8 x i5> @dynamic_i5(i32 %idx) {
+; CHECK-LABEL: dynamic_i5(
+; CHECK:       {
+; CHECK-NEXT:    .local .align 8 .b8 __local_depot13[8];
+; CHECK-NEXT:    .reg .b64 %SP;
+; CHECK-NEXT:    .reg .b64 %SPL;
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<15>;
+; CHECK-NEXT:    .reg .b64 %rd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.b64 %SPL, __local_depot13;
+; CHECK-NEXT:    cvta.local.u64 %SP, %SPL;
+; CHECK-NEXT:    ld.param.b32 %rd1, [dynamic_i5_param_0];
+; CHECK-NEXT:    and.b64 %rd2, %rd1, 7;
+; CHECK-NEXT:    add.u64 %rd3, %SP, 0;
+; CHECK-NEXT:    or.b64 %rd4, %rd3, %rd2;
+; CHECK-NEXT:    st.v2.b32 [%SP], {%r1, %r2};
+; CHECK-NEXT:    st.b8 [%rd4], 1;
+; CHECK-NEXT:    ld.b32 %r3, [%SP];
+; CHECK-NEXT:    ld.b32 %r4, [%SP+4];
+; CHECK-NEXT:    prmt.b32 %r5, %r4, 0, 0x7773U;
+; CHECK-NEXT:    prmt.b32 %r6, %r4, 0, 0x7772U;
+; CHECK-NEXT:    prmt.b32 %r7, %r6, %r5, 0x5410U;
+; CHECK-NEXT:    prmt.b32 %r8, %r4, 0, 0x7771U;
+; CHECK-NEXT:    prmt.b32 %r9, %r4, 0, 0x7770U;
+; CHECK-NEXT:    prmt.b32 %r10, %r9, %r8, 0x5410U;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0+8], {%r10, %r7};
+; CHECK-NEXT:    prmt.b32 %r11, %r3, 0, 0x7773U;
+; CHECK-NEXT:    cvt.u16.u32 %rs1, %r11;
+; CHECK-NEXT:    mov.b16 %rs2, 3;
+; CHECK-NEXT:    mov.b32 %r12, {%rs2, %rs1};
+; CHECK-NEXT:    prmt.b32 %r13, %r3, 0, 0x7770U;
+; CHECK-NEXT:    cvt.u16.u32 %rs3, %r13;
+; CHECK-NEXT:    mov.b16 %rs4, 2;
+; CHECK-NEXT:    mov.b32 %r14, {%rs3, %rs4};
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r14, %r12};
+; CHECK-NEXT:    ret;
+  %v0 = insertelement <8 x i5> poison, i5 1, i32 %idx
+  %v1 = insertelement <8 x i5> %v0, i5 2, i32 1
+  %v2 = insertelement <8 x i5> %v1, i5 3, i32 2
+  ret <8 x i5> %v2
+}
+
+; Test with i7 elements (7-bit, non-byte-aligned)
+define <8 x i7> @dynamic_i7(i32 %idx) {
+; CHECK-LABEL: dynamic_i7(
+; CHECK:       {
+; CHECK-NEXT:    .local .align 8 .b8 __local_depot14[8];
+; CHECK-NEXT:    .reg .b64 %SP;
+; CHECK-NEXT:    .reg .b64 %SPL;
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<15>;
+; CHECK-NEXT:    .reg .b64 %rd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.b64 %SPL, __local_depot14;
+; CHECK-NEXT:    cvta.local.u64 %SP, %SPL;
+; CHECK-NEXT:    ld.param.b32 %rd1, [dynamic_i7_param_0];
+; CHECK-NEXT:    and.b64 %rd2, %rd1, 7;
+; CHECK-NEXT:    add.u64 %rd3, %SP, 0;
+; CHECK-NEXT:    or.b64 %rd4, %rd3, %rd2;
+; CHECK-NEXT:    st.v2.b32 [%SP], {%r1, %r2};
+; CHECK-NEXT:    st.b8 [%rd4], 1;
+; CHECK-NEXT:    ld.b32 %r3, [%SP];
+; CHECK-NEXT:    ld.b32 %r4, [%SP+4];
+; CHECK-NEXT:    prmt.b32 %r5, %r4, 0, 0x7773U;
+; CHECK-NEXT:    prmt.b32 %r6, %r4, 0, 0x7772U;
+; CHECK-NEXT:    prmt.b32 %r7, %r6, %r5, 0x5410U;
+; CHECK-NEXT:    prmt.b32 %r8, %r4, 0, 0x7771U;
+; CHECK-NEXT:    prmt.b32 %r9, %r4, 0, 0x7770U;
+; CHECK-NEXT:    prmt.b32 %r10, %r9, %r8, 0x5410U;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0+8], {%r10, %r7};
+; CHECK-NEXT:    prmt.b32 %r11, %r3, 0, 0x7773U;
+; CHECK-NEXT:    cvt.u16.u32 %rs1, %r11;
+; CHECK-NEXT:    mov.b16 %rs2, 3;
+; CHECK-NEXT:    mov.b32 %r12, {%rs2, %rs1};
+; CHECK-NEXT:    prmt.b32 %r13, %r3, 0, 0x7770U;
+; CHECK-NEXT:    cvt.u16.u32 %rs3, %r13;
+; CHECK-NEXT:    mov.b16 %rs4, 2;
+; CHECK-NEXT:    mov.b32 %r14, {%rs3, %rs4};
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r14, %r12};
+; CHECK-NEXT:    ret;
+  %v0 = insertelement <8 x i7> poison, i7 1, i32 %idx
+  %v1 = insertelement <8 x i7> %v0, i7 2, i32 1
+  %v2 = insertelement <8 x i7> %v1, i7 3, i32 2
+  ret <8 x i7> %v2
+}
+
+; Test with i6 elements (6-bit, non-byte-aligned)
+define <8 x i6> @dynamic_i6(i32 %idx) {
+; CHECK-LABEL: dynamic_i6(
+; CHECK:       {
+; CHECK-NEXT:    .local .align 8 .b8 __local_depot15[8];
+; CHECK-NEXT:    .reg .b64 %SP;
+; CHECK-NEXT:    .reg .b64 %SPL;
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<15>;
+; CHECK-NEXT:    .reg .b64 %rd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.b64 %SPL, __local_depot15;
+; CHECK-NEXT:    cvta.local.u64 %SP, %SPL;
+; CHECK-NEXT:    ld.param.b32 %rd1, [dynamic_i6_param_0];
+; CHECK-NEXT:    and.b64 %rd2, %rd1, 7;
+; CHECK-NEXT:    add.u64 %rd3, %SP, 0;
+; CHECK-NEXT:    or.b64 %rd4, %rd3, %rd2;
+; CHECK-NEXT:    st.v2.b32 [%SP], {%r1, %r2};
+; CHECK-NEXT:    st.b8 [%rd4], 1;
+; CHECK-NEXT:    ld.b32 %r3, [%SP];
+; CHECK-NEXT:    ld.b32 %r4, [%SP+4];
+; CHECK-NEXT:    prmt.b32 %r5, %r4, 0, 0x7773U;
+; CHECK-NEXT:    prmt.b32 %r6, %r4, 0, 0x7772U;
+; CHECK-NEXT:    prmt.b32 %r7, %r6, %r5, 0x5410U;
+; CHECK-NEXT:    prmt.b32 %r8, %r4, 0, 0x7771U;
+; CHECK-NEXT:    prmt.b32 %r9, %r4, 0, 0x7770U;
+; CHECK-NEXT:    prmt.b32 %r10, %r9, %r8, 0x5410U;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0+8], {%r10, %r7};
+; CHECK-NEXT:    prmt.b32 %r11, %r3, 0, 0x7773U;
+; CHECK-NEXT:    cvt.u16.u32 %rs1, %r11;
+; CHECK-NEXT:    mov.b16 %rs2, 3;
+; CHECK-NEXT:    mov.b32 %r12, {%rs2, %rs1};
+; CHECK-NEXT:    prmt.b32 %r13, %r3, 0, 0x7770U;
+; CHECK-NEXT:    cvt.u16.u32 %rs3, %r13;
+; CHECK-NEXT:    mov.b16 %rs4, 2;
+; CHECK-NEXT:    mov.b32 %r14, {%rs3, %rs4};
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r14, %r12};
+; CHECK-NEXT:    ret;
+  %v0 = insertelement <8 x i6> poison, i6 1, i32 %idx
+  %v1 = insertelement <8 x i6> %v0, i6 2, i32 1
+  %v2 = insertelement <8 x i6> %v1, i6 3, i32 2
+  ret <8 x i6> %v2
+}
+
+; Test with multiple dynamic insertions on i3 elements
+define <4 x i3> @multiple_dynamic_i3(i32 %idx0, i32 %idx1) {
+; CHECK-LABEL: multiple_dynamic_i3(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<9>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [multiple_dynamic_i3_param_0];
+; CHECK-NEXT:    shl.b32 %r2, %r1, 3;
+; CHECK-NEXT:    bfi.b32 %r3, 1, %r4, %r2, 8;
+; CHECK-NEXT:    ld.param.b32 %r5, [multiple_dynamic_i3_param_1];
+; CHECK-NEXT:    shl.b32 %r6, %r5, 3;
+; CHECK-NEXT:    bfi.b32 %r7, 2, %r3, %r6, 8;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %r7;
+; CHECK-NEXT:    shr.u32 %r8, %r7, 16;
+; CHECK-NEXT:    st.param.b16 [func_retval0+2], %r8;
+; CHECK-NEXT:    ret;
+  %v0 = insertelement <4 x i3> poison, i3 1, i32 %idx0
+  %v1 = insertelement <4 x i3> %v0, i3 2, i32 %idx1
+  ret <4 x i3> %v1
+}
diff --git a/llvm/test/CodeGen/PowerPC/bittest.ll b/llvm/test/CodeGen/PowerPC/bittest.ll
new file mode 100644
index 0000000000000..cba56e3d5798f
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/bittest.ll
@@ -0,0 +1,193 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -verify-machineinstrs < %s -O3 -mcpu=ppc -mtriple powerpc-ibm-aix \
+; RUN:     -ppc-asm-full-reg-names | FileCheck %s
+
+define i32 @foo(i32 noundef signext %x) {
+; CHECK-LABEL: foo:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    mflr r0
+; CHECK-NEXT:    stwu r1, -64(r1)
+; CHECK-NEXT:    stw r0, 72(r1)
+; CHECK-NEXT:    cmpwi r3, 8
+; CHECK-NEXT:    stw r31, 60(r1) # 4-byte Folded Spill
+; CHECK-NEXT:    mr r31, r3
+; CHECK-NEXT:    li r3, 0
+; CHECK-NEXT:    ble cr0, L..BB0_4
+; CHECK-NEXT:  # %bb.1: # %entry
+; CHECK-NEXT:    cmpwi r31, 11
+; CHECK-NEXT:    bge cr0, L..BB0_7
+; CHECK-NEXT:  # %bb.2: # %entry
+; CHECK-NEXT:    cmplwi r31, 9
+; CHECK-NEXT:    beq cr0, L..BB0_9
+; CHECK-NEXT:  # %bb.3: # %entry
+; CHECK-NEXT:    cmplwi r31, 10
+; CHECK-NEXT:    beq cr0, L..BB0_11
+; CHECK-NEXT:    b L..BB0_13
+; CHECK-NEXT:  L..BB0_4: # %entry
+; CHECK-NEXT:    cmplwi r31, 4
+; CHECK-NEXT:    beq cr0, L..BB0_12
+; CHECK-NEXT:  # %bb.5: # %entry
+; CHECK-NEXT:    cmplwi r31, 7
+; CHECK-NEXT:    beq cr0, L..BB0_11
+; CHECK-NEXT:  # %bb.6: # %entry
+; CHECK-NEXT:    cmplwi r31, 8
+; CHECK-NEXT:    beq cr0, L..BB0_10
+; CHECK-NEXT:    b L..BB0_13
+; CHECK-NEXT:  L..BB0_7: # %entry
+; CHECK-NEXT:    beq cr0, L..BB0_10
+; CHECK-NEXT:  # %bb.8: # %entry
+; CHECK-NEXT:    cmplwi r31, 12
+; CHECK-NEXT:    bne cr0, L..BB0_13
+; CHECK-NEXT:  L..BB0_9: # %sw.bb2
+; CHECK-NEXT:    mr r3, r31
+; CHECK-NEXT:    bl .foo3[PR]
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    mr r3, r31
+; CHECK-NEXT:    b L..BB0_13
+; CHECK-NEXT:  L..BB0_10: # %sw.bb1
+; CHECK-NEXT:    mr r3, r31
+; CHECK-NEXT:    bl .foo2[PR]
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    mr r3, r31
+; CHECK-NEXT:    b L..BB0_13
+; CHECK-NEXT:  L..BB0_11: # %sw.bb
+; CHECK-NEXT:    mr r3, r31
+; CHECK-NEXT:    bl .foo1[PR]
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    mr r3, r31
+; CHECK-NEXT:    b L..BB0_13
+; CHECK-NEXT:  L..BB0_12: # %sw.bb3
+; CHECK-NEXT:    li r3, 4
+; CHECK-NEXT:    bl .foo4[PR]
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    li r3, 4
+; CHECK-NEXT:  L..BB0_13: # %return
+; CHECK-NEXT:    lwz r31, 60(r1) # 4-byte Folded Reload
+; CHECK-NEXT:    addi r1, r1, 64
+; CHECK-NEXT:    lwz r0, 8(r1)
+; CHECK-NEXT:    mtlr r0
+; CHECK-NEXT:    blr
+entry:
+  switch i32 %x, label %return [
+  i32 7, label %sw.bb
+  i32 10, label %sw.bb
+  i32 8, label %sw.bb1
+  i32 11, label %sw.bb1
+  i32 9, label %sw.bb2
+  i32 12, label %sw.bb2
+  i32 4, label %sw.bb3
+  ]
+
+sw.bb:                                            ; preds = %entry, %entry
+  tail call void @foo1(i32 noundef signext %x)
+  br label %return
+
+sw.bb1:                                           ; preds = %entry, %entry
+  tail call void @foo2(i32 noundef signext %x)
+  br label %return
+
+sw.bb2:                                           ; preds = %entry, %entry
+  tail call void @foo3(i32 noundef signext %x)
+  br label %return
+
+sw.bb3:                                           ; preds = %entry
+  tail call void @foo4(i32 noundef signext 4)
+  br label %return
+
+return:                                           ; preds = %sw.bb, %sw.bb1, %sw.bb2, %sw.bb3, %entry
+  %retval.0 = phi i32 [ 0, %entry ], [ 4, %sw.bb3 ], [ %x, %sw.bb2 ], [ %x, %sw.bb1 ], [ %x, %sw.bb ]
+  ret i32 %retval.0
+}
+
+define i32 @goo(i32 noundef signext %x) {
+; CHECK-LABEL: goo:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    mflr r0
+; CHECK-NEXT:    stwu r1, -64(r1)
+; CHECK-NEXT:    stw r0, 72(r1)
+; CHECK-NEXT:    cmplwi r3, 12
+; CHECK-NEXT:    stw r31, 60(r1) # 4-byte Folded Spill
+; CHECK-NEXT:    mr r31, r3
+; CHECK-NEXT:    bgt cr0, L..BB1_7
+; CHECK-NEXT:  # %bb.1: # %entry
+; CHECK-NEXT:    li r3, 1
+; CHECK-NEXT:    slw r3, r3, r31
+; CHECK-NEXT:    andi. r4, r3, 5632
+; CHECK-NEXT:    bne cr0, L..BB1_4
+; CHECK-NEXT:  # %bb.2: # %entry
+; CHECK-NEXT:    andi. r3, r3, 2304
+; CHECK-NEXT:    beq cr0, L..BB1_5
+; CHECK-NEXT:  # %bb.3: # %sw.bb1
+; CHECK-NEXT:    mr r3, r31
+; CHECK-NEXT:    bl .foo2[PR]
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    b L..BB1_9
+; CHECK-NEXT:  L..BB1_4: # %sw.bb2
+; CHECK-NEXT:    mr r3, r31
+; CHECK-NEXT:    bl .foo3[PR]
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    b L..BB1_9
+; CHECK-NEXT:  L..BB1_5: # %entry
+; CHECK-NEXT:    cmplwi r31, 7
+; CHECK-NEXT:    bne cr0, L..BB1_7
+; CHECK-NEXT:  # %bb.6: # %sw.bb
+; CHECK-NEXT:    li r3, 7
+; CHECK-NEXT:    li r31, 7
+; CHECK-NEXT:    bl .foo1[PR]
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    b L..BB1_9
+; CHECK-NEXT:  L..BB1_7: # %entry
+; CHECK-NEXT:    cmplwi r31, 4
+; CHECK-NEXT:    li r31, 0
+; CHECK-NEXT:    bne cr0, L..BB1_9
+; CHECK-NEXT:  # %bb.8: # %sw.bb3
+; CHECK-NEXT:    li r3, 4
+; CHECK-NEXT:    li r31, 4
+; CHECK-NEXT:    bl .foo4[PR]
+; CHECK-NEXT:    nop
+; CHECK-NEXT:  L..BB1_9: # %return
+; CHECK-NEXT:    mr r3, r31
+; CHECK-NEXT:    lwz r31, 60(r1) # 4-byte Folded Reload
+; CHECK-NEXT:    addi r1, r1, 64
+; CHECK-NEXT:    lwz r0, 8(r1)
+; CHECK-NEXT:    mtlr r0
+; CHECK-NEXT:    blr
+entry:
+  switch i32 %x, label %return [
+  i32 7, label %sw.bb
+  i32 8, label %sw.bb1
+  i32 11, label %sw.bb1
+  i32 9, label %sw.bb2
+  i32 10, label %sw.bb2
+  i32 12, label %sw.bb2
+  i32 4, label %sw.bb3
+  ]
+
+sw.bb:                                            ; preds = %entry
+  tail call void @foo1(i32 noundef signext 7)
+  br label %return
+
+sw.bb1:                                           ; preds = %entry, %entry
+  tail call void @foo2(i32 noundef signext %x)
+  br label %return
+
+sw.bb2:                                           ; preds = %entry, %entry, %entry
+  tail call void @foo3(i32 noundef signext %x)
+  br label %return
+
+sw.bb3:                                           ; preds = %entry
+  tail call void @foo4(i32 noundef signext 4)
+  br label %return
+
+return:                                           ; preds = %sw.bb, %sw.bb1, %sw.bb2, %sw.bb3, %entry
+  %retval.0 = phi i32 [ 0, %entry ], [ 4, %sw.bb3 ], [ %x, %sw.bb2 ], [ %x, %sw.bb1 ], [ 7, %sw.bb ]
+  ret i32 %retval.0
+}
+
+declare void @foo1(i32 noundef signext)
+
+declare void @foo2(i32 noundef signext)
+
+declare void @foo3(i32 noundef signext)
+
+declare void @foo4(i32 noundef signext)
diff --git a/llvm/test/CodeGen/PowerPC/combine-sext-and-shl-after-isel.ll b/llvm/test/CodeGen/PowerPC/combine-sext-and-shl-after-isel.ll
index 00a77f92c0413..530169ff09486 100644
--- a/llvm/test/CodeGen/PowerPC/combine-sext-and-shl-after-isel.ll
+++ b/llvm/test/CodeGen/PowerPC/combine-sext-and-shl-after-isel.ll
@@ -212,37 +212,33 @@ define hidden void @testCaller(i1 %incond) local_unnamed_addr align 2 nounwind {
 ; CHECK-NEXT:    std r30, 48(r1) # 8-byte Folded Spill
 ; CHECK-NEXT:    andi. r3, r3, 1
 ; CHECK-NEXT:    li r3, -1
+; CHECK-NEXT:    li r4, 0
 ; CHECK-NEXT:    li r30, 0
 ; CHECK-NEXT:    crmove 4*cr2+lt, gt
 ; CHECK-NEXT:    std r29, 40(r1) # 8-byte Folded Spill
 ; CHECK-NEXT:    b .LBB3_2
-; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  .LBB3_1: # %if.end116
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    bl callee
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    mr r3, r29
-; CHECK-NEXT:  .LBB3_2: # %cond.end.i.i
-; CHECK-NEXT:    # =>This Loop Header: Depth=1
-; CHECK-NEXT:    # Child Loop BB3_3 Depth 2
-; CHECK-NEXT:    lwz r29, 0(r3)
-; CHECK-NEXT:    li r5, 0
-; CHECK-NEXT:    extsw r4, r29
-; CHECK-NEXT:    .p2align 5
-; CHECK-NEXT:  .LBB3_3: # %while.body5.i
-; CHECK-NEXT:    # Parent Loop BB3_2 Depth=1
-; CHECK-NEXT:    # => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    addi r5, r5, -1
-; CHECK-NEXT:    cmpwi r5, 0
-; CHECK-NEXT:    bgt cr0, .LBB3_3
-; CHECK-NEXT:  # %bb.4: # %while.cond12.preheader.i
+; CHECK-NEXT:    li r4, 0
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  .LBB3_2: # %while.body5.i
 ; CHECK-NEXT:    #
+; CHECK-NEXT:    addi r4, r4, -1
+; CHECK-NEXT:    cmpwi r4, 0
+; CHECK-NEXT:    bgt cr0, .LBB3_2
+; CHECK-NEXT:  # %bb.3: # %while.cond12.preheader.i
+; CHECK-NEXT:    #
+; CHECK-NEXT:    lwz r29, 0(r3)
 ; CHECK-NEXT:    bc 12, 4*cr2+lt, .LBB3_1
-; CHECK-NEXT:  # %bb.5: # %for.cond99.preheader
+; CHECK-NEXT:  # %bb.4: # %for.cond99.preheader
 ; CHECK-NEXT:    #
+; CHECK-NEXT:    extsw r4, r29
 ; CHECK-NEXT:    ld r5, 0(r3)
-; CHECK-NEXT:    sldi r4, r4, 2
 ; CHECK-NEXT:    stw r3, 0(r3)
+; CHECK-NEXT:    sldi r4, r4, 2
 ; CHECK-NEXT:    stwx r30, r5, r4
 ; CHECK-NEXT:    b .LBB3_1
 ;
@@ -256,37 +252,33 @@ define hidden void @testCaller(i1 %incond) local_unnamed_addr align 2 nounwind {
 ; CHECK-BE-NEXT:    std r30, 64(r1) # 8-byte Folded Spill
 ; CHECK-BE-NEXT:    andi. r3, r3, 1
 ; CHECK-BE-NEXT:    li r3, -1
+; CHECK-BE-NEXT:    li r4, 0
 ; CHECK-BE-NEXT:    li r30, 0
 ; CHECK-BE-NEXT:    crmove 4*cr2+lt, gt
 ; CHECK-BE-NEXT:    std r29, 56(r1) # 8-byte Folded Spill
 ; CHECK-BE-NEXT:    b .LBB3_2
-; CHECK-BE-NEXT:    .p2align 4
 ; CHECK-BE-NEXT:  .LBB3_1: # %if.end116
 ; CHECK-BE-NEXT:    #
 ; CHECK-BE-NEXT:    bl callee
 ; CHECK-BE-NEXT:    nop
 ; CHECK-BE-NEXT:    mr r3, r29
-; CHECK-BE-NEXT:  .LBB3_2: # %cond.end.i.i
-; CHECK-BE-NEXT:    # =>This Loop Header: Depth=1
-; CHECK-BE-NEXT:    # Child Loop BB3_3 Depth 2
-; CHECK-BE-NEXT:    lwz r29, 0(r3)
-; CHECK-BE-NEXT:    li r5, 0
-; CHECK-BE-NEXT:    extsw r4, r29
-; CHECK-BE-NEXT:    .p2align 5
-; CHECK-BE-NEXT:  .LBB3_3: # %while.body5.i
-; CHECK-BE-NEXT:    # Parent Loop BB3_2 Depth=1
-; CHECK-BE-NEXT:    # => This Inner Loop Header: Depth=2
-; CHECK-BE-NEXT:    addi r5, r5, -1
-; CHECK-BE-NEXT:    cmpwi r5, 0
-; CHECK-BE-NEXT:    bgt cr0, .LBB3_3
-; CHECK-BE-NEXT:  # %bb.4: # %while.cond12.preheader.i
+; CHECK-BE-NEXT:    li r4, 0
+; CHECK-BE-NEXT:    .p2align 4
+; CHECK-BE-NEXT:  .LBB3_2: # %while.body5.i
+; CHECK-BE-NEXT:    #
+; CHECK-BE-NEXT:    addi r4, r4, -1
+; CHECK-BE-NEXT:    cmpwi r4, 0
+; CHECK-BE-NEXT:    bgt cr0, .LBB3_2
+; CHECK-BE-NEXT:  # %bb.3: # %while.cond12.preheader.i
 ; CHECK-BE-NEXT:    #
+; CHECK-BE-NEXT:    lwz r29, 0(r3)
 ; CHECK-BE-NEXT:    bc 12, 4*cr2+lt, .LBB3_1
-; CHECK-BE-NEXT:  # %bb.5: # %for.cond99.preheader
+; CHECK-BE-NEXT:  # %bb.4: # %for.cond99.preheader
 ; CHECK-BE-NEXT:    #
+; CHECK-BE-NEXT:    extsw r4, r29
 ; CHECK-BE-NEXT:    ld r5, 0(r3)
-; CHECK-BE-NEXT:    sldi r4, r4, 2
 ; CHECK-BE-NEXT:    stw r3, 0(r3)
+; CHECK-BE-NEXT:    sldi r4, r4, 2
 ; CHECK-BE-NEXT:    stwx r30, r5, r4
 ; CHECK-BE-NEXT:    b .LBB3_1
 ;
@@ -300,32 +292,28 @@ define hidden void @testCaller(i1 %incond) local_unnamed_addr align 2 nounwind {
 ; CHECK-P9-NEXT:    std r0, 80(r1)
 ; CHECK-P9-NEXT:    std r30, 48(r1) # 8-byte Folded Spill
 ; CHECK-P9-NEXT:    li r3, -1
+; CHECK-P9-NEXT:    li r4, 0
 ; CHECK-P9-NEXT:    li r30, 0
 ; CHECK-P9-NEXT:    std r29, 40(r1) # 8-byte Folded Spill
 ; CHECK-P9-NEXT:    crmove 4*cr2+lt, gt
 ; CHECK-P9-NEXT:    b .LBB3_2
-; CHECK-P9-NEXT:    .p2align 4
 ; CHECK-P9-NEXT:  .LBB3_1: # %if.end116
 ; CHECK-P9-NEXT:    #
 ; CHECK-P9-NEXT:    bl callee
 ; CHECK-P9-NEXT:    nop
 ; CHECK-P9-NEXT:    mr r3, r29
-; CHECK-P9-NEXT:  .LBB3_2: # %cond.end.i.i
-; CHECK-P9-NEXT:    # =>This Loop Header: Depth=1
-; CHECK-P9-NEXT:    # Child Loop BB3_3 Depth 2
-; CHECK-P9-NEXT:    lwz r29, 0(r3)
 ; CHECK-P9-NEXT:    li r4, 0
-; CHECK-P9-NEXT:    .p2align 5
-; CHECK-P9-NEXT:  .LBB3_3: # %while.body5.i
-; CHECK-P9-NEXT:    # Parent Loop BB3_2 Depth=1
-; CHECK-P9-NEXT:    # => This Inner Loop Header: Depth=2
+; CHECK-P9-NEXT:    .p2align 4
+; CHECK-P9-NEXT:  .LBB3_2: # %while.body5.i
+; CHECK-P9-NEXT:    #
 ; CHECK-P9-NEXT:    addi r4, r4, -1
 ; CHECK-P9-NEXT:    cmpwi r4, 0
-; CHECK-P9-NEXT:    bgt cr0, .LBB3_3
-; CHECK-P9-NEXT:  # %bb.4: # %while.cond12.preheader.i
+; CHECK-P9-NEXT:    bgt cr0, .LBB3_2
+; CHECK-P9-NEXT:  # %bb.3: # %while.cond12.preheader.i
 ; CHECK-P9-NEXT:    #
+; CHECK-P9-NEXT:    lwz r29, 0(r3)
 ; CHECK-P9-NEXT:    bc 12, 4*cr2+lt, .LBB3_1
-; CHECK-P9-NEXT:  # %bb.5: # %for.cond99.preheader
+; CHECK-P9-NEXT:  # %bb.4: # %for.cond99.preheader
 ; CHECK-P9-NEXT:    #
 ; CHECK-P9-NEXT:    ld r4, 0(r3)
 ; CHECK-P9-NEXT:    extswsli r5, r29, 2
@@ -343,32 +331,28 @@ define hidden void @testCaller(i1 %incond) local_unnamed_addr align 2 nounwind {
 ; CHECK-P9-BE-NEXT:    std r0, 96(r1)
 ; CHECK-P9-BE-NEXT:    std r30, 64(r1) # 8-byte Folded Spill
 ; CHECK-P9-BE-NEXT:    li r3, -1
+; CHECK-P9-BE-NEXT:    li r4, 0
 ; CHECK-P9-BE-NEXT:    li r30, 0
 ; CHECK-P9-BE-NEXT:    std r29, 56(r1) # 8-byte Folded Spill
 ; CHECK-P9-BE-NEXT:    crmove 4*cr2+lt, gt
 ; CHECK-P9-BE-NEXT:    b .LBB3_2
-; CHECK-P9-BE-NEXT:    .p2align 4
 ; CHECK-P9-BE-NEXT:  .LBB3_1: # %if.end116
 ; CHECK-P9-BE-NEXT:    #
 ; CHECK-P9-BE-NEXT:    bl callee
 ; CHECK-P9-BE-NEXT:    nop
 ; CHECK-P9-BE-NEXT:    mr r3, r29
-; CHECK-P9-BE-NEXT:  .LBB3_2: # %cond.end.i.i
-; CHECK-P9-BE-NEXT:    # =>This Loop Header: Depth=1
-; CHECK-P9-BE-NEXT:    # Child Loop BB3_3 Depth 2
-; CHECK-P9-BE-NEXT:    lwz r29, 0(r3)
 ; CHECK-P9-BE-NEXT:    li r4, 0
-; CHECK-P9-BE-NEXT:    .p2align 5
-; CHECK-P9-BE-NEXT:  .LBB3_3: # %while.body5.i
-; CHECK-P9-BE-NEXT:    # Parent Loop BB3_2 Depth=1
-; CHECK-P9-BE-NEXT:    # => This Inner Loop Header: Depth=2
+; CHECK-P9-BE-NEXT:    .p2align 4
+; CHECK-P9-BE-NEXT:  .LBB3_2: # %while.body5.i
+; CHECK-P9-BE-NEXT:    #
 ; CHECK-P9-BE-NEXT:    addi r4, r4, -1
 ; CHECK-P9-BE-NEXT:    cmpwi r4, 0
-; CHECK-P9-BE-NEXT:    bgt cr0, .LBB3_3
-; CHECK-P9-BE-NEXT:  # %bb.4: # %while.cond12.preheader.i
+; CHECK-P9-BE-NEXT:    bgt cr0, .LBB3_2
+; CHECK-P9-BE-NEXT:  # %bb.3: # %while.cond12.preheader.i
 ; CHECK-P9-BE-NEXT:    #
+; CHECK-P9-BE-NEXT:    lwz r29, 0(r3)
 ; CHECK-P9-BE-NEXT:    bc 12, 4*cr2+lt, .LBB3_1
-; CHECK-P9-BE-NEXT:  # %bb.5: # %for.cond99.preheader
+; CHECK-P9-BE-NEXT:  # %bb.4: # %for.cond99.preheader
 ; CHECK-P9-BE-NEXT:    #
 ; CHECK-P9-BE-NEXT:    ld r4, 0(r3)
 ; CHECK-P9-BE-NEXT:    extswsli r5, r29, 2
diff --git a/llvm/test/CodeGen/PowerPC/vec_insert_elt.ll b/llvm/test/CodeGen/PowerPC/vec_insert_elt.ll
index 291a9c1f978da..b006c78604648 100644
--- a/llvm/test/CodeGen/PowerPC/vec_insert_elt.ll
+++ b/llvm/test/CodeGen/PowerPC/vec_insert_elt.ll
@@ -242,17 +242,14 @@ define <2 x i64> @testDoubleword(<2 x i64> %a, i64 %b, i64 %idx) {
 ; AIX-P8-32-LABEL: testDoubleword:
 ; AIX-P8-32:       # %bb.0: # %entry
 ; AIX-P8-32-NEXT:    add r6, r6, r6
-; AIX-P8-32-NEXT:    addi r5, r1, -32
+; AIX-P8-32-NEXT:    addi r5, r1, -16
 ; AIX-P8-32-NEXT:    rlwinm r7, r6, 2, 28, 29
-; AIX-P8-32-NEXT:    stxvw4x v2, 0, r5
+; AIX-P8-32-NEXT:    stxvd2x v2, 0, r5
 ; AIX-P8-32-NEXT:    stwx r3, r5, r7
-; AIX-P8-32-NEXT:    addi r3, r1, -16
-; AIX-P8-32-NEXT:    lxvw4x vs0, 0, r5
-; AIX-P8-32-NEXT:    addi r5, r6, 1
-; AIX-P8-32-NEXT:    rlwinm r5, r5, 2, 28, 29
-; AIX-P8-32-NEXT:    stxvw4x vs0, 0, r3
-; AIX-P8-32-NEXT:    stwx r4, r3, r5
-; AIX-P8-32-NEXT:    lxvw4x v2, 0, r3
+; AIX-P8-32-NEXT:    addi r3, r6, 1
+; AIX-P8-32-NEXT:    rlwinm r3, r3, 2, 28, 29
+; AIX-P8-32-NEXT:    stwx r4, r5, r3
+; AIX-P8-32-NEXT:    lxvd2x v2, 0, r5
 ; AIX-P8-32-NEXT:    blr
 entry:
   %vecins = insertelement <2 x i64> %a, i64 %b, i64 %idx
@@ -426,17 +423,14 @@ define <4 x float> @testFloat2(<4 x float> %a, ptr %b, i32 zeroext %idx1, i32 ze
 ; AIX-P8-LABEL: testFloat2:
 ; AIX-P8:       # %bb.0: # %entry
 ; AIX-P8-NEXT:    lwz r6, 0(r3)
-; AIX-P8-NEXT:    rlwinm r4, r4, 2, 28, 29
-; AIX-P8-NEXT:    addi r7, r1, -32
+; AIX-P8-NEXT:    lwz r3, 1(r3)
+; AIX-P8-NEXT:    addi r7, r1, -16
 ; AIX-P8-NEXT:    stxvw4x v2, 0, r7
 ; AIX-P8-NEXT:    rlwinm r5, r5, 2, 28, 29
+; AIX-P8-NEXT:    rlwinm r4, r4, 2, 28, 29
 ; AIX-P8-NEXT:    stwx r6, r7, r4
-; AIX-P8-NEXT:    addi r4, r1, -16
-; AIX-P8-NEXT:    lxvw4x vs0, 0, r7
-; AIX-P8-NEXT:    lwz r3, 1(r3)
-; AIX-P8-NEXT:    stxvw4x vs0, 0, r4
-; AIX-P8-NEXT:    stwx r3, r4, r5
-; AIX-P8-NEXT:    lxvw4x v2, 0, r4
+; AIX-P8-NEXT:    stwx r3, r7, r5
+; AIX-P8-NEXT:    lxvw4x v2, 0, r7
 ; AIX-P8-NEXT:    blr
 entry:
   %add.ptr1 = getelementptr inbounds i8, ptr %b, i64 1
@@ -493,38 +487,32 @@ define <4 x float> @testFloat3(<4 x float> %a, ptr %b, i32 zeroext %idx1, i32 ze
 ;
 ; AIX-P8-64-LABEL: testFloat3:
 ; AIX-P8-64:       # %bb.0: # %entry
+; AIX-P8-64-NEXT:    li r7, 1
 ; AIX-P8-64-NEXT:    lis r6, 1
-; AIX-P8-64-NEXT:    rlwinm r4, r4, 2, 28, 29
-; AIX-P8-64-NEXT:    addi r7, r1, -32
 ; AIX-P8-64-NEXT:    rlwinm r5, r5, 2, 28, 29
+; AIX-P8-64-NEXT:    rlwinm r4, r4, 2, 28, 29
+; AIX-P8-64-NEXT:    rldic r7, r7, 36, 27
 ; AIX-P8-64-NEXT:    lwzx r6, r3, r6
+; AIX-P8-64-NEXT:    lwzx r3, r3, r7
+; AIX-P8-64-NEXT:    addi r7, r1, -16
 ; AIX-P8-64-NEXT:    stxvw4x v2, 0, r7
 ; AIX-P8-64-NEXT:    stwx r6, r7, r4
-; AIX-P8-64-NEXT:    li r4, 1
-; AIX-P8-64-NEXT:    lxvw4x vs0, 0, r7
-; AIX-P8-64-NEXT:    rldic r4, r4, 36, 27
-; AIX-P8-64-NEXT:    lwzx r3, r3, r4
-; AIX-P8-64-NEXT:    addi r4, r1, -16
-; AIX-P8-64-NEXT:    stxvw4x vs0, 0, r4
-; AIX-P8-64-NEXT:    stwx r3, r4, r5
-; AIX-P8-64-NEXT:    lxvw4x v2, 0, r4
+; AIX-P8-64-NEXT:    stwx r3, r7, r5
+; AIX-P8-64-NEXT:    lxvw4x v2, 0, r7
 ; AIX-P8-64-NEXT:    blr
 ;
 ; AIX-P8-32-LABEL: testFloat3:
 ; AIX-P8-32:       # %bb.0: # %entry
 ; AIX-P8-32-NEXT:    lis r6, 1
-; AIX-P8-32-NEXT:    rlwinm r4, r4, 2, 28, 29
-; AIX-P8-32-NEXT:    addi r7, r1, -32
 ; AIX-P8-32-NEXT:    rlwinm r5, r5, 2, 28, 29
+; AIX-P8-32-NEXT:    rlwinm r4, r4, 2, 28, 29
+; AIX-P8-32-NEXT:    addi r7, r1, -16
 ; AIX-P8-32-NEXT:    lwzx r6, r3, r6
+; AIX-P8-32-NEXT:    lwz r3, 0(r3)
 ; AIX-P8-32-NEXT:    stxvw4x v2, 0, r7
 ; AIX-P8-32-NEXT:    stwx r6, r7, r4
-; AIX-P8-32-NEXT:    addi r4, r1, -16
-; AIX-P8-32-NEXT:    lxvw4x vs0, 0, r7
-; AIX-P8-32-NEXT:    lwz r3, 0(r3)
-; AIX-P8-32-NEXT:    stxvw4x vs0, 0, r4
-; AIX-P8-32-NEXT:    stwx r3, r4, r5
-; AIX-P8-32-NEXT:    lxvw4x v2, 0, r4
+; AIX-P8-32-NEXT:    stwx r3, r7, r5
+; AIX-P8-32-NEXT:    lxvw4x v2, 0, r7
 ; AIX-P8-32-NEXT:    blr
 entry:
   %add.ptr = getelementptr inbounds i8, ptr %b, i64 65536
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vse.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vse.ll
new file mode 100644
index 0000000000000..785d9fc6a7970
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vse.ll
@@ -0,0 +1,1575 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfhmin,+zvfbfmin \
+; RUN:   -global-isel -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin \
+; RUN:   -global-isel -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare void @llvm.riscv.vse.nxv1i64(
+  <vscale x 1 x i64>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv1i64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vse64.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv1i64(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv1i64(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv1i64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vse64.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv1i64(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
+
+define void @intrinsic_vse_allonesmask_v_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_allonesmask_v_nxv1i64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vse64.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv1i64(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i1> splat (i1 true),
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.nxv2i64(
+  <vscale x 2 x i64>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv2i64_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vse64.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv2i64(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv2i64(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv2i64_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vse64.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv2i64(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.nxv4i64(
+  <vscale x 4 x i64>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv4i64_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vse64.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv4i64(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv4i64(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv4i64_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vse64.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv4i64(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.nxv8i64(
+  <vscale x 8 x i64>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv8i64_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vse64.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv8i64(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv8i64(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv8i64_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vse64.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv8i64(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.nxv1f64(
+  <vscale x 1 x double>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv1f64_nxv1f64(<vscale x 1 x double> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv1f64_nxv1f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vse64.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv1f64(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv1f64(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv1f64_nxv1f64(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv1f64_nxv1f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vse64.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv1f64(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.nxv2f64(
+  <vscale x 2 x double>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv2f64_nxv2f64(<vscale x 2 x double> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv2f64_nxv2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vse64.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv2f64(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv2f64(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv2f64_nxv2f64(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv2f64_nxv2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vse64.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv2f64(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.nxv4f64(
+  <vscale x 4 x double>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv4f64_nxv4f64(<vscale x 4 x double> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv4f64_nxv4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vse64.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv4f64(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv4f64(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv4f64_nxv4f64(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv4f64_nxv4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vse64.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv4f64(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.nxv8f64(
+  <vscale x 8 x double>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv8f64_nxv8f64(<vscale x 8 x double> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv8f64_nxv8f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vse64.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv8f64(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv8f64(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv8f64_nxv8f64(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv8f64_nxv8f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vse64.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv8f64(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.nxv1i32(
+  <vscale x 1 x i32>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv1i32_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv1i32(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv1i32(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv1i32_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vse32.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv1i32(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.nxv2i32(
+  <vscale x 2 x i32>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv2i32_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv2i32(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv2i32(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv2i32_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vse32.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv2i32(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.nxv4i32(
+  <vscale x 4 x i32>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv4i32_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv4i32(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv4i32(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv4i32_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vse32.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv4i32(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.nxv8i32(
+  <vscale x 8 x i32>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv8i32_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv8i32(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv8i32(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv8i32_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vse32.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv8i32(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.nxv16i32(
+  <vscale x 16 x i32>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv16i32_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv16i32(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv16i32(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv16i32_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vse32.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv16i32(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.nxv1f32(
+  <vscale x 1 x float>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv1f32_nxv1f32(<vscale x 1 x float> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv1f32_nxv1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv1f32(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv1f32(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv1f32_nxv1f32(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv1f32_nxv1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vse32.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv1f32(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.nxv2f32(
+  <vscale x 2 x float>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv2f32_nxv2f32(<vscale x 2 x float> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv2f32_nxv2f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv2f32(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv2f32(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv2f32_nxv2f32(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv2f32_nxv2f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vse32.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv2f32(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.nxv4f32(
+  <vscale x 4 x float>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv4f32_nxv4f32(<vscale x 4 x float> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv4f32_nxv4f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv4f32(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv4f32(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv4f32_nxv4f32(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv4f32_nxv4f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vse32.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv4f32(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.nxv8f32(
+  <vscale x 8 x float>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv8f32_nxv8f32(<vscale x 8 x float> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv8f32_nxv8f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv8f32(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv8f32(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv8f32_nxv8f32(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv8f32_nxv8f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vse32.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv8f32(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.nxv16f32(
+  <vscale x 16 x float>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv16f32_nxv16f32(<vscale x 16 x float> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv16f32_nxv16f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv16f32(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv16f32(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv16f32_nxv16f32(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv16f32_nxv16f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vse32.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv16f32(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.nxv1i16(
+  <vscale x 1 x i16>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv1i16_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vse16.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv1i16(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv1i16(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv1i16_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vse16.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv1i16(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.nxv2i16(
+  <vscale x 2 x i16>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv2i16_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vse16.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv2i16(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv2i16(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv2i16_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vse16.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv2i16(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.nxv4i16(
+  <vscale x 4 x i16>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv4i16_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vse16.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv4i16(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv4i16(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv4i16_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vse16.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv4i16(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.nxv8i16(
+  <vscale x 8 x i16>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv8i16_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vse16.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv8i16(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv8i16(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv8i16_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vse16.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv8i16(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.nxv16i16(
+  <vscale x 16 x i16>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv16i16_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vse16.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv16i16(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv16i16(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv16i16_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vse16.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv16i16(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.nxv32i16(
+  <vscale x 32 x i16>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv32i16_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vse16.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv32i16(
+    <vscale x 32 x i16> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv32i16(
+  <vscale x 32 x i16>,
+  ptr,
+  <vscale x 32 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv32i16_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vse16.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv32i16(
+    <vscale x 32 x i16> %0,
+    ptr %1,
+    <vscale x 32 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.nxv1f16(
+  <vscale x 1 x half>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv1f16_nxv1f16(<vscale x 1 x half> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv1f16_nxv1f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vse16.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv1f16(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv1f16(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv1f16_nxv1f16(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv1f16_nxv1f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vse16.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv1f16(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.nxv2f16(
+  <vscale x 2 x half>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv2f16_nxv2f16(<vscale x 2 x half> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv2f16_nxv2f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vse16.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv2f16(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv2f16(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv2f16_nxv2f16(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv2f16_nxv2f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vse16.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv2f16(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.nxv4f16(
+  <vscale x 4 x half>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv4f16_nxv4f16(<vscale x 4 x half> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv4f16_nxv4f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vse16.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv4f16(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv4f16(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv4f16_nxv4f16(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv4f16_nxv4f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vse16.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv4f16(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.nxv8f16(
+  <vscale x 8 x half>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv8f16_nxv8f16(<vscale x 8 x half> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv8f16_nxv8f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vse16.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv8f16(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv8f16(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv8f16_nxv8f16(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv8f16_nxv8f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vse16.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv8f16(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.nxv16f16(
+  <vscale x 16 x half>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv16f16_nxv16f16(<vscale x 16 x half> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv16f16_nxv16f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vse16.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv16f16(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv16f16(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv16f16_nxv16f16(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv16f16_nxv16f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vse16.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv16f16(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.nxv32f16(
+  <vscale x 32 x half>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv32f16_nxv32f16(<vscale x 32 x half> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv32f16_nxv32f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vse16.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv32f16(
+    <vscale x 32 x half> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv32f16(
+  <vscale x 32 x half>,
+  ptr,
+  <vscale x 32 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv32f16_nxv32f16(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv32f16_nxv32f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vse16.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv32f16(
+    <vscale x 32 x half> %0,
+    ptr %1,
+    <vscale x 32 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.nxv1i8(
+  <vscale x 1 x i8>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv1i8_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vse8.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv1i8(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv1i8(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv1i8_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vse8.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv1i8(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.nxv2i8(
+  <vscale x 2 x i8>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv2i8_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vse8.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv2i8(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv2i8(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv2i8_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vse8.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv2i8(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.nxv4i8(
+  <vscale x 4 x i8>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv4i8_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vse8.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv4i8(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv4i8(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv4i8_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vse8.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv4i8(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.nxv8i8(
+  <vscale x 8 x i8>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv8i8_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vse8.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv8i8(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv8i8(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv8i8_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vse8.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv8i8(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.nxv16i8(
+  <vscale x 16 x i8>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv16i8_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vse8.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv16i8(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv16i8(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv16i8_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vse8.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv16i8(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.nxv32i8(
+  <vscale x 32 x i8>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv32i8_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vse8.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv32i8(
+    <vscale x 32 x i8> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv32i8(
+  <vscale x 32 x i8>,
+  ptr,
+  <vscale x 32 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv32i8_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vse8.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv32i8(
+    <vscale x 32 x i8> %0,
+    ptr %1,
+    <vscale x 32 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.nxv64i8(
+  <vscale x 64 x i8>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv64i8_nxv64i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vse8.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv64i8(
+    <vscale x 64 x i8> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv64i8(
+  <vscale x 64 x i8>,
+  ptr,
+  <vscale x 64 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, ptr %1, <vscale x 64 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv64i8_nxv64i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vse8.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv64i8(
+    <vscale x 64 x i8> %0,
+    ptr %1,
+    <vscale x 64 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsm.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsm.ll
new file mode 100644
index 0000000000000..5237536c07740
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsm.ll
@@ -0,0 +1,139 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \
+; RUN:   -global-isel -verify-machineinstrs | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \
+; RUN:   -global-isel -verify-machineinstrs | FileCheck %s
+
+declare void @llvm.riscv.vsm.nxv1i1(<vscale x 1 x i1>, ptr, iXLen);
+
+define void @intrinsic_vsm_v_nxv1i1(<vscale x 1 x i1> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vsm_v_nxv1i1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsm.v v0, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsm.nxv1i1(<vscale x 1 x i1> %0, ptr %1, iXLen %2)
+  ret void
+}
+
+declare void @llvm.riscv.vsm.nxv2i1(<vscale x 2 x i1>, ptr, iXLen);
+
+define void @intrinsic_vsm_v_nxv2i1(<vscale x 2 x i1> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vsm_v_nxv2i1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsm.v v0, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsm.nxv2i1(<vscale x 2 x i1> %0, ptr %1, iXLen %2)
+  ret void
+}
+
+declare void @llvm.riscv.vsm.nxv4i1(<vscale x 4 x i1>, ptr, iXLen);
+
+define void @intrinsic_vsm_v_nxv4i1(<vscale x 4 x i1> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vsm_v_nxv4i1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsm.v v0, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsm.nxv4i1(<vscale x 4 x i1> %0, ptr %1, iXLen %2)
+  ret void
+}
+
+declare void @llvm.riscv.vsm.nxv8i1(<vscale x 8 x i1>, ptr, iXLen);
+
+define void @intrinsic_vsm_v_nxv8i1(<vscale x 8 x i1> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vsm_v_nxv8i1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsm.v v0, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsm.nxv8i1(<vscale x 8 x i1> %0, ptr %1, iXLen %2)
+  ret void
+}
+
+declare void @llvm.riscv.vsm.nxv16i1(<vscale x 16 x i1>, ptr, iXLen);
+
+define void @intrinsic_vsm_v_nxv16i1(<vscale x 16 x i1> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vsm_v_nxv16i1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vsm.v v0, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsm.nxv16i1(<vscale x 16 x i1> %0, ptr %1, iXLen %2)
+  ret void
+}
+
+declare void @llvm.riscv.vsm.nxv32i1(<vscale x 32 x i1>, ptr, iXLen);
+
+define void @intrinsic_vsm_v_nxv32i1(<vscale x 32 x i1> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vsm_v_nxv32i1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vsm.v v0, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsm.nxv32i1(<vscale x 32 x i1> %0, ptr %1, iXLen %2)
+  ret void
+}
+
+declare void @llvm.riscv.vsm.nxv64i1(<vscale x 64 x i1>, ptr, iXLen);
+
+define void @intrinsic_vsm_v_nxv64i1(<vscale x 64 x i1> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vsm_v_nxv64i1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vsm.v v0, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsm.nxv64i1(<vscale x 64 x i1> %0, ptr %1, iXLen %2)
+  ret void
+}
+
+declare <vscale x 1 x i1> @llvm.riscv.vmseq.nxv1i16(
+  <vscale x 1 x i16>,
+  <vscale x 1 x i16>,
+  iXLen);
+
+; Make sure we can use the vsetvli from the producing instruction.
+define void @test_vsetvli_i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, ptr %2, iXLen %3) nounwind {
+; CHECK-LABEL: test_vsetvli_i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vmseq.vv v8, v8, v9
+; CHECK-NEXT:    vsm.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i1> @llvm.riscv.vmseq.nxv1i16(
+    <vscale x 1 x i16> %0,
+    <vscale x 1 x i16> %1,
+    iXLen %3)
+  call void @llvm.riscv.vsm.nxv1i1(<vscale x 1 x i1> %a, ptr %2, iXLen %3)
+  ret void
+}
+
+declare <vscale x 1 x i1> @llvm.riscv.vmseq.nxv1i32(
+  <vscale x 1 x i32>,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define void @test_vsetvli_i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, ptr %2, iXLen %3) nounwind {
+; CHECK-LABEL: test_vsetvli_i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vmseq.vv v8, v8, v9
+; CHECK-NEXT:    vsm.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i1> @llvm.riscv.vmseq.nxv1i32(
+    <vscale x 1 x i32> %0,
+    <vscale x 1 x i32> %1,
+    iXLen %3)
+  call void @llvm.riscv.vsm.nxv1i1(<vscale x 1 x i1> %a, ptr %2, iXLen %3)
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsse.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsse.ll
new file mode 100644
index 0000000000000..b7609ff5fd1cd
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsse.ll
@@ -0,0 +1,1724 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfhmin,+zvfbfmin \
+; RUN:   -global-isel -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin \
+; RUN:   -global-isel -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare void @llvm.riscv.vsse.nxv1i64(
+  <vscale x 1 x i64>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv1i64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; CHECK-NEXT:    vsse64.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv1i64(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv1i64(
+  <vscale x 1 x i64>,
+  ptr,
+  iXLen,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, ptr %1, iXLen %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv1i64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; CHECK-NEXT:    vsse64.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv1i64(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+define void @intrinsic_vsse_allonesmask_v_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, ptr %1, iXLen %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_allonesmask_v_nxv1i64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; CHECK-NEXT:    vsse64.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv1i64(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 1 x i1> splat (i1 true),
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv2i64(
+  <vscale x 2 x i64>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv2i64_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; CHECK-NEXT:    vsse64.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv2i64(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv2i64(
+  <vscale x 2 x i64>,
+  ptr,
+  iXLen,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, ptr %1, iXLen %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv2i64_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; CHECK-NEXT:    vsse64.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv2i64(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv4i64(
+  <vscale x 4 x i64>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv4i64_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; CHECK-NEXT:    vsse64.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv4i64(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv4i64(
+  <vscale x 4 x i64>,
+  ptr,
+  iXLen,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, ptr %1, iXLen %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv4i64_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; CHECK-NEXT:    vsse64.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv4i64(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv8i64(
+  <vscale x 8 x i64>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv8i64_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; CHECK-NEXT:    vsse64.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv8i64(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv8i64(
+  <vscale x 8 x i64>,
+  ptr,
+  iXLen,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, ptr %1, iXLen %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv8i64_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; CHECK-NEXT:    vsse64.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv8i64(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv1f64(
+  <vscale x 1 x double>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv1f64_nxv1f64(<vscale x 1 x double> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv1f64_nxv1f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; CHECK-NEXT:    vsse64.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv1f64(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv1f64(
+  <vscale x 1 x double>,
+  ptr,
+  iXLen,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv1f64_nxv1f64(<vscale x 1 x double> %0, ptr %1, iXLen %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv1f64_nxv1f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; CHECK-NEXT:    vsse64.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv1f64(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv2f64(
+  <vscale x 2 x double>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv2f64_nxv2f64(<vscale x 2 x double> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv2f64_nxv2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; CHECK-NEXT:    vsse64.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv2f64(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv2f64(
+  <vscale x 2 x double>,
+  ptr,
+  iXLen,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv2f64_nxv2f64(<vscale x 2 x double> %0, ptr %1, iXLen %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv2f64_nxv2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; CHECK-NEXT:    vsse64.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv2f64(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv4f64(
+  <vscale x 4 x double>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv4f64_nxv4f64(<vscale x 4 x double> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv4f64_nxv4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; CHECK-NEXT:    vsse64.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv4f64(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv4f64(
+  <vscale x 4 x double>,
+  ptr,
+  iXLen,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv4f64_nxv4f64(<vscale x 4 x double> %0, ptr %1, iXLen %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv4f64_nxv4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; CHECK-NEXT:    vsse64.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv4f64(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv8f64(
+  <vscale x 8 x double>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv8f64_nxv8f64(<vscale x 8 x double> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv8f64_nxv8f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; CHECK-NEXT:    vsse64.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv8f64(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv8f64(
+  <vscale x 8 x double>,
+  ptr,
+  iXLen,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv8f64_nxv8f64(<vscale x 8 x double> %0, ptr %1, iXLen %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv8f64_nxv8f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; CHECK-NEXT:    vsse64.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv8f64(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv1i32(
+  <vscale x 1 x i32>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv1i32_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e32, mf2, ta, ma
+; CHECK-NEXT:    vsse32.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv1i32(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv1i32(
+  <vscale x 1 x i32>,
+  ptr,
+  iXLen,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, ptr %1, iXLen %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv1i32_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e32, mf2, ta, ma
+; CHECK-NEXT:    vsse32.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv1i32(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv2i32(
+  <vscale x 2 x i32>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv2i32_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT:    vsse32.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv2i32(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv2i32(
+  <vscale x 2 x i32>,
+  ptr,
+  iXLen,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, ptr %1, iXLen %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv2i32_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT:    vsse32.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv2i32(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv4i32(
+  <vscale x 4 x i32>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv4i32_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e32, m2, ta, ma
+; CHECK-NEXT:    vsse32.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv4i32(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv4i32(
+  <vscale x 4 x i32>,
+  ptr,
+  iXLen,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, ptr %1, iXLen %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv4i32_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e32, m2, ta, ma
+; CHECK-NEXT:    vsse32.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv4i32(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv8i32(
+  <vscale x 8 x i32>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv8i32_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
+; CHECK-NEXT:    vsse32.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv8i32(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv8i32(
+  <vscale x 8 x i32>,
+  ptr,
+  iXLen,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, ptr %1, iXLen %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv8i32_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
+; CHECK-NEXT:    vsse32.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv8i32(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv16i32(
+  <vscale x 16 x i32>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv16i32_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vsse32.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv16i32(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv16i32(
+  <vscale x 16 x i32>,
+  ptr,
+  iXLen,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, ptr %1, iXLen %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv16i32_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vsse32.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv16i32(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv1f32(
+  <vscale x 1 x float>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv1f32_nxv1f32(<vscale x 1 x float> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv1f32_nxv1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e32, mf2, ta, ma
+; CHECK-NEXT:    vsse32.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv1f32(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv1f32(
+  <vscale x 1 x float>,
+  ptr,
+  iXLen,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv1f32_nxv1f32(<vscale x 1 x float> %0, ptr %1, iXLen %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv1f32_nxv1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e32, mf2, ta, ma
+; CHECK-NEXT:    vsse32.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv1f32(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv2f32(
+  <vscale x 2 x float>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv2f32_nxv2f32(<vscale x 2 x float> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv2f32_nxv2f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT:    vsse32.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv2f32(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv2f32(
+  <vscale x 2 x float>,
+  ptr,
+  iXLen,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv2f32_nxv2f32(<vscale x 2 x float> %0, ptr %1, iXLen %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv2f32_nxv2f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT:    vsse32.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv2f32(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv4f32(
+  <vscale x 4 x float>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv4f32_nxv4f32(<vscale x 4 x float> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv4f32_nxv4f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e32, m2, ta, ma
+; CHECK-NEXT:    vsse32.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv4f32(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv4f32(
+  <vscale x 4 x float>,
+  ptr,
+  iXLen,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv4f32_nxv4f32(<vscale x 4 x float> %0, ptr %1, iXLen %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv4f32_nxv4f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e32, m2, ta, ma
+; CHECK-NEXT:    vsse32.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv4f32(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv8f32(
+  <vscale x 8 x float>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv8f32_nxv8f32(<vscale x 8 x float> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv8f32_nxv8f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
+; CHECK-NEXT:    vsse32.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv8f32(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv8f32(
+  <vscale x 8 x float>,
+  ptr,
+  iXLen,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv8f32_nxv8f32(<vscale x 8 x float> %0, ptr %1, iXLen %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv8f32_nxv8f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
+; CHECK-NEXT:    vsse32.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv8f32(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv16f32(
+  <vscale x 16 x float>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv16f32_nxv16f32(<vscale x 16 x float> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv16f32_nxv16f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vsse32.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv16f32(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv16f32(
+  <vscale x 16 x float>,
+  ptr,
+  iXLen,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv16f32_nxv16f32(<vscale x 16 x float> %0, ptr %1, iXLen %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv16f32_nxv16f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vsse32.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv16f32(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv1i16(
+  <vscale x 1 x i16>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv1i16_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e16, mf4, ta, ma
+; CHECK-NEXT:    vsse16.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv1i16(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv1i16(
+  <vscale x 1 x i16>,
+  ptr,
+  iXLen,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, ptr %1, iXLen %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv1i16_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e16, mf4, ta, ma
+; CHECK-NEXT:    vsse16.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv1i16(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv2i16(
+  <vscale x 2 x i16>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv2i16_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e16, mf2, ta, ma
+; CHECK-NEXT:    vsse16.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv2i16(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv2i16(
+  <vscale x 2 x i16>,
+  ptr,
+  iXLen,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, ptr %1, iXLen %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv2i16_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e16, mf2, ta, ma
+; CHECK-NEXT:    vsse16.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv2i16(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv4i16(
+  <vscale x 4 x i16>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv4i16_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT:    vsse16.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv4i16(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv4i16(
+  <vscale x 4 x i16>,
+  ptr,
+  iXLen,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, ptr %1, iXLen %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv4i16_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT:    vsse16.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv4i16(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv8i16(
+  <vscale x 8 x i16>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv8i16_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e16, m2, ta, ma
+; CHECK-NEXT:    vsse16.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv8i16(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv8i16(
+  <vscale x 8 x i16>,
+  ptr,
+  iXLen,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, ptr %1, iXLen %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv8i16_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e16, m2, ta, ma
+; CHECK-NEXT:    vsse16.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv8i16(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv16i16(
+  <vscale x 16 x i16>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv16i16_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vsse16.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv16i16(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv16i16(
+  <vscale x 16 x i16>,
+  ptr,
+  iXLen,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, ptr %1, iXLen %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv16i16_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vsse16.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv16i16(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv32i16(
+  <vscale x 32 x i16>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv32i16_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
+; CHECK-NEXT:    vsse16.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv32i16(
+    <vscale x 32 x i16> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv32i16(
+  <vscale x 32 x i16>,
+  ptr,
+  iXLen,
+  <vscale x 32 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, ptr %1, iXLen %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv32i16_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
+; CHECK-NEXT:    vsse16.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv32i16(
+    <vscale x 32 x i16> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv1f16(
+  <vscale x 1 x half>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv1f16_nxv1f16(<vscale x 1 x half> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv1f16_nxv1f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e16, mf4, ta, ma
+; CHECK-NEXT:    vsse16.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv1f16(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv1f16(
+  <vscale x 1 x half>,
+  ptr,
+  iXLen,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv1f16_nxv1f16(<vscale x 1 x half> %0, ptr %1, iXLen %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv1f16_nxv1f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e16, mf4, ta, ma
+; CHECK-NEXT:    vsse16.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv1f16(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv2f16(
+  <vscale x 2 x half>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv2f16_nxv2f16(<vscale x 2 x half> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv2f16_nxv2f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e16, mf2, ta, ma
+; CHECK-NEXT:    vsse16.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv2f16(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv2f16(
+  <vscale x 2 x half>,
+  ptr,
+  iXLen,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv2f16_nxv2f16(<vscale x 2 x half> %0, ptr %1, iXLen %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv2f16_nxv2f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e16, mf2, ta, ma
+; CHECK-NEXT:    vsse16.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv2f16(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv4f16(
+  <vscale x 4 x half>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv4f16_nxv4f16(<vscale x 4 x half> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv4f16_nxv4f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT:    vsse16.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv4f16(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv4f16(
+  <vscale x 4 x half>,
+  ptr,
+  iXLen,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv4f16_nxv4f16(<vscale x 4 x half> %0, ptr %1, iXLen %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv4f16_nxv4f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT:    vsse16.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv4f16(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv8f16(
+  <vscale x 8 x half>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv8f16_nxv8f16(<vscale x 8 x half> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv8f16_nxv8f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e16, m2, ta, ma
+; CHECK-NEXT:    vsse16.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv8f16(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv8f16(
+  <vscale x 8 x half>,
+  ptr,
+  iXLen,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv8f16_nxv8f16(<vscale x 8 x half> %0, ptr %1, iXLen %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv8f16_nxv8f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e16, m2, ta, ma
+; CHECK-NEXT:    vsse16.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv8f16(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv16f16(
+  <vscale x 16 x half>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv16f16_nxv16f16(<vscale x 16 x half> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv16f16_nxv16f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vsse16.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv16f16(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv16f16(
+  <vscale x 16 x half>,
+  ptr,
+  iXLen,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv16f16_nxv16f16(<vscale x 16 x half> %0, ptr %1, iXLen %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv16f16_nxv16f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vsse16.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv16f16(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv32f16(
+  <vscale x 32 x half>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv32f16_nxv32f16(<vscale x 32 x half> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv32f16_nxv32f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
+; CHECK-NEXT:    vsse16.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv32f16(
+    <vscale x 32 x half> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv32f16(
+  <vscale x 32 x half>,
+  ptr,
+  iXLen,
+  <vscale x 32 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv32f16_nxv32f16(<vscale x 32 x half> %0, ptr %1, iXLen %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv32f16_nxv32f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
+; CHECK-NEXT:    vsse16.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv32f16(
+    <vscale x 32 x half> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv1i8(
+  <vscale x 1 x i8>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv1i8_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e8, mf8, ta, ma
+; CHECK-NEXT:    vsse8.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv1i8(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv1i8(
+  <vscale x 1 x i8>,
+  ptr,
+  iXLen,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, ptr %1, iXLen %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv1i8_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e8, mf8, ta, ma
+; CHECK-NEXT:    vsse8.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv1i8(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv2i8(
+  <vscale x 2 x i8>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv2i8_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e8, mf4, ta, ma
+; CHECK-NEXT:    vsse8.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv2i8(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv2i8(
+  <vscale x 2 x i8>,
+  ptr,
+  iXLen,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, ptr %1, iXLen %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv2i8_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e8, mf4, ta, ma
+; CHECK-NEXT:    vsse8.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv2i8(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv4i8(
+  <vscale x 4 x i8>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv4i8_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e8, mf2, ta, ma
+; CHECK-NEXT:    vsse8.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv4i8(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv4i8(
+  <vscale x 4 x i8>,
+  ptr,
+  iXLen,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, ptr %1, iXLen %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv4i8_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e8, mf2, ta, ma
+; CHECK-NEXT:    vsse8.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv4i8(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv8i8(
+  <vscale x 8 x i8>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv8i8_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
+; CHECK-NEXT:    vsse8.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv8i8(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv8i8(
+  <vscale x 8 x i8>,
+  ptr,
+  iXLen,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, ptr %1, iXLen %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv8i8_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
+; CHECK-NEXT:    vsse8.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv8i8(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv16i8(
+  <vscale x 16 x i8>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv16i8_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
+; CHECK-NEXT:    vsse8.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv16i8(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv16i8(
+  <vscale x 16 x i8>,
+  ptr,
+  iXLen,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, ptr %1, iXLen %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv16i8_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
+; CHECK-NEXT:    vsse8.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv16i8(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv32i8(
+  <vscale x 32 x i8>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv32i8_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
+; CHECK-NEXT:    vsse8.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv32i8(
+    <vscale x 32 x i8> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv32i8(
+  <vscale x 32 x i8>,
+  ptr,
+  iXLen,
+  <vscale x 32 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, ptr %1, iXLen %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv32i8_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
+; CHECK-NEXT:    vsse8.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv32i8(
+    <vscale x 32 x i8> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv64i8(
+  <vscale x 64 x i8>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv64i8_nxv64i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; CHECK-NEXT:    vsse8.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv64i8(
+    <vscale x 64 x i8> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv64i8(
+  <vscale x 64 x i8>,
+  ptr,
+  iXLen,
+  <vscale x 64 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, ptr %1, iXLen %2, <vscale x 64 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv64i8_nxv64i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; CHECK-NEXT:    vsse8.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv64i8(
+    <vscale x 64 x i8> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 64 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/atomic-rmw-minmax.ll b/llvm/test/CodeGen/RISCV/atomic-rmw-minmax.ll
new file mode 100644
index 0000000000000..b43555c6637c4
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/atomic-rmw-minmax.ll
@@ -0,0 +1,642 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+b,+zalrsc -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefixes=RV32IB-COMMON,RV32IB-ZALRSC %s
+; RUN: llc -mtriple=riscv32 -mattr=+b,+zalrsc,+permissive-zalrsc -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefixes=RV32IB-COMMON,RV32IB-ZALRSC-PERM %s
+; RUN: llc -mtriple=riscv32 -mattr=+b,+zalrsc,+permissive-zalrsc,+a -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefixes=RV32IB-COMMON,RV32IAB %s
+;
+; RUN: llc -mtriple=riscv64 -mattr=+b,+zalrsc -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefixes=RV64IB-ZALRSC %s
+; RUN: llc -mtriple=riscv64 -mattr=+b,+zalrsc,+permissive-zalrsc -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefixes=RV64IB-ZALRSC-PERM %s
+; RUN: llc -mtriple=riscv64 -mattr=+b,+zalrsc,+permissive-zalrsc,+a -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefixes=RV64IAB %s
+
+define i32 @atomicrmw_max_i32_seq_cst(ptr %a, i32 %b) nounwind {
+; RV32IB-ZALRSC-LABEL: atomicrmw_max_i32_seq_cst:
+; RV32IB-ZALRSC:       # %bb.0:
+; RV32IB-ZALRSC-NEXT:  .LBB0_1: # =>This Inner Loop Header: Depth=1
+; RV32IB-ZALRSC-NEXT:    lr.w.aqrl a2, (a0)
+; RV32IB-ZALRSC-NEXT:    mv a3, a2
+; RV32IB-ZALRSC-NEXT:    bge a3, a1, .LBB0_3
+; RV32IB-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB0_1 Depth=1
+; RV32IB-ZALRSC-NEXT:    mv a3, a1
+; RV32IB-ZALRSC-NEXT:  .LBB0_3: # in Loop: Header=BB0_1 Depth=1
+; RV32IB-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV32IB-ZALRSC-NEXT:    bnez a3, .LBB0_1
+; RV32IB-ZALRSC-NEXT:  # %bb.4:
+; RV32IB-ZALRSC-NEXT:    mv a0, a2
+; RV32IB-ZALRSC-NEXT:    ret
+;
+; RV32IB-ZALRSC-PERM-LABEL: atomicrmw_max_i32_seq_cst:
+; RV32IB-ZALRSC-PERM:       # %bb.0:
+; RV32IB-ZALRSC-PERM-NEXT:  .LBB0_1: # =>This Inner Loop Header: Depth=1
+; RV32IB-ZALRSC-PERM-NEXT:    lr.w.aqrl a2, (a0)
+; RV32IB-ZALRSC-PERM-NEXT:    max a3, a2, a1
+; RV32IB-ZALRSC-PERM-NEXT:    sc.w.rl a3, a3, (a0)
+; RV32IB-ZALRSC-PERM-NEXT:    bnez a3, .LBB0_1
+; RV32IB-ZALRSC-PERM-NEXT:  # %bb.2:
+; RV32IB-ZALRSC-PERM-NEXT:    mv a0, a2
+; RV32IB-ZALRSC-PERM-NEXT:    ret
+;
+; RV32IAB-LABEL: atomicrmw_max_i32_seq_cst:
+; RV32IAB:       # %bb.0:
+; RV32IAB-NEXT:    amomax.w.aqrl a0, a1, (a0)
+; RV32IAB-NEXT:    ret
+;
+; RV64IB-ZALRSC-LABEL: atomicrmw_max_i32_seq_cst:
+; RV64IB-ZALRSC:       # %bb.0:
+; RV64IB-ZALRSC-NEXT:    sext.w a2, a1
+; RV64IB-ZALRSC-NEXT:  .LBB0_1: # =>This Inner Loop Header: Depth=1
+; RV64IB-ZALRSC-NEXT:    lr.w.aqrl a1, (a0)
+; RV64IB-ZALRSC-NEXT:    mv a3, a1
+; RV64IB-ZALRSC-NEXT:    bge a3, a2, .LBB0_3
+; RV64IB-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB0_1 Depth=1
+; RV64IB-ZALRSC-NEXT:    mv a3, a2
+; RV64IB-ZALRSC-NEXT:  .LBB0_3: # in Loop: Header=BB0_1 Depth=1
+; RV64IB-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV64IB-ZALRSC-NEXT:    bnez a3, .LBB0_1
+; RV64IB-ZALRSC-NEXT:  # %bb.4:
+; RV64IB-ZALRSC-NEXT:    mv a0, a1
+; RV64IB-ZALRSC-NEXT:    ret
+;
+; RV64IB-ZALRSC-PERM-LABEL: atomicrmw_max_i32_seq_cst:
+; RV64IB-ZALRSC-PERM:       # %bb.0:
+; RV64IB-ZALRSC-PERM-NEXT:    sext.w a2, a1
+; RV64IB-ZALRSC-PERM-NEXT:  .LBB0_1: # =>This Inner Loop Header: Depth=1
+; RV64IB-ZALRSC-PERM-NEXT:    lr.w.aqrl a1, (a0)
+; RV64IB-ZALRSC-PERM-NEXT:    max a3, a1, a2
+; RV64IB-ZALRSC-PERM-NEXT:    sc.w.rl a3, a3, (a0)
+; RV64IB-ZALRSC-PERM-NEXT:    bnez a3, .LBB0_1
+; RV64IB-ZALRSC-PERM-NEXT:  # %bb.2:
+; RV64IB-ZALRSC-PERM-NEXT:    mv a0, a1
+; RV64IB-ZALRSC-PERM-NEXT:    ret
+;
+; RV64IAB-LABEL: atomicrmw_max_i32_seq_cst:
+; RV64IAB:       # %bb.0:
+; RV64IAB-NEXT:    amomax.w.aqrl a0, a1, (a0)
+; RV64IAB-NEXT:    ret
+  %1 = atomicrmw max ptr %a, i32 %b seq_cst
+  ret i32 %1
+}
+
+define i32 @atomicrmw_min_i32_seq_cst(ptr %a, i32 %b) nounwind {
+; RV32IB-ZALRSC-LABEL: atomicrmw_min_i32_seq_cst:
+; RV32IB-ZALRSC:       # %bb.0:
+; RV32IB-ZALRSC-NEXT:  .LBB1_1: # =>This Inner Loop Header: Depth=1
+; RV32IB-ZALRSC-NEXT:    lr.w.aqrl a2, (a0)
+; RV32IB-ZALRSC-NEXT:    mv a3, a2
+; RV32IB-ZALRSC-NEXT:    bge a1, a3, .LBB1_3
+; RV32IB-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB1_1 Depth=1
+; RV32IB-ZALRSC-NEXT:    mv a3, a1
+; RV32IB-ZALRSC-NEXT:  .LBB1_3: # in Loop: Header=BB1_1 Depth=1
+; RV32IB-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV32IB-ZALRSC-NEXT:    bnez a3, .LBB1_1
+; RV32IB-ZALRSC-NEXT:  # %bb.4:
+; RV32IB-ZALRSC-NEXT:    mv a0, a2
+; RV32IB-ZALRSC-NEXT:    ret
+;
+; RV32IB-ZALRSC-PERM-LABEL: atomicrmw_min_i32_seq_cst:
+; RV32IB-ZALRSC-PERM:       # %bb.0:
+; RV32IB-ZALRSC-PERM-NEXT:  .LBB1_1: # =>This Inner Loop Header: Depth=1
+; RV32IB-ZALRSC-PERM-NEXT:    lr.w.aqrl a2, (a0)
+; RV32IB-ZALRSC-PERM-NEXT:    min a3, a2, a1
+; RV32IB-ZALRSC-PERM-NEXT:    sc.w.rl a3, a3, (a0)
+; RV32IB-ZALRSC-PERM-NEXT:    bnez a3, .LBB1_1
+; RV32IB-ZALRSC-PERM-NEXT:  # %bb.2:
+; RV32IB-ZALRSC-PERM-NEXT:    mv a0, a2
+; RV32IB-ZALRSC-PERM-NEXT:    ret
+;
+; RV32IAB-LABEL: atomicrmw_min_i32_seq_cst:
+; RV32IAB:       # %bb.0:
+; RV32IAB-NEXT:    amomin.w.aqrl a0, a1, (a0)
+; RV32IAB-NEXT:    ret
+;
+; RV64IB-ZALRSC-LABEL: atomicrmw_min_i32_seq_cst:
+; RV64IB-ZALRSC:       # %bb.0:
+; RV64IB-ZALRSC-NEXT:    sext.w a2, a1
+; RV64IB-ZALRSC-NEXT:  .LBB1_1: # =>This Inner Loop Header: Depth=1
+; RV64IB-ZALRSC-NEXT:    lr.w.aqrl a1, (a0)
+; RV64IB-ZALRSC-NEXT:    mv a3, a1
+; RV64IB-ZALRSC-NEXT:    bge a2, a3, .LBB1_3
+; RV64IB-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB1_1 Depth=1
+; RV64IB-ZALRSC-NEXT:    mv a3, a2
+; RV64IB-ZALRSC-NEXT:  .LBB1_3: # in Loop: Header=BB1_1 Depth=1
+; RV64IB-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV64IB-ZALRSC-NEXT:    bnez a3, .LBB1_1
+; RV64IB-ZALRSC-NEXT:  # %bb.4:
+; RV64IB-ZALRSC-NEXT:    mv a0, a1
+; RV64IB-ZALRSC-NEXT:    ret
+;
+; RV64IB-ZALRSC-PERM-LABEL: atomicrmw_min_i32_seq_cst:
+; RV64IB-ZALRSC-PERM:       # %bb.0:
+; RV64IB-ZALRSC-PERM-NEXT:    sext.w a2, a1
+; RV64IB-ZALRSC-PERM-NEXT:  .LBB1_1: # =>This Inner Loop Header: Depth=1
+; RV64IB-ZALRSC-PERM-NEXT:    lr.w.aqrl a1, (a0)
+; RV64IB-ZALRSC-PERM-NEXT:    min a3, a1, a2
+; RV64IB-ZALRSC-PERM-NEXT:    sc.w.rl a3, a3, (a0)
+; RV64IB-ZALRSC-PERM-NEXT:    bnez a3, .LBB1_1
+; RV64IB-ZALRSC-PERM-NEXT:  # %bb.2:
+; RV64IB-ZALRSC-PERM-NEXT:    mv a0, a1
+; RV64IB-ZALRSC-PERM-NEXT:    ret
+;
+; RV64IAB-LABEL: atomicrmw_min_i32_seq_cst:
+; RV64IAB:       # %bb.0:
+; RV64IAB-NEXT:    amomin.w.aqrl a0, a1, (a0)
+; RV64IAB-NEXT:    ret
+  %1 = atomicrmw min ptr %a, i32 %b seq_cst
+  ret i32 %1
+}
+
+define i32 @atomicrmw_umax_i32_seq_cst(ptr %a, i32 %b) nounwind {
+; RV32IB-ZALRSC-LABEL: atomicrmw_umax_i32_seq_cst:
+; RV32IB-ZALRSC:       # %bb.0:
+; RV32IB-ZALRSC-NEXT:  .LBB2_1: # =>This Inner Loop Header: Depth=1
+; RV32IB-ZALRSC-NEXT:    lr.w.aqrl a2, (a0)
+; RV32IB-ZALRSC-NEXT:    mv a3, a2
+; RV32IB-ZALRSC-NEXT:    bgeu a3, a1, .LBB2_3
+; RV32IB-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB2_1 Depth=1
+; RV32IB-ZALRSC-NEXT:    mv a3, a1
+; RV32IB-ZALRSC-NEXT:  .LBB2_3: # in Loop: Header=BB2_1 Depth=1
+; RV32IB-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV32IB-ZALRSC-NEXT:    bnez a3, .LBB2_1
+; RV32IB-ZALRSC-NEXT:  # %bb.4:
+; RV32IB-ZALRSC-NEXT:    mv a0, a2
+; RV32IB-ZALRSC-NEXT:    ret
+;
+; RV32IB-ZALRSC-PERM-LABEL: atomicrmw_umax_i32_seq_cst:
+; RV32IB-ZALRSC-PERM:       # %bb.0:
+; RV32IB-ZALRSC-PERM-NEXT:  .LBB2_1: # =>This Inner Loop Header: Depth=1
+; RV32IB-ZALRSC-PERM-NEXT:    lr.w.aqrl a2, (a0)
+; RV32IB-ZALRSC-PERM-NEXT:    maxu a3, a2, a1
+; RV32IB-ZALRSC-PERM-NEXT:    sc.w.rl a3, a3, (a0)
+; RV32IB-ZALRSC-PERM-NEXT:    bnez a3, .LBB2_1
+; RV32IB-ZALRSC-PERM-NEXT:  # %bb.2:
+; RV32IB-ZALRSC-PERM-NEXT:    mv a0, a2
+; RV32IB-ZALRSC-PERM-NEXT:    ret
+;
+; RV32IAB-LABEL: atomicrmw_umax_i32_seq_cst:
+; RV32IAB:       # %bb.0:
+; RV32IAB-NEXT:    amomaxu.w.aqrl a0, a1, (a0)
+; RV32IAB-NEXT:    ret
+;
+; RV64IB-ZALRSC-LABEL: atomicrmw_umax_i32_seq_cst:
+; RV64IB-ZALRSC:       # %bb.0:
+; RV64IB-ZALRSC-NEXT:    sext.w a2, a1
+; RV64IB-ZALRSC-NEXT:  .LBB2_1: # =>This Inner Loop Header: Depth=1
+; RV64IB-ZALRSC-NEXT:    lr.w.aqrl a1, (a0)
+; RV64IB-ZALRSC-NEXT:    mv a3, a1
+; RV64IB-ZALRSC-NEXT:    bgeu a3, a2, .LBB2_3
+; RV64IB-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB2_1 Depth=1
+; RV64IB-ZALRSC-NEXT:    mv a3, a2
+; RV64IB-ZALRSC-NEXT:  .LBB2_3: # in Loop: Header=BB2_1 Depth=1
+; RV64IB-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV64IB-ZALRSC-NEXT:    bnez a3, .LBB2_1
+; RV64IB-ZALRSC-NEXT:  # %bb.4:
+; RV64IB-ZALRSC-NEXT:    mv a0, a1
+; RV64IB-ZALRSC-NEXT:    ret
+;
+; RV64IB-ZALRSC-PERM-LABEL: atomicrmw_umax_i32_seq_cst:
+; RV64IB-ZALRSC-PERM:       # %bb.0:
+; RV64IB-ZALRSC-PERM-NEXT:    sext.w a2, a1
+; RV64IB-ZALRSC-PERM-NEXT:  .LBB2_1: # =>This Inner Loop Header: Depth=1
+; RV64IB-ZALRSC-PERM-NEXT:    lr.w.aqrl a1, (a0)
+; RV64IB-ZALRSC-PERM-NEXT:    maxu a3, a1, a2
+; RV64IB-ZALRSC-PERM-NEXT:    sc.w.rl a3, a3, (a0)
+; RV64IB-ZALRSC-PERM-NEXT:    bnez a3, .LBB2_1
+; RV64IB-ZALRSC-PERM-NEXT:  # %bb.2:
+; RV64IB-ZALRSC-PERM-NEXT:    mv a0, a1
+; RV64IB-ZALRSC-PERM-NEXT:    ret
+;
+; RV64IAB-LABEL: atomicrmw_umax_i32_seq_cst:
+; RV64IAB:       # %bb.0:
+; RV64IAB-NEXT:    amomaxu.w.aqrl a0, a1, (a0)
+; RV64IAB-NEXT:    ret
+  %1 = atomicrmw umax ptr %a, i32 %b seq_cst
+  ret i32 %1
+}
+
+define i32 @atomicrmw_umin_i32_seq_cst(ptr %a, i32 %b) nounwind {
+; RV32IB-ZALRSC-LABEL: atomicrmw_umin_i32_seq_cst:
+; RV32IB-ZALRSC:       # %bb.0:
+; RV32IB-ZALRSC-NEXT:  .LBB3_1: # =>This Inner Loop Header: Depth=1
+; RV32IB-ZALRSC-NEXT:    lr.w.aqrl a2, (a0)
+; RV32IB-ZALRSC-NEXT:    mv a3, a2
+; RV32IB-ZALRSC-NEXT:    bgeu a1, a3, .LBB3_3
+; RV32IB-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB3_1 Depth=1
+; RV32IB-ZALRSC-NEXT:    mv a3, a1
+; RV32IB-ZALRSC-NEXT:  .LBB3_3: # in Loop: Header=BB3_1 Depth=1
+; RV32IB-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV32IB-ZALRSC-NEXT:    bnez a3, .LBB3_1
+; RV32IB-ZALRSC-NEXT:  # %bb.4:
+; RV32IB-ZALRSC-NEXT:    mv a0, a2
+; RV32IB-ZALRSC-NEXT:    ret
+;
+; RV32IB-ZALRSC-PERM-LABEL: atomicrmw_umin_i32_seq_cst:
+; RV32IB-ZALRSC-PERM:       # %bb.0:
+; RV32IB-ZALRSC-PERM-NEXT:  .LBB3_1: # =>This Inner Loop Header: Depth=1
+; RV32IB-ZALRSC-PERM-NEXT:    lr.w.aqrl a2, (a0)
+; RV32IB-ZALRSC-PERM-NEXT:    minu a3, a2, a1
+; RV32IB-ZALRSC-PERM-NEXT:    sc.w.rl a3, a3, (a0)
+; RV32IB-ZALRSC-PERM-NEXT:    bnez a3, .LBB3_1
+; RV32IB-ZALRSC-PERM-NEXT:  # %bb.2:
+; RV32IB-ZALRSC-PERM-NEXT:    mv a0, a2
+; RV32IB-ZALRSC-PERM-NEXT:    ret
+;
+; RV32IAB-LABEL: atomicrmw_umin_i32_seq_cst:
+; RV32IAB:       # %bb.0:
+; RV32IAB-NEXT:    amominu.w.aqrl a0, a1, (a0)
+; RV32IAB-NEXT:    ret
+;
+; RV64IB-ZALRSC-LABEL: atomicrmw_umin_i32_seq_cst:
+; RV64IB-ZALRSC:       # %bb.0:
+; RV64IB-ZALRSC-NEXT:    sext.w a2, a1
+; RV64IB-ZALRSC-NEXT:  .LBB3_1: # =>This Inner Loop Header: Depth=1
+; RV64IB-ZALRSC-NEXT:    lr.w.aqrl a1, (a0)
+; RV64IB-ZALRSC-NEXT:    mv a3, a1
+; RV64IB-ZALRSC-NEXT:    bgeu a2, a3, .LBB3_3
+; RV64IB-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB3_1 Depth=1
+; RV64IB-ZALRSC-NEXT:    mv a3, a2
+; RV64IB-ZALRSC-NEXT:  .LBB3_3: # in Loop: Header=BB3_1 Depth=1
+; RV64IB-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV64IB-ZALRSC-NEXT:    bnez a3, .LBB3_1
+; RV64IB-ZALRSC-NEXT:  # %bb.4:
+; RV64IB-ZALRSC-NEXT:    mv a0, a1
+; RV64IB-ZALRSC-NEXT:    ret
+;
+; RV64IB-ZALRSC-PERM-LABEL: atomicrmw_umin_i32_seq_cst:
+; RV64IB-ZALRSC-PERM:       # %bb.0:
+; RV64IB-ZALRSC-PERM-NEXT:    sext.w a2, a1
+; RV64IB-ZALRSC-PERM-NEXT:  .LBB3_1: # =>This Inner Loop Header: Depth=1
+; RV64IB-ZALRSC-PERM-NEXT:    lr.w.aqrl a1, (a0)
+; RV64IB-ZALRSC-PERM-NEXT:    minu a3, a1, a2
+; RV64IB-ZALRSC-PERM-NEXT:    sc.w.rl a3, a3, (a0)
+; RV64IB-ZALRSC-PERM-NEXT:    bnez a3, .LBB3_1
+; RV64IB-ZALRSC-PERM-NEXT:  # %bb.2:
+; RV64IB-ZALRSC-PERM-NEXT:    mv a0, a1
+; RV64IB-ZALRSC-PERM-NEXT:    ret
+;
+; RV64IAB-LABEL: atomicrmw_umin_i32_seq_cst:
+; RV64IAB:       # %bb.0:
+; RV64IAB-NEXT:    amominu.w.aqrl a0, a1, (a0)
+; RV64IAB-NEXT:    ret
+  %1 = atomicrmw umin ptr %a, i32 %b seq_cst
+  ret i32 %1
+}
+
+define i64 @atomicrmw_max_i64_seq_cst(ptr %a, i64 %b) nounwind {
+; RV32IB-COMMON-LABEL: atomicrmw_max_i64_seq_cst:
+; RV32IB-COMMON:       # %bb.0:
+; RV32IB-COMMON-NEXT:    addi sp, sp, -32
+; RV32IB-COMMON-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32IB-COMMON-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32IB-COMMON-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32IB-COMMON-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32IB-COMMON-NEXT:    mv s0, a2
+; RV32IB-COMMON-NEXT:    mv s1, a0
+; RV32IB-COMMON-NEXT:    lw a4, 0(a0)
+; RV32IB-COMMON-NEXT:    lw a5, 4(a0)
+; RV32IB-COMMON-NEXT:    mv s2, a1
+; RV32IB-COMMON-NEXT:    j .LBB4_2
+; RV32IB-COMMON-NEXT:  .LBB4_1: # %atomicrmw.start
+; RV32IB-COMMON-NEXT:    # in Loop: Header=BB4_2 Depth=1
+; RV32IB-COMMON-NEXT:    sw a4, 8(sp)
+; RV32IB-COMMON-NEXT:    sw a5, 12(sp)
+; RV32IB-COMMON-NEXT:    addi a1, sp, 8
+; RV32IB-COMMON-NEXT:    li a4, 5
+; RV32IB-COMMON-NEXT:    li a5, 5
+; RV32IB-COMMON-NEXT:    mv a0, s1
+; RV32IB-COMMON-NEXT:    call __atomic_compare_exchange_8
+; RV32IB-COMMON-NEXT:    lw a4, 8(sp)
+; RV32IB-COMMON-NEXT:    lw a5, 12(sp)
+; RV32IB-COMMON-NEXT:    bnez a0, .LBB4_7
+; RV32IB-COMMON-NEXT:  .LBB4_2: # %atomicrmw.start
+; RV32IB-COMMON-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32IB-COMMON-NEXT:    beq a5, s0, .LBB4_4
+; RV32IB-COMMON-NEXT:  # %bb.3: # %atomicrmw.start
+; RV32IB-COMMON-NEXT:    # in Loop: Header=BB4_2 Depth=1
+; RV32IB-COMMON-NEXT:    slt a0, s0, a5
+; RV32IB-COMMON-NEXT:    j .LBB4_5
+; RV32IB-COMMON-NEXT:  .LBB4_4: # in Loop: Header=BB4_2 Depth=1
+; RV32IB-COMMON-NEXT:    sltu a0, s2, a4
+; RV32IB-COMMON-NEXT:  .LBB4_5: # %atomicrmw.start
+; RV32IB-COMMON-NEXT:    # in Loop: Header=BB4_2 Depth=1
+; RV32IB-COMMON-NEXT:    mv a2, a4
+; RV32IB-COMMON-NEXT:    mv a3, a5
+; RV32IB-COMMON-NEXT:    bnez a0, .LBB4_1
+; RV32IB-COMMON-NEXT:  # %bb.6: # %atomicrmw.start
+; RV32IB-COMMON-NEXT:    # in Loop: Header=BB4_2 Depth=1
+; RV32IB-COMMON-NEXT:    mv a2, s2
+; RV32IB-COMMON-NEXT:    mv a3, s0
+; RV32IB-COMMON-NEXT:    j .LBB4_1
+; RV32IB-COMMON-NEXT:  .LBB4_7: # %atomicrmw.end
+; RV32IB-COMMON-NEXT:    mv a0, a4
+; RV32IB-COMMON-NEXT:    mv a1, a5
+; RV32IB-COMMON-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32IB-COMMON-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32IB-COMMON-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32IB-COMMON-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32IB-COMMON-NEXT:    addi sp, sp, 32
+; RV32IB-COMMON-NEXT:    ret
+;
+; RV64IB-ZALRSC-LABEL: atomicrmw_max_i64_seq_cst:
+; RV64IB-ZALRSC:       # %bb.0:
+; RV64IB-ZALRSC-NEXT:  .LBB4_1: # =>This Inner Loop Header: Depth=1
+; RV64IB-ZALRSC-NEXT:    lr.d.aqrl a2, (a0)
+; RV64IB-ZALRSC-NEXT:    mv a3, a2
+; RV64IB-ZALRSC-NEXT:    bge a3, a1, .LBB4_3
+; RV64IB-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB4_1 Depth=1
+; RV64IB-ZALRSC-NEXT:    mv a3, a1
+; RV64IB-ZALRSC-NEXT:  .LBB4_3: # in Loop: Header=BB4_1 Depth=1
+; RV64IB-ZALRSC-NEXT:    sc.d.rl a3, a3, (a0)
+; RV64IB-ZALRSC-NEXT:    bnez a3, .LBB4_1
+; RV64IB-ZALRSC-NEXT:  # %bb.4:
+; RV64IB-ZALRSC-NEXT:    mv a0, a2
+; RV64IB-ZALRSC-NEXT:    ret
+;
+; RV64IB-ZALRSC-PERM-LABEL: atomicrmw_max_i64_seq_cst:
+; RV64IB-ZALRSC-PERM:       # %bb.0:
+; RV64IB-ZALRSC-PERM-NEXT:  .LBB4_1: # =>This Inner Loop Header: Depth=1
+; RV64IB-ZALRSC-PERM-NEXT:    lr.d.aqrl a2, (a0)
+; RV64IB-ZALRSC-PERM-NEXT:    max a3, a2, a1
+; RV64IB-ZALRSC-PERM-NEXT:    sc.d.rl a3, a3, (a0)
+; RV64IB-ZALRSC-PERM-NEXT:    bnez a3, .LBB4_1
+; RV64IB-ZALRSC-PERM-NEXT:  # %bb.2:
+; RV64IB-ZALRSC-PERM-NEXT:    mv a0, a2
+; RV64IB-ZALRSC-PERM-NEXT:    ret
+;
+; RV64IAB-LABEL: atomicrmw_max_i64_seq_cst:
+; RV64IAB:       # %bb.0:
+; RV64IAB-NEXT:    amomax.d.aqrl a0, a1, (a0)
+; RV64IAB-NEXT:    ret
+  %1 = atomicrmw max ptr %a, i64 %b seq_cst
+  ret i64 %1
+}
+
+define i64 @atomicrmw_min_i64_seq_cst(ptr %a, i64 %b) nounwind {
+; RV32IB-COMMON-LABEL: atomicrmw_min_i64_seq_cst:
+; RV32IB-COMMON:       # %bb.0:
+; RV32IB-COMMON-NEXT:    addi sp, sp, -32
+; RV32IB-COMMON-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32IB-COMMON-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32IB-COMMON-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32IB-COMMON-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32IB-COMMON-NEXT:    mv s0, a2
+; RV32IB-COMMON-NEXT:    mv s1, a0
+; RV32IB-COMMON-NEXT:    lw a4, 0(a0)
+; RV32IB-COMMON-NEXT:    lw a5, 4(a0)
+; RV32IB-COMMON-NEXT:    mv s2, a1
+; RV32IB-COMMON-NEXT:    j .LBB5_2
+; RV32IB-COMMON-NEXT:  .LBB5_1: # %atomicrmw.start
+; RV32IB-COMMON-NEXT:    # in Loop: Header=BB5_2 Depth=1
+; RV32IB-COMMON-NEXT:    sw a4, 8(sp)
+; RV32IB-COMMON-NEXT:    sw a5, 12(sp)
+; RV32IB-COMMON-NEXT:    addi a1, sp, 8
+; RV32IB-COMMON-NEXT:    li a4, 5
+; RV32IB-COMMON-NEXT:    li a5, 5
+; RV32IB-COMMON-NEXT:    mv a0, s1
+; RV32IB-COMMON-NEXT:    call __atomic_compare_exchange_8
+; RV32IB-COMMON-NEXT:    lw a4, 8(sp)
+; RV32IB-COMMON-NEXT:    lw a5, 12(sp)
+; RV32IB-COMMON-NEXT:    bnez a0, .LBB5_7
+; RV32IB-COMMON-NEXT:  .LBB5_2: # %atomicrmw.start
+; RV32IB-COMMON-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32IB-COMMON-NEXT:    beq a5, s0, .LBB5_4
+; RV32IB-COMMON-NEXT:  # %bb.3: # %atomicrmw.start
+; RV32IB-COMMON-NEXT:    # in Loop: Header=BB5_2 Depth=1
+; RV32IB-COMMON-NEXT:    slt a0, a5, s0
+; RV32IB-COMMON-NEXT:    j .LBB5_5
+; RV32IB-COMMON-NEXT:  .LBB5_4: # in Loop: Header=BB5_2 Depth=1
+; RV32IB-COMMON-NEXT:    sltu a0, a4, s2
+; RV32IB-COMMON-NEXT:  .LBB5_5: # %atomicrmw.start
+; RV32IB-COMMON-NEXT:    # in Loop: Header=BB5_2 Depth=1
+; RV32IB-COMMON-NEXT:    mv a2, a4
+; RV32IB-COMMON-NEXT:    mv a3, a5
+; RV32IB-COMMON-NEXT:    bnez a0, .LBB5_1
+; RV32IB-COMMON-NEXT:  # %bb.6: # %atomicrmw.start
+; RV32IB-COMMON-NEXT:    # in Loop: Header=BB5_2 Depth=1
+; RV32IB-COMMON-NEXT:    mv a2, s2
+; RV32IB-COMMON-NEXT:    mv a3, s0
+; RV32IB-COMMON-NEXT:    j .LBB5_1
+; RV32IB-COMMON-NEXT:  .LBB5_7: # %atomicrmw.end
+; RV32IB-COMMON-NEXT:    mv a0, a4
+; RV32IB-COMMON-NEXT:    mv a1, a5
+; RV32IB-COMMON-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32IB-COMMON-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32IB-COMMON-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32IB-COMMON-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32IB-COMMON-NEXT:    addi sp, sp, 32
+; RV32IB-COMMON-NEXT:    ret
+;
+; RV64IB-ZALRSC-LABEL: atomicrmw_min_i64_seq_cst:
+; RV64IB-ZALRSC:       # %bb.0:
+; RV64IB-ZALRSC-NEXT:  .LBB5_1: # =>This Inner Loop Header: Depth=1
+; RV64IB-ZALRSC-NEXT:    lr.d.aqrl a2, (a0)
+; RV64IB-ZALRSC-NEXT:    mv a3, a2
+; RV64IB-ZALRSC-NEXT:    bge a1, a3, .LBB5_3
+; RV64IB-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB5_1 Depth=1
+; RV64IB-ZALRSC-NEXT:    mv a3, a1
+; RV64IB-ZALRSC-NEXT:  .LBB5_3: # in Loop: Header=BB5_1 Depth=1
+; RV64IB-ZALRSC-NEXT:    sc.d.rl a3, a3, (a0)
+; RV64IB-ZALRSC-NEXT:    bnez a3, .LBB5_1
+; RV64IB-ZALRSC-NEXT:  # %bb.4:
+; RV64IB-ZALRSC-NEXT:    mv a0, a2
+; RV64IB-ZALRSC-NEXT:    ret
+;
+; RV64IB-ZALRSC-PERM-LABEL: atomicrmw_min_i64_seq_cst:
+; RV64IB-ZALRSC-PERM:       # %bb.0:
+; RV64IB-ZALRSC-PERM-NEXT:  .LBB5_1: # =>This Inner Loop Header: Depth=1
+; RV64IB-ZALRSC-PERM-NEXT:    lr.d.aqrl a2, (a0)
+; RV64IB-ZALRSC-PERM-NEXT:    min a3, a2, a1
+; RV64IB-ZALRSC-PERM-NEXT:    sc.d.rl a3, a3, (a0)
+; RV64IB-ZALRSC-PERM-NEXT:    bnez a3, .LBB5_1
+; RV64IB-ZALRSC-PERM-NEXT:  # %bb.2:
+; RV64IB-ZALRSC-PERM-NEXT:    mv a0, a2
+; RV64IB-ZALRSC-PERM-NEXT:    ret
+;
+; RV64IAB-LABEL: atomicrmw_min_i64_seq_cst:
+; RV64IAB:       # %bb.0:
+; RV64IAB-NEXT:    amomin.d.aqrl a0, a1, (a0)
+; RV64IAB-NEXT:    ret
+  %1 = atomicrmw min ptr %a, i64 %b seq_cst
+  ret i64 %1
+}
+
+define i64 @atomicrmw_umax_i64_seq_cst(ptr %a, i64 %b) nounwind {
+; RV32IB-COMMON-LABEL: atomicrmw_umax_i64_seq_cst:
+; RV32IB-COMMON:       # %bb.0:
+; RV32IB-COMMON-NEXT:    addi sp, sp, -32
+; RV32IB-COMMON-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32IB-COMMON-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32IB-COMMON-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32IB-COMMON-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32IB-COMMON-NEXT:    mv s0, a2
+; RV32IB-COMMON-NEXT:    mv s1, a0
+; RV32IB-COMMON-NEXT:    lw a4, 0(a0)
+; RV32IB-COMMON-NEXT:    lw a5, 4(a0)
+; RV32IB-COMMON-NEXT:    mv s2, a1
+; RV32IB-COMMON-NEXT:    j .LBB6_2
+; RV32IB-COMMON-NEXT:  .LBB6_1: # %atomicrmw.start
+; RV32IB-COMMON-NEXT:    # in Loop: Header=BB6_2 Depth=1
+; RV32IB-COMMON-NEXT:    sw a4, 8(sp)
+; RV32IB-COMMON-NEXT:    sw a5, 12(sp)
+; RV32IB-COMMON-NEXT:    addi a1, sp, 8
+; RV32IB-COMMON-NEXT:    li a4, 5
+; RV32IB-COMMON-NEXT:    li a5, 5
+; RV32IB-COMMON-NEXT:    mv a0, s1
+; RV32IB-COMMON-NEXT:    call __atomic_compare_exchange_8
+; RV32IB-COMMON-NEXT:    lw a4, 8(sp)
+; RV32IB-COMMON-NEXT:    lw a5, 12(sp)
+; RV32IB-COMMON-NEXT:    bnez a0, .LBB6_7
+; RV32IB-COMMON-NEXT:  .LBB6_2: # %atomicrmw.start
+; RV32IB-COMMON-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32IB-COMMON-NEXT:    beq a5, s0, .LBB6_4
+; RV32IB-COMMON-NEXT:  # %bb.3: # %atomicrmw.start
+; RV32IB-COMMON-NEXT:    # in Loop: Header=BB6_2 Depth=1
+; RV32IB-COMMON-NEXT:    sltu a0, s0, a5
+; RV32IB-COMMON-NEXT:    j .LBB6_5
+; RV32IB-COMMON-NEXT:  .LBB6_4: # in Loop: Header=BB6_2 Depth=1
+; RV32IB-COMMON-NEXT:    sltu a0, s2, a4
+; RV32IB-COMMON-NEXT:  .LBB6_5: # %atomicrmw.start
+; RV32IB-COMMON-NEXT:    # in Loop: Header=BB6_2 Depth=1
+; RV32IB-COMMON-NEXT:    mv a2, a4
+; RV32IB-COMMON-NEXT:    mv a3, a5
+; RV32IB-COMMON-NEXT:    bnez a0, .LBB6_1
+; RV32IB-COMMON-NEXT:  # %bb.6: # %atomicrmw.start
+; RV32IB-COMMON-NEXT:    # in Loop: Header=BB6_2 Depth=1
+; RV32IB-COMMON-NEXT:    mv a2, s2
+; RV32IB-COMMON-NEXT:    mv a3, s0
+; RV32IB-COMMON-NEXT:    j .LBB6_1
+; RV32IB-COMMON-NEXT:  .LBB6_7: # %atomicrmw.end
+; RV32IB-COMMON-NEXT:    mv a0, a4
+; RV32IB-COMMON-NEXT:    mv a1, a5
+; RV32IB-COMMON-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32IB-COMMON-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32IB-COMMON-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32IB-COMMON-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32IB-COMMON-NEXT:    addi sp, sp, 32
+; RV32IB-COMMON-NEXT:    ret
+;
+; RV64IB-ZALRSC-LABEL: atomicrmw_umax_i64_seq_cst:
+; RV64IB-ZALRSC:       # %bb.0:
+; RV64IB-ZALRSC-NEXT:  .LBB6_1: # =>This Inner Loop Header: Depth=1
+; RV64IB-ZALRSC-NEXT:    lr.d.aqrl a2, (a0)
+; RV64IB-ZALRSC-NEXT:    mv a3, a2
+; RV64IB-ZALRSC-NEXT:    bgeu a3, a1, .LBB6_3
+; RV64IB-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB6_1 Depth=1
+; RV64IB-ZALRSC-NEXT:    mv a3, a1
+; RV64IB-ZALRSC-NEXT:  .LBB6_3: # in Loop: Header=BB6_1 Depth=1
+; RV64IB-ZALRSC-NEXT:    sc.d.rl a3, a3, (a0)
+; RV64IB-ZALRSC-NEXT:    bnez a3, .LBB6_1
+; RV64IB-ZALRSC-NEXT:  # %bb.4:
+; RV64IB-ZALRSC-NEXT:    mv a0, a2
+; RV64IB-ZALRSC-NEXT:    ret
+;
+; RV64IB-ZALRSC-PERM-LABEL: atomicrmw_umax_i64_seq_cst:
+; RV64IB-ZALRSC-PERM:       # %bb.0:
+; RV64IB-ZALRSC-PERM-NEXT:  .LBB6_1: # =>This Inner Loop Header: Depth=1
+; RV64IB-ZALRSC-PERM-NEXT:    lr.d.aqrl a2, (a0)
+; RV64IB-ZALRSC-PERM-NEXT:    maxu a3, a2, a1
+; RV64IB-ZALRSC-PERM-NEXT:    sc.d.rl a3, a3, (a0)
+; RV64IB-ZALRSC-PERM-NEXT:    bnez a3, .LBB6_1
+; RV64IB-ZALRSC-PERM-NEXT:  # %bb.2:
+; RV64IB-ZALRSC-PERM-NEXT:    mv a0, a2
+; RV64IB-ZALRSC-PERM-NEXT:    ret
+;
+; RV64IAB-LABEL: atomicrmw_umax_i64_seq_cst:
+; RV64IAB:       # %bb.0:
+; RV64IAB-NEXT:    amomaxu.d.aqrl a0, a1, (a0)
+; RV64IAB-NEXT:    ret
+  %1 = atomicrmw umax ptr %a, i64 %b seq_cst
+  ret i64 %1
+}
+
+define i64 @atomicrmw_umin_i64_seq_cst(ptr %a, i64 %b) nounwind {
+; RV32IB-COMMON-LABEL: atomicrmw_umin_i64_seq_cst:
+; RV32IB-COMMON:       # %bb.0:
+; RV32IB-COMMON-NEXT:    addi sp, sp, -32
+; RV32IB-COMMON-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32IB-COMMON-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32IB-COMMON-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32IB-COMMON-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32IB-COMMON-NEXT:    mv s0, a2
+; RV32IB-COMMON-NEXT:    mv s1, a0
+; RV32IB-COMMON-NEXT:    lw a4, 0(a0)
+; RV32IB-COMMON-NEXT:    lw a5, 4(a0)
+; RV32IB-COMMON-NEXT:    mv s2, a1
+; RV32IB-COMMON-NEXT:    j .LBB7_2
+; RV32IB-COMMON-NEXT:  .LBB7_1: # %atomicrmw.start
+; RV32IB-COMMON-NEXT:    # in Loop: Header=BB7_2 Depth=1
+; RV32IB-COMMON-NEXT:    sw a4, 8(sp)
+; RV32IB-COMMON-NEXT:    sw a5, 12(sp)
+; RV32IB-COMMON-NEXT:    addi a1, sp, 8
+; RV32IB-COMMON-NEXT:    li a4, 5
+; RV32IB-COMMON-NEXT:    li a5, 5
+; RV32IB-COMMON-NEXT:    mv a0, s1
+; RV32IB-COMMON-NEXT:    call __atomic_compare_exchange_8
+; RV32IB-COMMON-NEXT:    lw a4, 8(sp)
+; RV32IB-COMMON-NEXT:    lw a5, 12(sp)
+; RV32IB-COMMON-NEXT:    bnez a0, .LBB7_7
+; RV32IB-COMMON-NEXT:  .LBB7_2: # %atomicrmw.start
+; RV32IB-COMMON-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32IB-COMMON-NEXT:    beq a5, s0, .LBB7_4
+; RV32IB-COMMON-NEXT:  # %bb.3: # %atomicrmw.start
+; RV32IB-COMMON-NEXT:    # in Loop: Header=BB7_2 Depth=1
+; RV32IB-COMMON-NEXT:    sltu a0, a5, s0
+; RV32IB-COMMON-NEXT:    j .LBB7_5
+; RV32IB-COMMON-NEXT:  .LBB7_4: # in Loop: Header=BB7_2 Depth=1
+; RV32IB-COMMON-NEXT:    sltu a0, a4, s2
+; RV32IB-COMMON-NEXT:  .LBB7_5: # %atomicrmw.start
+; RV32IB-COMMON-NEXT:    # in Loop: Header=BB7_2 Depth=1
+; RV32IB-COMMON-NEXT:    mv a2, a4
+; RV32IB-COMMON-NEXT:    mv a3, a5
+; RV32IB-COMMON-NEXT:    bnez a0, .LBB7_1
+; RV32IB-COMMON-NEXT:  # %bb.6: # %atomicrmw.start
+; RV32IB-COMMON-NEXT:    # in Loop: Header=BB7_2 Depth=1
+; RV32IB-COMMON-NEXT:    mv a2, s2
+; RV32IB-COMMON-NEXT:    mv a3, s0
+; RV32IB-COMMON-NEXT:    j .LBB7_1
+; RV32IB-COMMON-NEXT:  .LBB7_7: # %atomicrmw.end
+; RV32IB-COMMON-NEXT:    mv a0, a4
+; RV32IB-COMMON-NEXT:    mv a1, a5
+; RV32IB-COMMON-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32IB-COMMON-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32IB-COMMON-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32IB-COMMON-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32IB-COMMON-NEXT:    addi sp, sp, 32
+; RV32IB-COMMON-NEXT:    ret
+;
+; RV64IB-ZALRSC-LABEL: atomicrmw_umin_i64_seq_cst:
+; RV64IB-ZALRSC:       # %bb.0:
+; RV64IB-ZALRSC-NEXT:  .LBB7_1: # =>This Inner Loop Header: Depth=1
+; RV64IB-ZALRSC-NEXT:    lr.d.aqrl a2, (a0)
+; RV64IB-ZALRSC-NEXT:    mv a3, a2
+; RV64IB-ZALRSC-NEXT:    bgeu a1, a3, .LBB7_3
+; RV64IB-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB7_1 Depth=1
+; RV64IB-ZALRSC-NEXT:    mv a3, a1
+; RV64IB-ZALRSC-NEXT:  .LBB7_3: # in Loop: Header=BB7_1 Depth=1
+; RV64IB-ZALRSC-NEXT:    sc.d.rl a3, a3, (a0)
+; RV64IB-ZALRSC-NEXT:    bnez a3, .LBB7_1
+; RV64IB-ZALRSC-NEXT:  # %bb.4:
+; RV64IB-ZALRSC-NEXT:    mv a0, a2
+; RV64IB-ZALRSC-NEXT:    ret
+;
+; RV64IB-ZALRSC-PERM-LABEL: atomicrmw_umin_i64_seq_cst:
+; RV64IB-ZALRSC-PERM:       # %bb.0:
+; RV64IB-ZALRSC-PERM-NEXT:  .LBB7_1: # =>This Inner Loop Header: Depth=1
+; RV64IB-ZALRSC-PERM-NEXT:    lr.d.aqrl a2, (a0)
+; RV64IB-ZALRSC-PERM-NEXT:    minu a3, a2, a1
+; RV64IB-ZALRSC-PERM-NEXT:    sc.d.rl a3, a3, (a0)
+; RV64IB-ZALRSC-PERM-NEXT:    bnez a3, .LBB7_1
+; RV64IB-ZALRSC-PERM-NEXT:  # %bb.2:
+; RV64IB-ZALRSC-PERM-NEXT:    mv a0, a2
+; RV64IB-ZALRSC-PERM-NEXT:    ret
+;
+; RV64IAB-LABEL: atomicrmw_umin_i64_seq_cst:
+; RV64IAB:       # %bb.0:
+; RV64IAB-NEXT:    amominu.d.aqrl a0, a1, (a0)
+; RV64IAB-NEXT:    ret
+  %1 = atomicrmw umin ptr %a, i64 %b seq_cst
+  ret i64 %1
+}
diff --git a/llvm/test/CodeGen/RISCV/features-info.ll b/llvm/test/CodeGen/RISCV/features-info.ll
index 5e5f2b78e8869..988d0490afeb6 100644
--- a/llvm/test/CodeGen/RISCV/features-info.ll
+++ b/llvm/test/CodeGen/RISCV/features-info.ll
@@ -81,6 +81,7 @@
 ; CHECK-NEXT:   optimized-nf7-segment-load-store - vlseg7eN.v and vsseg7eN.v are implemented as a wide memory op and shuffle.
 ; CHECK-NEXT:   optimized-nf8-segment-load-store - vlseg8eN.v and vsseg8eN.v are implemented as a wide memory op and shuffle.
 ; CHECK-NEXT:   optimized-zero-stride-load       - Optimized (perform fewer memory operations)zero-stride vector load.
+; CHECK-NEXT:   permissive-zalrsc                - Implementation permits non-base instructions between LR/SC pairs.
 ; CHECK-NEXT:   predictable-select-expensive     - Prefer likely predicted branches over selects.
 ; CHECK-NEXT:   prefer-vsetvli-over-read-vlenb   - Prefer vsetvli over read vlenb CSR to calculate VLEN.
 ; CHECK-NEXT:   prefer-w-inst                    - Prefer instructions with W suffix.
@@ -135,6 +136,7 @@
 ; CHECK-NEXT:   shgatpa                          - 'Shgatpa' (SvNNx4 mode supported for all modes supported by satp, as well as Bare).
 ; CHECK-NEXT:   shifted-zextw-fusion             - Enable SLLI+SRLI to be fused when computing (shifted) word zero extension.
 ; CHECK-NEXT:   shlcofideleg                     - 'Shlcofideleg' (Delegating LCOFI Interrupts to VS-mode).
+; CHECK-NEXT:   short-forward-branch-i-minmax    - Enable short forward branch optimization for min,max instructions in Zbb.
 ; CHECK-NEXT:   short-forward-branch-opt         - Enable short forward branch optimization.
 ; CHECK-NEXT:   shtvala                          - 'Shtvala' (htval provides all needed values).
 ; CHECK-NEXT:   shvsatpa                         - 'Shvsatpa' (vsatp supports all modes supported by satp).
diff --git a/llvm/test/CodeGen/RISCV/rv64-stackmap.ll b/llvm/test/CodeGen/RISCV/rv64-stackmap.ll
index d07f608bf7893..c50a0fb3ffe91 100644
--- a/llvm/test/CodeGen/RISCV/rv64-stackmap.ll
+++ b/llvm/test/CodeGen/RISCV/rv64-stackmap.ll
@@ -7,11 +7,11 @@
 ; CHECK-NEXT:   .byte   0
 ; CHECK-NEXT:   .half   0
 ; Num Functions
-; CHECK-NEXT:   .word   12
+; CHECK-NEXT:   .word   13
 ; Num LargeConstants
-; CHECK-NEXT:   .word   2
+; CHECK-NEXT:   .word   3
 ; Num Callsites
-; CHECK-NEXT:   .word   16
+; CHECK-NEXT:   .word   17
 
 ; Functions and stack size
 ; CHECK-NEXT:   .quad   constantargs
@@ -38,8 +38,8 @@
 ; CHECK-NEXT:   .quad   liveConstant
 ; CHECK-NEXT:   .quad   0
 ; CHECK-NEXT:   .quad   1
-; CHECK-NEXT:   .quad   spilledValue
-; CHECK-NEXT:   .quad   144
+; CHECK-NEXT:   .quad   liveArgs
+; CHECK-NEXT:   .quad   0
 ; CHECK-NEXT:   .quad   1
 ; CHECK-NEXT:   .quad   directFrameIdx
 ; CHECK-NEXT:   .quad   48
@@ -50,10 +50,14 @@
 ; CHECK-NEXT:   .quad   needsStackRealignment
 ; CHECK-NEXT:   .quad   -1
 ; CHECK-NEXT:   .quad   1
+; CHECK-NEXT:   .quad   floats
+; CHECK-NEXT:   .quad   32
+; CHECK-NEXT:   .quad   1
 
 ; Num LargeConstants
 ; CHECK-NEXT:   .quad   4294967295
 ; CHECK-NEXT:   .quad   4294967296
+; CHECK-NEXT:   .quad   4609434218613702656
 
 ; Constant arguments
 ;
@@ -278,7 +282,7 @@ define void @liveConstant() {
 ;
 ; Verify 28 stack map entries.
 ;
-; CHECK-LABEL:  .word   .L{{.*}}-spilledValue
+; CHECK-LABEL:  .word   .L{{.*}}-liveArgs
 ; CHECK-NEXT:   .half   0
 ; CHECK-NEXT:   .half   28
 ;
@@ -290,9 +294,9 @@ define void @liveConstant() {
 ; CHECK-NEXT:   .half   2
 ; CHECK-NEXT:   .half   0
 ; CHECK-NEXT:   .word
-define void @spilledValue(i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27) {
+define void @liveArgs(i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i8 %l25, i16 zeroext %l26, i32 signext %l27) {
 entry:
-  call void (i64, i32, ptr, i32, ...) @llvm.experimental.patchpoint.void(i64 11, i32 28, ptr null, i32 5, i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27)
+  call void (i64, i32, ptr, i32, ...) @llvm.experimental.patchpoint.void(i64 11, i32 28, ptr null, i32 5, i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i8 %l25, i16 %l26, i32 %l27)
   ret void
 }
 
@@ -379,6 +383,104 @@ define void @needsStackRealignment() {
 }
 declare void @escape_values(...)
 
+; CHECK-LABEL:  .word   .L{{.*}}-floats
+; CHECK-NEXT:   .half   0
+; Num Locations
+; CHECK-NEXT:   .half   12
+; Loc 0: constant float as constant integer
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .byte   0
+; CHECK-NEXT:   .half   8
+; CHECK-NEXT:   .half   0
+; CHECK-NEXT:   .half   0
+; CHECK-NEXT:   .word
+; Loc 1: constant double as large constant integer
+; CHECK-NEXT:   .byte   5
+; CHECK-NEXT:   .byte   0
+; CHECK-NEXT:   .half   8
+; CHECK-NEXT:   .half   0
+; CHECK-NEXT:   .half   0
+; CHECK-NEXT:   .word
+; Loc 2: constant half as constant integer
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .byte   0
+; CHECK-NEXT:   .half   8
+; CHECK-NEXT:   .half   0
+; CHECK-NEXT:   .half   0
+; CHECK-NEXT:   .word
+; Loc 3: constant bfloat as constant integer
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .byte   0
+; CHECK-NEXT:   .half   8
+; CHECK-NEXT:   .half   0
+; CHECK-NEXT:   .half   0
+; CHECK-NEXT:   .word
+; Loc 4: float value in X register
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   0
+; CHECK-NEXT:   .half   8
+; CHECK-NEXT:   .half   10
+; CHECK-NEXT:   .half   0
+; CHECK-NEXT:   .word
+; Loc 5: double value in X register
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   0
+; CHECK-NEXT:   .half   8
+; CHECK-NEXT:   .half   11
+; CHECK-NEXT:   .half   0
+; CHECK-NEXT:   .word
+; Loc 6: half value in X register
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   0
+; CHECK-NEXT:   .half   8
+; CHECK-NEXT:   .half   12
+; CHECK-NEXT:   .half   0
+; CHECK-NEXT:   .word
+; Loc 7: bfloat value in X register
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   0
+; CHECK-NEXT:   .half   8
+; CHECK-NEXT:   .half   13
+; CHECK-NEXT:   .half   0
+; CHECK-NEXT:   .word
+; Loc 8: float on stack
+; CHECK-NEXT:   .byte   2
+; CHECK-NEXT:   .byte   0
+; CHECK-NEXT:   .half   8
+; CHECK-NEXT:   .half   2
+; CHECK-NEXT:   .half   0
+; CHECK-NEXT:   .word
+; Loc 9: double on stack
+; CHECK-NEXT:   .byte   2
+; CHECK-NEXT:   .byte   0
+; CHECK-NEXT:   .half   8
+; CHECK-NEXT:   .half   2
+; CHECK-NEXT:   .half   0
+; CHECK-NEXT:   .word
+; Loc 10: half on stack
+; CHECK-NEXT:   .byte   2
+; CHECK-NEXT:   .byte   0
+; CHECK-NEXT:   .half   8
+; CHECK-NEXT:   .half   2
+; CHECK-NEXT:   .half   0
+; CHECK-NEXT:   .word
+; Loc 11: bfloat on stack
+; CHECK-NEXT:   .byte   2
+; CHECK-NEXT:   .byte   0
+; CHECK-NEXT:   .half   8
+; CHECK-NEXT:   .half   2
+; CHECK-NEXT:   .half   0
+; CHECK-NEXT:   .word
+define void @floats(float %f, double %g, half %h, bfloat %i) {
+  %ff = alloca float
+  %gg = alloca double
+  %hh = alloca half
+  %ii = alloca bfloat
+  call void (i64, i32, ...) @llvm.experimental.stackmap(i64 888, i32 0, float 1.25,
+    double 1.5, half 1.5, bfloat 1.5, float %f, double %g, half %h, bfloat %i, ptr %ff, ptr %gg, ptr %hh, ptr %ii)
+  ret void
+}
+
 declare void @llvm.experimental.stackmap(i64, i32, ...)
 declare void @llvm.experimental.patchpoint.void(i64, i32, ptr, i32, ...)
 declare i64 @llvm.experimental.patchpoint.i64(i64, i32, ptr, i32, ...)
diff --git a/llvm/test/CodeGen/RISCV/rv64p.ll b/llvm/test/CodeGen/RISCV/rv64p.ll
index cb07f945a582a..f937f44f13320 100644
--- a/llvm/test/CodeGen/RISCV/rv64p.ll
+++ b/llvm/test/CodeGen/RISCV/rv64p.ll
@@ -297,8 +297,7 @@ declare i32 @llvm.abs.i32(i32, i1 immarg)
 define i32 @abs_i32(i32 %x) {
 ; CHECK-LABEL: abs_i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    sext.w a0, a0
-; CHECK-NEXT:    abs a0, a0
+; CHECK-NEXT:    absw a0, a0
 ; CHECK-NEXT:    ret
   %abs = tail call i32 @llvm.abs.i32(i32 %x, i1 true)
   ret i32 %abs
@@ -307,8 +306,7 @@ define i32 @abs_i32(i32 %x) {
 define signext i32 @abs_i32_sext(i32 signext %x) {
 ; CHECK-LABEL: abs_i32_sext:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    abs a0, a0
-; CHECK-NEXT:    sext.w a0, a0
+; CHECK-NEXT:    absw a0, a0
 ; CHECK-NEXT:    ret
   %abs = tail call i32 @llvm.abs.i32(i32 %x, i1 true)
   ret i32 %abs
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
index 4c35b2506d3e4..7e6f2c76e5881 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
@@ -15265,6 +15265,259 @@ define <4 x i32> @masked_gather_widen_sew_negative_stride(ptr %base) {
   ret <4 x i32> %x
 }
 
+define <7 x i8> @mgather_baseidx_v7i8(ptr %base, <7 x i8> %idxs, <7 x i1> %m, <7 x i8> %passthru) {
+; RV32-LABEL: mgather_baseidx_v7i8:
+; RV32:       # %bb.0:
+; RV32-NEXT:    li a1, 127
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vmv.s.x v10, a1
+; RV32-NEXT:    vmand.mm v0, v0, v10
+; RV32-NEXT:    vsext.vf4 v10, v8
+; RV32-NEXT:    vsetvli zero, zero, e8, mf2, ta, mu
+; RV32-NEXT:    vluxei32.v v9, (a0), v10, v0.t
+; RV32-NEXT:    vmv1r.v v8, v9
+; RV32-NEXT:    ret
+;
+; RV64V-LABEL: mgather_baseidx_v7i8:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    li a1, 127
+; RV64V-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV64V-NEXT:    vmv.s.x v10, a1
+; RV64V-NEXT:    vmand.mm v0, v0, v10
+; RV64V-NEXT:    vsext.vf8 v12, v8
+; RV64V-NEXT:    vsetvli zero, zero, e8, mf2, ta, mu
+; RV64V-NEXT:    vluxei64.v v9, (a0), v12, v0.t
+; RV64V-NEXT:    vmv1r.v v8, v9
+; RV64V-NEXT:    ret
+;
+; RV64ZVE32F-LABEL: mgather_baseidx_v7i8:
+; RV64ZVE32F:       # %bb.0:
+; RV64ZVE32F-NEXT:    addi sp, sp, -16
+; RV64ZVE32F-NEXT:    .cfi_def_cfa_offset 16
+; RV64ZVE32F-NEXT:    .cfi_remember_state
+; RV64ZVE32F-NEXT:    li a1, 64
+; RV64ZVE32F-NEXT:    addi a2, sp, 8
+; RV64ZVE32F-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; RV64ZVE32F-NEXT:    vsm.v v0, (a2)
+; RV64ZVE32F-NEXT:    ld a1, 8(sp)
+; RV64ZVE32F-NEXT:    andi a2, a1, 1
+; RV64ZVE32F-NEXT:    beqz a2, .LBB132_2
+; RV64ZVE32F-NEXT:  # %bb.1: # %cond.load
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v8
+; RV64ZVE32F-NEXT:    add a2, a0, a2
+; RV64ZVE32F-NEXT:    lbu a2, 0(a2)
+; RV64ZVE32F-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV64ZVE32F-NEXT:    vslidedown.vi v10, v9, 1
+; RV64ZVE32F-NEXT:    vmv.v.x v11, a2
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v10
+; RV64ZVE32F-NEXT:    vslidedown.vi v10, v9, 2
+; RV64ZVE32F-NEXT:    vslide1down.vx v11, v11, a2
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v10
+; RV64ZVE32F-NEXT:    vslidedown.vi v10, v9, 3
+; RV64ZVE32F-NEXT:    vslide1down.vx v11, v11, a2
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v10
+; RV64ZVE32F-NEXT:    vslidedown.vi v10, v9, 4
+; RV64ZVE32F-NEXT:    vslide1down.vx v11, v11, a2
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v10
+; RV64ZVE32F-NEXT:    vslidedown.vi v10, v9, 5
+; RV64ZVE32F-NEXT:    vslidedown.vi v9, v9, 6
+; RV64ZVE32F-NEXT:    vslide1down.vx v11, v11, a2
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v10
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v11, a2
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v9
+; RV64ZVE32F-NEXT:    vslide1down.vx v9, v10, a2
+; RV64ZVE32F-NEXT:    vslidedown.vi v9, v9, 1
+; RV64ZVE32F-NEXT:  .LBB132_2: # %else
+; RV64ZVE32F-NEXT:    andi a2, a1, 2
+; RV64ZVE32F-NEXT:    beqz a2, .LBB132_4
+; RV64ZVE32F-NEXT:  # %bb.3: # %cond.load1
+; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
+; RV64ZVE32F-NEXT:    vslidedown.vi v10, v8, 1
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v9
+; RV64ZVE32F-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV64ZVE32F-NEXT:    vslidedown.vi v11, v9, 2
+; RV64ZVE32F-NEXT:    vmv.x.s a3, v10
+; RV64ZVE32F-NEXT:    add a3, a0, a3
+; RV64ZVE32F-NEXT:    lbu a3, 0(a3)
+; RV64ZVE32F-NEXT:    vmv.v.x v10, a2
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v11
+; RV64ZVE32F-NEXT:    vslidedown.vi v11, v9, 3
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v10, a3
+; RV64ZVE32F-NEXT:    vmv.x.s a3, v11
+; RV64ZVE32F-NEXT:    vslidedown.vi v11, v9, 4
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v10, a2
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v11
+; RV64ZVE32F-NEXT:    vslidedown.vi v11, v9, 5
+; RV64ZVE32F-NEXT:    vslidedown.vi v9, v9, 6
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v10, a3
+; RV64ZVE32F-NEXT:    vmv.x.s a3, v11
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v10, a2
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v10, a3
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v9
+; RV64ZVE32F-NEXT:    vslide1down.vx v9, v10, a2
+; RV64ZVE32F-NEXT:    vslidedown.vi v9, v9, 1
+; RV64ZVE32F-NEXT:  .LBB132_4: # %else2
+; RV64ZVE32F-NEXT:    andi a2, a1, 4
+; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV64ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
+; RV64ZVE32F-NEXT:    beqz a2, .LBB132_6
+; RV64ZVE32F-NEXT:  # %bb.5: # %cond.load4
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v10
+; RV64ZVE32F-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV64ZVE32F-NEXT:    vslidedown.vi v11, v9, 1
+; RV64ZVE32F-NEXT:    vmv.x.s a3, v9
+; RV64ZVE32F-NEXT:    vslidedown.vi v12, v9, 3
+; RV64ZVE32F-NEXT:    vmv.x.s a4, v11
+; RV64ZVE32F-NEXT:    vmv.v.x v11, a3
+; RV64ZVE32F-NEXT:    vmv.x.s a3, v12
+; RV64ZVE32F-NEXT:    vslidedown.vi v12, v9, 4
+; RV64ZVE32F-NEXT:    add a2, a0, a2
+; RV64ZVE32F-NEXT:    lbu a2, 0(a2)
+; RV64ZVE32F-NEXT:    vslide1down.vx v11, v11, a4
+; RV64ZVE32F-NEXT:    vmv.x.s a4, v12
+; RV64ZVE32F-NEXT:    vslidedown.vi v12, v9, 5
+; RV64ZVE32F-NEXT:    vslidedown.vi v9, v9, 6
+; RV64ZVE32F-NEXT:    vslide1down.vx v11, v11, a2
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v12
+; RV64ZVE32F-NEXT:    vslide1down.vx v11, v11, a3
+; RV64ZVE32F-NEXT:    vslide1down.vx v11, v11, a4
+; RV64ZVE32F-NEXT:    vslide1down.vx v11, v11, a2
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v9
+; RV64ZVE32F-NEXT:    vslide1down.vx v9, v11, a2
+; RV64ZVE32F-NEXT:    vslidedown.vi v9, v9, 1
+; RV64ZVE32F-NEXT:  .LBB132_6: # %else5
+; RV64ZVE32F-NEXT:    andi a2, a1, 8
+; RV64ZVE32F-NEXT:    beqz a2, .LBB132_8
+; RV64ZVE32F-NEXT:  # %bb.7: # %cond.load7
+; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
+; RV64ZVE32F-NEXT:    vslidedown.vi v10, v10, 1
+; RV64ZVE32F-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV64ZVE32F-NEXT:    vslidedown.vi v11, v9, 1
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v9
+; RV64ZVE32F-NEXT:    vmv.x.s a3, v10
+; RV64ZVE32F-NEXT:    vslidedown.vi v10, v9, 2
+; RV64ZVE32F-NEXT:    vmv.x.s a4, v11
+; RV64ZVE32F-NEXT:    vmv.v.x v11, a2
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v10
+; RV64ZVE32F-NEXT:    vslidedown.vi v10, v9, 4
+; RV64ZVE32F-NEXT:    vslide1down.vx v11, v11, a4
+; RV64ZVE32F-NEXT:    vmv.x.s a4, v10
+; RV64ZVE32F-NEXT:    vslidedown.vi v10, v9, 5
+; RV64ZVE32F-NEXT:    add a3, a0, a3
+; RV64ZVE32F-NEXT:    lbu a3, 0(a3)
+; RV64ZVE32F-NEXT:    vslidedown.vi v9, v9, 6
+; RV64ZVE32F-NEXT:    vslide1down.vx v11, v11, a2
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v10
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v11, a3
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v10, a4
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v10, a2
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v9
+; RV64ZVE32F-NEXT:    vslide1down.vx v9, v10, a2
+; RV64ZVE32F-NEXT:    vslidedown.vi v9, v9, 1
+; RV64ZVE32F-NEXT:  .LBB132_8: # %else8
+; RV64ZVE32F-NEXT:    andi a2, a1, 16
+; RV64ZVE32F-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
+; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 4
+; RV64ZVE32F-NEXT:    bnez a2, .LBB132_13
+; RV64ZVE32F-NEXT:  # %bb.9: # %else11
+; RV64ZVE32F-NEXT:    andi a2, a1, 32
+; RV64ZVE32F-NEXT:    bnez a2, .LBB132_14
+; RV64ZVE32F-NEXT:  .LBB132_10: # %else14
+; RV64ZVE32F-NEXT:    andi a1, a1, 64
+; RV64ZVE32F-NEXT:    beqz a1, .LBB132_12
+; RV64ZVE32F-NEXT:  .LBB132_11: # %cond.load16
+; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV64ZVE32F-NEXT:    vslidedown.vi v10, v9, 1
+; RV64ZVE32F-NEXT:    vmv.x.s a1, v9
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v8
+; RV64ZVE32F-NEXT:    vmv.v.x v8, a1
+; RV64ZVE32F-NEXT:    vmv.x.s a1, v10
+; RV64ZVE32F-NEXT:    vslidedown.vi v10, v9, 2
+; RV64ZVE32F-NEXT:    add a0, a0, a2
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v10
+; RV64ZVE32F-NEXT:    vslidedown.vi v10, v9, 3
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a1
+; RV64ZVE32F-NEXT:    vmv.x.s a1, v10
+; RV64ZVE32F-NEXT:    vslidedown.vi v10, v9, 4
+; RV64ZVE32F-NEXT:    vslidedown.vi v9, v9, 5
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v10
+; RV64ZVE32F-NEXT:    lbu a0, 0(a0)
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a1
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
+; RV64ZVE32F-NEXT:    vmv.x.s a1, v9
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a1
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a0
+; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 1
+; RV64ZVE32F-NEXT:  .LBB132_12: # %else17
+; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; RV64ZVE32F-NEXT:    vmv1r.v v8, v9
+; RV64ZVE32F-NEXT:    addi sp, sp, 16
+; RV64ZVE32F-NEXT:    .cfi_def_cfa_offset 0
+; RV64ZVE32F-NEXT:    ret
+; RV64ZVE32F-NEXT:  .LBB132_13: # %cond.load10
+; RV64ZVE32F-NEXT:    .cfi_restore_state
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v8
+; RV64ZVE32F-NEXT:    vslidedown.vi v10, v9, 1
+; RV64ZVE32F-NEXT:    vmv.x.s a3, v9
+; RV64ZVE32F-NEXT:    vslidedown.vi v11, v9, 2
+; RV64ZVE32F-NEXT:    vmv.x.s a4, v10
+; RV64ZVE32F-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV64ZVE32F-NEXT:    vmv.v.x v10, a3
+; RV64ZVE32F-NEXT:    vmv.x.s a3, v11
+; RV64ZVE32F-NEXT:    vslidedown.vi v11, v9, 3
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v10, a4
+; RV64ZVE32F-NEXT:    vmv.x.s a4, v11
+; RV64ZVE32F-NEXT:    vslidedown.vi v11, v9, 5
+; RV64ZVE32F-NEXT:    vslidedown.vi v9, v9, 6
+; RV64ZVE32F-NEXT:    add a2, a0, a2
+; RV64ZVE32F-NEXT:    lbu a2, 0(a2)
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v10, a3
+; RV64ZVE32F-NEXT:    vmv.x.s a3, v11
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v10, a4
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v10, a2
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v10, a3
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v9
+; RV64ZVE32F-NEXT:    vslide1down.vx v9, v10, a2
+; RV64ZVE32F-NEXT:    vslidedown.vi v9, v9, 1
+; RV64ZVE32F-NEXT:    andi a2, a1, 32
+; RV64ZVE32F-NEXT:    beqz a2, .LBB132_10
+; RV64ZVE32F-NEXT:  .LBB132_14: # %cond.load13
+; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
+; RV64ZVE32F-NEXT:    vslidedown.vi v10, v8, 1
+; RV64ZVE32F-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV64ZVE32F-NEXT:    vslidedown.vi v11, v9, 1
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v9
+; RV64ZVE32F-NEXT:    vmv.x.s a3, v10
+; RV64ZVE32F-NEXT:    vslidedown.vi v10, v9, 2
+; RV64ZVE32F-NEXT:    vmv.x.s a4, v11
+; RV64ZVE32F-NEXT:    vmv.v.x v11, a2
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v10
+; RV64ZVE32F-NEXT:    vslidedown.vi v10, v9, 3
+; RV64ZVE32F-NEXT:    vslide1down.vx v11, v11, a4
+; RV64ZVE32F-NEXT:    vmv.x.s a4, v10
+; RV64ZVE32F-NEXT:    vslidedown.vi v10, v9, 4
+; RV64ZVE32F-NEXT:    vslidedown.vi v9, v9, 6
+; RV64ZVE32F-NEXT:    add a3, a0, a3
+; RV64ZVE32F-NEXT:    lbu a3, 0(a3)
+; RV64ZVE32F-NEXT:    vslide1down.vx v11, v11, a2
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v10
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v11, a4
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v10, a2
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v10, a3
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v9
+; RV64ZVE32F-NEXT:    vslide1down.vx v9, v10, a2
+; RV64ZVE32F-NEXT:    vslidedown.vi v9, v9, 1
+; RV64ZVE32F-NEXT:    andi a1, a1, 64
+; RV64ZVE32F-NEXT:    bnez a1, .LBB132_11
+; RV64ZVE32F-NEXT:    j .LBB132_12
+  %ptrs = getelementptr inbounds i8, ptr %base, <7 x i8> %idxs
+  %v = call <7 x i8> @llvm.masked.gather.v7i8.v7p0(<7 x ptr> %ptrs, i32 1, <7 x i1> %m, <7 x i8> %passthru)
+  ret <7 x i8> %v
+}
+
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; RV32V-ZVFH: {{.*}}
 ; RV32V-ZVFHMIN: {{.*}}
diff --git a/llvm/test/CodeGen/RISCV/rvv/pr165232.ll b/llvm/test/CodeGen/RISCV/rvv/pr165232.ll
new file mode 100644
index 0000000000000..bef53c6a5ae62
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/pr165232.ll
@@ -0,0 +1,244 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=riscv64 -mattr=+v | FileCheck %s
+
+target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
+target triple = "riscv64-unknown-linux-gnu"
+
+define i1 @main(ptr %var_117, ptr %arrayinit.element3045, ptr %arrayinit.element3047, ptr %arrayinit.element3049, ptr %arrayinit.element3051, ptr %arrayinit.element3053, ptr %arrayinit.element3055, ptr %arrayinit.element3057, ptr %arrayinit.element3059, ptr %arrayinit.element3061, ptr %arrayinit.element3063, ptr %arrayinit.element3065, ptr %arrayinit.element3067, i64 %var_94_i.07698, target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %1) {
+; CHECK-LABEL: main:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr t0, vlenb
+; CHECK-NEXT:    slli t0, t0, 3
+; CHECK-NEXT:    mv t1, t0
+; CHECK-NEXT:    slli t0, t0, 1
+; CHECK-NEXT:    add t0, t0, t1
+; CHECK-NEXT:    sub sp, sp, t0
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; CHECK-NEXT:    sd a1, 8(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    sd a2, 0(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs4r.v v12, (a1) # vscale x 32-byte Folded Spill
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 2
+; CHECK-NEXT:    add a1, a1, a2
+; CHECK-NEXT:    vs4r.v v16, (a1) # vscale x 32-byte Folded Spill
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 2
+; CHECK-NEXT:    mv a2, a1
+; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    add a1, a1, a2
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs4r.v v8, (a1) # vscale x 32-byte Folded Spill
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    mv a2, a1
+; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    add a1, a1, a2
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    ld t0, 56(a1)
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    mv a2, a1
+; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    add a1, a1, a2
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    ld t1, 48(a1)
+; CHECK-NEXT:    vsetvli t2, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v9, 0
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    mv a2, a1
+; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    add a1, a1, a2
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    ld t2, 40(a1)
+; CHECK-NEXT:    # kill: def $v10 killed $v9 killed $vtype
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    mv a2, a1
+; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    add a1, a1, a2
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    ld t3, 32(a1)
+; CHECK-NEXT:    vmv.v.i v11, 0
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    mv a2, a1
+; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    add a1, a1, a2
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    ld t4, 16(a1)
+; CHECK-NEXT:    vmv.v.i v12, 0
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    mv a2, a1
+; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    add a1, a1, a2
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    ld t5, 24(a1)
+; CHECK-NEXT:    vmv.v.i v13, 0
+; CHECK-NEXT:    vsetvli t6, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vmv.v.i v22, 0
+; CHECK-NEXT:    vmv1r.v v14, v9
+; CHECK-NEXT:    sd zero, 0(a0)
+; CHECK-NEXT:    vmv.v.i v24, 0
+; CHECK-NEXT:    vmv1r.v v15, v9
+; CHECK-NEXT:    vmv1r.v v18, v9
+; CHECK-NEXT:    li t6, 1023
+; CHECK-NEXT:    vmv.v.i v26, 0
+; CHECK-NEXT:    vmv1r.v v19, v9
+; CHECK-NEXT:    slli t6, t6, 52
+; CHECK-NEXT:    vmv.v.i v28, 0
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs2r.v v22, (a1) # vscale x 16-byte Folded Spill
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 1
+; CHECK-NEXT:    add a1, a1, a2
+; CHECK-NEXT:    vs4r.v v24, (a1) # vscale x 32-byte Folded Spill
+; CHECK-NEXT:    slli a2, a2, 1
+; CHECK-NEXT:    add a1, a1, a2
+; CHECK-NEXT:    ld a2, 0(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    vs2r.v v28, (a1) # vscale x 16-byte Folded Spill
+; CHECK-NEXT:    ld a1, 8(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    vmv1r.v v20, v9
+; CHECK-NEXT:    sd t6, 0(t5)
+; CHECK-NEXT:    vmv2r.v v16, v14
+; CHECK-NEXT:    vmv2r.v v14, v12
+; CHECK-NEXT:    vmv2r.v v12, v10
+; CHECK-NEXT:    vmv1r.v v11, v9
+; CHECK-NEXT:    vmv1r.v v21, v9
+; CHECK-NEXT:    csrr t5, vlenb
+; CHECK-NEXT:    slli t5, t5, 3
+; CHECK-NEXT:    add t5, sp, t5
+; CHECK-NEXT:    addi t5, t5, 16
+; CHECK-NEXT:    vs2r.v v18, (t5) # vscale x 16-byte Folded Spill
+; CHECK-NEXT:    csrr t6, vlenb
+; CHECK-NEXT:    slli t6, t6, 1
+; CHECK-NEXT:    add t5, t5, t6
+; CHECK-NEXT:    vs2r.v v20, (t5) # vscale x 16-byte Folded Spill
+; CHECK-NEXT:    vsetivli zero, 0, e32, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v19, 0
+; CHECK-NEXT:    vmclr.m v10
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vmv.v.i v6, 0
+; CHECK-NEXT:  .LBB0_1: # %for.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmv1r.v v20, v19
+; CHECK-NEXT:    vmv1r.v v3, v19
+; CHECK-NEXT:    vmv1r.v v5, v19
+; CHECK-NEXT:    vmv1r.v v2, v19
+; CHECK-NEXT:    vmv1r.v v31, v19
+; CHECK-NEXT:    vmv1r.v v30, v19
+; CHECK-NEXT:    vmv1r.v v4, v19
+; CHECK-NEXT:    vmv2r.v v22, v10
+; CHECK-NEXT:    vmv4r.v v24, v12
+; CHECK-NEXT:    vmv2r.v v28, v16
+; CHECK-NEXT:    vmv2r.v v8, v6
+; CHECK-NEXT:    vmv1r.v v18, v19
+; CHECK-NEXT:    vmv1r.v v21, v10
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, tu, ma
+; CHECK-NEXT:    vle32.v v20, (t4)
+; CHECK-NEXT:    vle32.v v3, (t1)
+; CHECK-NEXT:    vle32.v v30, (a7)
+; CHECK-NEXT:    vle64.v v8, (a4)
+; CHECK-NEXT:    vle32.v v5, (t2)
+; CHECK-NEXT:    vle32.v v2, (t3)
+; CHECK-NEXT:    vle32.v v31, (a6)
+; CHECK-NEXT:    vmv1r.v v24, v30
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
+; CHECK-NEXT:    vmflt.vv v21, v8, v6, v0.t
+; CHECK-NEXT:    vmv1r.v v8, v19
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, tu, mu
+; CHECK-NEXT:    vle32.v v18, (a2)
+; CHECK-NEXT:    vle32.v v8, (a3)
+; CHECK-NEXT:    vle32.v v4, (a5)
+; CHECK-NEXT:    vmv1r.v v22, v20
+; CHECK-NEXT:    csrr t5, vlenb
+; CHECK-NEXT:    slli t5, t5, 3
+; CHECK-NEXT:    add t5, sp, t5
+; CHECK-NEXT:    addi t5, t5, 16
+; CHECK-NEXT:    vl1r.v v1, (t5) # vscale x 8-byte Folded Reload
+; CHECK-NEXT:    csrr t6, vlenb
+; CHECK-NEXT:    add t5, t5, t6
+; CHECK-NEXT:    vl2r.v v2, (t5) # vscale x 16-byte Folded Reload
+; CHECK-NEXT:    slli t6, t6, 1
+; CHECK-NEXT:    add t5, t5, t6
+; CHECK-NEXT:    vl1r.v v4, (t5) # vscale x 8-byte Folded Reload
+; CHECK-NEXT:    vsseg4e32.v v1, (zero)
+; CHECK-NEXT:    vsseg8e32.v v22, (a1)
+; CHECK-NEXT:    vmv1r.v v0, v21
+; CHECK-NEXT:    vssub.vv v8, v19, v18, v0.t
+; CHECK-NEXT:    csrr t5, vlenb
+; CHECK-NEXT:    slli t5, t5, 2
+; CHECK-NEXT:    mv t6, t5
+; CHECK-NEXT:    slli t5, t5, 1
+; CHECK-NEXT:    add t5, t5, t6
+; CHECK-NEXT:    add t5, sp, t5
+; CHECK-NEXT:    addi t5, t5, 16
+; CHECK-NEXT:    vl4r.v v20, (t5) # vscale x 32-byte Folded Reload
+; CHECK-NEXT:    vsetvli zero, t0, e64, m2, ta, ma
+; CHECK-NEXT:    vsseg2e64.v v20, (zero)
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    addi t5, sp, 16
+; CHECK-NEXT:    vl4r.v v20, (t5) # vscale x 32-byte Folded Reload
+; CHECK-NEXT:    csrr t6, vlenb
+; CHECK-NEXT:    slli t6, t6, 2
+; CHECK-NEXT:    add t5, t5, t6
+; CHECK-NEXT:    vl4r.v v24, (t5) # vscale x 32-byte Folded Reload
+; CHECK-NEXT:    vsetivli zero, 0, e64, m2, ta, ma
+; CHECK-NEXT:    vsseg4e64.v v20, (zero), v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vsseg8e32.v v8, (a0)
+; CHECK-NEXT:    csrr t5, vlenb
+; CHECK-NEXT:    slli t5, t5, 4
+; CHECK-NEXT:    add t5, sp, t5
+; CHECK-NEXT:    addi t5, t5, 16
+; CHECK-NEXT:    vl4r.v v20, (t5) # vscale x 32-byte Folded Reload
+; CHECK-NEXT:    csrr t6, vlenb
+; CHECK-NEXT:    slli t6, t6, 2
+; CHECK-NEXT:    add t5, t5, t6
+; CHECK-NEXT:    vl4r.v v24, (t5) # vscale x 32-byte Folded Reload
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vsseg4e64.v v20, (zero)
+; CHECK-NEXT:    j .LBB0_1
+entry:
+  store double 0.000000e+00, ptr %var_117, align 8
+  store double 1.000000e+00, ptr %arrayinit.element3061, align 8
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %2 = call <vscale x 2 x float> @llvm.riscv.vle.nxv2f32.p0.i64(<vscale x 2 x float> zeroinitializer, ptr %arrayinit.element3059, i64 0)
+  %3 = call <vscale x 2 x float> @llvm.riscv.vle.nxv2f32.p0.i64(<vscale x 2 x float> zeroinitializer, ptr %arrayinit.element3067, i64 0)
+  %4 = call <vscale x 2 x float> @llvm.riscv.vle.nxv2f32.p0.i64(<vscale x 2 x float> zeroinitializer, ptr %arrayinit.element3065, i64 0)
+  %5 = call <vscale x 2 x float> @llvm.riscv.vle.nxv2f32.p0.i64(<vscale x 2 x float> zeroinitializer, ptr %arrayinit.element3063, i64 0)
+  %6 = call <vscale x 2 x float> @llvm.riscv.vle.nxv2f32.p0.i64(<vscale x 2 x float> zeroinitializer, ptr %arrayinit.element3055, i64 0)
+  %7 = call <vscale x 2 x float> @llvm.riscv.vle.nxv2f32.p0.i64(<vscale x 2 x float> zeroinitializer, ptr %arrayinit.element3057, i64 0)
+  %8 = call <vscale x 2 x float> @llvm.riscv.vle.nxv2f32.p0.i64(<vscale x 2 x float> zeroinitializer, ptr %arrayinit.element3053, i64 0)
+  %9 = call <vscale x 2 x double> @llvm.riscv.vle.nxv2f64.p0.i64(<vscale x 2 x double> zeroinitializer, ptr %arrayinit.element3051, i64 0)
+  %10 = tail call <vscale x 2 x i32> @llvm.riscv.vle.nxv2i32.p0.i64(<vscale x 2 x i32> zeroinitializer, ptr %arrayinit.element3047, i64 0)
+  %11 = tail call <vscale x 2 x i32> @llvm.riscv.vle.nxv2i32.p0.i64(<vscale x 2 x i32> zeroinitializer, ptr %arrayinit.element3049, i64 0)
+  call void @llvm.riscv.vsseg4.triscv.vector.tuple_nxv8i8_4t.p0.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) zeroinitializer, ptr null, i64 0, i64 5)
+  %12 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv2f32(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) zeroinitializer, <vscale x 2 x float> %8, i32 0)
+  %13 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv2f32(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %12, <vscale x 2 x float> %7, i32 2)
+  %14 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv2f32(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %13, <vscale x 2 x float> %6, i32 0)
+  %15 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv2f32(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %14, <vscale x 2 x float> %5, i32 0)
+  %16 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv2f32(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %15, <vscale x 2 x float> %4, i32 0)
+  %17 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv2f32(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %16, <vscale x 2 x float> %3, i32 0)
+  %18 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv2f32(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %17, <vscale x 2 x float> %2, i32 0)
+  call void @llvm.riscv.vsseg8.triscv.vector.tuple_nxv8i8_8t.p0.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %18, ptr %arrayinit.element3045, i64 0, i64 5)
+  %19 = tail call <vscale x 2 x i1> @llvm.riscv.vmfgt.mask.nxv2f64.nxv2f64.i64(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x double> zeroinitializer, <vscale x 2 x double> %9, <vscale x 2 x i1> zeroinitializer, i64 0)
+  %20 = tail call <vscale x 2 x i32> @llvm.riscv.vssub.mask.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> %11, <vscale x 2 x i32> zeroinitializer, <vscale x 2 x i32> %10, <vscale x 2 x i1> %19, i64 0, i64 0)
+  call void @llvm.riscv.vsseg2.triscv.vector.tuple_nxv16i8_2t.p0.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, ptr null, i64 %var_94_i.07698, i64 6)
+  call void @llvm.riscv.vsseg4.mask.triscv.vector.tuple_nxv16i8_4t.p0.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) zeroinitializer, ptr null, <vscale x 2 x i1> zeroinitializer, i64 0, i64 6)
+  %21 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv2i32(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) poison, <vscale x 2 x i32> %20, i32 0)
+  call void @llvm.riscv.vsseg8.triscv.vector.tuple_nxv8i8_8t.p0.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %21, ptr %var_117, i64 0, i64 5)
+  call void @llvm.riscv.vsseg4.triscv.vector.tuple_nxv16i8_4t.p0.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %1, ptr null, i64 0, i64 6)
+  br label %for.body
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/zvlsseg-spill.mir b/llvm/test/CodeGen/RISCV/rvv/zvlsseg-spill.mir
index dd9960d17af43..9c2fa9d0009a7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/zvlsseg-spill.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/zvlsseg-spill.mir
@@ -32,10 +32,10 @@ body: |
     ; CHECK-NEXT: $x11 = ADDI $x2, 16
     ; CHECK-NEXT: VS4R_V $v0m4, $x11, implicit $v0_v1_v2_v3_v4_v5_v6 :: (store (<vscale x 1 x s256>) into %stack.0, align 8)
     ; CHECK-NEXT: $x12 = PseudoReadVLENB
-    ; CHECK-NEXT: $x13 = SLLI $x12, 2
-    ; CHECK-NEXT: $x11 = ADD killed $x11, killed $x13
+    ; CHECK-NEXT: $x12 = SLLI killed $x12, 2
+    ; CHECK-NEXT: $x11 = ADD killed $x11, $x12
     ; CHECK-NEXT: VS2R_V $v4m2, $x11, implicit $v0_v1_v2_v3_v4_v5_v6 :: (store (<vscale x 1 x s128>) into %stack.0, align 8)
-    ; CHECK-NEXT: $x12 = SLLI killed $x12, 1
+    ; CHECK-NEXT: $x12 = SRLI killed $x12, 1
     ; CHECK-NEXT: $x11 = ADD killed $x11, killed $x12
     ; CHECK-NEXT: VS1R_V $v6, killed $x11, implicit $v0_v1_v2_v3_v4_v5_v6 :: (store (<vscale x 1 x s64>) into %stack.0)
     ; CHECK-NEXT: $x11 = ADDI $x2, 16
@@ -93,10 +93,10 @@ body: |
     ; CHECK-NEXT: $x11 = ADDI $x2, 16
     ; CHECK-NEXT: $v10m2 = VL2RE8_V $x11 :: (load (<vscale x 1 x s128>) from %stack.0, align 8)
     ; CHECK-NEXT: $x12 = PseudoReadVLENB
-    ; CHECK-NEXT: $x13 = SLLI $x12, 1
-    ; CHECK-NEXT: $x11 = ADD killed $x11, killed $x13
+    ; CHECK-NEXT: $x12 = SLLI killed $x12, 1
+    ; CHECK-NEXT: $x11 = ADD killed $x11, $x12
     ; CHECK-NEXT: $v12m4 = VL4RE8_V $x11 :: (load (<vscale x 1 x s256>) from %stack.0, align 8)
-    ; CHECK-NEXT: $x12 = SLLI killed $x12, 2
+    ; CHECK-NEXT: $x12 = SLLI killed $x12, 1
     ; CHECK-NEXT: $x11 = ADD killed $x11, killed $x12
     ; CHECK-NEXT: $v16 = VL1RE8_V killed $x11 :: (load (<vscale x 1 x s64>) from %stack.0)
     ; CHECK-NEXT: VS1R_V killed $v10, killed renamable $x10
diff --git a/llvm/test/CodeGen/RISCV/short-forward-branch-opt-min-max.ll b/llvm/test/CodeGen/RISCV/short-forward-branch-opt-min-max.ll
new file mode 100644
index 0000000000000..05e06cea9967a
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/short-forward-branch-opt-min-max.ll
@@ -0,0 +1,703 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=riscv32 -mattr=+zbb | FileCheck %s --check-prefixes=RV32I-ZBB
+; RUN: llc < %s -mtriple=riscv64 -mattr=+zbb | FileCheck %s --check-prefixes=RV64I-ZBB
+; RUN: llc < %s -mtriple=riscv32 -mattr=+zbb,+short-forward-branch-opt | \
+; RUN:   FileCheck %s --check-prefixes=RV32I-SFB-ZBB
+; RUN: llc < %s -mtriple=riscv64 -mattr=+zbb,+short-forward-branch-opt | \
+; RUN:   FileCheck %s --check-prefixes=RV64I-SFB-ZBB
+; RUN: llc < %s -mtriple=riscv32 -mattr=+zbb,+short-forward-branch-i-minmax | \
+; RUN:   FileCheck %s --check-prefixes=RV32I-SFBIMinMax-ZBB
+; RUN: llc < %s -mtriple=riscv64 -mattr=+zbb,+short-forward-branch-i-minmax | \
+; RUN:   FileCheck %s --check-prefixes=RV64I-SFBIMinMax-ZBB
+
+define i32 @select_example_smax(i32 %a, i32 %b, i1 zeroext %x, i32 %y) {
+; RV32I-ZBB-LABEL: select_example_smax:
+; RV32I-ZBB:       # %bb.0: # %entry
+; RV32I-ZBB-NEXT:    beqz a2, .LBB0_2
+; RV32I-ZBB-NEXT:  # %bb.1:
+; RV32I-ZBB-NEXT:    max a1, a0, a3
+; RV32I-ZBB-NEXT:  .LBB0_2: # %entry
+; RV32I-ZBB-NEXT:    mv a0, a1
+; RV32I-ZBB-NEXT:    ret
+;
+; RV64I-ZBB-LABEL: select_example_smax:
+; RV64I-ZBB:       # %bb.0: # %entry
+; RV64I-ZBB-NEXT:    beqz a2, .LBB0_2
+; RV64I-ZBB-NEXT:  # %bb.1:
+; RV64I-ZBB-NEXT:    sext.w a3, a3
+; RV64I-ZBB-NEXT:    sext.w a0, a0
+; RV64I-ZBB-NEXT:    max a1, a0, a3
+; RV64I-ZBB-NEXT:  .LBB0_2: # %entry
+; RV64I-ZBB-NEXT:    mv a0, a1
+; RV64I-ZBB-NEXT:    ret
+;
+; RV32I-SFB-ZBB-LABEL: select_example_smax:
+; RV32I-SFB-ZBB:       # %bb.0: # %entry
+; RV32I-SFB-ZBB-NEXT:    max a0, a0, a3
+; RV32I-SFB-ZBB-NEXT:    bnez a2, .LBB0_2
+; RV32I-SFB-ZBB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-ZBB-NEXT:    mv a0, a1
+; RV32I-SFB-ZBB-NEXT:  .LBB0_2: # %entry
+; RV32I-SFB-ZBB-NEXT:    ret
+;
+; RV64I-SFB-ZBB-LABEL: select_example_smax:
+; RV64I-SFB-ZBB:       # %bb.0: # %entry
+; RV64I-SFB-ZBB-NEXT:    sext.w a3, a3
+; RV64I-SFB-ZBB-NEXT:    sext.w a0, a0
+; RV64I-SFB-ZBB-NEXT:    max a0, a0, a3
+; RV64I-SFB-ZBB-NEXT:    bnez a2, .LBB0_2
+; RV64I-SFB-ZBB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-ZBB-NEXT:    mv a0, a1
+; RV64I-SFB-ZBB-NEXT:  .LBB0_2: # %entry
+; RV64I-SFB-ZBB-NEXT:    ret
+;
+; RV32I-SFBIMinMax-ZBB-LABEL: select_example_smax:
+; RV32I-SFBIMinMax-ZBB:       # %bb.0: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    beqz a2, .LBB0_2
+; RV32I-SFBIMinMax-ZBB-NEXT:  # %bb.1: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    max a1, a0, a3
+; RV32I-SFBIMinMax-ZBB-NEXT:  .LBB0_2: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    mv a0, a1
+; RV32I-SFBIMinMax-ZBB-NEXT:    ret
+;
+; RV64I-SFBIMinMax-ZBB-LABEL: select_example_smax:
+; RV64I-SFBIMinMax-ZBB:       # %bb.0: # %entry
+; RV64I-SFBIMinMax-ZBB-NEXT:    sext.w a3, a3
+; RV64I-SFBIMinMax-ZBB-NEXT:    sext.w a0, a0
+; RV64I-SFBIMinMax-ZBB-NEXT:    beqz a2, .LBB0_2
+; RV64I-SFBIMinMax-ZBB-NEXT:  # %bb.1: # %entry
+; RV64I-SFBIMinMax-ZBB-NEXT:    max a1, a0, a3
+; RV64I-SFBIMinMax-ZBB-NEXT:  .LBB0_2: # %entry
+; RV64I-SFBIMinMax-ZBB-NEXT:    mv a0, a1
+; RV64I-SFBIMinMax-ZBB-NEXT:    ret
+entry:
+  %res = call i32 @llvm.smax.i32(i32 %a, i32 %y)
+  %sel = select i1 %x, i32 %res, i32 %b
+  ret i32 %sel
+}
+
+define i32 @select_example_smin(i32 %a, i32 %b, i1 zeroext %x, i32 %y) {
+; RV32I-ZBB-LABEL: select_example_smin:
+; RV32I-ZBB:       # %bb.0: # %entry
+; RV32I-ZBB-NEXT:    beqz a2, .LBB1_2
+; RV32I-ZBB-NEXT:  # %bb.1:
+; RV32I-ZBB-NEXT:    min a1, a0, a3
+; RV32I-ZBB-NEXT:  .LBB1_2: # %entry
+; RV32I-ZBB-NEXT:    mv a0, a1
+; RV32I-ZBB-NEXT:    ret
+;
+; RV64I-ZBB-LABEL: select_example_smin:
+; RV64I-ZBB:       # %bb.0: # %entry
+; RV64I-ZBB-NEXT:    beqz a2, .LBB1_2
+; RV64I-ZBB-NEXT:  # %bb.1:
+; RV64I-ZBB-NEXT:    sext.w a3, a3
+; RV64I-ZBB-NEXT:    sext.w a0, a0
+; RV64I-ZBB-NEXT:    min a1, a0, a3
+; RV64I-ZBB-NEXT:  .LBB1_2: # %entry
+; RV64I-ZBB-NEXT:    mv a0, a1
+; RV64I-ZBB-NEXT:    ret
+;
+; RV32I-SFB-ZBB-LABEL: select_example_smin:
+; RV32I-SFB-ZBB:       # %bb.0: # %entry
+; RV32I-SFB-ZBB-NEXT:    min a0, a0, a3
+; RV32I-SFB-ZBB-NEXT:    bnez a2, .LBB1_2
+; RV32I-SFB-ZBB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-ZBB-NEXT:    mv a0, a1
+; RV32I-SFB-ZBB-NEXT:  .LBB1_2: # %entry
+; RV32I-SFB-ZBB-NEXT:    ret
+;
+; RV64I-SFB-ZBB-LABEL: select_example_smin:
+; RV64I-SFB-ZBB:       # %bb.0: # %entry
+; RV64I-SFB-ZBB-NEXT:    sext.w a3, a3
+; RV64I-SFB-ZBB-NEXT:    sext.w a0, a0
+; RV64I-SFB-ZBB-NEXT:    min a0, a0, a3
+; RV64I-SFB-ZBB-NEXT:    bnez a2, .LBB1_2
+; RV64I-SFB-ZBB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-ZBB-NEXT:    mv a0, a1
+; RV64I-SFB-ZBB-NEXT:  .LBB1_2: # %entry
+; RV64I-SFB-ZBB-NEXT:    ret
+;
+; RV32I-SFBIMinMax-ZBB-LABEL: select_example_smin:
+; RV32I-SFBIMinMax-ZBB:       # %bb.0: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    beqz a2, .LBB1_2
+; RV32I-SFBIMinMax-ZBB-NEXT:  # %bb.1: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    min a1, a0, a3
+; RV32I-SFBIMinMax-ZBB-NEXT:  .LBB1_2: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    mv a0, a1
+; RV32I-SFBIMinMax-ZBB-NEXT:    ret
+;
+; RV64I-SFBIMinMax-ZBB-LABEL: select_example_smin:
+; RV64I-SFBIMinMax-ZBB:       # %bb.0: # %entry
+; RV64I-SFBIMinMax-ZBB-NEXT:    sext.w a3, a3
+; RV64I-SFBIMinMax-ZBB-NEXT:    sext.w a0, a0
+; RV64I-SFBIMinMax-ZBB-NEXT:    beqz a2, .LBB1_2
+; RV64I-SFBIMinMax-ZBB-NEXT:  # %bb.1: # %entry
+; RV64I-SFBIMinMax-ZBB-NEXT:    min a1, a0, a3
+; RV64I-SFBIMinMax-ZBB-NEXT:  .LBB1_2: # %entry
+; RV64I-SFBIMinMax-ZBB-NEXT:    mv a0, a1
+; RV64I-SFBIMinMax-ZBB-NEXT:    ret
+entry:
+  %res = call i32 @llvm.smin.i32(i32 %a, i32 %y)
+  %sel = select i1 %x, i32 %res, i32 %b
+  ret i32 %sel
+}
+
+define i32 @select_example_umax(i32 %a, i32 %b, i1 zeroext %x, i32 %y) {
+; RV32I-ZBB-LABEL: select_example_umax:
+; RV32I-ZBB:       # %bb.0: # %entry
+; RV32I-ZBB-NEXT:    beqz a2, .LBB2_2
+; RV32I-ZBB-NEXT:  # %bb.1:
+; RV32I-ZBB-NEXT:    maxu a1, a0, a3
+; RV32I-ZBB-NEXT:  .LBB2_2: # %entry
+; RV32I-ZBB-NEXT:    mv a0, a1
+; RV32I-ZBB-NEXT:    ret
+;
+; RV64I-ZBB-LABEL: select_example_umax:
+; RV64I-ZBB:       # %bb.0: # %entry
+; RV64I-ZBB-NEXT:    beqz a2, .LBB2_2
+; RV64I-ZBB-NEXT:  # %bb.1:
+; RV64I-ZBB-NEXT:    sext.w a3, a3
+; RV64I-ZBB-NEXT:    sext.w a0, a0
+; RV64I-ZBB-NEXT:    maxu a1, a0, a3
+; RV64I-ZBB-NEXT:  .LBB2_2: # %entry
+; RV64I-ZBB-NEXT:    mv a0, a1
+; RV64I-ZBB-NEXT:    ret
+;
+; RV32I-SFB-ZBB-LABEL: select_example_umax:
+; RV32I-SFB-ZBB:       # %bb.0: # %entry
+; RV32I-SFB-ZBB-NEXT:    maxu a0, a0, a3
+; RV32I-SFB-ZBB-NEXT:    bnez a2, .LBB2_2
+; RV32I-SFB-ZBB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-ZBB-NEXT:    mv a0, a1
+; RV32I-SFB-ZBB-NEXT:  .LBB2_2: # %entry
+; RV32I-SFB-ZBB-NEXT:    ret
+;
+; RV64I-SFB-ZBB-LABEL: select_example_umax:
+; RV64I-SFB-ZBB:       # %bb.0: # %entry
+; RV64I-SFB-ZBB-NEXT:    sext.w a3, a3
+; RV64I-SFB-ZBB-NEXT:    sext.w a0, a0
+; RV64I-SFB-ZBB-NEXT:    maxu a0, a0, a3
+; RV64I-SFB-ZBB-NEXT:    bnez a2, .LBB2_2
+; RV64I-SFB-ZBB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-ZBB-NEXT:    mv a0, a1
+; RV64I-SFB-ZBB-NEXT:  .LBB2_2: # %entry
+; RV64I-SFB-ZBB-NEXT:    ret
+;
+; RV32I-SFBIMinMax-ZBB-LABEL: select_example_umax:
+; RV32I-SFBIMinMax-ZBB:       # %bb.0: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    beqz a2, .LBB2_2
+; RV32I-SFBIMinMax-ZBB-NEXT:  # %bb.1: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    maxu a1, a0, a3
+; RV32I-SFBIMinMax-ZBB-NEXT:  .LBB2_2: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    mv a0, a1
+; RV32I-SFBIMinMax-ZBB-NEXT:    ret
+;
+; RV64I-SFBIMinMax-ZBB-LABEL: select_example_umax:
+; RV64I-SFBIMinMax-ZBB:       # %bb.0: # %entry
+; RV64I-SFBIMinMax-ZBB-NEXT:    sext.w a3, a3
+; RV64I-SFBIMinMax-ZBB-NEXT:    sext.w a0, a0
+; RV64I-SFBIMinMax-ZBB-NEXT:    beqz a2, .LBB2_2
+; RV64I-SFBIMinMax-ZBB-NEXT:  # %bb.1: # %entry
+; RV64I-SFBIMinMax-ZBB-NEXT:    maxu a1, a0, a3
+; RV64I-SFBIMinMax-ZBB-NEXT:  .LBB2_2: # %entry
+; RV64I-SFBIMinMax-ZBB-NEXT:    mv a0, a1
+; RV64I-SFBIMinMax-ZBB-NEXT:    ret
+entry:
+  %res = call i32 @llvm.umax.i32(i32 %a, i32 %y)
+  %sel = select i1 %x, i32 %res, i32 %b
+  ret i32 %sel
+}
+
+define i32 @select_example_umin(i32 %a, i32 %b, i1 zeroext %x, i32 %y) {
+; RV32I-ZBB-LABEL: select_example_umin:
+; RV32I-ZBB:       # %bb.0: # %entry
+; RV32I-ZBB-NEXT:    beqz a2, .LBB3_2
+; RV32I-ZBB-NEXT:  # %bb.1:
+; RV32I-ZBB-NEXT:    minu a1, a0, a3
+; RV32I-ZBB-NEXT:  .LBB3_2: # %entry
+; RV32I-ZBB-NEXT:    mv a0, a1
+; RV32I-ZBB-NEXT:    ret
+;
+; RV64I-ZBB-LABEL: select_example_umin:
+; RV64I-ZBB:       # %bb.0: # %entry
+; RV64I-ZBB-NEXT:    beqz a2, .LBB3_2
+; RV64I-ZBB-NEXT:  # %bb.1:
+; RV64I-ZBB-NEXT:    sext.w a3, a3
+; RV64I-ZBB-NEXT:    sext.w a0, a0
+; RV64I-ZBB-NEXT:    minu a1, a0, a3
+; RV64I-ZBB-NEXT:  .LBB3_2: # %entry
+; RV64I-ZBB-NEXT:    mv a0, a1
+; RV64I-ZBB-NEXT:    ret
+;
+; RV32I-SFB-ZBB-LABEL: select_example_umin:
+; RV32I-SFB-ZBB:       # %bb.0: # %entry
+; RV32I-SFB-ZBB-NEXT:    minu a0, a0, a3
+; RV32I-SFB-ZBB-NEXT:    bnez a2, .LBB3_2
+; RV32I-SFB-ZBB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-ZBB-NEXT:    mv a0, a1
+; RV32I-SFB-ZBB-NEXT:  .LBB3_2: # %entry
+; RV32I-SFB-ZBB-NEXT:    ret
+;
+; RV64I-SFB-ZBB-LABEL: select_example_umin:
+; RV64I-SFB-ZBB:       # %bb.0: # %entry
+; RV64I-SFB-ZBB-NEXT:    sext.w a3, a3
+; RV64I-SFB-ZBB-NEXT:    sext.w a0, a0
+; RV64I-SFB-ZBB-NEXT:    minu a0, a0, a3
+; RV64I-SFB-ZBB-NEXT:    bnez a2, .LBB3_2
+; RV64I-SFB-ZBB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-ZBB-NEXT:    mv a0, a1
+; RV64I-SFB-ZBB-NEXT:  .LBB3_2: # %entry
+; RV64I-SFB-ZBB-NEXT:    ret
+;
+; RV32I-SFBIMinMax-ZBB-LABEL: select_example_umin:
+; RV32I-SFBIMinMax-ZBB:       # %bb.0: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    beqz a2, .LBB3_2
+; RV32I-SFBIMinMax-ZBB-NEXT:  # %bb.1: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    minu a1, a0, a3
+; RV32I-SFBIMinMax-ZBB-NEXT:  .LBB3_2: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    mv a0, a1
+; RV32I-SFBIMinMax-ZBB-NEXT:    ret
+;
+; RV64I-SFBIMinMax-ZBB-LABEL: select_example_umin:
+; RV64I-SFBIMinMax-ZBB:       # %bb.0: # %entry
+; RV64I-SFBIMinMax-ZBB-NEXT:    sext.w a3, a3
+; RV64I-SFBIMinMax-ZBB-NEXT:    sext.w a0, a0
+; RV64I-SFBIMinMax-ZBB-NEXT:    beqz a2, .LBB3_2
+; RV64I-SFBIMinMax-ZBB-NEXT:  # %bb.1: # %entry
+; RV64I-SFBIMinMax-ZBB-NEXT:    minu a1, a0, a3
+; RV64I-SFBIMinMax-ZBB-NEXT:  .LBB3_2: # %entry
+; RV64I-SFBIMinMax-ZBB-NEXT:    mv a0, a1
+; RV64I-SFBIMinMax-ZBB-NEXT:    ret
+entry:
+  %res = call i32 @llvm.umin.i32(i32 %a, i32 %y)
+  %sel = select i1 %x, i32 %res, i32 %b
+  ret i32 %sel
+}
+
+define i64 @select_example_smax_1(i64 %a, i64 %b, i1 zeroext %x, i64 %y) {
+; RV32I-ZBB-LABEL: select_example_smax_1:
+; RV32I-ZBB:       # %bb.0: # %entry
+; RV32I-ZBB-NEXT:    beq a1, a6, .LBB4_2
+; RV32I-ZBB-NEXT:  # %bb.1: # %entry
+; RV32I-ZBB-NEXT:    slt a7, a6, a1
+; RV32I-ZBB-NEXT:    beqz a7, .LBB4_3
+; RV32I-ZBB-NEXT:    j .LBB4_4
+; RV32I-ZBB-NEXT:  .LBB4_2:
+; RV32I-ZBB-NEXT:    sltu a7, a5, a0
+; RV32I-ZBB-NEXT:    bnez a7, .LBB4_4
+; RV32I-ZBB-NEXT:  .LBB4_3: # %entry
+; RV32I-ZBB-NEXT:    mv a1, a6
+; RV32I-ZBB-NEXT:    mv a0, a5
+; RV32I-ZBB-NEXT:  .LBB4_4: # %entry
+; RV32I-ZBB-NEXT:    beqz a4, .LBB4_6
+; RV32I-ZBB-NEXT:  # %bb.5: # %entry
+; RV32I-ZBB-NEXT:    ret
+; RV32I-ZBB-NEXT:  .LBB4_6: # %entry
+; RV32I-ZBB-NEXT:    mv a0, a2
+; RV32I-ZBB-NEXT:    mv a1, a3
+; RV32I-ZBB-NEXT:    ret
+;
+; RV64I-ZBB-LABEL: select_example_smax_1:
+; RV64I-ZBB:       # %bb.0: # %entry
+; RV64I-ZBB-NEXT:    beqz a2, .LBB4_2
+; RV64I-ZBB-NEXT:  # %bb.1:
+; RV64I-ZBB-NEXT:    max a1, a0, a3
+; RV64I-ZBB-NEXT:  .LBB4_2: # %entry
+; RV64I-ZBB-NEXT:    mv a0, a1
+; RV64I-ZBB-NEXT:    ret
+;
+; RV32I-SFB-ZBB-LABEL: select_example_smax_1:
+; RV32I-SFB-ZBB:       # %bb.0: # %entry
+; RV32I-SFB-ZBB-NEXT:    sltu a7, a5, a0
+; RV32I-SFB-ZBB-NEXT:    slt t0, a6, a1
+; RV32I-SFB-ZBB-NEXT:    bne a1, a6, .LBB4_2
+; RV32I-SFB-ZBB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-ZBB-NEXT:    mv t0, a7
+; RV32I-SFB-ZBB-NEXT:  .LBB4_2: # %entry
+; RV32I-SFB-ZBB-NEXT:    bnez t0, .LBB4_4
+; RV32I-SFB-ZBB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-ZBB-NEXT:    mv a1, a6
+; RV32I-SFB-ZBB-NEXT:  .LBB4_4: # %entry
+; RV32I-SFB-ZBB-NEXT:    bnez t0, .LBB4_6
+; RV32I-SFB-ZBB-NEXT:  # %bb.5: # %entry
+; RV32I-SFB-ZBB-NEXT:    mv a0, a5
+; RV32I-SFB-ZBB-NEXT:  .LBB4_6: # %entry
+; RV32I-SFB-ZBB-NEXT:    bnez a4, .LBB4_8
+; RV32I-SFB-ZBB-NEXT:  # %bb.7: # %entry
+; RV32I-SFB-ZBB-NEXT:    mv a0, a2
+; RV32I-SFB-ZBB-NEXT:  .LBB4_8: # %entry
+; RV32I-SFB-ZBB-NEXT:    bnez a4, .LBB4_10
+; RV32I-SFB-ZBB-NEXT:  # %bb.9: # %entry
+; RV32I-SFB-ZBB-NEXT:    mv a1, a3
+; RV32I-SFB-ZBB-NEXT:  .LBB4_10: # %entry
+; RV32I-SFB-ZBB-NEXT:    ret
+;
+; RV64I-SFB-ZBB-LABEL: select_example_smax_1:
+; RV64I-SFB-ZBB:       # %bb.0: # %entry
+; RV64I-SFB-ZBB-NEXT:    max a0, a0, a3
+; RV64I-SFB-ZBB-NEXT:    bnez a2, .LBB4_2
+; RV64I-SFB-ZBB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-ZBB-NEXT:    mv a0, a1
+; RV64I-SFB-ZBB-NEXT:  .LBB4_2: # %entry
+; RV64I-SFB-ZBB-NEXT:    ret
+;
+; RV32I-SFBIMinMax-ZBB-LABEL: select_example_smax_1:
+; RV32I-SFBIMinMax-ZBB:       # %bb.0: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    sltu a7, a5, a0
+; RV32I-SFBIMinMax-ZBB-NEXT:    slt t0, a6, a1
+; RV32I-SFBIMinMax-ZBB-NEXT:    bne a1, a6, .LBB4_2
+; RV32I-SFBIMinMax-ZBB-NEXT:  # %bb.1: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    mv t0, a7
+; RV32I-SFBIMinMax-ZBB-NEXT:  .LBB4_2: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    bnez t0, .LBB4_4
+; RV32I-SFBIMinMax-ZBB-NEXT:  # %bb.3: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    mv a1, a6
+; RV32I-SFBIMinMax-ZBB-NEXT:  .LBB4_4: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    bnez t0, .LBB4_6
+; RV32I-SFBIMinMax-ZBB-NEXT:  # %bb.5: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    mv a0, a5
+; RV32I-SFBIMinMax-ZBB-NEXT:  .LBB4_6: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    bnez a4, .LBB4_8
+; RV32I-SFBIMinMax-ZBB-NEXT:  # %bb.7: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    mv a0, a2
+; RV32I-SFBIMinMax-ZBB-NEXT:  .LBB4_8: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    bnez a4, .LBB4_10
+; RV32I-SFBIMinMax-ZBB-NEXT:  # %bb.9: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    mv a1, a3
+; RV32I-SFBIMinMax-ZBB-NEXT:  .LBB4_10: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    ret
+;
+; RV64I-SFBIMinMax-ZBB-LABEL: select_example_smax_1:
+; RV64I-SFBIMinMax-ZBB:       # %bb.0: # %entry
+; RV64I-SFBIMinMax-ZBB-NEXT:    beqz a2, .LBB4_2
+; RV64I-SFBIMinMax-ZBB-NEXT:  # %bb.1: # %entry
+; RV64I-SFBIMinMax-ZBB-NEXT:    max a1, a0, a3
+; RV64I-SFBIMinMax-ZBB-NEXT:  .LBB4_2: # %entry
+; RV64I-SFBIMinMax-ZBB-NEXT:    mv a0, a1
+; RV64I-SFBIMinMax-ZBB-NEXT:    ret
+entry:
+  %res = call i64 @llvm.smax.i64(i64 %a, i64 %y)
+  %sel = select i1 %x, i64 %res, i64 %b
+  ret i64 %sel
+}
+
+define i64 @select_example_smin_1(i64 %a, i64 %b, i1 zeroext %x, i64 %y) {
+; RV32I-ZBB-LABEL: select_example_smin_1:
+; RV32I-ZBB:       # %bb.0: # %entry
+; RV32I-ZBB-NEXT:    beq a1, a6, .LBB5_2
+; RV32I-ZBB-NEXT:  # %bb.1: # %entry
+; RV32I-ZBB-NEXT:    slt a7, a1, a6
+; RV32I-ZBB-NEXT:    beqz a7, .LBB5_3
+; RV32I-ZBB-NEXT:    j .LBB5_4
+; RV32I-ZBB-NEXT:  .LBB5_2:
+; RV32I-ZBB-NEXT:    sltu a7, a0, a5
+; RV32I-ZBB-NEXT:    bnez a7, .LBB5_4
+; RV32I-ZBB-NEXT:  .LBB5_3: # %entry
+; RV32I-ZBB-NEXT:    mv a1, a6
+; RV32I-ZBB-NEXT:    mv a0, a5
+; RV32I-ZBB-NEXT:  .LBB5_4: # %entry
+; RV32I-ZBB-NEXT:    beqz a4, .LBB5_6
+; RV32I-ZBB-NEXT:  # %bb.5: # %entry
+; RV32I-ZBB-NEXT:    ret
+; RV32I-ZBB-NEXT:  .LBB5_6: # %entry
+; RV32I-ZBB-NEXT:    mv a0, a2
+; RV32I-ZBB-NEXT:    mv a1, a3
+; RV32I-ZBB-NEXT:    ret
+;
+; RV64I-ZBB-LABEL: select_example_smin_1:
+; RV64I-ZBB:       # %bb.0: # %entry
+; RV64I-ZBB-NEXT:    beqz a2, .LBB5_2
+; RV64I-ZBB-NEXT:  # %bb.1:
+; RV64I-ZBB-NEXT:    min a1, a0, a3
+; RV64I-ZBB-NEXT:  .LBB5_2: # %entry
+; RV64I-ZBB-NEXT:    mv a0, a1
+; RV64I-ZBB-NEXT:    ret
+;
+; RV32I-SFB-ZBB-LABEL: select_example_smin_1:
+; RV32I-SFB-ZBB:       # %bb.0: # %entry
+; RV32I-SFB-ZBB-NEXT:    sltu a7, a0, a5
+; RV32I-SFB-ZBB-NEXT:    slt t0, a1, a6
+; RV32I-SFB-ZBB-NEXT:    bne a1, a6, .LBB5_2
+; RV32I-SFB-ZBB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-ZBB-NEXT:    mv t0, a7
+; RV32I-SFB-ZBB-NEXT:  .LBB5_2: # %entry
+; RV32I-SFB-ZBB-NEXT:    bnez t0, .LBB5_4
+; RV32I-SFB-ZBB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-ZBB-NEXT:    mv a1, a6
+; RV32I-SFB-ZBB-NEXT:  .LBB5_4: # %entry
+; RV32I-SFB-ZBB-NEXT:    bnez t0, .LBB5_6
+; RV32I-SFB-ZBB-NEXT:  # %bb.5: # %entry
+; RV32I-SFB-ZBB-NEXT:    mv a0, a5
+; RV32I-SFB-ZBB-NEXT:  .LBB5_6: # %entry
+; RV32I-SFB-ZBB-NEXT:    bnez a4, .LBB5_8
+; RV32I-SFB-ZBB-NEXT:  # %bb.7: # %entry
+; RV32I-SFB-ZBB-NEXT:    mv a0, a2
+; RV32I-SFB-ZBB-NEXT:  .LBB5_8: # %entry
+; RV32I-SFB-ZBB-NEXT:    bnez a4, .LBB5_10
+; RV32I-SFB-ZBB-NEXT:  # %bb.9: # %entry
+; RV32I-SFB-ZBB-NEXT:    mv a1, a3
+; RV32I-SFB-ZBB-NEXT:  .LBB5_10: # %entry
+; RV32I-SFB-ZBB-NEXT:    ret
+;
+; RV64I-SFB-ZBB-LABEL: select_example_smin_1:
+; RV64I-SFB-ZBB:       # %bb.0: # %entry
+; RV64I-SFB-ZBB-NEXT:    min a0, a0, a3
+; RV64I-SFB-ZBB-NEXT:    bnez a2, .LBB5_2
+; RV64I-SFB-ZBB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-ZBB-NEXT:    mv a0, a1
+; RV64I-SFB-ZBB-NEXT:  .LBB5_2: # %entry
+; RV64I-SFB-ZBB-NEXT:    ret
+;
+; RV32I-SFBIMinMax-ZBB-LABEL: select_example_smin_1:
+; RV32I-SFBIMinMax-ZBB:       # %bb.0: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    sltu a7, a0, a5
+; RV32I-SFBIMinMax-ZBB-NEXT:    slt t0, a1, a6
+; RV32I-SFBIMinMax-ZBB-NEXT:    bne a1, a6, .LBB5_2
+; RV32I-SFBIMinMax-ZBB-NEXT:  # %bb.1: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    mv t0, a7
+; RV32I-SFBIMinMax-ZBB-NEXT:  .LBB5_2: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    bnez t0, .LBB5_4
+; RV32I-SFBIMinMax-ZBB-NEXT:  # %bb.3: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    mv a1, a6
+; RV32I-SFBIMinMax-ZBB-NEXT:  .LBB5_4: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    bnez t0, .LBB5_6
+; RV32I-SFBIMinMax-ZBB-NEXT:  # %bb.5: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    mv a0, a5
+; RV32I-SFBIMinMax-ZBB-NEXT:  .LBB5_6: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    bnez a4, .LBB5_8
+; RV32I-SFBIMinMax-ZBB-NEXT:  # %bb.7: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    mv a0, a2
+; RV32I-SFBIMinMax-ZBB-NEXT:  .LBB5_8: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    bnez a4, .LBB5_10
+; RV32I-SFBIMinMax-ZBB-NEXT:  # %bb.9: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    mv a1, a3
+; RV32I-SFBIMinMax-ZBB-NEXT:  .LBB5_10: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    ret
+;
+; RV64I-SFBIMinMax-ZBB-LABEL: select_example_smin_1:
+; RV64I-SFBIMinMax-ZBB:       # %bb.0: # %entry
+; RV64I-SFBIMinMax-ZBB-NEXT:    beqz a2, .LBB5_2
+; RV64I-SFBIMinMax-ZBB-NEXT:  # %bb.1: # %entry
+; RV64I-SFBIMinMax-ZBB-NEXT:    min a1, a0, a3
+; RV64I-SFBIMinMax-ZBB-NEXT:  .LBB5_2: # %entry
+; RV64I-SFBIMinMax-ZBB-NEXT:    mv a0, a1
+; RV64I-SFBIMinMax-ZBB-NEXT:    ret
+entry:
+  %res = call i64 @llvm.smin.i64(i64 %a, i64 %y)
+  %sel = select i1 %x, i64 %res, i64 %b
+  ret i64 %sel
+}
+
+define i64 @select_example_umax_1(i64 %a, i64 %b, i1 zeroext %x, i64 %y) {
+; RV32I-ZBB-LABEL: select_example_umax_1:
+; RV32I-ZBB:       # %bb.0: # %entry
+; RV32I-ZBB-NEXT:    beq a1, a6, .LBB6_2
+; RV32I-ZBB-NEXT:  # %bb.1: # %entry
+; RV32I-ZBB-NEXT:    sltu a7, a6, a1
+; RV32I-ZBB-NEXT:    beqz a7, .LBB6_3
+; RV32I-ZBB-NEXT:    j .LBB6_4
+; RV32I-ZBB-NEXT:  .LBB6_2:
+; RV32I-ZBB-NEXT:    sltu a7, a5, a0
+; RV32I-ZBB-NEXT:    bnez a7, .LBB6_4
+; RV32I-ZBB-NEXT:  .LBB6_3: # %entry
+; RV32I-ZBB-NEXT:    mv a1, a6
+; RV32I-ZBB-NEXT:    mv a0, a5
+; RV32I-ZBB-NEXT:  .LBB6_4: # %entry
+; RV32I-ZBB-NEXT:    beqz a4, .LBB6_6
+; RV32I-ZBB-NEXT:  # %bb.5: # %entry
+; RV32I-ZBB-NEXT:    ret
+; RV32I-ZBB-NEXT:  .LBB6_6: # %entry
+; RV32I-ZBB-NEXT:    mv a0, a2
+; RV32I-ZBB-NEXT:    mv a1, a3
+; RV32I-ZBB-NEXT:    ret
+;
+; RV64I-ZBB-LABEL: select_example_umax_1:
+; RV64I-ZBB:       # %bb.0: # %entry
+; RV64I-ZBB-NEXT:    beqz a2, .LBB6_2
+; RV64I-ZBB-NEXT:  # %bb.1:
+; RV64I-ZBB-NEXT:    maxu a1, a0, a3
+; RV64I-ZBB-NEXT:  .LBB6_2: # %entry
+; RV64I-ZBB-NEXT:    mv a0, a1
+; RV64I-ZBB-NEXT:    ret
+;
+; RV32I-SFB-ZBB-LABEL: select_example_umax_1:
+; RV32I-SFB-ZBB:       # %bb.0: # %entry
+; RV32I-SFB-ZBB-NEXT:    sltu a7, a5, a0
+; RV32I-SFB-ZBB-NEXT:    sltu t0, a6, a1
+; RV32I-SFB-ZBB-NEXT:    bne a1, a6, .LBB6_2
+; RV32I-SFB-ZBB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-ZBB-NEXT:    mv t0, a7
+; RV32I-SFB-ZBB-NEXT:  .LBB6_2: # %entry
+; RV32I-SFB-ZBB-NEXT:    bnez t0, .LBB6_4
+; RV32I-SFB-ZBB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-ZBB-NEXT:    mv a1, a6
+; RV32I-SFB-ZBB-NEXT:  .LBB6_4: # %entry
+; RV32I-SFB-ZBB-NEXT:    bnez t0, .LBB6_6
+; RV32I-SFB-ZBB-NEXT:  # %bb.5: # %entry
+; RV32I-SFB-ZBB-NEXT:    mv a0, a5
+; RV32I-SFB-ZBB-NEXT:  .LBB6_6: # %entry
+; RV32I-SFB-ZBB-NEXT:    bnez a4, .LBB6_8
+; RV32I-SFB-ZBB-NEXT:  # %bb.7: # %entry
+; RV32I-SFB-ZBB-NEXT:    mv a0, a2
+; RV32I-SFB-ZBB-NEXT:  .LBB6_8: # %entry
+; RV32I-SFB-ZBB-NEXT:    bnez a4, .LBB6_10
+; RV32I-SFB-ZBB-NEXT:  # %bb.9: # %entry
+; RV32I-SFB-ZBB-NEXT:    mv a1, a3
+; RV32I-SFB-ZBB-NEXT:  .LBB6_10: # %entry
+; RV32I-SFB-ZBB-NEXT:    ret
+;
+; RV64I-SFB-ZBB-LABEL: select_example_umax_1:
+; RV64I-SFB-ZBB:       # %bb.0: # %entry
+; RV64I-SFB-ZBB-NEXT:    maxu a0, a0, a3
+; RV64I-SFB-ZBB-NEXT:    bnez a2, .LBB6_2
+; RV64I-SFB-ZBB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-ZBB-NEXT:    mv a0, a1
+; RV64I-SFB-ZBB-NEXT:  .LBB6_2: # %entry
+; RV64I-SFB-ZBB-NEXT:    ret
+;
+; RV32I-SFBIMinMax-ZBB-LABEL: select_example_umax_1:
+; RV32I-SFBIMinMax-ZBB:       # %bb.0: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    sltu a7, a5, a0
+; RV32I-SFBIMinMax-ZBB-NEXT:    sltu t0, a6, a1
+; RV32I-SFBIMinMax-ZBB-NEXT:    bne a1, a6, .LBB6_2
+; RV32I-SFBIMinMax-ZBB-NEXT:  # %bb.1: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    mv t0, a7
+; RV32I-SFBIMinMax-ZBB-NEXT:  .LBB6_2: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    bnez t0, .LBB6_4
+; RV32I-SFBIMinMax-ZBB-NEXT:  # %bb.3: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    mv a1, a6
+; RV32I-SFBIMinMax-ZBB-NEXT:  .LBB6_4: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    bnez t0, .LBB6_6
+; RV32I-SFBIMinMax-ZBB-NEXT:  # %bb.5: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    mv a0, a5
+; RV32I-SFBIMinMax-ZBB-NEXT:  .LBB6_6: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    bnez a4, .LBB6_8
+; RV32I-SFBIMinMax-ZBB-NEXT:  # %bb.7: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    mv a0, a2
+; RV32I-SFBIMinMax-ZBB-NEXT:  .LBB6_8: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    bnez a4, .LBB6_10
+; RV32I-SFBIMinMax-ZBB-NEXT:  # %bb.9: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    mv a1, a3
+; RV32I-SFBIMinMax-ZBB-NEXT:  .LBB6_10: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    ret
+;
+; RV64I-SFBIMinMax-ZBB-LABEL: select_example_umax_1:
+; RV64I-SFBIMinMax-ZBB:       # %bb.0: # %entry
+; RV64I-SFBIMinMax-ZBB-NEXT:    beqz a2, .LBB6_2
+; RV64I-SFBIMinMax-ZBB-NEXT:  # %bb.1: # %entry
+; RV64I-SFBIMinMax-ZBB-NEXT:    maxu a1, a0, a3
+; RV64I-SFBIMinMax-ZBB-NEXT:  .LBB6_2: # %entry
+; RV64I-SFBIMinMax-ZBB-NEXT:    mv a0, a1
+; RV64I-SFBIMinMax-ZBB-NEXT:    ret
+entry:
+  %res = call i64 @llvm.umax.i64(i64 %a, i64 %y)
+  %sel = select i1 %x, i64 %res, i64 %b
+  ret i64 %sel
+}
+
+define i64 @select_example_umin_1(i64 %a, i64 %b, i1 zeroext %x, i64 %y) {
+; RV32I-ZBB-LABEL: select_example_umin_1:
+; RV32I-ZBB:       # %bb.0: # %entry
+; RV32I-ZBB-NEXT:    beq a1, a6, .LBB7_2
+; RV32I-ZBB-NEXT:  # %bb.1: # %entry
+; RV32I-ZBB-NEXT:    sltu a7, a1, a6
+; RV32I-ZBB-NEXT:    beqz a7, .LBB7_3
+; RV32I-ZBB-NEXT:    j .LBB7_4
+; RV32I-ZBB-NEXT:  .LBB7_2:
+; RV32I-ZBB-NEXT:    sltu a7, a0, a5
+; RV32I-ZBB-NEXT:    bnez a7, .LBB7_4
+; RV32I-ZBB-NEXT:  .LBB7_3: # %entry
+; RV32I-ZBB-NEXT:    mv a1, a6
+; RV32I-ZBB-NEXT:    mv a0, a5
+; RV32I-ZBB-NEXT:  .LBB7_4: # %entry
+; RV32I-ZBB-NEXT:    beqz a4, .LBB7_6
+; RV32I-ZBB-NEXT:  # %bb.5: # %entry
+; RV32I-ZBB-NEXT:    ret
+; RV32I-ZBB-NEXT:  .LBB7_6: # %entry
+; RV32I-ZBB-NEXT:    mv a0, a2
+; RV32I-ZBB-NEXT:    mv a1, a3
+; RV32I-ZBB-NEXT:    ret
+;
+; RV64I-ZBB-LABEL: select_example_umin_1:
+; RV64I-ZBB:       # %bb.0: # %entry
+; RV64I-ZBB-NEXT:    beqz a2, .LBB7_2
+; RV64I-ZBB-NEXT:  # %bb.1:
+; RV64I-ZBB-NEXT:    minu a1, a0, a3
+; RV64I-ZBB-NEXT:  .LBB7_2: # %entry
+; RV64I-ZBB-NEXT:    mv a0, a1
+; RV64I-ZBB-NEXT:    ret
+;
+; RV32I-SFB-ZBB-LABEL: select_example_umin_1:
+; RV32I-SFB-ZBB:       # %bb.0: # %entry
+; RV32I-SFB-ZBB-NEXT:    sltu a7, a0, a5
+; RV32I-SFB-ZBB-NEXT:    sltu t0, a1, a6
+; RV32I-SFB-ZBB-NEXT:    bne a1, a6, .LBB7_2
+; RV32I-SFB-ZBB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-ZBB-NEXT:    mv t0, a7
+; RV32I-SFB-ZBB-NEXT:  .LBB7_2: # %entry
+; RV32I-SFB-ZBB-NEXT:    bnez t0, .LBB7_4
+; RV32I-SFB-ZBB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-ZBB-NEXT:    mv a1, a6
+; RV32I-SFB-ZBB-NEXT:  .LBB7_4: # %entry
+; RV32I-SFB-ZBB-NEXT:    bnez t0, .LBB7_6
+; RV32I-SFB-ZBB-NEXT:  # %bb.5: # %entry
+; RV32I-SFB-ZBB-NEXT:    mv a0, a5
+; RV32I-SFB-ZBB-NEXT:  .LBB7_6: # %entry
+; RV32I-SFB-ZBB-NEXT:    bnez a4, .LBB7_8
+; RV32I-SFB-ZBB-NEXT:  # %bb.7: # %entry
+; RV32I-SFB-ZBB-NEXT:    mv a0, a2
+; RV32I-SFB-ZBB-NEXT:  .LBB7_8: # %entry
+; RV32I-SFB-ZBB-NEXT:    bnez a4, .LBB7_10
+; RV32I-SFB-ZBB-NEXT:  # %bb.9: # %entry
+; RV32I-SFB-ZBB-NEXT:    mv a1, a3
+; RV32I-SFB-ZBB-NEXT:  .LBB7_10: # %entry
+; RV32I-SFB-ZBB-NEXT:    ret
+;
+; RV64I-SFB-ZBB-LABEL: select_example_umin_1:
+; RV64I-SFB-ZBB:       # %bb.0: # %entry
+; RV64I-SFB-ZBB-NEXT:    minu a0, a0, a3
+; RV64I-SFB-ZBB-NEXT:    bnez a2, .LBB7_2
+; RV64I-SFB-ZBB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-ZBB-NEXT:    mv a0, a1
+; RV64I-SFB-ZBB-NEXT:  .LBB7_2: # %entry
+; RV64I-SFB-ZBB-NEXT:    ret
+;
+; RV32I-SFBIMinMax-ZBB-LABEL: select_example_umin_1:
+; RV32I-SFBIMinMax-ZBB:       # %bb.0: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    sltu a7, a0, a5
+; RV32I-SFBIMinMax-ZBB-NEXT:    sltu t0, a1, a6
+; RV32I-SFBIMinMax-ZBB-NEXT:    bne a1, a6, .LBB7_2
+; RV32I-SFBIMinMax-ZBB-NEXT:  # %bb.1: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    mv t0, a7
+; RV32I-SFBIMinMax-ZBB-NEXT:  .LBB7_2: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    bnez t0, .LBB7_4
+; RV32I-SFBIMinMax-ZBB-NEXT:  # %bb.3: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    mv a1, a6
+; RV32I-SFBIMinMax-ZBB-NEXT:  .LBB7_4: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    bnez t0, .LBB7_6
+; RV32I-SFBIMinMax-ZBB-NEXT:  # %bb.5: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    mv a0, a5
+; RV32I-SFBIMinMax-ZBB-NEXT:  .LBB7_6: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    bnez a4, .LBB7_8
+; RV32I-SFBIMinMax-ZBB-NEXT:  # %bb.7: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    mv a0, a2
+; RV32I-SFBIMinMax-ZBB-NEXT:  .LBB7_8: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    bnez a4, .LBB7_10
+; RV32I-SFBIMinMax-ZBB-NEXT:  # %bb.9: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    mv a1, a3
+; RV32I-SFBIMinMax-ZBB-NEXT:  .LBB7_10: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    ret
+;
+; RV64I-SFBIMinMax-ZBB-LABEL: select_example_umin_1:
+; RV64I-SFBIMinMax-ZBB:       # %bb.0: # %entry
+; RV64I-SFBIMinMax-ZBB-NEXT:    beqz a2, .LBB7_2
+; RV64I-SFBIMinMax-ZBB-NEXT:  # %bb.1: # %entry
+; RV64I-SFBIMinMax-ZBB-NEXT:    minu a1, a0, a3
+; RV64I-SFBIMinMax-ZBB-NEXT:  .LBB7_2: # %entry
+; RV64I-SFBIMinMax-ZBB-NEXT:    mv a0, a1
+; RV64I-SFBIMinMax-ZBB-NEXT:    ret
+entry:
+  %res = call i64 @llvm.umin.i64(i64 %a, i64 %y)
+  %sel = select i1 %x, i64 %res, i64 %b
+  ret i64 %sel
+}
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/WaveActiveMin.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/WaveActiveMin.ll
new file mode 100644
index 0000000000000..d121c1a937a9b
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/WaveActiveMin.ll
@@ -0,0 +1,57 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-vulkan-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-vulkan-unknown %s -o - -filetype=obj | spirv-val %}
+
+; Test lowering to spir-v backend for various types and scalar/vector
+
+; CHECK: OpCapability GroupNonUniformArithmetic
+
+; CHECK-DAG:   %[[#f16:]] = OpTypeFloat 16
+; CHECK-DAG:   %[[#f32:]] = OpTypeFloat 32
+; CHECK-DAG:   %[[#uint:]] = OpTypeInt 32 0
+; CHECK-DAG:   %[[#v4_half:]] = OpTypeVector %[[#f16]] 4
+; CHECK-DAG:   %[[#scope:]] = OpConstant %[[#uint]] 3
+
+; CHECK-LABEL: Begin function test_float
+; CHECK:   %[[#fexpr:]] = OpFunctionParameter %[[#f32]]
+define float @test_float(float %fexpr) {
+entry:
+; CHECK:   %[[#fret:]] = OpGroupNonUniformFMin %[[#f32]] %[[#scope]] Reduce %[[#fexpr]]
+  %0 = call float @llvm.spv.wave.reduce.min.f32(float %fexpr)
+  ret float %0
+}
+
+; CHECK-LABEL: Begin function test_int_signed
+; CHECK:   %[[#iexpr:]] = OpFunctionParameter %[[#uint]]
+define i32 @test_int_signed(i32 %iexpr) {
+entry:
+; CHECK:   %[[#iret:]] = OpGroupNonUniformSMin %[[#uint]] %[[#scope]] Reduce %[[#iexpr]]
+  %0 = call i32 @llvm.spv.wave.reduce.min.i32(i32 %iexpr)
+  ret i32 %0
+}
+
+; CHECK-LABEL: Begin function test_int_unsigned
+; CHECK:   %[[#iexpr:]] = OpFunctionParameter %[[#uint]]
+define i32 @test_int_unsigned(i32 %iexpr) {
+entry:
+; CHECK:   %[[#iret:]] = OpGroupNonUniformUMin %[[#uint]] %[[#scope]] Reduce %[[#iexpr]]
+  %0 = call i32 @llvm.spv.wave.reduce.umin.i32(i32 %iexpr)
+  ret i32 %0
+}
+
+; CHECK-LABEL: Begin function test_vhalf
+; CHECK:   %[[#vbexpr:]] = OpFunctionParameter %[[#v4_half]]
+define <4 x half> @test_vhalf(<4 x half> %vbexpr) {
+entry:
+; CHECK:   %[[#vhalfret:]] = OpGroupNonUniformFMin %[[#v4_half]] %[[#scope]] Reduce %[[#vbexpr]]
+  %0 = call <4 x half> @llvm.spv.wave.reduce.min.v4half(<4 x half> %vbexpr)
+  ret <4 x half> %0
+}
+
+declare float @llvm.spv.wave.reduce.min.f32(float)
+declare i32 @llvm.spv.wave.reduce.min.i32(i32)
+declare <4 x half> @llvm.spv.wave.reduce.min.v4half(<4 x half>)
+
+declare float @llvm.spv.wave.reduce.umin.f32(float)
+declare i32 @llvm.spv.wave.reduce.umin.i32(i32)
+declare <4 x half> @llvm.spv.wave.reduce.umin.v4half(<4 x half>)
+
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/issue-146942-ptr-cast.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/issue-146942-ptr-cast.ll
index ed67344842b11..4817e7450ac2e 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/issue-146942-ptr-cast.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/issue-146942-ptr-cast.ll
@@ -16,7 +16,6 @@
 define void @case1() local_unnamed_addr {
   ; CHECK: %[[#BUFFER_LOAD:]] = OpLoad %[[#FLOAT4]] %{{[0-9]+}} Aligned 16
   ; CHECK: %[[#CAST_LOAD:]] = OpBitcast %[[#INT4]] %[[#BUFFER_LOAD]]
-  ; CHECK: %[[#VEC_SHUFFLE:]] = OpVectorShuffle %[[#INT4]] %[[#CAST_LOAD]] %[[#CAST_LOAD]] 0 1 2 3
   %1 = tail call target("spirv.VulkanBuffer", [0 x <4 x float>], 12, 0) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0v4f32_12_0t(i32 0, i32 2, i32 1, i32 0, ptr nonnull @.str)
   %2 = tail call target("spirv.VulkanBuffer", [0 x <4 x i32>], 12, 1) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0v4i32_12_1t(i32 0, i32 5, i32 1, i32 0, ptr nonnull @.str.2)
   %3 = tail call noundef align 16 dereferenceable(16) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.VulkanBuffer_a0v4f32_12_0t(target("spirv.VulkanBuffer", [0 x <4 x float>], 12, 0) %1, i32 0)
@@ -29,8 +28,7 @@ define void @case1() local_unnamed_addr {
 define void @case2() local_unnamed_addr {
   ; CHECK: %[[#BUFFER_LOAD:]] = OpLoad %[[#FLOAT4]] %{{[0-9]+}} Aligned 16
   ; CHECK: %[[#CAST_LOAD:]] = OpBitcast %[[#INT4]] %[[#BUFFER_LOAD]]
-  ; CHECK: %[[#VEC_SHUFFLE:]] = OpVectorShuffle %[[#INT4]] %[[#CAST_LOAD]] %[[#CAST_LOAD]] 0 1 2 3
-  ; CHECK: %[[#VEC_TRUNCATE:]] = OpVectorShuffle %[[#INT3]] %[[#VEC_SHUFFLE]] %[[#UNDEF_INT4]] 0 1 2
+  ; CHECK: %[[#VEC_TRUNCATE:]] = OpVectorShuffle %[[#INT3]] %[[#CAST_LOAD]] %[[#UNDEF_INT4]] 0 1 2
   %1 = tail call target("spirv.VulkanBuffer", [0 x <4 x float>], 12, 0) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0v4f32_12_0t(i32 0, i32 2, i32 1, i32 0, ptr nonnull @.str)
   %2 = tail call target("spirv.VulkanBuffer", [0 x <3 x i32>], 12, 1) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0v3i32_12_1t(i32 0, i32 5, i32 1, i32 0, ptr nonnull @.str.3)
   %3 = tail call noundef align 16 dereferenceable(16) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.VulkanBuffer_a0v4f32_12_0t(target("spirv.VulkanBuffer", [0 x <4 x float>], 12, 0) %1, i32 0)
diff --git a/llvm/test/CodeGen/SPIRV/pointers/ptrcast-bitcast.ll b/llvm/test/CodeGen/SPIRV/pointers/ptrcast-bitcast.ll
index 84913283f6868..a1ec2cd1cfdd2 100644
--- a/llvm/test/CodeGen/SPIRV/pointers/ptrcast-bitcast.ll
+++ b/llvm/test/CodeGen/SPIRV/pointers/ptrcast-bitcast.ll
@@ -26,3 +26,25 @@ entry:
   store <4 x i32> %6, ptr addrspace(11) %7, align 16
   ret void
 }
+
+; This tests a load from a pointer that has been bitcast between vector types
+; which share the same total bit-width but have different numbers of elements.
+; Tests that legalize-pointer-casts works correctly by moving the bitcast to
+; the element that was loaded.
+
+define void @main2() local_unnamed_addr #0 {
+entry:
+; CHECK:  %[[LOAD:[0-9]+]] = OpLoad %[[#v2_double]] {{.*}}
+; CHECK:  %[[BITCAST1:[0-9]+]] = OpBitcast %[[#v4_uint]] %[[LOAD]]
+; CHECK:  %[[BITCAST2:[0-9]+]] = OpBitcast %[[#v2_double]] %[[BITCAST1]]
+; CHECK: OpStore {{%[0-9]+}} %[[BITCAST2]] {{.*}}
+
+  %0 = tail call target("spirv.VulkanBuffer", [0 x <2 x double>], 12, 1) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0v2f64_12_1t(i32 0, i32 2, i32 1, i32 0, ptr nonnull @.str.2)
+  %2 = tail call noundef align 16 dereferenceable(16) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.VulkanBuffer_a0v2f64_12_1t(target("spirv.VulkanBuffer", [0 x <2 x double>], 12, 1) %0, i32 0)
+  %3 = load <4 x i32>, ptr addrspace(11) %2
+  %4 = tail call noundef align 16 dereferenceable(16) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.VulkanBuffer_a0v2f64_12_1t(target("spirv.VulkanBuffer", [0 x <2 x double>], 12, 1) %0, i32 1)
+  store <4 x i32> %3, ptr addrspace(11) %4
+  ret void
+}
+
+attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
diff --git a/llvm/test/CodeGen/SystemZ/stackmap.ll b/llvm/test/CodeGen/SystemZ/stackmap.ll
index 05b8de756c032..f414ea33a6e80 100644
--- a/llvm/test/CodeGen/SystemZ/stackmap.ll
+++ b/llvm/test/CodeGen/SystemZ/stackmap.ll
@@ -84,14 +84,14 @@
 ; CHECK-NEXT:   .short  8
 ; CHECK-NEXT:   .short  0
 ; CHECK-NEXT:   .short  0
-; CHECK-NEXT:   .long   65535
+; CHECK-NEXT:   .long   -1
 ; SmallConstant
 ; CHECK-NEXT:   .byte   4
 ; CHECK-NEXT:   .byte   0
 ; CHECK-NEXT:   .short  8
 ; CHECK-NEXT:   .short  0
 ; CHECK-NEXT:   .short  0
-; CHECK-NEXT:   .long   65535
+; CHECK-NEXT:   .long   -1
 ; SmallConstant
 ; CHECK-NEXT:   .byte   4
 ; CHECK-NEXT:   .byte   0
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/minloop.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/minloop.ll
index 9c36bae6fac13..ec257bcf123f3 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/minloop.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/minloop.ll
@@ -6,77 +6,81 @@ define void @arm_min_q31(ptr nocapture readonly %pSrc, i32 %blockSize, ptr nocap
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
 ; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT:    .pad #4
+; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    ldr.w r12, [r0]
 ; CHECK-NEXT:    subs.w r9, r1, #1
 ; CHECK-NEXT:    beq .LBB0_3
 ; CHECK-NEXT:  @ %bb.1: @ %while.body.preheader
-; CHECK-NEXT:    and r8, r9, #3
+; CHECK-NEXT:    and r6, r9, #3
 ; CHECK-NEXT:    subs r7, r1, #2
 ; CHECK-NEXT:    cmp r7, #3
 ; CHECK-NEXT:    bhs .LBB0_4
 ; CHECK-NEXT:  @ %bb.2:
-; CHECK-NEXT:    movs r6, #0
-; CHECK-NEXT:    b .LBB0_6
+; CHECK-NEXT:    mov.w r10, #0
+; CHECK-NEXT:    cbnz r6, .LBB0_7
+; CHECK-NEXT:    b .LBB0_10
 ; CHECK-NEXT:  .LBB0_3:
-; CHECK-NEXT:    movs r6, #0
+; CHECK-NEXT:    mov.w r10, #0
 ; CHECK-NEXT:    b .LBB0_10
 ; CHECK-NEXT:  .LBB0_4: @ %while.body.preheader.new
 ; CHECK-NEXT:    bic r7, r9, #3
-; CHECK-NEXT:    movs r6, #1
+; CHECK-NEXT:    str r6, [sp] @ 4-byte Spill
 ; CHECK-NEXT:    subs r7, #4
+; CHECK-NEXT:    movs r6, #1
+; CHECK-NEXT:    mov.w r8, #0
+; CHECK-NEXT:    mov.w r10, #0
 ; CHECK-NEXT:    add.w lr, r6, r7, lsr #2
-; CHECK-NEXT:    movs r6, #0
-; CHECK-NEXT:    movs r7, #4
 ; CHECK-NEXT:  .LBB0_5: @ %while.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldr r10, [r0, #16]!
-; CHECK-NEXT:    sub.w r9, r9, #4
-; CHECK-NEXT:    ldrd r5, r4, [r0, #-12]
-; CHECK-NEXT:    ldr r11, [r0, #-4]
+; CHECK-NEXT:    ldr r11, [r0, #16]!
+; CHECK-NEXT:    ldrd r5, r7, [r0, #-12]
+; CHECK-NEXT:    ldr r4, [r0, #-4]
 ; CHECK-NEXT:    cmp r12, r5
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    subgt r6, r7, #3
 ; CHECK-NEXT:    csel r5, r5, r12, gt
-; CHECK-NEXT:    cmp r5, r4
+; CHECK-NEXT:    csinc r6, r10, r8, le
+; CHECK-NEXT:    cmp r5, r7
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    subgt r6, r7, #2
-; CHECK-NEXT:    csel r5, r4, r5, gt
-; CHECK-NEXT:    cmp r5, r11
+; CHECK-NEXT:    addgt.w r6, r8, #2
+; CHECK-NEXT:    csel r7, r7, r5, gt
+; CHECK-NEXT:    cmp r7, r4
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    subgt r6, r7, #1
-; CHECK-NEXT:    csel r5, r11, r5, gt
-; CHECK-NEXT:    cmp r5, r10
-; CHECK-NEXT:    csel r6, r7, r6, gt
-; CHECK-NEXT:    add.w r7, r7, #4
-; CHECK-NEXT:    csel r12, r10, r5, gt
+; CHECK-NEXT:    addgt.w r6, r8, #3
+; CHECK-NEXT:    csel r7, r4, r7, gt
+; CHECK-NEXT:    add.w r8, r8, #4
+; CHECK-NEXT:    cmp r7, r11
+; CHECK-NEXT:    csel r10, r8, r6, gt
+; CHECK-NEXT:    csel r12, r11, r7, gt
 ; CHECK-NEXT:    le lr, .LBB0_5
-; CHECK-NEXT:  .LBB0_6: @ %while.end.loopexit.unr-lcssa
-; CHECK-NEXT:    cmp.w r8, #0
-; CHECK-NEXT:    beq .LBB0_10
-; CHECK-NEXT:  @ %bb.7: @ %while.body.epil
+; CHECK-NEXT:  @ %bb.6: @ %while.end.loopexit.unr-lcssa.loopexit
+; CHECK-NEXT:    ldr r6, [sp] @ 4-byte Reload
+; CHECK-NEXT:    sub.w r9, r9, r8
+; CHECK-NEXT:    cbz r6, .LBB0_10
+; CHECK-NEXT:  .LBB0_7: @ %while.body.epil
 ; CHECK-NEXT:    ldr r7, [r0, #4]
 ; CHECK-NEXT:    sub.w r1, r1, r9
 ; CHECK-NEXT:    cmp r12, r7
-; CHECK-NEXT:    csel r6, r1, r6, gt
+; CHECK-NEXT:    csel r10, r1, r10, gt
 ; CHECK-NEXT:    csel r12, r7, r12, gt
-; CHECK-NEXT:    cmp.w r8, #1
+; CHECK-NEXT:    cmp r6, #1
 ; CHECK-NEXT:    beq .LBB0_10
 ; CHECK-NEXT:  @ %bb.8: @ %while.body.epil.1
 ; CHECK-NEXT:    ldr r7, [r0, #8]
 ; CHECK-NEXT:    cmp r12, r7
-; CHECK-NEXT:    csinc r6, r6, r1, le
+; CHECK-NEXT:    csinc r10, r10, r1, le
 ; CHECK-NEXT:    csel r12, r7, r12, gt
-; CHECK-NEXT:    cmp.w r8, #2
+; CHECK-NEXT:    cmp r6, #2
 ; CHECK-NEXT:    beq .LBB0_10
 ; CHECK-NEXT:  @ %bb.9: @ %while.body.epil.2
 ; CHECK-NEXT:    ldr r0, [r0, #12]
 ; CHECK-NEXT:    cmp r12, r0
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    addgt r6, r1, #2
+; CHECK-NEXT:    addgt.w r10, r1, #2
 ; CHECK-NEXT:    csel r12, r0, r12, gt
 ; CHECK-NEXT:  .LBB0_10: @ %while.end
 ; CHECK-NEXT:    str.w r12, [r2]
-; CHECK-NEXT:    str r6, [r3]
+; CHECK-NEXT:    str.w r10, [r3]
+; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
 entry:
   %0 = load i32, ptr %pSrc, align 4
diff --git a/llvm/test/CodeGen/X86/amx-tf32-internal.ll b/llvm/test/CodeGen/X86/amx-tf32-internal.ll
index 6d0f3c57c08d8..caf7a1cb7bd2d 100644
--- a/llvm/test/CodeGen/X86/amx-tf32-internal.ll
+++ b/llvm/test/CodeGen/X86/amx-tf32-internal.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+avx512f, \
-; RUN: -mattr=+amx-tf32,+amx-transpose -verify-machineinstrs | FileCheck %s
+; RUN: -mattr=+amx-tf32 -verify-machineinstrs | FileCheck %s
 
 define void @test_amx(i8* %pointer, i8* %base, i64 %stride) {
 ; CHECK-LABEL: test_amx:
@@ -20,7 +20,6 @@ define void @test_amx(i8* %pointer, i8* %base, i64 %stride) {
 ; CHECK-NEXT:    tilezero %tmm1
 ; CHECK-NEXT:    tilezero %tmm2
 ; CHECK-NEXT:    tmmultf32ps %tmm1, %tmm0, %tmm2
-; CHECK-NEXT:    ttmmultf32ps %tmm1, %tmm0, %tmm2
 ; CHECK-NEXT:    tilestored %tmm2, (%rdi,%rdx)
 ; CHECK-NEXT:    tilerelease
 ; CHECK-NEXT:    vzeroupper
@@ -31,9 +30,8 @@ define void @test_amx(i8* %pointer, i8* %base, i64 %stride) {
   %c = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 8)
 
   %c1 = call x86_amx @llvm.x86.tmmultf32ps.internal(i16 8, i16 8, i16 8, x86_amx %c, x86_amx %a, x86_amx %b)
-  %c2 = call x86_amx @llvm.x86.ttmmultf32ps.internal(i16 8, i16 8, i16 8, x86_amx %c1, x86_amx %a, x86_amx %b)
 
-  call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %c2)
+  call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %c1)
   ret void
 }
 
@@ -43,4 +41,3 @@ declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx)
 
 
 declare x86_amx @llvm.x86.tmmultf32ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
-declare x86_amx @llvm.x86.ttmmultf32ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
diff --git a/llvm/test/CodeGen/X86/amx-tf32-intrinsics.ll b/llvm/test/CodeGen/X86/amx-tf32-intrinsics.ll
index af1a7ae102975..642c1b7317f81 100644
--- a/llvm/test/CodeGen/X86/amx-tf32-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/amx-tf32-intrinsics.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-tf32,+amx-transpose -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-tf32 -verify-machineinstrs | FileCheck %s
 
 define void @test_tmmultf32ps() {
 ; CHECK-LABEL: test_tmmultf32ps:
@@ -11,13 +11,3 @@ define void @test_tmmultf32ps() {
 }
 declare void @llvm.x86.tmmultf32ps(i8 %A, i8 %B, i8 %C)
 
-define void @test_ttmmultf32ps() {
-; CHECK-LABEL: test_ttmmultf32ps:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    ttmmultf32ps %tmm3, %tmm2, %tmm1
-; CHECK-NEXT:    retq
-  call void @llvm.x86.ttmmultf32ps(i8 1, i8 2, i8 3)
-  ret  void
-}
-declare void @llvm.x86.ttmmultf32ps(i8 %A, i8 %B, i8 %C)
-
diff --git a/llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll b/llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll
deleted file mode 100755
index 1f5758c804b2b..0000000000000
--- a/llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll
+++ /dev/null
@@ -1,122 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-transpose,+amx-movrs | FileCheck %s --check-prefixes=CHECK,O0
-; RUN: llc < %s -O2 -mtriple=x86_64-unknown-unknown -mattr=+amx-transpose,+amx-movrs | FileCheck %s --check-prefixes=CHECK,O2
-; RUN: llc < %s -O2 -mtriple=x86_64-unknown-unknown -mattr=+amx-transpose,+amx-movrs,+egpr --show-mc-encoding | FileCheck %s --check-prefix=EGPR
-
-define void @test_amx(i64 %stride, i8* %addr1) #0 {
-; CHECK-LABEL: test_amx:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    t2rpntlvwz0rs (%rsi,%rdi), %tmm0
-; CHECK-NEXT:    t2rpntlvwz0rst1 (%rsi,%rdi), %tmm2
-; CHECK-NEXT:    t2rpntlvwz1rs (%rsi,%rdi), %tmm0
-; CHECK-NEXT:    t2rpntlvwz1rst1 (%rsi,%rdi), %tmm2
-; CHECK-NEXT:    retq
-;
-; EGPR-LABEL: test_amx:
-; EGPR:       # %bb.0:
-; EGPR-NEXT:    t2rpntlvwz0rs (%rsi,%rdi), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x78,0xf8,0x04,0x3e]
-; EGPR-NEXT:    t2rpntlvwz0rst1 (%rsi,%rdi), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x78,0xf9,0x14,0x3e]
-; EGPR-NEXT:    t2rpntlvwz1rs (%rsi,%rdi), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x79,0xf8,0x04,0x3e]
-; EGPR-NEXT:    t2rpntlvwz1rst1 (%rsi,%rdi), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x79,0xf9,0x14,0x3e]
-; EGPR-NEXT:    retq # encoding: [0xc3]
-  call void @llvm.x86.t2rpntlvwz0rs(i8 1, i8* %addr1, i64 %stride)
-  call void @llvm.x86.t2rpntlvwz0rst1(i8 2, i8* %addr1, i64 %stride)
-  call void @llvm.x86.t2rpntlvwz1rs(i8 1, i8* %addr1, i64 %stride)
-  call void @llvm.x86.t2rpntlvwz1rst1(i8 2, i8* %addr1, i64 %stride)
-  ret void
-}
-declare void @llvm.x86.t2rpntlvwz0rs(i8 , i8* , i64 )
-declare void @llvm.x86.t2rpntlvwz0rst1(i8 , i8* , i64 )
-declare void @llvm.x86.t2rpntlvwz1rs(i8 , i8* , i64 )
-declare void @llvm.x86.t2rpntlvwz1rst1(i8 , i8* , i64 )
-
-define void @test_amx2(i8* %base, i64 %stride) #0 {
-; O0-LABEL: test_amx2:
-; O0:       # %bb.0:
-; O0-NEXT:    xorps %xmm0, %xmm0
-; O0-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
-; O0-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
-; O0-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
-; O0-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
-; O0-NEXT:    movb $1, -{{[0-9]+}}(%rsp)
-; O0-NEXT:    movw $8, %ax
-; O0-NEXT:    # implicit-def: $al
-; O0-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; O0-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
-; O0-NEXT:    # implicit-def: $al
-; O0-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; O0-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
-; O0-NEXT:    ldtilecfg -{{[0-9]+}}(%rsp)
-; O0-NEXT:    t2rpntlvwz0rst1 (%rdi,%rsi), %tmm4
-; O0-NEXT:    movw $8, %ax
-; O0-NEXT:    # implicit-def: $al
-; O0-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; O0-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
-; O0-NEXT:    # implicit-def: $al
-; O0-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; O0-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
-; O0-NEXT:    ldtilecfg -{{[0-9]+}}(%rsp)
-; O0-NEXT:    t2rpntlvwz1rs (%rdi,%rsi), %tmm4
-; O0-NEXT:    movw $8, %ax
-; O0-NEXT:    # implicit-def: $al
-; O0-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; O0-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
-; O0-NEXT:    # implicit-def: $al
-; O0-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; O0-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
-; O0-NEXT:    ldtilecfg -{{[0-9]+}}(%rsp)
-; O0-NEXT:    t2rpntlvwz1rst1 (%rdi,%rsi), %tmm4
-; O0-NEXT:    tilerelease
-; O0-NEXT:    retq
-;
-; O2-LABEL: test_amx2:
-; O2:       # %bb.0:
-; O2-NEXT:    xorps %xmm0, %xmm0
-; O2-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
-; O2-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
-; O2-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
-; O2-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
-; O2-NEXT:    movb $1, -{{[0-9]+}}(%rsp)
-; O2-NEXT:    movb $8, -{{[0-9]+}}(%rsp)
-; O2-NEXT:    movw $8, -{{[0-9]+}}(%rsp)
-; O2-NEXT:    movb $8, -{{[0-9]+}}(%rsp)
-; O2-NEXT:    movw $8, -{{[0-9]+}}(%rsp)
-; O2-NEXT:    ldtilecfg -{{[0-9]+}}(%rsp)
-; O2-NEXT:    movw $8, %ax
-; O2-NEXT:    t2rpntlvwz0rs (%rdi,%rsi), %tmm4
-; O2-NEXT:    t2rpntlvwz0rst1 (%rdi,%rsi), %tmm4
-; O2-NEXT:    t2rpntlvwz1rs (%rdi,%rsi), %tmm4
-; O2-NEXT:    t2rpntlvwz1rst1 (%rdi,%rsi), %tmm4
-; O2-NEXT:    tilerelease
-; O2-NEXT:    retq
-;
-; EGPR-LABEL: test_amx2:
-; EGPR:       # %bb.0:
-; EGPR-NEXT:    xorps %xmm0, %xmm0 # encoding: [0x0f,0x57,0xc0]
-; EGPR-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x44,0x24,0xc0]
-; EGPR-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x44,0x24,0xd0]
-; EGPR-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x44,0x24,0xe0]
-; EGPR-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x44,0x24,0xf0]
-; EGPR-NEXT:    movb $1, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xc0,0x01]
-; EGPR-NEXT:    movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xf4,0x08]
-; EGPR-NEXT:    movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0xd8,0x08,0x00]
-; EGPR-NEXT:    movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xf5,0x08]
-; EGPR-NEXT:    movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0xda,0x08,0x00]
-; EGPR-NEXT:    ldtilecfg -{{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x44,0x24,0xc0]
-; EGPR-NEXT:    movw $8, %ax # encoding: [0x66,0xb8,0x08,0x00]
-; EGPR-NEXT:    t2rpntlvwz0rs (%rdi,%rsi), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x78,0xf8,0x24,0x37]
-; EGPR-NEXT:    t2rpntlvwz0rst1 (%rdi,%rsi), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x78,0xf9,0x24,0x37]
-; EGPR-NEXT:    t2rpntlvwz1rs (%rdi,%rsi), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x79,0xf8,0x24,0x37]
-; EGPR-NEXT:    t2rpntlvwz1rst1 (%rdi,%rsi), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x79,0xf9,0x24,0x37]
-; EGPR-NEXT:    tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0]
-; EGPR-NEXT:    retq # encoding: [0xc3]
-  call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rs.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
-  call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rst1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
-  call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rs.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
-  call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rst1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
-  ret void
-}
-declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rs.internal(i16, i16, i16, i8*, i64)
-declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rst1.internal(i16, i16, i16, i8*, i64)
-declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rs.internal(i16, i16, i16, i8*, i64)
-declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rst1.internal(i16, i16, i16, i8*, i64)
diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_O2_to_O0.ll b/llvm/test/CodeGen/X86/amx_tile_pair_O2_to_O0.ll
deleted file mode 100644
index 4f41410010302..0000000000000
--- a/llvm/test/CodeGen/X86/amx_tile_pair_O2_to_O0.ll
+++ /dev/null
@@ -1,136 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \
-; RUN: -mattr=+amx-transpose -verify-machineinstrs | FileCheck %s
-
-@buf = dso_local global [2048 x i8] zeroinitializer, align 16
-@buf2 = dso_local global [2048 x i8] zeroinitializer, align 16
-
-define dso_local void @test_tile_2rpntlvwz0(i16 noundef signext %row, i16 noundef signext %col0, i16 noundef signext %col1) local_unnamed_addr #0 {
-; CHECK-LABEL: test_tile_2rpntlvwz0:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushq %rbp
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset %rbp, -16
-; CHECK-NEXT:    movq %rsp, %rbp
-; CHECK-NEXT:    .cfi_def_cfa_register %rbp
-; CHECK-NEXT:    pushq %rbx
-; CHECK-NEXT:    andq $-1024, %rsp # imm = 0xFC00
-; CHECK-NEXT:    subq $8192, %rsp # imm = 0x2000
-; CHECK-NEXT:    .cfi_offset %rbx, -24
-; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vmovups %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movb $1, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    # kill: def $dx killed $dx killed $edx
-; CHECK-NEXT:    movw %si, %cx
-; CHECK-NEXT:    movw %di, %ax
-; CHECK-NEXT:    # implicit-def: $al
-; CHECK-NEXT:    movb %al, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movw %dx, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    # implicit-def: $al
-; CHECK-NEXT:    movb %al, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movw %dx, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    # implicit-def: $al
-; CHECK-NEXT:    movb %al, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    # implicit-def: $cl
-; CHECK-NEXT:    movb %cl, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movw %dx, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    # implicit-def: $al
-; CHECK-NEXT:    movb %al, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    # implicit-def: $al
-; CHECK-NEXT:    movb %al, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    # implicit-def: $al
-; CHECK-NEXT:    movb %al, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    # implicit-def: $al
-; CHECK-NEXT:    movb %al, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movw %dx, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movl $buf, %esi
-; CHECK-NEXT:    movl $32, %edi
-; CHECK-NEXT:    t2rpntlvwz0 (%rsi,%rdi), %tmm4
-; CHECK-NEXT:    movabsq $64, %rbx
-; CHECK-NEXT:    tilestored %tmm5, (%rsp,%rbx) # 1024-byte Folded Spill
-; CHECK-NEXT:    tileloadd (%rsp,%rbx), %tmm0 # 1024-byte Folded Reload
-; CHECK-NEXT:    movabsq $64, %rbx
-; CHECK-NEXT:    tilestored %tmm4, 1024(%rsp,%rbx) # 1024-byte Folded Spill
-; CHECK-NEXT:    tileloadd 1024(%rsp,%rbx), %tmm1 # 1024-byte Folded Reload
-; CHECK-NEXT:    movl $64, %edi
-; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
-; CHECK-NEXT:    tilestored %tmm1, (%rsi,%rdi)
-; CHECK-NEXT:    movl $64, %edi
-; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
-; CHECK-NEXT:    tilestored %tmm0, (%rsi,%rdi)
-; CHECK-NEXT:    tilezero %tmm0
-; CHECK-NEXT:    movl $64, %edi
-; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
-; CHECK-NEXT:    tilestored %tmm0, (%rsi,%rdi)
-; CHECK-NEXT:    movl $64, %edi
-; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
-; CHECK-NEXT:    tileloadd (%rsi,%rdi), %tmm1
-; CHECK-NEXT:    movl $64, %edi
-; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
-; CHECK-NEXT:    tileloadd (%rsi,%rdi), %tmm2
-; CHECK-NEXT:    movl $64, %edi
-; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
-; CHECK-NEXT:    tileloadd (%rsi,%rdi), %tmm0
-; CHECK-NEXT:    tdpbssd %tmm2, %tmm1, %tmm0
-; CHECK-NEXT:    movl $64, %edi
-; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
-; CHECK-NEXT:    tilestored %tmm0, (%rsi,%rdi)
-; CHECK-NEXT:    movl $64, %edi
-; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
-; CHECK-NEXT:    tileloadd (%rsi,%rdi), %tmm0
-; CHECK-NEXT:    movl $buf2, %edx
-; CHECK-NEXT:    movl $32, %esi
-; CHECK-NEXT:    tilestored %tmm0, (%rdx,%rsi)
-; CHECK-NEXT:    leaq -8(%rbp), %rsp
-; CHECK-NEXT:    popq %rbx
-; CHECK-NEXT:    popq %rbp
-; CHECK-NEXT:    .cfi_def_cfa %rsp, 8
-; CHECK-NEXT:    tilerelease
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
-entry:
-  %0 = tail call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 %row, i16 %col0, i16 %col1, ptr @buf, i64 32) #3
-  %1 = extractvalue { x86_amx, x86_amx } %0, 0
-  %2 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %1) #3
-  %3 = extractvalue { x86_amx, x86_amx } %0, 1
-  %4 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %3) #3
-  %5 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col0) #3
-  %6 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %5) #3
-  %7 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %6) #3
-  %8 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %2) #3
-  %9 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %4) #3
-  %10 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col1, i16 %col0, x86_amx %7, x86_amx %8, x86_amx %9) #3
-  %11 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %10) #3
-  %12 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %11) #3
-  tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col0, ptr @buf2, i64 32, x86_amx %12) #3
-  ret void
-}
-
-declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16, i16, i16, ptr, i64) #1
-
-declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx) #2
-
-declare x86_amx @llvm.x86.tilezero.internal(i16, i16) #3
-
-declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #3
-
-declare x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>) #2
-
-declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) #4
-
-attributes #0 = { nounwind uwtable "target-cpu"="x86-64" "target-features"="+amx-bf16,+amx-int8,+amx-tile,+amx-transpose" }
-attributes #1 = { argmemonly nofree nounwind readonly }
-attributes #2 = { nofree nosync nounwind readnone }
-attributes #3 = { nounwind }
-attributes #4 = { argmemonly nounwind writeonly }
-
-!llvm.module.flags = !{!0, !1, !2}
-
-!0 = !{i32 1, !"wchar_size", i32 4}
-!1 = !{i32 7, !"uwtable", i32 2}
-!2 = !{i32 7, !"frame-pointer", i32 2}
diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_configure_O0.mir b/llvm/test/CodeGen/X86/amx_tile_pair_configure_O0.mir
deleted file mode 100644
index ab12ab3a4f13d..0000000000000
--- a/llvm/test/CodeGen/X86/amx_tile_pair_configure_O0.mir
+++ /dev/null
@@ -1,165 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \
-# RUN: -mattr=+amx-transpose -run-pass=fasttileconfig -o - %s | FileCheck %s
-
----
-name:            test_tile_2rpntlvwz0
-alignment:       16
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-failedISel:      false
-tracksRegLiveness: true
-hasWinCFI:       false
-callsEHReturn:   false
-callsUnwindInit: false
-hasEHContTarget: false
-hasEHScopes:     false
-hasEHFunclets:   false
-failsVerification: false
-tracksDebugUserValues: false
-registers:       []
-liveins:
-  - { reg: '$edi', virtual-reg: '' }
-  - { reg: '$esi', virtual-reg: '' }
-  - { reg: '$edx', virtual-reg: '' }
-frameInfo:
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    1024
-  adjustsStack:    false
-  hasCalls:        true
-  stackProtector:  ''
-  functionContext: ''
-  maxCallFrameSize: 4294967295
-  cvBytesOfCalleeSavedRegisters: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-  hasTailCall:     false
-  localFrameSize:  0
-  savePoint:       []
-  restorePoint:    []
-fixedStack:      []
-stack:
-  - { id: 0, name: '', type: default, offset: 0, size: 8, alignment: 8,
-      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
-      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
-  - { id: 1, name: '', type: default, offset: 0, size: 8, alignment: 8,
-      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
-      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
-  - { id: 2, name: '', type: default, offset: 0, size: 8, alignment: 8,
-      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
-      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
-  - { id: 3, name: '', type: default, offset: 0, size: 8, alignment: 8,
-      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
-      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
-  - { id: 4, name: '', type: default, offset: 0, size: 64, alignment: 4,
-      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
-      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
-  - { id: 5, name: '', type: spill-slot, offset: 0, size: 2, alignment: 2,
-      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
-      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
-  - { id: 6, name: '', type: spill-slot, offset: 0, size: 2, alignment: 2,
-      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
-      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
-  - { id: 7, name: '', type: spill-slot, offset: 0, size: 8, alignment: 8,
-      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
-      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
-callSites:       []
-debugValueSubstitutions: []
-constants:       []
-machineFunctionInfo:
-  amxProgModel: ManagedRA
-body:             |
-  bb.0.entry:
-    liveins: $rdi, $rsi, $rdx, $rax
-
-    ; CHECK-LABEL: name: test_tile_2rpntlvwz0
-    ; CHECK: liveins: $rdi, $rsi, $rdx, $rax
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: renamable $zmm0 = AVX512_512_SET0
-    ; CHECK-NEXT: VMOVUPSZmr %stack.4, 1, $noreg, 0, $noreg, killed renamable $zmm0 :: (store (s512) into %stack.4, align 4)
-    ; CHECK-NEXT: MOV8mi %stack.4, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.4, align 4)
-    ; CHECK-NEXT: renamable $rcx = MOV32ri64 64
-    ; CHECK-NEXT: MOV64mr %stack.7, 1, $noreg, 0, $noreg, $rcx :: (store (s64) into %stack.7)
-    ; CHECK-NEXT: renamable $cx = MOV16ri 64
-    ; CHECK-NEXT: MOV16mr %stack.5, 1, $noreg, 0, $noreg, $cx :: (store (s16) into %stack.5)
-    ; CHECK-NEXT: renamable $cx = MOV16ri 16
-    ; CHECK-NEXT: renamable $r8w = MOV16ri 16
-    ; CHECK-NEXT: MOV16mr %stack.6, 1, $noreg, 0, $noreg, $r8w :: (store (s16) into %stack.6)
-    ; CHECK-NEXT: $al = IMPLICIT_DEF
-    ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 48, $noreg, $al :: (store (s512) into %stack.4 + 48, align 4)
-    ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 16, $noreg, $cx :: (store (s512) into %stack.4 + 16, align 4)
-    ; CHECK-NEXT: $al = IMPLICIT_DEF
-    ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 50, $noreg, $al :: (store (s512) into %stack.4 + 50, align 2, basealign 4)
-    ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 20, $noreg, $cx :: (store (s512) into %stack.4 + 20, align 4)
-    ; CHECK-NEXT: $al = IMPLICIT_DEF
-    ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 49, $noreg, $al :: (store (s512) into %stack.4 + 49, align 1, basealign 4)
-    ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 18, $noreg, $di :: (store (s512) into %stack.4 + 18, align 2, basealign 4)
-    ; CHECK-NEXT: $al = IMPLICIT_DEF
-    ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 48, $noreg, $al :: (store (s512) into %stack.4 + 48, align 4)
-    ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 16, $noreg, $cx :: (store (s512) into %stack.4 + 16, align 4)
-    ; CHECK-NEXT: $al = IMPLICIT_DEF
-    ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 48, $noreg, $al :: (store (s512) into %stack.4 + 48, align 4)
-    ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 16, $noreg, $cx :: (store (s512) into %stack.4 + 16, align 4)
-    ; CHECK-NEXT: $al = IMPLICIT_DEF
-    ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 52, $noreg, $al :: (store (s512) into %stack.4 + 52, align 4)
-    ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 24, $noreg, $cx :: (store (s512) into %stack.4 + 24, align 4)
-    ; CHECK-NEXT: $al = IMPLICIT_DEF
-    ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 53, $noreg, $al :: (store (s512) into %stack.4 + 53, align 1, basealign 4)
-    ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 26, $noreg, $di :: (store (s512) into %stack.4 + 26, align 2, basealign 4)
-    ; CHECK-NEXT: PLDTILECFGV %stack.4, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.4, align 4)
-    ; CHECK-NEXT: renamable $r9 = COPY $rsi
-    ; CHECK-NEXT: $rsi = MOV64rm %stack.7, 1, $noreg, 0, $noreg :: (load (s64) from %stack.7)
-    ; CHECK-NEXT: renamable $r8 = COPY $rdi
-    ; CHECK-NEXT: $di = MOV16rm %stack.6, 1, $noreg, 0, $noreg :: (load (s16) from %stack.6)
-    ; CHECK-NEXT: renamable $r10 = COPY $rax
-    ; CHECK-NEXT: $ax = MOV16rm %stack.5, 1, $noreg, 0, $noreg :: (load (s16) from %stack.5)
-    ; CHECK-NEXT: renamable $tmm4_tmm5 = PT2RPNTLVWZ0V renamable $ax, renamable $cx, renamable $di, renamable $rdx, 1, killed renamable $r10, 0, $noreg
-    ; CHECK-NEXT: renamable $tmm0 = COPY renamable $tmm5
-    ; CHECK-NEXT: renamable $tmm1 = COPY renamable $tmm4, implicit killed $tmm4_tmm5
-    ; CHECK-NEXT: PTILESTOREDV renamable $ax, renamable $cx, renamable $r9, 1, renamable $rsi, 0, $noreg, killed renamable $tmm1
-    ; CHECK-NEXT: PTILESTOREDV renamable $ax, renamable $di, renamable $r8, 1, renamable $rsi, 0, $noreg, killed renamable $tmm0
-    ; CHECK-NEXT: renamable $tmm0 = PTILEZEROV renamable $ax, renamable $cx
-    ; CHECK-NEXT: PTILESTOREDV renamable $ax, renamable $cx, renamable $rdx, 1, renamable $rsi, 0, $noreg, killed renamable $tmm0
-    ; CHECK-NEXT: renamable $tmm0 = PTILELOADDV renamable $ax, renamable $cx, killed renamable $r9, 1, renamable $rsi, 0, $noreg
-    ; CHECK-NEXT: renamable $tmm1 = PTILELOADDV renamable $ax, renamable $di, killed renamable $r8, 1, renamable $rsi, 0, $noreg
-    ; CHECK-NEXT: renamable $tmm2 = PTILELOADDV renamable $ax, renamable $cx, renamable $rdx, 1, renamable $rsi, 0, $noreg
-    ; CHECK-NEXT: renamable $tmm0 = PTDPBSSDV renamable $ax, renamable $cx, killed renamable $di, renamable $tmm0, killed renamable $tmm1, killed renamable $tmm2
-    ; CHECK-NEXT: PTILESTOREDV killed renamable $ax, killed renamable $cx, killed renamable $rdx, 1, killed renamable $rsi, 0, $noreg, killed renamable $tmm0
-    renamable $zmm0 = AVX512_512_SET0
-    VMOVUPSZmr %stack.4, 1, $noreg, 0, $noreg, killed renamable $zmm0 :: (store (s512) into %stack.4, align 4)
-    MOV8mi %stack.4, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.4, align 4)
-    renamable $rcx = MOV32ri64 64
-    MOV64mr %stack.7, 1, $noreg, 0, $noreg, $rcx :: (store (s64) into %stack.7)
-    renamable $cx = MOV16ri 64
-    MOV16mr %stack.5, 1, $noreg, 0, $noreg, $cx :: (store (s16) into %stack.5)
-    renamable $cx = MOV16ri 16
-    renamable $r8w = MOV16ri 16
-    MOV16mr %stack.6, 1, $noreg, 0, $noreg, $r8w :: (store (s16) into %stack.6)
-    PLDTILECFGV %stack.4, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.4, align 4)
-    renamable $r9 = COPY $rsi
-    $rsi = MOV64rm %stack.7, 1, $noreg, 0, $noreg :: (load (s64) from %stack.7)
-    renamable $r8 = COPY $rdi
-    $di = MOV16rm %stack.6, 1, $noreg, 0, $noreg :: (load (s16) from %stack.6)
-    renamable $r10 = COPY $rax
-    $ax = MOV16rm %stack.5, 1, $noreg, 0, $noreg :: (load (s16) from %stack.5)
-    renamable $tmm4_tmm5 = PT2RPNTLVWZ0V renamable $ax, renamable $cx, renamable $di, renamable $rdx, 1, killed renamable $r10, 0, $noreg
-    renamable $tmm0 = COPY renamable $tmm5
-    renamable $tmm1 = COPY renamable $tmm4, implicit killed $tmm4_tmm5
-    PTILESTOREDV renamable $ax, renamable $cx, renamable $r9, 1, renamable $rsi, 0, $noreg, killed renamable $tmm1
-    PTILESTOREDV renamable $ax, renamable $di, renamable $r8, 1, renamable $rsi, 0, $noreg, killed renamable $tmm0
-    renamable $tmm0 = PTILEZEROV renamable $ax, renamable $cx
-    PTILESTOREDV renamable $ax, renamable $cx, renamable $rdx, 1, renamable $rsi, 0, $noreg, killed renamable $tmm0
-    renamable $tmm0 = PTILELOADDV renamable $ax, renamable $cx, killed renamable $r9, 1, renamable $rsi, 0, $noreg
-    renamable $tmm1 = PTILELOADDV renamable $ax, renamable $di, killed renamable $r8, 1, renamable $rsi, 0, $noreg
-    renamable $tmm2 = PTILELOADDV renamable $ax, renamable $cx, renamable $rdx, 1, renamable $rsi, 0, $noreg
-    renamable $tmm0 = PTDPBSSDV renamable $ax, renamable $cx, killed renamable $di, renamable $tmm0, killed renamable $tmm1, killed renamable $tmm2
-    PTILESTOREDV killed renamable $ax, killed renamable $cx, killed renamable $rdx, 1, killed renamable $rsi, 0, $noreg, killed renamable $tmm0
-...
diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_configure_O2.mir b/llvm/test/CodeGen/X86/amx_tile_pair_configure_O2.mir
deleted file mode 100644
index c7d241f8a98b6..0000000000000
--- a/llvm/test/CodeGen/X86/amx_tile_pair_configure_O2.mir
+++ /dev/null
@@ -1,153 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -O2 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \
-# RUN: -mattr=+amx-transpose -run-pass=greedy,tileconfig -o - %s | FileCheck %s
-
---- |
-  @buf = dso_local global [2048 x i8] zeroinitializer, align 16
-  @buf2 = dso_local global [2048 x i8] zeroinitializer, align 16
-
-  define dso_local void @test_tile_2rpntlvwz0(i16 noundef signext %row, i16 noundef signext %col0, i16 noundef signext %col1) local_unnamed_addr #0 {
-  entry:
-    %0 = tail call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 %row, i16 %col0, i16 %col1, i8* getelementptr inbounds ([2048 x i8], [2048 x i8]* @buf, i64 0, i64 0), i64 32) #5
-    %1 = extractvalue { x86_amx, x86_amx } %0, 0
-    %2 = extractvalue { x86_amx, x86_amx } %0, 1
-    %3 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col0) #5
-    %4 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col1, i16 %col0, x86_amx %3, x86_amx %1, x86_amx %2) #5
-    tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col0, i8* getelementptr inbounds ([2048 x i8], [2048 x i8]* @buf2, i64 0, i64 0), i64 32, x86_amx %4) #5
-    ret void
-  }
-
-  declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16, i16, i16, i8*, i64) #1
-
-  declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx) #2
-
-  declare x86_amx @llvm.x86.tilezero.internal(i16, i16) #3
-
-  declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #3
-
-  declare x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>) #2
-
-  declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx) #4
-
-  attributes #0 = { nounwind uwtable "frame-pointer"="all" "min-legal-vector-width"="8192" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+amx-bf16,+amx-int8,+amx-tile,+amx-transpose,+avx,+avx2,+avx512f,+crc32,+cx8,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+amx-tile,+amx-bf16,+avx512f,+amx-transpose" "tune-cpu"="generic" }
-  attributes #1 = { argmemonly nounwind readonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
-  attributes #2 = { nounwind readnone "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
-  attributes #3 = { nounwind "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
-  attributes #4 = { argmemonly nounwind writeonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
-  attributes #5 = { nounwind }
-
-...
----
-name:            test_tile_2rpntlvwz0
-alignment:       16
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-failedISel:      false
-tracksRegLiveness: true
-hasWinCFI:       false
-callsEHReturn:   false
-callsUnwindInit: false
-hasEHContTarget: false
-hasEHScopes:     false
-hasEHFunclets:   false
-failsVerification: false
-tracksDebugUserValues: false
-registers:
-  - { id: 0, class: gr32, preferred-register: '' }
-  - { id: 1, class: gr32, preferred-register: '' }
-  - { id: 2, class: gr32, preferred-register: '' }
-  - { id: 3, class: gr16, preferred-register: '' }
-  - { id: 4, class: gr16, preferred-register: '' }
-  - { id: 5, class: gr16, preferred-register: '' }
-  - { id: 6, class: gr64, preferred-register: '' }
-  - { id: 7, class: gr64_nosp, preferred-register: '' }
-  - { id: 8, class: tilepair, preferred-register: '' }
-  - { id: 9, class: tile, preferred-register: '' }
-  - { id: 10, class: tile, preferred-register: '' }
-  - { id: 11, class: tile, preferred-register: '' }
-  - { id: 12, class: tile, preferred-register: '' }
-  - { id: 13, class: gr64, preferred-register: '' }
-  - { id: 14, class: vr512, preferred-register: '' }
-liveins:
-  - { reg: '$edi', virtual-reg: '%0' }
-  - { reg: '$esi', virtual-reg: '%1' }
-  - { reg: '$edx', virtual-reg: '%2' }
-frameInfo:
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    4
-  adjustsStack:    false
-  hasCalls:        false
-  stackProtector:  ''
-  functionContext: ''
-  maxCallFrameSize: 4294967295
-  cvBytesOfCalleeSavedRegisters: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-  hasTailCall:     false
-  localFrameSize:  0
-  savePoint:       []
-  restorePoint:    []
-fixedStack:      []
-stack:
-  - { id: 0, name: '', type: default, offset: 0, size: 64, alignment: 4,
-      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
-      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
-callSites:       []
-debugValueSubstitutions: []
-constants:       []
-machineFunctionInfo:
-  amxProgModel: ManagedRA
-body:             |
-  bb.0.entry:
-    liveins: $edi, $esi, $edx
-
-
-    ; CHECK-LABEL: name: test_tile_2rpntlvwz0
-    ; CHECK: liveins: $edi, $esi, $edx
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edx
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr32 = COPY $esi
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr32 = COPY $edi
-    ; CHECK-NEXT: [[AVX512_512_SET0_:%[0-9]+]]:vr512 = AVX512_512_SET0
-    ; CHECK-NEXT: VMOVUPSZmr %stack.0, 1, $noreg, 0, $noreg, [[AVX512_512_SET0_]] :: (store (s512) into %stack.0, align 4)
-    ; CHECK-NEXT: MOV8mi %stack.0, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.0, align 4)
-    ; CHECK-NEXT: MOV16mr %stack.0, 1, $noreg, 26, $noreg, [[COPY]].sub_16bit :: (store (s512) into %stack.0 + 26, align 2, basealign 4)
-    ; CHECK-NEXT: MOV8mr %stack.0, 1, $noreg, 53, $noreg, [[COPY2]].sub_8bit :: (store (s512) into %stack.0 + 53, align 1, basealign 4)
-    ; CHECK-NEXT: MOV16mr %stack.0, 1, $noreg, 24, $noreg, [[COPY1]].sub_16bit :: (store (s512) into %stack.0 + 24, align 4)
-    ; CHECK-NEXT: MOV8mr %stack.0, 1, $noreg, 52, $noreg, [[COPY2]].sub_8bit :: (store (s512) into %stack.0 + 52, align 4)
-    ; CHECK-NEXT: MOV16mr %stack.0, 1, $noreg, 16, $noreg, [[COPY]].sub_16bit :: (store (s512) into %stack.0 + 16, align 4)
-    ; CHECK-NEXT: MOV8mr %stack.0, 1, $noreg, 48, $noreg, [[COPY2]].sub_8bit :: (store (s512) into %stack.0 + 48, align 4)
-    ; CHECK-NEXT: PLDTILECFGV %stack.0, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.0, align 4)
-    ; CHECK-NEXT: [[MOV32ri64_:%[0-9]+]]:gr64 = MOV32ri64 @buf
-    ; CHECK-NEXT: [[MOV32ri64_1:%[0-9]+]]:gr64_nosp = MOV32ri64 32
-    ; CHECK-NEXT: [[PT2RPNTLVWZ0V:%[0-9]+]]:tilepair = PT2RPNTLVWZ0V [[COPY2]].sub_16bit, [[COPY1]].sub_16bit, [[COPY]].sub_16bit, [[MOV32ri64_]], 1, [[MOV32ri64_1]], 0, $noreg
-    ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTILEZEROV [[COPY2]].sub_16bit, [[COPY1]].sub_16bit
-    ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTDPBSSDV [[COPY2]].sub_16bit, [[COPY]].sub_16bit, [[COPY1]].sub_16bit, [[PTILEZEROV]], [[PT2RPNTLVWZ0V]].sub_t0, [[PT2RPNTLVWZ0V]].sub_t1
-    ; CHECK-NEXT: [[MOV32ri64_2:%[0-9]+]]:gr64 = MOV32ri64 @buf2
-    ; CHECK-NEXT: PTILESTOREDV [[COPY2]].sub_16bit, [[COPY1]].sub_16bit, [[MOV32ri64_2]], 1, [[MOV32ri64_1]], 0, $noreg, [[PTILEZEROV]]
-    ; CHECK-NEXT: RET 0
-    %2:gr32 = COPY $edx
-    %1:gr32 = COPY $esi
-    %0:gr32 = COPY $edi
-    %14:vr512 = AVX512_512_SET0
-    VMOVUPSZmr %stack.0, 1, $noreg, 0, $noreg, %14 :: (store (s512) into %stack.0, align 4)
-    MOV8mi %stack.0, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.0, align 4)
-    PLDTILECFGV %stack.0, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.0, align 4)
-    %6:gr64 = MOV32ri64 @buf
-    %7:gr64_nosp = MOV32ri64 32
-    %8:tilepair = PT2RPNTLVWZ0V %0.sub_16bit, %1.sub_16bit, %2.sub_16bit, %6, 1, %7, 0, $noreg
-    %12:tile = PTILEZEROV %0.sub_16bit, %1.sub_16bit
-    %12:tile = PTDPBSSDV %0.sub_16bit, %2.sub_16bit, %1.sub_16bit, %12, %8.sub_t0, %8.sub_t1
-    %13:gr64 = MOV32ri64 @buf2
-    PTILESTOREDV %0.sub_16bit, %1.sub_16bit, %13, 1, %7, 0, $noreg, %12
-    RET 0
-
-...
diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_copy.mir b/llvm/test/CodeGen/X86/amx_tile_pair_copy.mir
deleted file mode 100644
index 66b15aa5b3cde..0000000000000
--- a/llvm/test/CodeGen/X86/amx_tile_pair_copy.mir
+++ /dev/null
@@ -1,97 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \
-# RUN: -mattr=+amx-transpose -run-pass=lowertilecopy -o - %s | FileCheck %s
-
----
-name:            test_tile_2rpntlvwz0
-alignment:       16
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-failedISel:      false
-tracksRegLiveness: true
-hasWinCFI:       false
-callsEHReturn:   false
-callsUnwindInit: false
-hasEHContTarget: false
-hasEHScopes:     false
-hasEHFunclets:   false
-failsVerification: false
-tracksDebugUserValues: false
-registers:       []
-liveins:
-  - { reg: '$edi', virtual-reg: '' }
-  - { reg: '$esi', virtual-reg: '' }
-  - { reg: '$edx', virtual-reg: '' }
-  - { reg: '$cx', virtual-reg: '' }
-  - { reg: '$r9', virtual-reg: '' }
-  - { reg: '$r10', virtual-reg: '' }
-frameInfo:
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    1024
-  adjustsStack:    false
-  hasCalls:        true
-  stackProtector:  ''
-  functionContext: ''
-  maxCallFrameSize: 4294967295
-  cvBytesOfCalleeSavedRegisters: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-  hasTailCall:     false
-  localFrameSize:  0
-  savePoint:       []
-  restorePoint:    []
-fixedStack:      []
-stack:
-  - { id: 43, name: '', type: default, offset: 0, size: 64, alignment: 4,
-      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
-      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
-  - { id: 68, name: '', type: spill-slot, offset: 0, size: 8, alignment: 8,
-      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
-      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
-callSites:       []
-debugValueSubstitutions: []
-constants:       []
-machineFunctionInfo:
-  amxProgModel: ManagedRA
-body:             |
-  bb.0.entry:
-    liveins: $edi, $esi, $edx, $cx, $di, $r8w, $r11, $r10, $rbx, $r8, $r9
-
-
-    ; CHECK-LABEL: name: test_tile_2rpntlvwz0
-    ; CHECK: liveins: $edi, $esi, $edx, $cx, $di, $r8w, $r11, $r10, $rbx, $r8, $r9
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: PLDTILECFGV %stack.0, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.0, align 4)
-    ; CHECK-NEXT: renamable $tmm4_tmm5 = PT2RPNTLVWZ0V killed renamable $cx, killed renamable $di, killed renamable $r8w, killed renamable $r11, 1, killed renamable $rbx, 0, $noreg
-    ; CHECK-NEXT: $rax = MOV64ri 64
-    ; CHECK-NEXT: TILESTORED %stack.3, 1, $rax, 0, $noreg, $tmm5 :: (store (s8192) into %stack.3)
-    ; CHECK-NEXT: $tmm0 = TILELOADD %stack.3, 1, killed $rax, 0, $noreg :: (load (s8192) from %stack.3)
-    ; CHECK-NEXT: $rax = MOV64ri 64
-    ; CHECK-NEXT: TILESTORED %stack.2, 1, $rax, 0, $noreg, $tmm4 :: (store (s8192) into %stack.2)
-    ; CHECK-NEXT: $tmm1 = TILELOADD %stack.2, 1, killed $rax, 0, $noreg :: (load (s8192) from %stack.2)
-    ; CHECK-NEXT: renamable $r8 = MOV32ri64 64
-    ; CHECK-NEXT: MOV64mr %stack.1, 1, $noreg, 0, $noreg, $r8 :: (store (s64) into %stack.1)
-    ; CHECK-NEXT: renamable $di = MOV16ri 64
-    ; CHECK-NEXT: renamable $cx = MOV16ri 16
-    ; CHECK-NEXT: PTILESTOREDV renamable $cx, renamable $di, killed renamable $r10, 1, renamable $r8, 0, $noreg, killed renamable $tmm1
-    ; CHECK-NEXT: PTILESTOREDV killed renamable $cx, killed renamable $di, killed renamable $r9, 1, renamable $r8, 0, $noreg, killed renamable $tmm0
-    PLDTILECFGV %stack.43, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.43, align 4)
-    renamable $tmm4_tmm5 = PT2RPNTLVWZ0V killed renamable $cx, killed renamable $di, killed renamable $r8w, killed renamable $r11, 1, killed renamable $rbx, 0, $noreg
-    renamable $tmm0 = COPY renamable $tmm5
-    renamable $tmm1 = COPY renamable $tmm4, implicit killed $tmm4_tmm5
-    renamable $r8 = MOV32ri64 64
-    MOV64mr %stack.68, 1, $noreg, 0, $noreg, $r8 :: (store (s64) into %stack.68)
-    renamable $di = MOV16ri 64
-    renamable $cx = MOV16ri 16
-    PTILESTOREDV renamable $cx, renamable $di, killed renamable $r10, 1, renamable $r8, 0, $noreg, killed renamable $tmm1
-    PTILESTOREDV killed renamable $cx, killed renamable $di, killed renamable $r9, 1, renamable $r8, 0, $noreg, killed renamable $tmm0
-
-...
diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O0.ll b/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O0.ll
deleted file mode 100644
index 3549875e858a9..0000000000000
--- a/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O0.ll
+++ /dev/null
@@ -1,87 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-  ; RUN: opt --codegen-opt-level=0 -mtriple=x86_64 -x86-lower-amx-type %s -S | FileCheck %s
-  ; RUN: opt --codegen-opt-level=0 -mtriple=x86_64 -passes=x86-lower-amx-type %s -S | FileCheck %s
-
-  @buf = dso_local global [2048 x i8] zeroinitializer, align 16
-
-  ; Function Attrs: noinline nounwind optnone uwtable
-  define dso_local void @test_tile_2rpntlvwz0(i16 noundef signext %row, i16 noundef signext %col0, i16 noundef signext %col1, ptr %m) #0 {
-; CHECK-LABEL: @test_tile_2rpntlvwz0(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = udiv i16 [[COL1:%.*]], 4
-; CHECK-NEXT:    [[TMP1:%.*]] = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 [[ROW:%.*]], i16 [[COL0:%.*]], i16 [[COL1]], ptr @buf, i64 32) #[[ATTR3:[0-9]+]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { x86_amx, x86_amx } [[TMP1]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = sext i16 [[COL0]] to i64
-; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 [[COL0]], ptr [[M:%.*]], i64 [[TMP3]], x86_amx [[TMP2]])
-; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { x86_amx, x86_amx } [[TMP1]], 1
-; CHECK-NEXT:    [[TMP6:%.*]] = sext i16 [[COL1]] to i64
-; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 [[COL1]], ptr [[M]], i64 [[TMP6]], x86_amx [[TMP5]])
-; CHECK-NEXT:    [[TMP8:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 [[ROW]], i16 [[COL0]]) #[[ATTR3]]
-; CHECK-NEXT:    [[TMP9:%.*]] = sext i16 [[COL0]] to i64
-; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 [[COL0]], ptr [[M]], i64 [[TMP9]], x86_amx [[TMP8]])
-; CHECK-NEXT:    [[TMP11:%.*]] = sext i16 [[COL0]] to i64
-; CHECK-NEXT:    [[TMP13:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[ROW]], i16 [[COL0]], ptr [[M]], i64 [[TMP11]])
-; CHECK-NEXT:    [[TMP14:%.*]] = sext i16 [[COL1]] to i64
-; CHECK-NEXT:    [[TMP16:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[ROW]], i16 [[COL1]], ptr [[M]], i64 [[TMP14]])
-; CHECK-NEXT:    [[TMP17:%.*]] = sext i16 [[COL0]] to i64
-; CHECK-NEXT:    [[TMP19:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP0]], i16 [[COL0]], ptr [[M]], i64 [[TMP17]])
-; CHECK-NEXT:    [[TMP20:%.*]] = call x86_amx @llvm.x86.tdpbssd.internal(i16 [[ROW]], i16 [[COL0]], i16 [[COL1]], x86_amx [[TMP13]], x86_amx [[TMP16]], x86_amx [[TMP19]]) #[[ATTR3]]
-; CHECK-NEXT:    [[TMP21:%.*]] = sext i16 [[COL0]] to i64
-; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 [[COL0]], ptr [[M]], i64 [[TMP21]], x86_amx [[TMP20]])
-; CHECK-NEXT:    ret void
-;
-  entry:
-
-  %0 =  call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 %row, i16 %col0, i16 %col1, ptr getelementptr inbounds ([2048 x i8], ptr @buf, i64     0, i64 0), i64 32) #7
-  %1 = extractvalue { x86_amx, x86_amx } %0, 0
-  %2 =  call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %1) #7
-  store <256 x i32> %2, ptr %m, align 1024
-
-  %3 = extractvalue { x86_amx, x86_amx } %0, 1
-  %4 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %3) #7
-  store <256 x i32> %4, ptr %m, align 1024
-
-  %5 = call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col0) #7
-  %6 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %5) #7
-  store <256 x i32> %6, ptr %m, align 64
-
-  %7 = load <256 x i32>, ptr %m, align 64
-  %8 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %7) #7
-  %9 = load <256 x i32>, ptr %m, align 64
-  %10 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %9) #7
-  %11 = load <256 x i32>, ptr %m, align 64
-  %12 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %11) #7
-
-  %13 = call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col0, i16 %col1, x86_amx %8, x86_amx %10, x86_amx %12) #7
-  %14 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %13) #7
-  store <256 x i32> %14, ptr %m, align 64
-
-  ret void
-  }
-
-  ; Function Attrs: argmemonly nounwind readonly
-  declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16, i16, i16, ptr, i64) #2
-
-  ; Function Attrs: nounwind readnone
-  declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx) #3
-
-  ; Function Attrs: nounwind
-  declare x86_amx @llvm.x86.tilezero.internal(i16, i16) #4
-
-  ; Function Attrs: nounwind
-  declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #4
-
-  ; Function Attrs: nounwind readnone
-  declare x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>) #3
-
-  ; Function Attrs: argmemonly nounwind writeonly
-  declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) #5
-
-  attributes #0 = { noinline nounwind optnone uwtable "frame-pointer"="all" "min-legal-vector-width"="8192" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+amx-bf16,+amx-int8,+amx-tile,+amx-transpose,+avx,+avx2,+avx512f,+crc32,+cx8,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+amx-tile,+amx-bf16,+avx512f,+amx-transpose" "tune-cpu"="generic" }
-  attributes #1 = { argmemonly nofree nounwind willreturn writeonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
-  attributes #2 = { argmemonly nounwind readonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
-  attributes #3 = { nounwind readnone "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
-  attributes #4 = { nounwind "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
-  attributes #5 = { argmemonly nounwind writeonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
-  attributes #6 = { argmemonly nofree nounwind willreturn "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
-  attributes #7 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O2.ll b/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O2.ll
deleted file mode 100644
index 96966264e0515..0000000000000
--- a/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O2.ll
+++ /dev/null
@@ -1,61 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -x86-lower-amx-type %s -S | FileCheck %s
-; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -passes=x86-lower-amx-type %s -S | FileCheck %s
-
-  @buf = dso_local global [2048 x i8] zeroinitializer, align 16
-  @buf2 = dso_local global [2048 x i8] zeroinitializer, align 16
-
-  ; Function Attrs: nounwind uwtable
-  define dso_local void @test_tile_2rpntlvwz0(i16 noundef signext %row, i16 noundef signext %col0, i16 noundef signext %col1) local_unnamed_addr #0 {
-; CHECK-LABEL: @test_tile_2rpntlvwz0(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = tail call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 [[ROW:%.*]], i16 [[COL0:%.*]], i16 [[COL1:%.*]], ptr @buf, i64 32) #[[ATTR3:[0-9]+]]
-; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { x86_amx, x86_amx } [[TMP0]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { x86_amx, x86_amx } [[TMP0]], 1
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call x86_amx @llvm.x86.tilezero.internal(i16 [[ROW]], i16 [[COL0]]) #[[ATTR3]]
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 [[ROW]], i16 [[COL1]], i16 [[COL0]], x86_amx [[TMP3]], x86_amx [[TMP1]], x86_amx [[TMP2]]) #[[ATTR3]]
-; CHECK-NEXT:    tail call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 [[COL0]], ptr @buf2, i64 32, x86_amx [[TMP4]]) #[[ATTR3]]
-; CHECK-NEXT:    ret void
-;
-  entry:
-  %0 = tail call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 %row, i16 %col0, i16 %col1, ptr @buf, i64 32) #5
-  %1 = extractvalue { x86_amx, x86_amx } %0, 0
-  %2 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %1) #5
-  %3 = extractvalue { x86_amx, x86_amx } %0, 1
-  %4 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %3) #5
-  %5 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col0) #5
-  %6 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %5) #5
-  %7 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %6) #5
-  %8 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %2) #5
-  %9 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %4) #5
-  %10 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col1, i16 %col0, x86_amx %7, x86_amx %8, x86_amx %9) #5
-  %11 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %10) #5
-  %12 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %11) #5
-  tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col0, ptr @buf2, i64 32, x86_amx %12) #5
-  ret void
-  }
-
-  ; Function Attrs: argmemonly nounwind readonly
-  declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16, i16, i16, ptr, i64) #1
-
-  ; Function Attrs: nounwind readnone
-  declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx) #2
-
-  ; Function Attrs: nounwind
-  declare x86_amx @llvm.x86.tilezero.internal(i16, i16) #3
-
-  ; Function Attrs: nounwind
-  declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #3
-
-  ; Function Attrs: nounwind readnone
-  declare x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>) #2
-
-  ; Function Attrs: argmemonly nounwind writeonly
-  declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) #4
-
-  attributes #0 = { nounwind uwtable "frame-pointer"="all" "min-legal-vector-width"="8192" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+amx-bf16,+amx-int8,+amx-tile,+amx-transpose,+avx,+avx2,+avx512f,+crc32,+cx8,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+amx-tile,+amx-bf16,+avx512f,+amx-transpose" "tune-cpu"="generic" }
-  attributes #1 = { argmemonly nounwind readonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
-  attributes #2 = { nounwind readnone "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
-  attributes #3 = { nounwind "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
-  attributes #4 = { argmemonly nounwind writeonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
-  attributes #5 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O0.mir b/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O0.mir
deleted file mode 100644
index 1e3b242bca96c..0000000000000
--- a/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O0.mir
+++ /dev/null
@@ -1,134 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \
-# RUN: -mattr=+amx-transpose -run-pass=fastpretileconfig -o - %s | FileCheck %s
-
----
-name:            test_tile_2rpntlvwz0
-alignment:       16
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-failedISel:      false
-tracksRegLiveness: true
-hasWinCFI:       false
-callsEHReturn:   false
-callsUnwindInit: false
-hasEHContTarget: false
-hasEHScopes:     false
-hasEHFunclets:   false
-failsVerification: false
-tracksDebugUserValues: false
-registers:
-  - { id: 0, class: gr64_nosp, preferred-register: '' }
-  - { id: 1, class: gr16, preferred-register: '' }
-  - { id: 2, class: gr16, preferred-register: '' }
-  - { id: 3, class: gr16, preferred-register: '' }
-  - { id: 4, class: gr64, preferred-register: '' }
-  - { id: 5, class: gr64, preferred-register: '' }
-  - { id: 6, class: gr64, preferred-register: '' }
-  - { id: 7, class: gr64_nosp, preferred-register: '' }
-  - { id: 8, class: tilepair, preferred-register: '' }
-  - { id: 9, class: tile, preferred-register: '' }
-  - { id: 10, class: tile, preferred-register: '' }
-  - { id: 11, class: tile, preferred-register: '' }
-  - { id: 181, class: tile, preferred-register: '' }
-  - { id: 183, class: tile, preferred-register: '' }
-  - { id: 185, class: tile, preferred-register: '' }
-  - { id: 186, class: tile, preferred-register: '' }
-liveins:
-  - { reg: '$edi', virtual-reg: '%0' }
-  - { reg: '$esi', virtual-reg: '%1' }
-  - { reg: '$edx', virtual-reg: '%2' }
-frameInfo:
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    1024
-  adjustsStack:    false
-  hasCalls:        true
-  stackProtector:  ''
-  functionContext: ''
-  maxCallFrameSize: 4294967295
-  cvBytesOfCalleeSavedRegisters: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-  hasTailCall:     false
-  localFrameSize:  0
-  savePoint:       []
-  restorePoint:    []
-fixedStack:      []
-stack:
-  - { id: 18, name: '', type: default, offset: 0, size: 8, alignment: 8,
-      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
-      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
-  - { id: 19, name: '', type: default, offset: 0, size: 8, alignment: 8,
-      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
-      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
-  - { id: 20, name: '', type: default, offset: 0, size: 8, alignment: 8,
-      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
-      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
-  - { id: 21, name: '', type: default, offset: 0, size: 8,
-      alignment: 8, stack-id: default, callee-saved-register: '', callee-saved-restored: true,
-      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
-callSites:       []
-debugValueSubstitutions: []
-constants:       []
-machineFunctionInfo:
-  amxProgModel: ManagedRA
-body:             |
-  bb.0.entry:
-    liveins: $rdi, $rsi, $rdx, $rax
-
-    ; CHECK-LABEL: name: test_tile_2rpntlvwz0
-    ; CHECK: liveins: $rdi, $rsi, $rdx, $rax
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[AVX512_512_SET0_:%[0-9]+]]:vr512 = AVX512_512_SET0
-    ; CHECK-NEXT: VMOVUPSZmr %stack.4, 1, $noreg, 0, $noreg, [[AVX512_512_SET0_]] :: (store (s512) into %stack.4, align 4)
-    ; CHECK-NEXT: MOV8mi %stack.4, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.4, align 4)
-    ; CHECK-NEXT: [[MOV32ri64_:%[0-9]+]]:gr64_nosp = MOV32ri64 64
-    ; CHECK-NEXT: [[MOV16ri:%[0-9]+]]:gr16 = MOV16ri 64
-    ; CHECK-NEXT: [[MOV16ri1:%[0-9]+]]:gr16 = MOV16ri 16
-    ; CHECK-NEXT: [[MOV16ri2:%[0-9]+]]:gr16 = MOV16ri 16
-    ; CHECK-NEXT: PLDTILECFGV %stack.4, 1, $noreg, 0, $noreg, implicit-def $tmm0, implicit-def $tmm1, implicit-def $tmm2, implicit-def $tmm3, implicit-def $tmm4, implicit-def $tmm5, implicit-def $tmm6, implicit-def $tmm7 :: (load (s512) from %stack.4, align 4)
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr64 = COPY $rsi
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr64 = COPY $rdi
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr64 = COPY $rdx
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gr64_nosp = COPY $rax
-    ; CHECK-NEXT: [[PT2RPNTLVWZ0V:%[0-9]+]]:tilepair = PT2RPNTLVWZ0V [[MOV16ri]], [[MOV16ri1]], [[MOV16ri2]], [[COPY2]], 1, killed [[COPY3]], 0, $noreg
-    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:tile = COPY [[PT2RPNTLVWZ0V]].sub_t1
-    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:tile = COPY [[PT2RPNTLVWZ0V]].sub_t0
-    ; CHECK-NEXT: PTILESTOREDV [[MOV16ri]], [[MOV16ri1]], [[COPY]], 1, [[MOV32ri64_]], 0, $noreg, killed [[COPY5]]
-    ; CHECK-NEXT: PTILESTOREDV [[MOV16ri]], [[MOV16ri2]], [[COPY1]], 1, [[MOV32ri64_]], 0, $noreg, killed [[COPY4]]
-    ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTILEZEROV [[MOV16ri]], [[MOV16ri1]]
-    ; CHECK-NEXT: PTILESTOREDV [[MOV16ri]], [[MOV16ri1]], [[COPY2]], 1, [[MOV32ri64_]], 0, $noreg, killed [[PTILEZEROV]]
-    ; CHECK-NEXT: [[PTILELOADDV:%[0-9]+]]:tile = PTILELOADDV [[MOV16ri]], [[MOV16ri1]], [[COPY]], 1, [[MOV32ri64_]], 0, $noreg
-    ; CHECK-NEXT: [[PTILELOADDV1:%[0-9]+]]:tile = PTILELOADDV [[MOV16ri]], [[MOV16ri2]], [[COPY1]], 1, [[MOV32ri64_]], 0, $noreg
-    ; CHECK-NEXT: [[PTILELOADDV2:%[0-9]+]]:tile = PTILELOADDV [[MOV16ri]], [[MOV16ri1]], [[COPY2]], 1, [[MOV32ri64_]], 0, $noreg
-    ; CHECK-NEXT: [[PTDPBSSDV:%[0-9]+]]:tile = PTDPBSSDV [[MOV16ri]], [[MOV16ri1]], [[MOV16ri2]], [[PTILELOADDV]], killed [[PTILELOADDV1]], killed [[PTILELOADDV2]]
-    ; CHECK-NEXT: PTILESTOREDV killed [[MOV16ri]], killed [[MOV16ri1]], killed [[COPY2]], 1, killed [[MOV32ri64_]], 0, $noreg, killed [[PTDPBSSDV]]
-    %0:gr64_nosp = MOV32ri64 64
-    %1:gr16 = MOV16ri 64
-    %2:gr16 = MOV16ri 16
-    %3:gr16 = MOV16ri 16
-    %4:gr64 = COPY $rsi
-    %5:gr64 = COPY $rdi
-    %6:gr64 = COPY $rdx
-    %7:gr64_nosp = COPY $rax
-    %8:tilepair = PT2RPNTLVWZ0V %1, %2, %3, %6, 1, killed %7, 0, $noreg
-    %9:tile = COPY %8.sub_t1
-    %10:tile = COPY %8.sub_t0
-    PTILESTOREDV %1, %2, %4, 1, %0, 0, $noreg, killed %10
-    PTILESTOREDV %1, %3, %5, 1, %0, 0, $noreg, killed %9
-    %11:tile = PTILEZEROV %1, %2
-    PTILESTOREDV %1, %2, %6, 1, %0, 0, $noreg, killed %11
-    %181:tile = PTILELOADDV %1, %2, %4, 1, %0, 0, $noreg
-    %183:tile = PTILELOADDV %1, %3, %5, 1, %0, 0, $noreg
-    %185:tile = PTILELOADDV %1, %2, %6, 1, %0, 0, $noreg
-    %186:tile = PTDPBSSDV %1, %2, %3, %181, killed %183, killed %185
-    PTILESTOREDV killed %1, killed %2, killed %6, 1, killed %0, 0, $noreg, killed %186
-...
diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O2.mir b/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O2.mir
deleted file mode 100644
index ac2cdb4a50568..0000000000000
--- a/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O2.mir
+++ /dev/null
@@ -1,113 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -O2 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \
-# RUN: -mattr=+amx-transpose -run-pass=tilepreconfig -o - %s | FileCheck %s
-
----
-name:            test_tile_2rpntlvwz0
-alignment:       16
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-failedISel:      false
-tracksRegLiveness: true
-hasWinCFI:       false
-callsEHReturn:   false
-callsUnwindInit: false
-hasEHContTarget: false
-hasEHScopes:     false
-hasEHFunclets:   false
-failsVerification: false
-tracksDebugUserValues: false
-registers:
-  - { id: 0, class: gr32, preferred-register: '' }
-  - { id: 1, class: gr32, preferred-register: '' }
-  - { id: 2, class: gr32, preferred-register: '' }
-  - { id: 3, class: gr16, preferred-register: '' }
-  - { id: 4, class: gr16, preferred-register: '' }
-  - { id: 5, class: gr16, preferred-register: '' }
-  - { id: 6, class: gr64, preferred-register: '' }
-  - { id: 7, class: gr64_nosp, preferred-register: '' }
-  - { id: 8, class: tilepair, preferred-register: '' }
-  - { id: 9, class: tile, preferred-register: '' }
-  - { id: 10, class: tile, preferred-register: '' }
-  - { id: 11, class: tile, preferred-register: '' }
-  - { id: 12, class: tile, preferred-register: '' }
-  - { id: 13, class: gr64, preferred-register: '' }
-liveins:
-  - { reg: '$edi', virtual-reg: '%0' }
-  - { reg: '$esi', virtual-reg: '%1' }
-  - { reg: '$edx', virtual-reg: '%2' }
-frameInfo:
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    1
-  adjustsStack:    false
-  hasCalls:        false
-  stackProtector:  ''
-  functionContext: ''
-  maxCallFrameSize: 4294967295
-  cvBytesOfCalleeSavedRegisters: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-  hasTailCall:     false
-  localFrameSize:  0
-  savePoint:       []
-  restorePoint:    []
-fixedStack:      []
-stack:           []
-callSites:       []
-debugValueSubstitutions: []
-constants:       []
-machineFunctionInfo:
-  amxProgModel: ManagedRA
-body:             |
-  bb.0.entry:
-    liveins: $edi, $esi, $edx, $rax, $rbx
-
-    ; CHECK-LABEL: name: test_tile_2rpntlvwz0
-    ; CHECK: liveins: $edi, $esi, $edx, $rax, $rbx
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[AVX512_512_SET0_:%[0-9]+]]:vr512 = AVX512_512_SET0
-    ; CHECK-NEXT: VMOVUPSZmr %stack.0, 1, $noreg, 0, $noreg, [[AVX512_512_SET0_]] :: (store (s512) into %stack.0, align 4)
-    ; CHECK-NEXT: MOV8mi %stack.0, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.0, align 4)
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edx
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr32 = COPY $esi
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr32 = COPY $edi
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gr16 = COPY [[COPY]].sub_16bit
-    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gr16 = COPY [[COPY1]].sub_16bit
-    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gr16 = COPY [[COPY2]].sub_16bit
-    ; CHECK-NEXT: PLDTILECFGV %stack.0, 1, $noreg, 0, $noreg, implicit-def $tmm0, implicit-def $tmm1, implicit-def $tmm2, implicit-def $tmm3, implicit-def $tmm4, implicit-def $tmm5, implicit-def $tmm6, implicit-def $tmm7 :: (load (s512) from %stack.0, align 4)
-    ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gr64 = COPY $rax
-    ; CHECK-NEXT: [[MOV32ri64_:%[0-9]+]]:gr64_nosp = MOV32ri64 32
-    ; CHECK-NEXT: [[PT2RPNTLVWZ0V:%[0-9]+]]:tilepair = PT2RPNTLVWZ0V [[COPY5]], [[COPY4]], [[COPY3]], killed [[COPY6]], 1, [[MOV32ri64_]], 0, $noreg
-    ; CHECK-NEXT: [[COPY7:%[0-9]+]]:tile = COPY [[PT2RPNTLVWZ0V]].sub_t1
-    ; CHECK-NEXT: [[COPY8:%[0-9]+]]:tile = COPY [[PT2RPNTLVWZ0V]].sub_t0
-    ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTILEZEROV [[COPY5]], [[COPY4]]
-    ; CHECK-NEXT: [[PTDPBSSDV:%[0-9]+]]:tile = PTDPBSSDV [[COPY5]], [[COPY3]], [[COPY4]], [[PTILEZEROV]], killed [[COPY8]], killed [[COPY7]]
-    ; CHECK-NEXT: [[COPY9:%[0-9]+]]:gr64 = COPY $rbx
-    ; CHECK-NEXT: PTILESTOREDV [[COPY5]], [[COPY4]], killed [[COPY9]], 1, [[MOV32ri64_]], 0, $noreg, killed [[PTDPBSSDV]]
-    ; CHECK-NEXT: RET 0
-    %2:gr32 = COPY $edx
-    %1:gr32 = COPY $esi
-    %0:gr32 = COPY $edi
-    %3:gr16 = COPY %2.sub_16bit
-    %4:gr16 = COPY %1.sub_16bit
-    %5:gr16 = COPY %0.sub_16bit
-    %6:gr64 = COPY $rax
-    %7:gr64_nosp = MOV32ri64 32
-    %8:tilepair = PT2RPNTLVWZ0V %5, %4, %3, killed %6, 1, %7, 0, $noreg
-    %9:tile = COPY %8.sub_t1
-    %10:tile = COPY %8.sub_t0
-    %11:tile = PTILEZEROV %5, %4
-    %12:tile = PTDPBSSDV %5, %3, %4, %11, killed %10, killed %9
-    %13:gr64 = COPY $rbx
-    PTILESTOREDV %5, %4, killed %13, 1, %7, 0, $noreg, killed %12
-    RET 0
-
-...
diff --git a/llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll b/llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll
deleted file mode 100644
index 4cfd97afe721b..0000000000000
--- a/llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll
+++ /dev/null
@@ -1,371 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+amx-bf16,+amx-fp16,+amx-complex,+amx-transpose | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+amx-bf16,+amx-fp16,+amx-complex,+amx-transpose,+egpr --show-mc-encoding | FileCheck %s --check-prefix=EGPR
-
-define void @test_amx(i32 %rv32, i64 %stride, i64 %rvalue, i8* %addr1, <4 x float> %xmm) #0 {
-; CHECK-LABEL: test_amx:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    t2rpntlvwz0 (%rcx,%rsi), %tmm0
-; CHECK-NEXT:    t2rpntlvwz0t1 (%rcx,%rsi), %tmm2
-; CHECK-NEXT:    t2rpntlvwz1 (%rcx,%rsi), %tmm0
-; CHECK-NEXT:    t2rpntlvwz1t1 (%rcx,%rsi), %tmm2
-; CHECK-NEXT:    ttransposed %tmm3, %tmm1
-; CHECK-NEXT:    ttdpbf16ps %tmm3, %tmm2, %tmm1
-; CHECK-NEXT:    ttdpfp16ps %tmm6, %tmm5, %tmm4
-; CHECK-NEXT:    ttcmmimfp16ps %tmm3, %tmm2, %tmm1
-; CHECK-NEXT:    ttcmmrlfp16ps %tmm3, %tmm2, %tmm1
-; CHECK-NEXT:    tconjtcmmimfp16ps %tmm3, %tmm2, %tmm1
-; CHECK-NEXT:    tconjtfp16 %tmm2, %tmm1
-; CHECK-NEXT:    retq
-;
-; EGPR-LABEL: test_amx:
-; EGPR:       # %bb.0:
-; EGPR-NEXT:    t2rpntlvwz0 (%rcx,%rsi), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6e,0x04,0x31]
-; EGPR-NEXT:    t2rpntlvwz0t1 (%rcx,%rsi), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6f,0x14,0x31]
-; EGPR-NEXT:    t2rpntlvwz1 (%rcx,%rsi), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6e,0x04,0x31]
-; EGPR-NEXT:    t2rpntlvwz1t1 (%rcx,%rsi), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6f,0x14,0x31]
-; EGPR-NEXT:    ttransposed %tmm3, %tmm1 # encoding: [0xc4,0xe2,0x7a,0x5f,0xcb]
-; EGPR-NEXT:    ttdpbf16ps %tmm3, %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x62,0x6c,0xca]
-; EGPR-NEXT:    ttdpfp16ps %tmm6, %tmm5, %tmm4 # encoding: [0xc4,0xe2,0x4b,0x6c,0xe5]
-; EGPR-NEXT:    ttcmmimfp16ps %tmm3, %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x63,0x6b,0xca]
-; EGPR-NEXT:    ttcmmrlfp16ps %tmm3, %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x62,0x6b,0xca]
-; EGPR-NEXT:    tconjtcmmimfp16ps %tmm3, %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x60,0x6b,0xca]
-; EGPR-NEXT:    tconjtfp16 %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x79,0x6b,0xca]
-; EGPR-NEXT:    retq # encoding: [0xc3]
-  call void @llvm.x86.t2rpntlvwz0(i8 1, i8* %addr1, i64 %stride)
-  call void @llvm.x86.t2rpntlvwz0t1(i8 2, i8* %addr1, i64 %stride)
-  call void @llvm.x86.t2rpntlvwz1(i8 1, i8* %addr1, i64 %stride)
-  call void @llvm.x86.t2rpntlvwz1t1(i8 2, i8* %addr1, i64 %stride)
-  call void @llvm.x86.ttransposed(i8 1, i8 3)
-  call void @llvm.x86.ttdpbf16ps(i8 1, i8 2, i8 3)
-  call void @llvm.x86.ttdpfp16ps(i8 4, i8 5, i8 6)
-  call void @llvm.x86.ttcmmimfp16ps(i8 1, i8 2, i8 3)
-  call void @llvm.x86.ttcmmrlfp16ps(i8 1, i8 2, i8 3)
-  call void @llvm.x86.tconjtcmmimfp16ps(i8 1, i8 2, i8 3)
-  call void @llvm.x86.tconjtfp16(i8 1, i8 2)
-  ret void
-}
-
-declare void @llvm.x86.t2rpntlvwz0(i8 %tile1, i8* %addr1, i64 %stride)
-declare void @llvm.x86.t2rpntlvwz0t1(i8 %tile1, i8* %addr1, i64 %stride)
-declare void @llvm.x86.t2rpntlvwz1(i8 %tile1, i8* %addr1, i64 %stride)
-declare void @llvm.x86.t2rpntlvwz1t1(i8 %tile1, i8* %addr1, i64 %stride)
-declare void @llvm.x86.ttransposed(i8 %tile0, i8 %tile1)
-declare void @llvm.x86.ttdpbf16ps(i8 %tile0, i8 %tile1, i8 %tile2)
-declare void @llvm.x86.ttdpfp16ps(i8 %tile0, i8 %tile1, i8 %tile2)
-declare void @llvm.x86.ttcmmimfp16ps(i8 %A, i8 %B, i8 %C)
-declare void @llvm.x86.ttcmmrlfp16ps(i8 %A, i8 %B, i8 %C)
-declare void @llvm.x86.tconjtcmmimfp16ps(i8 %A, i8 %B, i8 %C)
-declare void @llvm.x86.tconjtfp16(i8 %A, i8 %B)
-
-define void @test_amx2(i8* %pointer, i8* %base, i64 %stride) #0 {
-; CHECK-LABEL: test_amx2:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    pushq %rbp
-; CHECK-NEXT:    subq $2928, %rsp # imm = 0xB70
-; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vmovups %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movb $1, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movb $8, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movw $8, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movb $8, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movw $8, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movb $8, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movw $8, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movb $8, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movw $8, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movw $8, %ax
-; CHECK-NEXT:    tileloadd (%rsi,%rdx), %tmm0
-; CHECK-NEXT:    tilezero %tmm1
-; CHECK-NEXT:    tilezero %tmm2
-; CHECK-NEXT:    ttdpbf16ps %tmm1, %tmm0, %tmm2
-; CHECK-NEXT:    ttdpfp16ps %tmm1, %tmm0, %tmm2
-; CHECK-NEXT:    ttcmmimfp16ps %tmm1, %tmm0, %tmm2
-; CHECK-NEXT:    ttcmmrlfp16ps %tmm1, %tmm0, %tmm2
-; CHECK-NEXT:    movabsq $64, %rbp
-; CHECK-NEXT:    tilestored %tmm2, 896(%rsp,%rbp) # 1024-byte Folded Spill
-; CHECK-NEXT:    tileloadd 896(%rsp,%rbp), %tmm3 # 1024-byte Folded Reload
-; CHECK-NEXT:    tconjtcmmimfp16ps %tmm1, %tmm0, %tmm3
-; CHECK-NEXT:    tconjtfp16 %tmm3, %tmm0
-; CHECK-NEXT:    tilestored %tmm2, (%rdi,%rdx)
-; CHECK-NEXT:    addq $2928, %rsp # imm = 0xB70
-; CHECK-NEXT:    popq %rbp
-; CHECK-NEXT:    tilerelease
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
-;
-; EGPR-LABEL: test_amx2:
-; EGPR:       # %bb.0:
-; EGPR-NEXT:    pushq %rbp # encoding: [0x55]
-; EGPR-NEXT:    subq $2928, %rsp # encoding: [0x48,0x81,0xec,0x70,0x0b,0x00,0x00]
-; EGPR-NEXT:    # imm = 0xB70
-; EGPR-NEXT:    vxorps %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc0]
-; EGPR-NEXT:    vmovups %zmm0, {{[0-9]+}}(%rsp) # encoding: [0x62,0xf1,0x7c,0x48,0x11,0x44,0x24,0x0d]
-; EGPR-NEXT:    movb $1, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0x40,0x03,0x00,0x00,0x01]
-; EGPR-NEXT:    movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0x70,0x03,0x00,0x00,0x08]
-; EGPR-NEXT:    movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x84,0x24,0x50,0x03,0x00,0x00,0x08,0x00]
-; EGPR-NEXT:    movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0x71,0x03,0x00,0x00,0x08]
-; EGPR-NEXT:    movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x84,0x24,0x52,0x03,0x00,0x00,0x08,0x00]
-; EGPR-NEXT:    movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0x72,0x03,0x00,0x00,0x08]
-; EGPR-NEXT:    movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x84,0x24,0x54,0x03,0x00,0x00,0x08,0x00]
-; EGPR-NEXT:    movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0x73,0x03,0x00,0x00,0x08]
-; EGPR-NEXT:    movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x84,0x24,0x56,0x03,0x00,0x00,0x08,0x00]
-; EGPR-NEXT:    ldtilecfg {{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x84,0x24,0x40,0x03,0x00,0x00]
-; EGPR-NEXT:    movw $8, %ax # encoding: [0x66,0xb8,0x08,0x00]
-; EGPR-NEXT:    tileloadd (%rsi,%rdx), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4b,0x04,0x16]
-; EGPR-NEXT:    tilezero %tmm1 # encoding: [0xc4,0xe2,0x7b,0x49,0xc8]
-; EGPR-NEXT:    tilezero %tmm2 # encoding: [0xc4,0xe2,0x7b,0x49,0xd0]
-; EGPR-NEXT:    ttdpbf16ps %tmm1, %tmm0, %tmm2 # encoding: [0xc4,0xe2,0x72,0x6c,0xd0]
-; EGPR-NEXT:    ttdpfp16ps %tmm1, %tmm0, %tmm2 # encoding: [0xc4,0xe2,0x73,0x6c,0xd0]
-; EGPR-NEXT:    ttcmmimfp16ps %tmm1, %tmm0, %tmm2 # encoding: [0xc4,0xe2,0x73,0x6b,0xd0]
-; EGPR-NEXT:    ttcmmrlfp16ps %tmm1, %tmm0, %tmm2 # encoding: [0xc4,0xe2,0x72,0x6b,0xd0]
-; EGPR-NEXT:    movabsq $64, %rbp # encoding: [0x48,0xbd,0x40,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
-; EGPR-NEXT:    tilestored %tmm2, 896(%rsp,%rbp) # 1024-byte Folded Spill
-; EGPR-NEXT:    # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x94,0x2c,0x80,0x03,0x00,0x00]
-; EGPR-NEXT:    tileloadd 896(%rsp,%rbp), %tmm3 # 1024-byte Folded Reload
-; EGPR-NEXT:    # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4b,0x9c,0x2c,0x80,0x03,0x00,0x00]
-; EGPR-NEXT:    tconjtcmmimfp16ps %tmm1, %tmm0, %tmm3 # encoding: [0xc4,0xe2,0x70,0x6b,0xd8]
-; EGPR-NEXT:    tconjtfp16 %tmm3, %tmm0 # encoding: [0xc4,0xe2,0x79,0x6b,0xc3]
-; EGPR-NEXT:    tilestored %tmm2, (%rdi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x14,0x17]
-; EGPR-NEXT:    addq $2928, %rsp # encoding: [0x48,0x81,0xc4,0x70,0x0b,0x00,0x00]
-; EGPR-NEXT:    # imm = 0xB70
-; EGPR-NEXT:    popq %rbp # encoding: [0x5d]
-; EGPR-NEXT:    tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0]
-; EGPR-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
-; EGPR-NEXT:    retq # encoding: [0xc3]
-
-  %a = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %base, i64 %stride)
-  %b = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 8)
-  %c = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 8)
-  %c1 = call x86_amx @llvm.x86.ttdpbf16ps.internal(i16 8, i16 8, i16 8, x86_amx %c, x86_amx %a, x86_amx %b)
-  %c2 = call x86_amx @llvm.x86.ttdpfp16ps.internal(i16 8, i16 8, i16 8, x86_amx %c1, x86_amx %a, x86_amx %b)
-  %c3 = call x86_amx @llvm.x86.ttcmmimfp16ps.internal(i16 8, i16 8, i16 8, x86_amx %c2, x86_amx %a, x86_amx %b)
-  %c4 = call x86_amx @llvm.x86.ttcmmrlfp16ps.internal(i16 8, i16 8, i16 8, x86_amx %c3, x86_amx %a, x86_amx %b)
-  %c5 = call x86_amx @llvm.x86.tconjtcmmimfp16ps.internal(i16 8, i16 8, i16 8, x86_amx %c4, x86_amx %a, x86_amx %b)
-  %c6 = call x86_amx @llvm.x86.tconjtfp16.internal(i16 8, i16 8, x86_amx %c5)
-
-  call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %c4)
-  ret void
-}
-
-define void @test_amx3(i8* %pointer, i8* %base, i64 %stride) #0 {
-; CHECK-LABEL: test_amx3:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movb $1, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movb $8, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movw $8, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movb $8, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movw $8, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movb $0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movw $0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    ldtilecfg -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    movw $8, %cx
-; CHECK-NEXT:    t2rpntlvwz0 (%rsi,%rdx), %tmm4
-; CHECK-NEXT:    t2rpntlvwz0t1 (%rsi,%rdx), %tmm4
-; CHECK-NEXT:    t2rpntlvwz1 (%rsi,%rdx), %tmm4
-; CHECK-NEXT:    t2rpntlvwz1t1 (%rsi,%rdx), %tmm4
-; CHECK-NEXT:    ttransposed %tmm4, %tmm0
-; CHECK-NEXT:    tilestored %tmm0, (%rdi,%rdx)
-; CHECK-NEXT:    tilerelease
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
-;
-; EGPR-LABEL: test_amx3:
-; EGPR:       # %bb.0:
-; EGPR-NEXT:    vxorps %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc0]
-; EGPR-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp) # encoding: [0x62,0xf1,0x7c,0x48,0x11,0x44,0x24,0xff]
-; EGPR-NEXT:    movb $1, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xc0,0x01]
-; EGPR-NEXT:    movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xf0,0x08]
-; EGPR-NEXT:    movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0xd0,0x08,0x00]
-; EGPR-NEXT:    movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xf4,0x08]
-; EGPR-NEXT:    movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0xd8,0x08,0x00]
-; EGPR-NEXT:    movb $0, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xf5,0x00]
-; EGPR-NEXT:    movw $0, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0xda,0x00,0x00]
-; EGPR-NEXT:    ldtilecfg -{{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x44,0x24,0xc0]
-; EGPR-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; EGPR-NEXT:    movw $8, %cx # encoding: [0x66,0xb9,0x08,0x00]
-; EGPR-NEXT:    t2rpntlvwz0 (%rsi,%rdx), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6e,0x24,0x16]
-; EGPR-NEXT:    t2rpntlvwz0t1 (%rsi,%rdx), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6f,0x24,0x16]
-; EGPR-NEXT:    t2rpntlvwz1 (%rsi,%rdx), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6e,0x24,0x16]
-; EGPR-NEXT:    t2rpntlvwz1t1 (%rsi,%rdx), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6f,0x24,0x16]
-; EGPR-NEXT:    ttransposed %tmm4, %tmm0 # encoding: [0xc4,0xe2,0x7a,0x5f,0xc4]
-; EGPR-NEXT:    tilestored %tmm0, (%rdi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x04,0x17]
-; EGPR-NEXT:    tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0]
-; EGPR-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
-; EGPR-NEXT:    retq # encoding: [0xc3]
-  %1 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 8, i16 8, i16 0, i8* %base, i64 %stride)
-  %2 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0t1.internal(i16 8, i16 8, i16 0, i8* %base, i64 %stride)
-  %3 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1.internal(i16 8, i16 8, i16 0, i8* %base, i64 %stride)
-  %4 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1t1.internal(i16 8, i16 8, i16 0, i8* %base, i64 %stride)
-  %5 = extractvalue { x86_amx, x86_amx } %4, 0
-  %6 = call x86_amx @llvm.x86.ttransposed.internal(i16 8, i16 8, x86_amx %5)
-  call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %6)
-  ret void
-}
-
-define void @test_amx_spill(i8* %pointer, i8* %base, i64 %stride) #0 {
-; CHECK-LABEL: test_amx_spill:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    subq $6088, %rsp # imm = 0x17C8
-; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movb $1, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movb $8, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movw $8, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movb $8, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movw $8, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movb $8, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movw $8, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movb $8, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movw $8, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movb $8, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movw $8, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    ldtilecfg -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movw $8, %ax
-; CHECK-NEXT:    tileloadd (%rsi,%rdx), %tmm0
-; CHECK-NEXT:    t2rpntlvwz0 (%rsi,%rdx), %tmm4
-; CHECK-NEXT:    t2rpntlvwz0t1 (%rsi,%rdx), %tmm6
-; CHECK-NEXT:    movabsq $64, %rcx
-; CHECK-NEXT:    tilestored %tmm6, 4032(%rsp,%rcx) # 1024-byte Folded Spill
-; CHECK-NEXT:    tilestored %tmm7, 5056(%rsp,%rcx) # 1024-byte Folded Spill
-; CHECK-NEXT:    t2rpntlvwz1 (%rsi,%rdx), %tmm6
-; CHECK-NEXT:    tilestored %tmm6, 1984(%rsp,%rcx) # 1024-byte Folded Spill
-; CHECK-NEXT:    tilestored %tmm7, 3008(%rsp,%rcx) # 1024-byte Folded Spill
-; CHECK-NEXT:    t2rpntlvwz1t1 (%rsi,%rdx), %tmm6
-; CHECK-NEXT:    tilestored %tmm6, -64(%rsp,%rcx) # 1024-byte Folded Spill
-; CHECK-NEXT:    tilestored %tmm7, 960(%rsp,%rcx) # 1024-byte Folded Spill
-; CHECK-NEXT:    t2rpntlvwz0 (%rsi,%rdx), %tmm6
-; CHECK-NEXT:    tilestored %tmm4, (%rsi,%rdx)
-; CHECK-NEXT:    tilestored %tmm5, (%rsi,%rdx)
-; CHECK-NEXT:    tileloadd 4032(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload
-; CHECK-NEXT:    tileloadd 5056(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload
-; CHECK-NEXT:    tilestored %tmm4, (%rsi,%rdx)
-; CHECK-NEXT:    tilestored %tmm5, (%rsi,%rdx)
-; CHECK-NEXT:    tileloadd 1984(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload
-; CHECK-NEXT:    tileloadd 3008(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload
-; CHECK-NEXT:    tilestored %tmm4, (%rsi,%rdx)
-; CHECK-NEXT:    tilestored %tmm5, (%rsi,%rdx)
-; CHECK-NEXT:    tileloadd -64(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload
-; CHECK-NEXT:    tileloadd 960(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload
-; CHECK-NEXT:    tilestored %tmm4, (%rsi,%rdx)
-; CHECK-NEXT:    tilestored %tmm5, (%rsi,%rdx)
-; CHECK-NEXT:    tilestored %tmm6, (%rsi,%rdx)
-; CHECK-NEXT:    tilestored %tmm7, (%rsi,%rdx)
-; CHECK-NEXT:    addq $6088, %rsp # imm = 0x17C8
-; CHECK-NEXT:    tilerelease
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
-;
-; EGPR-LABEL: test_amx_spill:
-; EGPR:       # %bb.0:
-; EGPR-NEXT:    subq $6088, %rsp # encoding: [0x48,0x81,0xec,0xc8,0x17,0x00,0x00]
-; EGPR-NEXT:    # imm = 0x17C8
-; EGPR-NEXT:    vxorps %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc0]
-; EGPR-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp) # encoding: [0x62,0xf1,0x7c,0x48,0x11,0x44,0x24,0xfe]
-; EGPR-NEXT:    movb $1, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0x80,0x01]
-; EGPR-NEXT:    movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xb0,0x08]
-; EGPR-NEXT:    movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x90,0x08,0x00]
-; EGPR-NEXT:    movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xb4,0x08]
-; EGPR-NEXT:    movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x98,0x08,0x00]
-; EGPR-NEXT:    movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xb5,0x08]
-; EGPR-NEXT:    movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x9a,0x08,0x00]
-; EGPR-NEXT:    movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xb6,0x08]
-; EGPR-NEXT:    movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x9c,0x08,0x00]
-; EGPR-NEXT:    movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xb7,0x08]
-; EGPR-NEXT:    movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x9e,0x08,0x00]
-; EGPR-NEXT:    ldtilecfg -{{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x44,0x24,0x80]
-; EGPR-NEXT:    movw $8, %ax # encoding: [0x66,0xb8,0x08,0x00]
-; EGPR-NEXT:    tileloadd (%rsi,%rdx), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4b,0x04,0x16]
-; EGPR-NEXT:    t2rpntlvwz0 (%rsi,%rdx), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6e,0x24,0x16]
-; EGPR-NEXT:    t2rpntlvwz0t1 (%rsi,%rdx), %tmm6 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6f,0x34,0x16]
-; EGPR-NEXT:    movabsq $64, %rcx # encoding: [0x48,0xb9,0x40,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
-; EGPR-NEXT:    tilestored %tmm6, 4032(%rsp,%rcx) # 1024-byte Folded Spill
-; EGPR-NEXT:    # encoding: [0xc4,0xe2,0x7a,0x4b,0xb4,0x0c,0xc0,0x0f,0x00,0x00]
-; EGPR-NEXT:    tilestored %tmm7, 5056(%rsp,%rcx) # 1024-byte Folded Spill
-; EGPR-NEXT:    # encoding: [0xc4,0xe2,0x7a,0x4b,0xbc,0x0c,0xc0,0x13,0x00,0x00]
-; EGPR-NEXT:    t2rpntlvwz1 (%rsi,%rdx), %tmm6 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6e,0x34,0x16]
-; EGPR-NEXT:    tilestored %tmm6, 1984(%rsp,%rcx) # 1024-byte Folded Spill
-; EGPR-NEXT:    # encoding: [0xc4,0xe2,0x7a,0x4b,0xb4,0x0c,0xc0,0x07,0x00,0x00]
-; EGPR-NEXT:    tilestored %tmm7, 3008(%rsp,%rcx) # 1024-byte Folded Spill
-; EGPR-NEXT:    # encoding: [0xc4,0xe2,0x7a,0x4b,0xbc,0x0c,0xc0,0x0b,0x00,0x00]
-; EGPR-NEXT:    t2rpntlvwz1t1 (%rsi,%rdx), %tmm6 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6f,0x34,0x16]
-; EGPR-NEXT:    tilestored %tmm6, -64(%rsp,%rcx) # 1024-byte Folded Spill
-; EGPR-NEXT:    # encoding: [0xc4,0xe2,0x7a,0x4b,0x74,0x0c,0xc0]
-; EGPR-NEXT:    tilestored %tmm7, 960(%rsp,%rcx) # 1024-byte Folded Spill
-; EGPR-NEXT:    # encoding: [0xc4,0xe2,0x7a,0x4b,0xbc,0x0c,0xc0,0x03,0x00,0x00]
-; EGPR-NEXT:    t2rpntlvwz0 (%rsi,%rdx), %tmm6 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6e,0x34,0x16]
-; EGPR-NEXT:    tilestored %tmm4, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x24,0x16]
-; EGPR-NEXT:    tilestored %tmm5, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x2c,0x16]
-; EGPR-NEXT:    tileloadd 4032(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload
-; EGPR-NEXT:    # encoding: [0xc4,0xe2,0x7b,0x4b,0xa4,0x0c,0xc0,0x0f,0x00,0x00]
-; EGPR-NEXT:    tileloadd 5056(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload
-; EGPR-NEXT:    # encoding: [0xc4,0xe2,0x7b,0x4b,0xac,0x0c,0xc0,0x13,0x00,0x00]
-; EGPR-NEXT:    tilestored %tmm4, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x24,0x16]
-; EGPR-NEXT:    tilestored %tmm5, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x2c,0x16]
-; EGPR-NEXT:    tileloadd 1984(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload
-; EGPR-NEXT:    # encoding: [0xc4,0xe2,0x7b,0x4b,0xa4,0x0c,0xc0,0x07,0x00,0x00]
-; EGPR-NEXT:    tileloadd 3008(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload
-; EGPR-NEXT:    # encoding: [0xc4,0xe2,0x7b,0x4b,0xac,0x0c,0xc0,0x0b,0x00,0x00]
-; EGPR-NEXT:    tilestored %tmm4, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x24,0x16]
-; EGPR-NEXT:    tilestored %tmm5, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x2c,0x16]
-; EGPR-NEXT:    tileloadd -64(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload
-; EGPR-NEXT:    # encoding: [0xc4,0xe2,0x7b,0x4b,0x64,0x0c,0xc0]
-; EGPR-NEXT:    tileloadd 960(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload
-; EGPR-NEXT:    # encoding: [0xc4,0xe2,0x7b,0x4b,0xac,0x0c,0xc0,0x03,0x00,0x00]
-; EGPR-NEXT:    tilestored %tmm4, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x24,0x16]
-; EGPR-NEXT:    tilestored %tmm5, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x2c,0x16]
-; EGPR-NEXT:    tilestored %tmm6, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x34,0x16]
-; EGPR-NEXT:    tilestored %tmm7, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x3c,0x16]
-; EGPR-NEXT:    addq $6088, %rsp # encoding: [0x48,0x81,0xc4,0xc8,0x17,0x00,0x00]
-; EGPR-NEXT:    # imm = 0x17C8
-; EGPR-NEXT:    tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0]
-; EGPR-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
-; EGPR-NEXT:    retq # encoding: [0xc3]
-  %a = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %base, i64 %stride)
-  %b1 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
-  %b2 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0t1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
-  %b3 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
-  %b4 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1t1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
-  %b5 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
-  %e11 = extractvalue { x86_amx, x86_amx } %b1, 0
-  %e12 = extractvalue { x86_amx, x86_amx } %b1, 1
-  %e21 = extractvalue { x86_amx, x86_amx } %b2, 0
-  %e22 = extractvalue { x86_amx, x86_amx } %b2, 1
-  %e31 = extractvalue { x86_amx, x86_amx } %b3, 0
-  %e32 = extractvalue { x86_amx, x86_amx } %b3, 1
-  %e41 = extractvalue { x86_amx, x86_amx } %b4, 0
-  %e42 = extractvalue { x86_amx, x86_amx } %b4, 1
-  %e51 = extractvalue { x86_amx, x86_amx } %b5, 0
-  %e52 = extractvalue { x86_amx, x86_amx } %b5, 1
-  call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e11)
-  call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e12)
-  call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e21)
-  call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e22)
-  call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e31)
-  call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e32)
-  call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e41)
-  call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e42)
-  call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e51)
-  call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e52)
-  ret void
-}
-
-declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64)
-declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx)
-declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16, i16, i16, i8*, i64)
-declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0t1.internal(i16, i16, i16, i8*, i64)
-declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1.internal(i16, i16, i16, i8*, i64)
-declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1t1.internal(i16, i16, i16, i8*, i64)
-declare x86_amx @llvm.x86.ttransposed.internal(i16, i16, x86_amx)
-declare x86_amx @llvm.x86.ttdpbf16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
-declare x86_amx @llvm.x86.ttdpfp16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
-declare x86_amx @llvm.x86.ttcmmimfp16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
-declare x86_amx @llvm.x86.ttcmmrlfp16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
-declare x86_amx @llvm.x86.tconjtcmmimfp16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
-declare x86_amx @llvm.x86.tconjtfp16.internal(i16, i16, x86_amx)
-
-attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/and-mask-variable.ll b/llvm/test/CodeGen/X86/and-mask-variable.ll
index d89f0db6a0c5b..3e5bd6952147c 100644
--- a/llvm/test/CodeGen/X86/and-mask-variable.ll
+++ b/llvm/test/CodeGen/X86/and-mask-variable.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=-bmi,-tbm,-bmi2,+fast-bextr < %s | FileCheck %s --check-prefixes=X86-NOBMI
-; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=+bmi,+tbm,+bmi2,+fast-bextr < %s | FileCheck %s --check-prefixes=X86-BMI2
-; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=+bmi,-tbm,+bmi2,+fast-bextr < %s | FileCheck %s --check-prefixes=X86-BMI2
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=-bmi,-tbm,-bmi2,+fast-bextr < %s | FileCheck %s --check-prefixes=X64-NOBMI
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+bmi,+tbm,+bmi2,+fast-bextr < %s | FileCheck %s --check-prefixes=X64-BMI2
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+bmi,-tbm,+bmi2,+fast-bextr < %s | FileCheck %s --check-prefixes=X64-BMI2
+; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=-bmi,-tbm,-bmi2 < %s | FileCheck %s --check-prefixes=X86-NOBMI
+; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=+bmi,+tbm,+bmi2 < %s | FileCheck %s --check-prefixes=X86-BMI2
+; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=+bmi,-tbm,+bmi2 < %s | FileCheck %s --check-prefixes=X86-BMI2
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=-bmi,-tbm,-bmi2 < %s | FileCheck %s --check-prefixes=X64-NOBMI
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+bmi,+tbm,+bmi2 < %s | FileCheck %s --check-prefixes=X64-BMI2
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+bmi,-tbm,+bmi2 < %s | FileCheck %s --check-prefixes=X64-BMI2
 
 define i32 @mask_pair(i32 %x, i32 %y) nounwind {
 ; X86-NOBMI-LABEL: mask_pair:
diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll b/llvm/test/CodeGen/X86/atomic-load-store.ll
index 3e7b73a65fe07..1173c45b4bfd8 100644
--- a/llvm/test/CodeGen/X86/atomic-load-store.ll
+++ b/llvm/test/CodeGen/X86/atomic-load-store.ll
@@ -1,12 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,CHECK-O3
-; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,CHECK-SSE-O3
-; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,CHECK-AVX-O3
-; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,CHECK-AVX-O3
-; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,CHECK-O0
-; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,CHECK-SSE-O0
-; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,CHECK-AVX-O0
-; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,CHECK-AVX-O0
+; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,CHECK-O3,CHECK-SSE-O3
+; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,CHECK-O3,CHECK-SSE-O3
+; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,CHECK-O3,CHECK-AVX-O3,CHECK-AVX2-O3
+; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,CHECK-O3,CHECK-AVX-O3,CHECK-AVX512-O3
+; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,CHECK-O0,CHECK-SSE-O0
+; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,CHECK-O0,CHECK-SSE-O0
+; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,CHECK-O0,CHECK-AVX-O0,CHECK-AVX2-O0
+; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,CHECK-O0,CHECK-AVX-O0,CHECK-AVX512-O0
 
 define void @test1(ptr %ptr, i32 %val1) {
 ; CHECK-LABEL: test1:
@@ -50,30 +50,10 @@ define <1 x i8> @atomic_vec1_i8(ptr %x) {
 ; CHECK-O3-NEXT:    movzbl (%rdi), %eax
 ; CHECK-O3-NEXT:    retq
 ;
-; CHECK-SSE-O3-LABEL: atomic_vec1_i8:
-; CHECK-SSE-O3:       # %bb.0:
-; CHECK-SSE-O3-NEXT:    movzbl (%rdi), %eax
-; CHECK-SSE-O3-NEXT:    retq
-;
-; CHECK-AVX-O3-LABEL: atomic_vec1_i8:
-; CHECK-AVX-O3:       # %bb.0:
-; CHECK-AVX-O3-NEXT:    movzbl (%rdi), %eax
-; CHECK-AVX-O3-NEXT:    retq
-;
 ; CHECK-O0-LABEL: atomic_vec1_i8:
 ; CHECK-O0:       # %bb.0:
 ; CHECK-O0-NEXT:    movb (%rdi), %al
 ; CHECK-O0-NEXT:    retq
-;
-; CHECK-SSE-O0-LABEL: atomic_vec1_i8:
-; CHECK-SSE-O0:       # %bb.0:
-; CHECK-SSE-O0-NEXT:    movb (%rdi), %al
-; CHECK-SSE-O0-NEXT:    retq
-;
-; CHECK-AVX-O0-LABEL: atomic_vec1_i8:
-; CHECK-AVX-O0:       # %bb.0:
-; CHECK-AVX-O0-NEXT:    movb (%rdi), %al
-; CHECK-AVX-O0-NEXT:    retq
   %ret = load atomic <1 x i8>, ptr %x acquire, align 1
   ret <1 x i8> %ret
 }
@@ -84,30 +64,10 @@ define <1 x i16> @atomic_vec1_i16(ptr %x) {
 ; CHECK-O3-NEXT:    movzwl (%rdi), %eax
 ; CHECK-O3-NEXT:    retq
 ;
-; CHECK-SSE-O3-LABEL: atomic_vec1_i16:
-; CHECK-SSE-O3:       # %bb.0:
-; CHECK-SSE-O3-NEXT:    movzwl (%rdi), %eax
-; CHECK-SSE-O3-NEXT:    retq
-;
-; CHECK-AVX-O3-LABEL: atomic_vec1_i16:
-; CHECK-AVX-O3:       # %bb.0:
-; CHECK-AVX-O3-NEXT:    movzwl (%rdi), %eax
-; CHECK-AVX-O3-NEXT:    retq
-;
 ; CHECK-O0-LABEL: atomic_vec1_i16:
 ; CHECK-O0:       # %bb.0:
 ; CHECK-O0-NEXT:    movw (%rdi), %ax
 ; CHECK-O0-NEXT:    retq
-;
-; CHECK-SSE-O0-LABEL: atomic_vec1_i16:
-; CHECK-SSE-O0:       # %bb.0:
-; CHECK-SSE-O0-NEXT:    movw (%rdi), %ax
-; CHECK-SSE-O0-NEXT:    retq
-;
-; CHECK-AVX-O0-LABEL: atomic_vec1_i16:
-; CHECK-AVX-O0:       # %bb.0:
-; CHECK-AVX-O0-NEXT:    movw (%rdi), %ax
-; CHECK-AVX-O0-NEXT:    retq
   %ret = load atomic <1 x i16>, ptr %x acquire, align 2
   ret <1 x i16> %ret
 }
@@ -119,35 +79,11 @@ define <1 x i32> @atomic_vec1_i8_zext(ptr %x) {
 ; CHECK-O3-NEXT:    movzbl %al, %eax
 ; CHECK-O3-NEXT:    retq
 ;
-; CHECK-SSE-O3-LABEL: atomic_vec1_i8_zext:
-; CHECK-SSE-O3:       # %bb.0:
-; CHECK-SSE-O3-NEXT:    movzbl (%rdi), %eax
-; CHECK-SSE-O3-NEXT:    movzbl %al, %eax
-; CHECK-SSE-O3-NEXT:    retq
-;
-; CHECK-AVX-O3-LABEL: atomic_vec1_i8_zext:
-; CHECK-AVX-O3:       # %bb.0:
-; CHECK-AVX-O3-NEXT:    movzbl (%rdi), %eax
-; CHECK-AVX-O3-NEXT:    movzbl %al, %eax
-; CHECK-AVX-O3-NEXT:    retq
-;
 ; CHECK-O0-LABEL: atomic_vec1_i8_zext:
 ; CHECK-O0:       # %bb.0:
 ; CHECK-O0-NEXT:    movb (%rdi), %al
 ; CHECK-O0-NEXT:    movzbl %al, %eax
 ; CHECK-O0-NEXT:    retq
-;
-; CHECK-SSE-O0-LABEL: atomic_vec1_i8_zext:
-; CHECK-SSE-O0:       # %bb.0:
-; CHECK-SSE-O0-NEXT:    movb (%rdi), %al
-; CHECK-SSE-O0-NEXT:    movzbl %al, %eax
-; CHECK-SSE-O0-NEXT:    retq
-;
-; CHECK-AVX-O0-LABEL: atomic_vec1_i8_zext:
-; CHECK-AVX-O0:       # %bb.0:
-; CHECK-AVX-O0-NEXT:    movb (%rdi), %al
-; CHECK-AVX-O0-NEXT:    movzbl %al, %eax
-; CHECK-AVX-O0-NEXT:    retq
   %ret = load atomic <1 x i8>, ptr %x acquire, align 1
   %zret = zext <1 x i8> %ret to <1 x i32>
   ret <1 x i32> %zret
@@ -160,35 +96,11 @@ define <1 x i64> @atomic_vec1_i16_sext(ptr %x) {
 ; CHECK-O3-NEXT:    movswq %ax, %rax
 ; CHECK-O3-NEXT:    retq
 ;
-; CHECK-SSE-O3-LABEL: atomic_vec1_i16_sext:
-; CHECK-SSE-O3:       # %bb.0:
-; CHECK-SSE-O3-NEXT:    movzwl (%rdi), %eax
-; CHECK-SSE-O3-NEXT:    movswq %ax, %rax
-; CHECK-SSE-O3-NEXT:    retq
-;
-; CHECK-AVX-O3-LABEL: atomic_vec1_i16_sext:
-; CHECK-AVX-O3:       # %bb.0:
-; CHECK-AVX-O3-NEXT:    movzwl (%rdi), %eax
-; CHECK-AVX-O3-NEXT:    movswq %ax, %rax
-; CHECK-AVX-O3-NEXT:    retq
-;
 ; CHECK-O0-LABEL: atomic_vec1_i16_sext:
 ; CHECK-O0:       # %bb.0:
 ; CHECK-O0-NEXT:    movw (%rdi), %ax
 ; CHECK-O0-NEXT:    movswq %ax, %rax
 ; CHECK-O0-NEXT:    retq
-;
-; CHECK-SSE-O0-LABEL: atomic_vec1_i16_sext:
-; CHECK-SSE-O0:       # %bb.0:
-; CHECK-SSE-O0-NEXT:    movw (%rdi), %ax
-; CHECK-SSE-O0-NEXT:    movswq %ax, %rax
-; CHECK-SSE-O0-NEXT:    retq
-;
-; CHECK-AVX-O0-LABEL: atomic_vec1_i16_sext:
-; CHECK-AVX-O0:       # %bb.0:
-; CHECK-AVX-O0-NEXT:    movw (%rdi), %ax
-; CHECK-AVX-O0-NEXT:    movswq %ax, %rax
-; CHECK-AVX-O0-NEXT:    retq
   %ret = load atomic <1 x i16>, ptr %x acquire, align 2
   %sret = sext <1 x i16> %ret to <1 x i64>
   ret <1 x i64> %sret
@@ -204,12 +116,6 @@ define <1 x ptr addrspace(270)> @atomic_vec1_ptr270(ptr %x) {
 }
 
 define <1 x bfloat> @atomic_vec1_bfloat(ptr %x) {
-; CHECK-O3-LABEL: atomic_vec1_bfloat:
-; CHECK-O3:       # %bb.0:
-; CHECK-O3-NEXT:    movzwl (%rdi), %eax
-; CHECK-O3-NEXT:    pinsrw $0, %eax, %xmm0
-; CHECK-O3-NEXT:    retq
-;
 ; CHECK-SSE-O3-LABEL: atomic_vec1_bfloat:
 ; CHECK-SSE-O3:       # %bb.0:
 ; CHECK-SSE-O3-NEXT:    movzwl (%rdi), %eax
@@ -222,15 +128,6 @@ define <1 x bfloat> @atomic_vec1_bfloat(ptr %x) {
 ; CHECK-AVX-O3-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
 ; CHECK-AVX-O3-NEXT:    retq
 ;
-; CHECK-O0-LABEL: atomic_vec1_bfloat:
-; CHECK-O0:       # %bb.0:
-; CHECK-O0-NEXT:    movw (%rdi), %cx
-; CHECK-O0-NEXT:    # implicit-def: $eax
-; CHECK-O0-NEXT:    movw %cx, %ax
-; CHECK-O0-NEXT:    # implicit-def: $xmm0
-; CHECK-O0-NEXT:    pinsrw $0, %eax, %xmm0
-; CHECK-O0-NEXT:    retq
-;
 ; CHECK-SSE-O0-LABEL: atomic_vec1_bfloat:
 ; CHECK-SSE-O0:       # %bb.0:
 ; CHECK-SSE-O0-NEXT:    movw (%rdi), %cx
@@ -283,30 +180,6 @@ define <1 x ptr> @atomic_vec1_ptr(ptr %x) nounwind {
 ; CHECK-O3-NEXT:    popq %rcx
 ; CHECK-O3-NEXT:    retq
 ;
-; CHECK-SSE-O3-LABEL: atomic_vec1_ptr:
-; CHECK-SSE-O3:       # %bb.0:
-; CHECK-SSE-O3-NEXT:    pushq %rax
-; CHECK-SSE-O3-NEXT:    movq %rdi, %rsi
-; CHECK-SSE-O3-NEXT:    movq %rsp, %rdx
-; CHECK-SSE-O3-NEXT:    movl $8, %edi
-; CHECK-SSE-O3-NEXT:    movl $2, %ecx
-; CHECK-SSE-O3-NEXT:    callq __atomic_load@PLT
-; CHECK-SSE-O3-NEXT:    movq (%rsp), %rax
-; CHECK-SSE-O3-NEXT:    popq %rcx
-; CHECK-SSE-O3-NEXT:    retq
-;
-; CHECK-AVX-O3-LABEL: atomic_vec1_ptr:
-; CHECK-AVX-O3:       # %bb.0:
-; CHECK-AVX-O3-NEXT:    pushq %rax
-; CHECK-AVX-O3-NEXT:    movq %rdi, %rsi
-; CHECK-AVX-O3-NEXT:    movq %rsp, %rdx
-; CHECK-AVX-O3-NEXT:    movl $8, %edi
-; CHECK-AVX-O3-NEXT:    movl $2, %ecx
-; CHECK-AVX-O3-NEXT:    callq __atomic_load@PLT
-; CHECK-AVX-O3-NEXT:    movq (%rsp), %rax
-; CHECK-AVX-O3-NEXT:    popq %rcx
-; CHECK-AVX-O3-NEXT:    retq
-;
 ; CHECK-O0-LABEL: atomic_vec1_ptr:
 ; CHECK-O0:       # %bb.0:
 ; CHECK-O0-NEXT:    pushq %rax
@@ -318,41 +191,11 @@ define <1 x ptr> @atomic_vec1_ptr(ptr %x) nounwind {
 ; CHECK-O0-NEXT:    movq (%rsp), %rax
 ; CHECK-O0-NEXT:    popq %rcx
 ; CHECK-O0-NEXT:    retq
-;
-; CHECK-SSE-O0-LABEL: atomic_vec1_ptr:
-; CHECK-SSE-O0:       # %bb.0:
-; CHECK-SSE-O0-NEXT:    pushq %rax
-; CHECK-SSE-O0-NEXT:    movq %rdi, %rsi
-; CHECK-SSE-O0-NEXT:    movl $8, %edi
-; CHECK-SSE-O0-NEXT:    movq %rsp, %rdx
-; CHECK-SSE-O0-NEXT:    movl $2, %ecx
-; CHECK-SSE-O0-NEXT:    callq __atomic_load@PLT
-; CHECK-SSE-O0-NEXT:    movq (%rsp), %rax
-; CHECK-SSE-O0-NEXT:    popq %rcx
-; CHECK-SSE-O0-NEXT:    retq
-;
-; CHECK-AVX-O0-LABEL: atomic_vec1_ptr:
-; CHECK-AVX-O0:       # %bb.0:
-; CHECK-AVX-O0-NEXT:    pushq %rax
-; CHECK-AVX-O0-NEXT:    movq %rdi, %rsi
-; CHECK-AVX-O0-NEXT:    movl $8, %edi
-; CHECK-AVX-O0-NEXT:    movq %rsp, %rdx
-; CHECK-AVX-O0-NEXT:    movl $2, %ecx
-; CHECK-AVX-O0-NEXT:    callq __atomic_load@PLT
-; CHECK-AVX-O0-NEXT:    movq (%rsp), %rax
-; CHECK-AVX-O0-NEXT:    popq %rcx
-; CHECK-AVX-O0-NEXT:    retq
   %ret = load atomic <1 x ptr>, ptr %x acquire, align 4
   ret <1 x ptr> %ret
 }
 
 define <1 x half> @atomic_vec1_half(ptr %x) {
-; CHECK-O3-LABEL: atomic_vec1_half:
-; CHECK-O3:       # %bb.0:
-; CHECK-O3-NEXT:    movzwl (%rdi), %eax
-; CHECK-O3-NEXT:    pinsrw $0, %eax, %xmm0
-; CHECK-O3-NEXT:    retq
-;
 ; CHECK-SSE-O3-LABEL: atomic_vec1_half:
 ; CHECK-SSE-O3:       # %bb.0:
 ; CHECK-SSE-O3-NEXT:    movzwl (%rdi), %eax
@@ -365,15 +208,6 @@ define <1 x half> @atomic_vec1_half(ptr %x) {
 ; CHECK-AVX-O3-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
 ; CHECK-AVX-O3-NEXT:    retq
 ;
-; CHECK-O0-LABEL: atomic_vec1_half:
-; CHECK-O0:       # %bb.0:
-; CHECK-O0-NEXT:    movw (%rdi), %cx
-; CHECK-O0-NEXT:    # implicit-def: $eax
-; CHECK-O0-NEXT:    movw %cx, %ax
-; CHECK-O0-NEXT:    # implicit-def: $xmm0
-; CHECK-O0-NEXT:    pinsrw $0, %eax, %xmm0
-; CHECK-O0-NEXT:    retq
-;
 ; CHECK-SSE-O0-LABEL: atomic_vec1_half:
 ; CHECK-SSE-O0:       # %bb.0:
 ; CHECK-SSE-O0-NEXT:    movw (%rdi), %cx
@@ -396,11 +230,6 @@ define <1 x half> @atomic_vec1_half(ptr %x) {
 }
 
 define <1 x float> @atomic_vec1_float(ptr %x) {
-; CHECK-O3-LABEL: atomic_vec1_float:
-; CHECK-O3:       # %bb.0:
-; CHECK-O3-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-O3-NEXT:    retq
-;
 ; CHECK-SSE-O3-LABEL: atomic_vec1_float:
 ; CHECK-SSE-O3:       # %bb.0:
 ; CHECK-SSE-O3-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
@@ -411,11 +240,6 @@ define <1 x float> @atomic_vec1_float(ptr %x) {
 ; CHECK-AVX-O3-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; CHECK-AVX-O3-NEXT:    retq
 ;
-; CHECK-O0-LABEL: atomic_vec1_float:
-; CHECK-O0:       # %bb.0:
-; CHECK-O0-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-O0-NEXT:    retq
-;
 ; CHECK-SSE-O0-LABEL: atomic_vec1_float:
 ; CHECK-SSE-O0:       # %bb.0:
 ; CHECK-SSE-O0-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
@@ -430,11 +254,6 @@ define <1 x float> @atomic_vec1_float(ptr %x) {
 }
 
 define <1 x double> @atomic_vec1_double_align(ptr %x) nounwind {
-; CHECK-O3-LABEL: atomic_vec1_double_align:
-; CHECK-O3:       # %bb.0:
-; CHECK-O3-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-O3-NEXT:    retq
-;
 ; CHECK-SSE-O3-LABEL: atomic_vec1_double_align:
 ; CHECK-SSE-O3:       # %bb.0:
 ; CHECK-SSE-O3-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
@@ -445,11 +264,6 @@ define <1 x double> @atomic_vec1_double_align(ptr %x) nounwind {
 ; CHECK-AVX-O3-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; CHECK-AVX-O3-NEXT:    retq
 ;
-; CHECK-O0-LABEL: atomic_vec1_double_align:
-; CHECK-O0:       # %bb.0:
-; CHECK-O0-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-O0-NEXT:    retq
-;
 ; CHECK-SSE-O0-LABEL: atomic_vec1_double_align:
 ; CHECK-SSE-O0:       # %bb.0:
 ; CHECK-SSE-O0-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
@@ -476,30 +290,6 @@ define <1 x i64> @atomic_vec1_i64(ptr %x) nounwind {
 ; CHECK-O3-NEXT:    popq %rcx
 ; CHECK-O3-NEXT:    retq
 ;
-; CHECK-SSE-O3-LABEL: atomic_vec1_i64:
-; CHECK-SSE-O3:       # %bb.0:
-; CHECK-SSE-O3-NEXT:    pushq %rax
-; CHECK-SSE-O3-NEXT:    movq %rdi, %rsi
-; CHECK-SSE-O3-NEXT:    movq %rsp, %rdx
-; CHECK-SSE-O3-NEXT:    movl $8, %edi
-; CHECK-SSE-O3-NEXT:    movl $2, %ecx
-; CHECK-SSE-O3-NEXT:    callq __atomic_load@PLT
-; CHECK-SSE-O3-NEXT:    movq (%rsp), %rax
-; CHECK-SSE-O3-NEXT:    popq %rcx
-; CHECK-SSE-O3-NEXT:    retq
-;
-; CHECK-AVX-O3-LABEL: atomic_vec1_i64:
-; CHECK-AVX-O3:       # %bb.0:
-; CHECK-AVX-O3-NEXT:    pushq %rax
-; CHECK-AVX-O3-NEXT:    movq %rdi, %rsi
-; CHECK-AVX-O3-NEXT:    movq %rsp, %rdx
-; CHECK-AVX-O3-NEXT:    movl $8, %edi
-; CHECK-AVX-O3-NEXT:    movl $2, %ecx
-; CHECK-AVX-O3-NEXT:    callq __atomic_load@PLT
-; CHECK-AVX-O3-NEXT:    movq (%rsp), %rax
-; CHECK-AVX-O3-NEXT:    popq %rcx
-; CHECK-AVX-O3-NEXT:    retq
-;
 ; CHECK-O0-LABEL: atomic_vec1_i64:
 ; CHECK-O0:       # %bb.0:
 ; CHECK-O0-NEXT:    pushq %rax
@@ -511,47 +301,11 @@ define <1 x i64> @atomic_vec1_i64(ptr %x) nounwind {
 ; CHECK-O0-NEXT:    movq (%rsp), %rax
 ; CHECK-O0-NEXT:    popq %rcx
 ; CHECK-O0-NEXT:    retq
-;
-; CHECK-SSE-O0-LABEL: atomic_vec1_i64:
-; CHECK-SSE-O0:       # %bb.0:
-; CHECK-SSE-O0-NEXT:    pushq %rax
-; CHECK-SSE-O0-NEXT:    movq %rdi, %rsi
-; CHECK-SSE-O0-NEXT:    movl $8, %edi
-; CHECK-SSE-O0-NEXT:    movq %rsp, %rdx
-; CHECK-SSE-O0-NEXT:    movl $2, %ecx
-; CHECK-SSE-O0-NEXT:    callq __atomic_load@PLT
-; CHECK-SSE-O0-NEXT:    movq (%rsp), %rax
-; CHECK-SSE-O0-NEXT:    popq %rcx
-; CHECK-SSE-O0-NEXT:    retq
-;
-; CHECK-AVX-O0-LABEL: atomic_vec1_i64:
-; CHECK-AVX-O0:       # %bb.0:
-; CHECK-AVX-O0-NEXT:    pushq %rax
-; CHECK-AVX-O0-NEXT:    movq %rdi, %rsi
-; CHECK-AVX-O0-NEXT:    movl $8, %edi
-; CHECK-AVX-O0-NEXT:    movq %rsp, %rdx
-; CHECK-AVX-O0-NEXT:    movl $2, %ecx
-; CHECK-AVX-O0-NEXT:    callq __atomic_load@PLT
-; CHECK-AVX-O0-NEXT:    movq (%rsp), %rax
-; CHECK-AVX-O0-NEXT:    popq %rcx
-; CHECK-AVX-O0-NEXT:    retq
   %ret = load atomic <1 x i64>, ptr %x acquire, align 4
   ret <1 x i64> %ret
 }
 
 define <1 x double> @atomic_vec1_double(ptr %x) nounwind {
-; CHECK-O3-LABEL: atomic_vec1_double:
-; CHECK-O3:       # %bb.0:
-; CHECK-O3-NEXT:    pushq %rax
-; CHECK-O3-NEXT:    movq %rdi, %rsi
-; CHECK-O3-NEXT:    movq %rsp, %rdx
-; CHECK-O3-NEXT:    movl $8, %edi
-; CHECK-O3-NEXT:    movl $2, %ecx
-; CHECK-O3-NEXT:    callq __atomic_load@PLT
-; CHECK-O3-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-O3-NEXT:    popq %rax
-; CHECK-O3-NEXT:    retq
-;
 ; CHECK-SSE-O3-LABEL: atomic_vec1_double:
 ; CHECK-SSE-O3:       # %bb.0:
 ; CHECK-SSE-O3-NEXT:    pushq %rax
@@ -576,18 +330,6 @@ define <1 x double> @atomic_vec1_double(ptr %x) nounwind {
 ; CHECK-AVX-O3-NEXT:    popq %rax
 ; CHECK-AVX-O3-NEXT:    retq
 ;
-; CHECK-O0-LABEL: atomic_vec1_double:
-; CHECK-O0:       # %bb.0:
-; CHECK-O0-NEXT:    pushq %rax
-; CHECK-O0-NEXT:    movq %rdi, %rsi
-; CHECK-O0-NEXT:    movl $8, %edi
-; CHECK-O0-NEXT:    movq %rsp, %rdx
-; CHECK-O0-NEXT:    movl $2, %ecx
-; CHECK-O0-NEXT:    callq __atomic_load@PLT
-; CHECK-O0-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-O0-NEXT:    popq %rax
-; CHECK-O0-NEXT:    retq
-;
 ; CHECK-SSE-O0-LABEL: atomic_vec1_double:
 ; CHECK-SSE-O0:       # %bb.0:
 ; CHECK-SSE-O0-NEXT:    pushq %rax
@@ -616,18 +358,6 @@ define <1 x double> @atomic_vec1_double(ptr %x) nounwind {
 }
 
 define <2 x i32> @atomic_vec2_i32(ptr %x) nounwind {
-; CHECK-O3-LABEL: atomic_vec2_i32:
-; CHECK-O3:       # %bb.0:
-; CHECK-O3-NEXT:    pushq %rax
-; CHECK-O3-NEXT:    movq %rdi, %rsi
-; CHECK-O3-NEXT:    movq %rsp, %rdx
-; CHECK-O3-NEXT:    movl $8, %edi
-; CHECK-O3-NEXT:    movl $2, %ecx
-; CHECK-O3-NEXT:    callq __atomic_load@PLT
-; CHECK-O3-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-O3-NEXT:    popq %rax
-; CHECK-O3-NEXT:    retq
-;
 ; CHECK-SSE-O3-LABEL: atomic_vec2_i32:
 ; CHECK-SSE-O3:       # %bb.0:
 ; CHECK-SSE-O3-NEXT:    pushq %rax
@@ -652,18 +382,6 @@ define <2 x i32> @atomic_vec2_i32(ptr %x) nounwind {
 ; CHECK-AVX-O3-NEXT:    popq %rax
 ; CHECK-AVX-O3-NEXT:    retq
 ;
-; CHECK-O0-LABEL: atomic_vec2_i32:
-; CHECK-O0:       # %bb.0:
-; CHECK-O0-NEXT:    pushq %rax
-; CHECK-O0-NEXT:    movq %rdi, %rsi
-; CHECK-O0-NEXT:    movl $8, %edi
-; CHECK-O0-NEXT:    movq %rsp, %rdx
-; CHECK-O0-NEXT:    movl $2, %ecx
-; CHECK-O0-NEXT:    callq __atomic_load@PLT
-; CHECK-O0-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; CHECK-O0-NEXT:    popq %rax
-; CHECK-O0-NEXT:    retq
-;
 ; CHECK-SSE-O0-LABEL: atomic_vec2_i32:
 ; CHECK-SSE-O0:       # %bb.0:
 ; CHECK-SSE-O0-NEXT:    pushq %rax
@@ -692,18 +410,6 @@ define <2 x i32> @atomic_vec2_i32(ptr %x) nounwind {
 }
 
 define <4 x float> @atomic_vec4_float(ptr %x) nounwind {
-; CHECK-O3-LABEL: atomic_vec4_float:
-; CHECK-O3:       # %bb.0:
-; CHECK-O3-NEXT:    subq $24, %rsp
-; CHECK-O3-NEXT:    movq %rdi, %rsi
-; CHECK-O3-NEXT:    movq %rsp, %rdx
-; CHECK-O3-NEXT:    movl $16, %edi
-; CHECK-O3-NEXT:    movl $2, %ecx
-; CHECK-O3-NEXT:    callq __atomic_load@PLT
-; CHECK-O3-NEXT:    movaps (%rsp), %xmm0
-; CHECK-O3-NEXT:    addq $24, %rsp
-; CHECK-O3-NEXT:    retq
-;
 ; CHECK-SSE-O3-LABEL: atomic_vec4_float:
 ; CHECK-SSE-O3:       # %bb.0:
 ; CHECK-SSE-O3-NEXT:    subq $24, %rsp
@@ -728,18 +434,6 @@ define <4 x float> @atomic_vec4_float(ptr %x) nounwind {
 ; CHECK-AVX-O3-NEXT:    addq $24, %rsp
 ; CHECK-AVX-O3-NEXT:    retq
 ;
-; CHECK-O0-LABEL: atomic_vec4_float:
-; CHECK-O0:       # %bb.0:
-; CHECK-O0-NEXT:    subq $24, %rsp
-; CHECK-O0-NEXT:    movq %rdi, %rsi
-; CHECK-O0-NEXT:    movl $16, %edi
-; CHECK-O0-NEXT:    movq %rsp, %rdx
-; CHECK-O0-NEXT:    movl $2, %ecx
-; CHECK-O0-NEXT:    callq __atomic_load@PLT
-; CHECK-O0-NEXT:    movaps (%rsp), %xmm0
-; CHECK-O0-NEXT:    addq $24, %rsp
-; CHECK-O0-NEXT:    retq
-;
 ; CHECK-SSE-O0-LABEL: atomic_vec4_float:
 ; CHECK-SSE-O0:       # %bb.0:
 ; CHECK-SSE-O0-NEXT:    subq $24, %rsp
@@ -768,21 +462,6 @@ define <4 x float> @atomic_vec4_float(ptr %x) nounwind {
 }
 
 define <8 x double> @atomic_vec8_double(ptr %x) nounwind {
-; CHECK-O3-LABEL: atomic_vec8_double:
-; CHECK-O3:       # %bb.0:
-; CHECK-O3-NEXT:    subq $72, %rsp
-; CHECK-O3-NEXT:    movq %rdi, %rsi
-; CHECK-O3-NEXT:    movq %rsp, %rdx
-; CHECK-O3-NEXT:    movl $64, %edi
-; CHECK-O3-NEXT:    movl $2, %ecx
-; CHECK-O3-NEXT:    callq __atomic_load@PLT
-; CHECK-O3-NEXT:    movaps (%rsp), %xmm0
-; CHECK-O3-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
-; CHECK-O3-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm2
-; CHECK-O3-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm3
-; CHECK-O3-NEXT:    addq $72, %rsp
-; CHECK-O3-NEXT:    retq
-;
 ; CHECK-SSE-O3-LABEL: atomic_vec8_double:
 ; CHECK-SSE-O3:       # %bb.0:
 ; CHECK-SSE-O3-NEXT:    subq $72, %rsp
@@ -798,20 +477,30 @@ define <8 x double> @atomic_vec8_double(ptr %x) nounwind {
 ; CHECK-SSE-O3-NEXT:    addq $72, %rsp
 ; CHECK-SSE-O3-NEXT:    retq
 ;
-; CHECK-O0-LABEL: atomic_vec8_double:
-; CHECK-O0:       # %bb.0:
-; CHECK-O0-NEXT:    subq $72, %rsp
-; CHECK-O0-NEXT:    movq %rdi, %rsi
-; CHECK-O0-NEXT:    movl $64, %edi
-; CHECK-O0-NEXT:    movq %rsp, %rdx
-; CHECK-O0-NEXT:    movl $2, %ecx
-; CHECK-O0-NEXT:    callq __atomic_load@PLT
-; CHECK-O0-NEXT:    movapd (%rsp), %xmm0
-; CHECK-O0-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
-; CHECK-O0-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm2
-; CHECK-O0-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm3
-; CHECK-O0-NEXT:    addq $72, %rsp
-; CHECK-O0-NEXT:    retq
+; CHECK-AVX2-O3-LABEL: atomic_vec8_double:
+; CHECK-AVX2-O3:       # %bb.0:
+; CHECK-AVX2-O3-NEXT:    subq $72, %rsp
+; CHECK-AVX2-O3-NEXT:    movq %rdi, %rsi
+; CHECK-AVX2-O3-NEXT:    movq %rsp, %rdx
+; CHECK-AVX2-O3-NEXT:    movl $64, %edi
+; CHECK-AVX2-O3-NEXT:    movl $2, %ecx
+; CHECK-AVX2-O3-NEXT:    callq __atomic_load@PLT
+; CHECK-AVX2-O3-NEXT:    vmovups (%rsp), %ymm0
+; CHECK-AVX2-O3-NEXT:    vmovups {{[0-9]+}}(%rsp), %ymm1
+; CHECK-AVX2-O3-NEXT:    addq $72, %rsp
+; CHECK-AVX2-O3-NEXT:    retq
+;
+; CHECK-AVX512-O3-LABEL: atomic_vec8_double:
+; CHECK-AVX512-O3:       # %bb.0:
+; CHECK-AVX512-O3-NEXT:    subq $72, %rsp
+; CHECK-AVX512-O3-NEXT:    movq %rdi, %rsi
+; CHECK-AVX512-O3-NEXT:    movq %rsp, %rdx
+; CHECK-AVX512-O3-NEXT:    movl $64, %edi
+; CHECK-AVX512-O3-NEXT:    movl $2, %ecx
+; CHECK-AVX512-O3-NEXT:    callq __atomic_load@PLT
+; CHECK-AVX512-O3-NEXT:    vmovups (%rsp), %zmm0
+; CHECK-AVX512-O3-NEXT:    addq $72, %rsp
+; CHECK-AVX512-O3-NEXT:    retq
 ;
 ; CHECK-SSE-O0-LABEL: atomic_vec8_double:
 ; CHECK-SSE-O0:       # %bb.0:
@@ -827,24 +516,36 @@ define <8 x double> @atomic_vec8_double(ptr %x) nounwind {
 ; CHECK-SSE-O0-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm3
 ; CHECK-SSE-O0-NEXT:    addq $72, %rsp
 ; CHECK-SSE-O0-NEXT:    retq
+;
+; CHECK-AVX2-O0-LABEL: atomic_vec8_double:
+; CHECK-AVX2-O0:       # %bb.0:
+; CHECK-AVX2-O0-NEXT:    subq $72, %rsp
+; CHECK-AVX2-O0-NEXT:    movq %rdi, %rsi
+; CHECK-AVX2-O0-NEXT:    movl $64, %edi
+; CHECK-AVX2-O0-NEXT:    movq %rsp, %rdx
+; CHECK-AVX2-O0-NEXT:    movl $2, %ecx
+; CHECK-AVX2-O0-NEXT:    callq __atomic_load@PLT
+; CHECK-AVX2-O0-NEXT:    vmovupd (%rsp), %ymm0
+; CHECK-AVX2-O0-NEXT:    vmovupd {{[0-9]+}}(%rsp), %ymm1
+; CHECK-AVX2-O0-NEXT:    addq $72, %rsp
+; CHECK-AVX2-O0-NEXT:    retq
+;
+; CHECK-AVX512-O0-LABEL: atomic_vec8_double:
+; CHECK-AVX512-O0:       # %bb.0:
+; CHECK-AVX512-O0-NEXT:    subq $72, %rsp
+; CHECK-AVX512-O0-NEXT:    movq %rdi, %rsi
+; CHECK-AVX512-O0-NEXT:    movl $64, %edi
+; CHECK-AVX512-O0-NEXT:    movq %rsp, %rdx
+; CHECK-AVX512-O0-NEXT:    movl $2, %ecx
+; CHECK-AVX512-O0-NEXT:    callq __atomic_load@PLT
+; CHECK-AVX512-O0-NEXT:    vmovupd (%rsp), %zmm0
+; CHECK-AVX512-O0-NEXT:    addq $72, %rsp
+; CHECK-AVX512-O0-NEXT:    retq
   %ret = load atomic <8 x double>, ptr %x acquire, align 4
   ret <8 x double> %ret
 }
 
 define <16 x bfloat> @atomic_vec16_bfloat(ptr %x) nounwind {
-; CHECK-O3-LABEL: atomic_vec16_bfloat:
-; CHECK-O3:       # %bb.0:
-; CHECK-O3-NEXT:    subq $40, %rsp
-; CHECK-O3-NEXT:    movq %rdi, %rsi
-; CHECK-O3-NEXT:    movq %rsp, %rdx
-; CHECK-O3-NEXT:    movl $32, %edi
-; CHECK-O3-NEXT:    movl $2, %ecx
-; CHECK-O3-NEXT:    callq __atomic_load@PLT
-; CHECK-O3-NEXT:    movaps (%rsp), %xmm0
-; CHECK-O3-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
-; CHECK-O3-NEXT:    addq $40, %rsp
-; CHECK-O3-NEXT:    retq
-;
 ; CHECK-SSE-O3-LABEL: atomic_vec16_bfloat:
 ; CHECK-SSE-O3:       # %bb.0:
 ; CHECK-SSE-O3-NEXT:    subq $40, %rsp
@@ -870,19 +571,6 @@ define <16 x bfloat> @atomic_vec16_bfloat(ptr %x) nounwind {
 ; CHECK-AVX-O3-NEXT:    addq $40, %rsp
 ; CHECK-AVX-O3-NEXT:    retq
 ;
-; CHECK-O0-LABEL: atomic_vec16_bfloat:
-; CHECK-O0:       # %bb.0:
-; CHECK-O0-NEXT:    subq $40, %rsp
-; CHECK-O0-NEXT:    movq %rdi, %rsi
-; CHECK-O0-NEXT:    movl $32, %edi
-; CHECK-O0-NEXT:    movq %rsp, %rdx
-; CHECK-O0-NEXT:    movl $2, %ecx
-; CHECK-O0-NEXT:    callq __atomic_load@PLT
-; CHECK-O0-NEXT:    movaps (%rsp), %xmm0
-; CHECK-O0-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
-; CHECK-O0-NEXT:    addq $40, %rsp
-; CHECK-O0-NEXT:    retq
-;
 ; CHECK-SSE-O0-LABEL: atomic_vec16_bfloat:
 ; CHECK-SSE-O0:       # %bb.0:
 ; CHECK-SSE-O0-NEXT:    subq $40, %rsp
@@ -912,21 +600,6 @@ define <16 x bfloat> @atomic_vec16_bfloat(ptr %x) nounwind {
 }
 
 define <32 x half> @atomic_vec32_half(ptr %x) nounwind {
-; CHECK-O3-LABEL: atomic_vec32_half:
-; CHECK-O3:       # %bb.0:
-; CHECK-O3-NEXT:    subq $72, %rsp
-; CHECK-O3-NEXT:    movq %rdi, %rsi
-; CHECK-O3-NEXT:    movq %rsp, %rdx
-; CHECK-O3-NEXT:    movl $64, %edi
-; CHECK-O3-NEXT:    movl $2, %ecx
-; CHECK-O3-NEXT:    callq __atomic_load@PLT
-; CHECK-O3-NEXT:    movaps (%rsp), %xmm0
-; CHECK-O3-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
-; CHECK-O3-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm2
-; CHECK-O3-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm3
-; CHECK-O3-NEXT:    addq $72, %rsp
-; CHECK-O3-NEXT:    retq
-;
 ; CHECK-SSE-O3-LABEL: atomic_vec32_half:
 ; CHECK-SSE-O3:       # %bb.0:
 ; CHECK-SSE-O3-NEXT:    subq $72, %rsp
@@ -942,20 +615,30 @@ define <32 x half> @atomic_vec32_half(ptr %x) nounwind {
 ; CHECK-SSE-O3-NEXT:    addq $72, %rsp
 ; CHECK-SSE-O3-NEXT:    retq
 ;
-; CHECK-O0-LABEL: atomic_vec32_half:
-; CHECK-O0:       # %bb.0:
-; CHECK-O0-NEXT:    subq $72, %rsp
-; CHECK-O0-NEXT:    movq %rdi, %rsi
-; CHECK-O0-NEXT:    movl $64, %edi
-; CHECK-O0-NEXT:    movq %rsp, %rdx
-; CHECK-O0-NEXT:    movl $2, %ecx
-; CHECK-O0-NEXT:    callq __atomic_load@PLT
-; CHECK-O0-NEXT:    movaps (%rsp), %xmm0
-; CHECK-O0-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
-; CHECK-O0-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm2
-; CHECK-O0-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm3
-; CHECK-O0-NEXT:    addq $72, %rsp
-; CHECK-O0-NEXT:    retq
+; CHECK-AVX2-O3-LABEL: atomic_vec32_half:
+; CHECK-AVX2-O3:       # %bb.0:
+; CHECK-AVX2-O3-NEXT:    subq $72, %rsp
+; CHECK-AVX2-O3-NEXT:    movq %rdi, %rsi
+; CHECK-AVX2-O3-NEXT:    movq %rsp, %rdx
+; CHECK-AVX2-O3-NEXT:    movl $64, %edi
+; CHECK-AVX2-O3-NEXT:    movl $2, %ecx
+; CHECK-AVX2-O3-NEXT:    callq __atomic_load@PLT
+; CHECK-AVX2-O3-NEXT:    vmovups (%rsp), %ymm0
+; CHECK-AVX2-O3-NEXT:    vmovups {{[0-9]+}}(%rsp), %ymm1
+; CHECK-AVX2-O3-NEXT:    addq $72, %rsp
+; CHECK-AVX2-O3-NEXT:    retq
+;
+; CHECK-AVX512-O3-LABEL: atomic_vec32_half:
+; CHECK-AVX512-O3:       # %bb.0:
+; CHECK-AVX512-O3-NEXT:    subq $72, %rsp
+; CHECK-AVX512-O3-NEXT:    movq %rdi, %rsi
+; CHECK-AVX512-O3-NEXT:    movq %rsp, %rdx
+; CHECK-AVX512-O3-NEXT:    movl $64, %edi
+; CHECK-AVX512-O3-NEXT:    movl $2, %ecx
+; CHECK-AVX512-O3-NEXT:    callq __atomic_load@PLT
+; CHECK-AVX512-O3-NEXT:    vmovups (%rsp), %zmm0
+; CHECK-AVX512-O3-NEXT:    addq $72, %rsp
+; CHECK-AVX512-O3-NEXT:    retq
 ;
 ; CHECK-SSE-O0-LABEL: atomic_vec32_half:
 ; CHECK-SSE-O0:       # %bb.0:
@@ -971,6 +654,31 @@ define <32 x half> @atomic_vec32_half(ptr %x) nounwind {
 ; CHECK-SSE-O0-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm3
 ; CHECK-SSE-O0-NEXT:    addq $72, %rsp
 ; CHECK-SSE-O0-NEXT:    retq
+;
+; CHECK-AVX2-O0-LABEL: atomic_vec32_half:
+; CHECK-AVX2-O0:       # %bb.0:
+; CHECK-AVX2-O0-NEXT:    subq $72, %rsp
+; CHECK-AVX2-O0-NEXT:    movq %rdi, %rsi
+; CHECK-AVX2-O0-NEXT:    movl $64, %edi
+; CHECK-AVX2-O0-NEXT:    movq %rsp, %rdx
+; CHECK-AVX2-O0-NEXT:    movl $2, %ecx
+; CHECK-AVX2-O0-NEXT:    callq __atomic_load@PLT
+; CHECK-AVX2-O0-NEXT:    vmovups (%rsp), %ymm0
+; CHECK-AVX2-O0-NEXT:    vmovups {{[0-9]+}}(%rsp), %ymm1
+; CHECK-AVX2-O0-NEXT:    addq $72, %rsp
+; CHECK-AVX2-O0-NEXT:    retq
+;
+; CHECK-AVX512-O0-LABEL: atomic_vec32_half:
+; CHECK-AVX512-O0:       # %bb.0:
+; CHECK-AVX512-O0-NEXT:    subq $72, %rsp
+; CHECK-AVX512-O0-NEXT:    movq %rdi, %rsi
+; CHECK-AVX512-O0-NEXT:    movl $64, %edi
+; CHECK-AVX512-O0-NEXT:    movq %rsp, %rdx
+; CHECK-AVX512-O0-NEXT:    movl $2, %ecx
+; CHECK-AVX512-O0-NEXT:    callq __atomic_load@PLT
+; CHECK-AVX512-O0-NEXT:    vmovups (%rsp), %zmm0
+; CHECK-AVX512-O0-NEXT:    addq $72, %rsp
+; CHECK-AVX512-O0-NEXT:    retq
   %ret = load atomic <32 x half>, ptr %x acquire, align 4
   ret <32 x half> %ret
 }
diff --git a/llvm/test/CodeGen/X86/basic-block-sections-bb-hash.ll b/llvm/test/CodeGen/X86/basic-block-sections-bb-hash.ll
new file mode 100644
index 0000000000000..293b48d7dc5dd
--- /dev/null
+++ b/llvm/test/CodeGen/X86/basic-block-sections-bb-hash.ll
@@ -0,0 +1,39 @@
+;; BB section test with basic block hashes.
+
+;; basic block sections Profile with bb hashes
+; RUN: echo 'v1' > %t
+; RUN: echo 'f foo' >> %t
+; RUN: echo 'g 0:10,1:9,2:1 1:8,3:8 2:2,3:2 3:11' >> %t
+; RUN: echo 'c 0 2 3' >> %t
+; RUN: echo 'h 0:64863A11B5CA0000 1:54F1E80D6B270006 2:54F1F4E66B270008 3:C8BC6041A2CB0009' >> %t
+; RUN: llc < %s -O0 -mtriple=x86_64-pc-linux -function-sections -basic-block-sections=%t | FileCheck %s
+;
+define void @foo(i1 zeroext) nounwind {
+  %2 = alloca i8, align 1
+  %3 = zext i1 %0 to i8
+  store i8 %3, ptr %2, align 1
+  %4 = load i8, ptr %2, align 1
+  %5 = trunc i8 %4 to i1
+  br i1 %5, label %6, label %8
+
+6:                                                ; preds = %1
+  %7 = call i32 @bar()
+  br label %10
+
+8:                                                ; preds = %1
+  %9 = call i32 @baz()
+  br label %10
+
+10:                                               ; preds = %8, %6
+  ret void
+}
+
+declare i32 @bar() #1
+
+declare i32 @baz() #1
+
+; CHECK: .section	.text.foo,"ax",@progbits
+; CHECK: callq baz
+; CHECK: retq
+; CHECK: .section	.text.split.foo,"ax",@progbits
+; CHECK: callq bar
diff --git a/llvm/test/CodeGen/X86/basic-block-sections-clusters-error.ll b/llvm/test/CodeGen/X86/basic-block-sections-clusters-error.ll
index 751ab76722c07..eb0a14b2820b4 100644
--- a/llvm/test/CodeGen/X86/basic-block-sections-clusters-error.ll
+++ b/llvm/test/CodeGen/X86/basic-block-sections-clusters-error.ll
@@ -69,6 +69,20 @@
 ; RUN: echo 'g 0:4,1:2:3' >> %t15
 ; RUN: not --crash llc < %s -O0 -mtriple=x86_64 -function-sections -basic-block-sections=%t15 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR15
 ; CHECK-ERROR15: LLVM ERROR: invalid profile {{.*}} at line 4: unsigned integer expected: '2:3'
+; RUN: echo 'v1' > %t16
+; RUN: echo 'f dummy1' >> %t16
+; RUN: echo 'c 0 1' >> %t16
+; RUN: echo 'g 0:4,1:2' >> %t16
+; RUN: echo 'h a:1111111111111111 1:ffffffffffffffff' >> %t16
+; RUN: not --crash llc < %s -O0 -mtriple=x86_64 -function-sections -basic-block-sections=%t16 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR16
+; CHECK-ERROR16: LLVM ERROR: invalid profile {{.*}} at line 5: unsigned integer expected: 'a'
+; RUN: echo 'v1' > %t17
+; RUN: echo 'f dummy1' >> %t17
+; RUN: echo 'c 0 1' >> %t17
+; RUN: echo 'g 0:4,1:2' >> %t17
+; RUN: echo 'h 0:111111111111111g 1:ffffffffffffffff' >> %t17
+; RUN: not --crash llc < %s -O0 -mtriple=x86_64 -function-sections -basic-block-sections=%t17 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR17
+; CHECK-ERROR17: LLVM ERROR: invalid profile {{.*}} at line 5: unsigned integer expected in hex format: '111111111111111g'
 
 
 define i32 @dummy1(i32 %x, i32 %y, i32 %z) {
diff --git a/llvm/test/CodeGen/X86/bfloat-calling-conv.ll b/llvm/test/CodeGen/X86/bfloat-calling-conv.ll
index ea4d32bae9ccb..d08749174f85c 100644
--- a/llvm/test/CodeGen/X86/bfloat-calling-conv.ll
+++ b/llvm/test/CodeGen/X86/bfloat-calling-conv.ll
@@ -660,8 +660,7 @@ define <3 x bfloat> @call_ret_v3bf16(ptr %ptr) #0 {
 ; SSE2-LABEL: call_ret_v3bf16:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pushq %rax
-; SSE2-NEXT:    movl 4(%rdi), %eax
-; SSE2-NEXT:    pinsrw $0, %eax, %xmm1
+; SSE2-NEXT:    pinsrw $0, 4(%rdi), %xmm1
 ; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE2-NEXT:    callq returns_v3bf16@PLT
@@ -725,8 +724,7 @@ define <3 x bfloat> @call_ret_v3bf16(ptr %ptr) #0 {
 ; AVXNECONVERT-LABEL: call_ret_v3bf16:
 ; AVXNECONVERT:       # %bb.0:
 ; AVXNECONVERT-NEXT:    pushq %rax
-; AVXNECONVERT-NEXT:    movl 4(%rdi), %eax
-; AVXNECONVERT-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
+; AVXNECONVERT-NEXT:    vpinsrw $0, 4(%rdi), %xmm0, %xmm0
 ; AVXNECONVERT-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; AVXNECONVERT-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
 ; AVXNECONVERT-NEXT:    callq returns_v3bf16@PLT
diff --git a/llvm/test/CodeGen/X86/bittest-big-integer.ll b/llvm/test/CodeGen/X86/bittest-big-integer.ll
index 19d751d176b6a..8007d9dcf13bc 100644
--- a/llvm/test/CodeGen/X86/bittest-big-integer.ll
+++ b/llvm/test/CodeGen/X86/bittest-big-integer.ll
@@ -7025,3 +7025,279 @@ define i1 @test_ne_i4096(ptr %word, i32 %position) nounwind {
   %cmp = icmp ne i4096 %test, 0
   ret i1 %cmp
 }
+
+; Special Cases
+
+; Multiple uses of the stored value
+define i1 @complement_cmpz_i128(ptr %word, i32 %position) nounwind {
+; X86-LABEL: complement_cmpz_i128:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $64, %esp
+; X86-NEXT:    movzbl 12(%ebp), %ecx
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    shrb $3, %al
+; X86-NEXT:    andb $12, %al
+; X86-NEXT:    negb %al
+; X86-NEXT:    movsbl %al, %esi
+; X86-NEXT:    movl 36(%esp,%esi), %eax
+; X86-NEXT:    movl 40(%esp,%esi), %edi
+; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    shldl %cl, %eax, %edx
+; X86-NEXT:    movl 32(%esp,%esi), %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 44(%esp,%esi), %esi
+; X86-NEXT:    shldl %cl, %edi, %esi
+; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    shll %cl, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    shldl %cl, %ebx, %eax
+; X86-NEXT:    movl 8(%ebp), %ecx
+; X86-NEXT:    xorl 12(%ecx), %esi
+; X86-NEXT:    xorl 8(%ecx), %edx
+; X86-NEXT:    xorl 4(%ecx), %eax
+; X86-NEXT:    xorl (%ecx), %edi
+; X86-NEXT:    movl %edx, 8(%ecx)
+; X86-NEXT:    movl %esi, 12(%ecx)
+; X86-NEXT:    movl %edi, (%ecx)
+; X86-NEXT:    movl %eax, 4(%ecx)
+; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    orl %edx, %edi
+; X86-NEXT:    orl %eax, %edi
+; X86-NEXT:    setne %al
+; X86-NEXT:    leal -12(%ebp), %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; SSE-LABEL: complement_cmpz_i128:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movl %esi, %ecx
+; SSE-NEXT:    movl $1, %eax
+; SSE-NEXT:    xorl %edx, %edx
+; SSE-NEXT:    shldq %cl, %rax, %rdx
+; SSE-NEXT:    shlq %cl, %rax
+; SSE-NEXT:    xorl %esi, %esi
+; SSE-NEXT:    testb $64, %cl
+; SSE-NEXT:    cmovneq %rax, %rdx
+; SSE-NEXT:    cmovneq %rsi, %rax
+; SSE-NEXT:    xorq 8(%rdi), %rdx
+; SSE-NEXT:    xorq (%rdi), %rax
+; SSE-NEXT:    movq %rax, (%rdi)
+; SSE-NEXT:    movq %rdx, 8(%rdi)
+; SSE-NEXT:    orq %rdx, %rax
+; SSE-NEXT:    setne %al
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: complement_cmpz_i128:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    movl %esi, %ecx
+; AVX2-NEXT:    movl $1, %eax
+; AVX2-NEXT:    xorl %edx, %edx
+; AVX2-NEXT:    shldq %cl, %rax, %rdx
+; AVX2-NEXT:    xorl %esi, %esi
+; AVX2-NEXT:    shlxq %rcx, %rax, %rax
+; AVX2-NEXT:    testb $64, %cl
+; AVX2-NEXT:    cmovneq %rax, %rdx
+; AVX2-NEXT:    cmovneq %rsi, %rax
+; AVX2-NEXT:    xorq 8(%rdi), %rdx
+; AVX2-NEXT:    xorq (%rdi), %rax
+; AVX2-NEXT:    movq %rax, (%rdi)
+; AVX2-NEXT:    movq %rdx, 8(%rdi)
+; AVX2-NEXT:    orq %rdx, %rax
+; AVX2-NEXT:    setne %al
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: complement_cmpz_i128:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    movl %esi, %ecx
+; AVX512-NEXT:    xorl %eax, %eax
+; AVX512-NEXT:    movl $1, %edx
+; AVX512-NEXT:    xorl %esi, %esi
+; AVX512-NEXT:    shldq %cl, %rdx, %rsi
+; AVX512-NEXT:    shlxq %rcx, %rdx, %rdx
+; AVX512-NEXT:    testb $64, %cl
+; AVX512-NEXT:    cmovneq %rdx, %rsi
+; AVX512-NEXT:    cmovneq %rax, %rdx
+; AVX512-NEXT:    xorq 8(%rdi), %rsi
+; AVX512-NEXT:    xorq (%rdi), %rdx
+; AVX512-NEXT:    movq %rdx, (%rdi)
+; AVX512-NEXT:    movq %rsi, 8(%rdi)
+; AVX512-NEXT:    orq %rsi, %rdx
+; AVX512-NEXT:    setne %al
+; AVX512-NEXT:    retq
+  %rem = and i32 %position, 127
+  %ofs = zext nneg i32 %rem to i128
+  %bit = shl nuw i128 1, %ofs
+  %ld = load i128, ptr %word
+  %res = xor i128 %ld, %bit
+  store i128 %res, ptr %word
+  %cmp = icmp ne i128 %res, 0
+  ret i1 %cmp
+}
+
+; Multiple loads in store chain
+define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind {
+; X86-LABEL: reset_multiload_i128:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $80, %esp
+; X86-NEXT:    movzbl 12(%ebp), %ecx
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    shrb $3, %al
+; X86-NEXT:    andb $12, %al
+; X86-NEXT:    negb %al
+; X86-NEXT:    movsbl %al, %eax
+; X86-NEXT:    movl 56(%esp,%eax), %esi
+; X86-NEXT:    movl 60(%esp,%eax), %edx
+; X86-NEXT:    shldl %cl, %esi, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 48(%esp,%eax), %edi
+; X86-NEXT:    movl 52(%esp,%eax), %eax
+; X86-NEXT:    shldl %cl, %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edi, %eax
+; X86-NEXT:    movl 8(%ebp), %ebx
+; X86-NEXT:    shll %cl, %edi
+; X86-NEXT:    movl 8(%ebx), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl %esi, %ecx
+; X86-NEXT:    movl (%ebx), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl %edi, %esi
+; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    movl 12(%ebx), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    movl 4(%ebx), %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl %ebx, %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    notl %ecx
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    notl %ebx
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    notl %edx
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    notl %edi
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movl (%eax), %eax
+; X86-NEXT:    movl 8(%ebp), %esi
+; X86-NEXT:    movl %ebx, 8(%esi)
+; X86-NEXT:    movl %ecx, 12(%esi)
+; X86-NEXT:    movl %edi, (%esi)
+; X86-NEXT:    movl %edx, 4(%esi)
+; X86-NEXT:    je .LBB22_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:  .LBB22_2:
+; X86-NEXT:    leal -12(%ebp), %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; SSE-LABEL: reset_multiload_i128:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movl %esi, %ecx
+; SSE-NEXT:    movl $1, %esi
+; SSE-NEXT:    xorl %r8d, %r8d
+; SSE-NEXT:    shldq %cl, %rsi, %r8
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    shlq %cl, %rsi
+; SSE-NEXT:    testb $64, %cl
+; SSE-NEXT:    cmovneq %rsi, %r8
+; SSE-NEXT:    cmovneq %rax, %rsi
+; SSE-NEXT:    movq (%rdi), %rcx
+; SSE-NEXT:    movq 8(%rdi), %r9
+; SSE-NEXT:    movq %r9, %r10
+; SSE-NEXT:    andq %r8, %r10
+; SSE-NEXT:    notq %r8
+; SSE-NEXT:    movq %rcx, %r11
+; SSE-NEXT:    andq %rsi, %r11
+; SSE-NEXT:    notq %rsi
+; SSE-NEXT:    andq %r9, %r8
+; SSE-NEXT:    andq %rcx, %rsi
+; SSE-NEXT:    orq %r10, %r11
+; SSE-NEXT:    jne .LBB22_2
+; SSE-NEXT:  # %bb.1:
+; SSE-NEXT:    movl (%rdx), %eax
+; SSE-NEXT:  .LBB22_2:
+; SSE-NEXT:    movq %rsi, (%rdi)
+; SSE-NEXT:    movq %r8, 8(%rdi)
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: reset_multiload_i128:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movl %esi, %ecx
+; AVX-NEXT:    movl $1, %esi
+; AVX-NEXT:    xorl %r8d, %r8d
+; AVX-NEXT:    shldq %cl, %rsi, %r8
+; AVX-NEXT:    xorl %eax, %eax
+; AVX-NEXT:    shlxq %rcx, %rsi, %r9
+; AVX-NEXT:    testb $64, %cl
+; AVX-NEXT:    cmovneq %r9, %r8
+; AVX-NEXT:    cmovneq %rax, %r9
+; AVX-NEXT:    movq (%rdi), %r10
+; AVX-NEXT:    movq 8(%rdi), %r11
+; AVX-NEXT:    andnq %r11, %r8, %rcx
+; AVX-NEXT:    andq %r8, %r11
+; AVX-NEXT:    andnq %r10, %r9, %rsi
+; AVX-NEXT:    andq %r9, %r10
+; AVX-NEXT:    orq %r11, %r10
+; AVX-NEXT:    jne .LBB22_2
+; AVX-NEXT:  # %bb.1:
+; AVX-NEXT:    movl (%rdx), %eax
+; AVX-NEXT:  .LBB22_2:
+; AVX-NEXT:    movq %rsi, (%rdi)
+; AVX-NEXT:    movq %rcx, 8(%rdi)
+; AVX-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX-NEXT:    retq
+  %rem = and i32 %position, 127
+  %ofs = zext nneg i32 %rem to i128
+  %bit = shl nuw i128 1, %ofs
+  %mask = xor i128 %bit, -1
+  %ld = load i128, ptr %word
+  %sel = load i32, ptr %p
+  %test = and i128 %ld, %bit
+  %res = and i128 %ld, %mask
+  %cmp = icmp eq i128 %test, 0
+  store i128 %res, ptr %word
+  %ret = select i1 %cmp, i32 %sel, i32 0
+  ret i32 %ret
+}
diff --git a/llvm/test/CodeGen/X86/ipra-reg-usage.ll b/llvm/test/CodeGen/X86/ipra-reg-usage.ll
index e73ff791dc423..f270f8fc741aa 100644
--- a/llvm/test/CodeGen/X86/ipra-reg-usage.ll
+++ b/llvm/test/CodeGen/X86/ipra-reg-usage.ll
@@ -7,7 +7,7 @@
 target triple = "x86_64-unknown-unknown"
 declare void @bar1()
 define preserve_allcc void @foo()#0 {
-; CHECK: foo Clobbered Registers: $cs $df $ds $eflags $eip $eiz $es $esp $fpcw $fpsw $fs $fs_base $gs $gs_base $hip $hsp $ip $mxcsr $rflags $rip $riz $rsp $sp $sph $spl $ss $ssp $_eflags $cr0 $cr1 $cr2 $cr3 $cr4 $cr5 $cr6 $cr7 $cr8 $cr9 $cr10 $cr11 $cr12 $cr13 $cr14 $cr15 $dr0 $dr1 $dr2 $dr3 $dr4 $dr5 $dr6 $dr7 $dr8 $dr9 $dr10 $dr11 $dr12 $dr13 $dr14 $dr15 $fp0 $fp1 $fp2 $fp3 $fp4 $fp5 $fp6 $fp7 $mm0 $mm1 $mm2 $mm3 $mm4 $mm5 $mm6 $mm7 $r11 $st0 $st1 $st2 $st3 $st4 $st5 $st6 $st7 $r11b $r11bh $r11d $r11w $r11wh $ymm0 $ymm1 $ymm2 $ymm3 $ymm4 $ymm5 $ymm6 $ymm7 $ymm8 $ymm9 $ymm10 $ymm11 $ymm12 $ymm13 $ymm14 $ymm15 $k0 $k1 $k2 $k3 $k4 $k5 $k6 $k7 $xmm16 $xmm17 $xmm18 $xmm19 $xmm20 $xmm21 $xmm22 $xmm23 $xmm24 $xmm25 $xmm26 $xmm27 $xmm28 $xmm29 $xmm30 $xmm31 $ymm16 $ymm17 $ymm18 $ymm19 $ymm20 $ymm21 $ymm22 $ymm23 $ymm24 $ymm25 $ymm26 $ymm27 $ymm28 $ymm29 $ymm30 $ymm31 $zmm0 $zmm1 $zmm2 $zmm3 $zmm4 $zmm5 $zmm6 $zmm7 $zmm8 $zmm9 $zmm10 $zmm11 $zmm12 $zmm13 $zmm14 $zmm15 $zmm16 $zmm17 $zmm18 $zmm19 $zmm20 $zmm21 $zmm22 $zmm23 $zmm24 $zmm25 $zmm26 $zmm27 $zmm28 $zmm29 $zmm30 $zmm31 $k0_k1 $k2_k3 $k4_k5 $k6_k7 $tmmcfg $tmm0 $tmm1 $tmm2 $tmm3 $tmm4 $tmm5 $tmm6 $tmm7 $tmm0_tmm1 $tmm2_tmm3 $tmm4_tmm5 $tmm6_tmm7 $r16 $r17 $r18 $r19 $r20 $r21 $r22 $r23 $r24 $r25 $r26 $r27 $r28 $r29 $r30 $r31 $r16b $r17b $r18b $r19b $r20b $r21b $r22b $r23b $r24b $r25b $r26b $r27b $r28b $r29b $r30b $r31b $r16bh $r17bh $r18bh $r19bh $r20bh $r21bh $r22bh $r23bh $r24bh $r25bh $r26bh $r27bh $r28bh $r29bh $r30bh $r31bh $r16d $r17d $r18d $r19d $r20d $r21d $r22d $r23d $r24d $r25d $r26d $r27d $r28d $r29d $r30d $r31d $r16w $r17w $r18w $r19w $r20w $r21w $r22w $r23w $r24w $r25w $r26w $r27w $r28w $r29w $r30w $r31w $r16wh $r17wh $r18wh $r19wh $r20wh $r21wh $r22wh $r23wh $r24wh $r25wh $r26wh $r27wh $r28wh $r29wh $r30wh $r31wh
+; CHECK: foo Clobbered Registers: $cs $df $ds $eflags $eip $eiz $es $esp $fpcw $fpsw $fs $fs_base $gs $gs_base $hip $hsp $ip $mxcsr $rflags $rip $riz $rsp $sp $sph $spl $ss $ssp $_eflags $cr0 $cr1 $cr2 $cr3 $cr4 $cr5 $cr6 $cr7 $cr8 $cr9 $cr10 $cr11 $cr12 $cr13 $cr14 $cr15 $dr0 $dr1 $dr2 $dr3 $dr4 $dr5 $dr6 $dr7 $dr8 $dr9 $dr10 $dr11 $dr12 $dr13 $dr14 $dr15 $fp0 $fp1 $fp2 $fp3 $fp4 $fp5 $fp6 $fp7 $mm0 $mm1 $mm2 $mm3 $mm4 $mm5 $mm6 $mm7 $r11 $st0 $st1 $st2 $st3 $st4 $st5 $st6 $st7 $r11b $r11bh $r11d $r11w $r11wh $ymm0 $ymm1 $ymm2 $ymm3 $ymm4 $ymm5 $ymm6 $ymm7 $ymm8 $ymm9 $ymm10 $ymm11 $ymm12 $ymm13 $ymm14 $ymm15 $k0 $k1 $k2 $k3 $k4 $k5 $k6 $k7 $xmm16 $xmm17 $xmm18 $xmm19 $xmm20 $xmm21 $xmm22 $xmm23 $xmm24 $xmm25 $xmm26 $xmm27 $xmm28 $xmm29 $xmm30 $xmm31 $ymm16 $ymm17 $ymm18 $ymm19 $ymm20 $ymm21 $ymm22 $ymm23 $ymm24 $ymm25 $ymm26 $ymm27 $ymm28 $ymm29 $ymm30 $ymm31 $zmm0 $zmm1 $zmm2 $zmm3 $zmm4 $zmm5 $zmm6 $zmm7 $zmm8 $zmm9 $zmm10 $zmm11 $zmm12 $zmm13 $zmm14 $zmm15 $zmm16 $zmm17 $zmm18 $zmm19 $zmm20 $zmm21 $zmm22 $zmm23 $zmm24 $zmm25 $zmm26 $zmm27 $zmm28 $zmm29 $zmm30 $zmm31 $k0_k1 $k2_k3 $k4_k5 $k6_k7 $tmmcfg $tmm0 $tmm1 $tmm2 $tmm3 $tmm4 $tmm5 $tmm6 $tmm7 $r16 $r17 $r18 $r19 $r20 $r21 $r22 $r23 $r24 $r25 $r26 $r27 $r28 $r29 $r30 $r31 $r16b $r17b $r18b $r19b $r20b $r21b $r22b $r23b $r24b $r25b $r26b $r27b $r28b $r29b $r30b $r31b $r16bh $r17bh $r18bh $r19bh $r20bh $r21bh $r22bh $r23bh $r24bh $r25bh $r26bh $r27bh $r28bh $r29bh $r30bh $r31bh $r16d $r17d $r18d $r19d $r20d $r21d $r22d $r23d $r24d $r25d $r26d $r27d $r28d $r29d $r30d $r31d $r16w $r17w $r18w $r19w $r20w $r21w $r22w $r23w $r24w $r25w $r26w $r27w $r28w $r29w $r30w $r31w $r16wh $r17wh $r18wh $r19wh $r20wh $r21wh $r22wh $r23wh $r24wh $r25wh $r26wh $r27wh $r28wh $r29wh $r30wh $r31wh
   call void @bar1()
   call void @bar2()
   ret void
@@ -15,7 +15,7 @@ define preserve_allcc void @foo()#0 {
 declare void @bar2()
 
 define preserve_nonecc void @foo2()#0 {
-; CHECK: foo2 Clobbered Registers: $ah $al $ax $ch $cl $cs $cx $df $dh $di $dih $dil $dl $ds $dx $eax $ecx $edi $edx $eflags $eip $eiz $es $esi $esp $fpcw $fpsw $fs $fs_base $gs $gs_base $hax $hcx $hdi $hdx $hip $hsi $hsp $ip $mxcsr $rax $rcx $rdi $rdx $rflags $rip $riz $rsi $rsp $si $sih $sil $sp $sph $spl $ss $ssp $_eflags $cr0 $cr1 $cr2 $cr3 $cr4 $cr5 $cr6 $cr7 $cr8 $cr9 $cr10 $cr11 $cr12 $cr13 $cr14 $cr15 $dr0 $dr1 $dr2 $dr3 $dr4 $dr5 $dr6 $dr7 $dr8 $dr9 $dr10 $dr11 $dr12 $dr13 $dr14 $dr15 $fp0 $fp1 $fp2 $fp3 $fp4 $fp5 $fp6 $fp7 $mm0 $mm1 $mm2 $mm3 $mm4 $mm5 $mm6 $mm7 $r8 $r9 $r10 $r11 $st0 $st1 $st2 $st3 $st4 $st5 $st6 $st7 $xmm0 $xmm1 $xmm2 $xmm3 $xmm4 $xmm5 $xmm6 $xmm7 $xmm8 $xmm9 $xmm10 $xmm11 $xmm12 $xmm13 $xmm14 $xmm15 $r8b $r9b $r10b $r11b $r8bh $r9bh $r10bh $r11bh $r8d $r9d $r10d $r11d $r8w $r9w $r10w $r11w $r8wh $r9wh $r10wh $r11wh $ymm0 $ymm1 $ymm2 $ymm3 $ymm4 $ymm5 $ymm6 $ymm7 $ymm8 $ymm9 $ymm10 $ymm11 $ymm12 $ymm13 $ymm14 $ymm15 $k0 $k1 $k2 $k3 $k4 $k5 $k6 $k7 $xmm16 $xmm17 $xmm18 $xmm19 $xmm20 $xmm21 $xmm22 $xmm23 $xmm24 $xmm25 $xmm26 $xmm27 $xmm28 $xmm29 $xmm30 $xmm31 $ymm16 $ymm17 $ymm18 $ymm19 $ymm20 $ymm21 $ymm22 $ymm23 $ymm24 $ymm25 $ymm26 $ymm27 $ymm28 $ymm29 $ymm30 $ymm31 $zmm0 $zmm1 $zmm2 $zmm3 $zmm4 $zmm5 $zmm6 $zmm7 $zmm8 $zmm9 $zmm10 $zmm11 $zmm12 $zmm13 $zmm14 $zmm15 $zmm16 $zmm17 $zmm18 $zmm19 $zmm20 $zmm21 $zmm22 $zmm23 $zmm24 $zmm25 $zmm26 $zmm27 $zmm28 $zmm29 $zmm30 $zmm31 $k0_k1 $k2_k3 $k4_k5 $k6_k7 $tmmcfg $tmm0 $tmm1 $tmm2 $tmm3 $tmm4 $tmm5 $tmm6 $tmm7 $tmm0_tmm1 $tmm2_tmm3 $tmm4_tmm5 $tmm6_tmm7 $r16 $r17 $r18 $r19 $r20 $r21 $r22 $r23 $r24 $r25 $r26 $r27 $r28 $r29 $r30 $r31 $r16b $r17b $r18b $r19b $r20b $r21b $r22b $r23b $r24b $r25b $r26b $r27b $r28b $r29b $r30b $r31b $r16bh $r17bh $r18bh $r19bh $r20bh $r21bh $r22bh $r23bh $r24bh $r25bh $r26bh $r27bh $r28bh $r29bh $r30bh $r31bh $r16d $r17d $r18d $r19d $r20d $r21d $r22d $r23d $r24d $r25d $r26d $r27d $r28d $r29d $r30d $r31d $r16w $r17w $r18w $r19w $r20w $r21w $r22w $r23w $r24w $r25w $r26w $r27w $r28w $r29w $r30w $r31w $r16wh $r17wh $r18wh $r19wh $r20wh $r21wh $r22wh $r23wh $r24wh $r25wh $r26wh $r27wh $r28wh $r29wh $r30wh $r31wh
+; CHECK: foo2 Clobbered Registers: $ah $al $ax $ch $cl $cs $cx $df $dh $di $dih $dil $dl $ds $dx $eax $ecx $edi $edx $eflags $eip $eiz $es $esi $esp $fpcw $fpsw $fs $fs_base $gs $gs_base $hax $hcx $hdi $hdx $hip $hsi $hsp $ip $mxcsr $rax $rcx $rdi $rdx $rflags $rip $riz $rsi $rsp $si $sih $sil $sp $sph $spl $ss $ssp $_eflags $cr0 $cr1 $cr2 $cr3 $cr4 $cr5 $cr6 $cr7 $cr8 $cr9 $cr10 $cr11 $cr12 $cr13 $cr14 $cr15 $dr0 $dr1 $dr2 $dr3 $dr4 $dr5 $dr6 $dr7 $dr8 $dr9 $dr10 $dr11 $dr12 $dr13 $dr14 $dr15 $fp0 $fp1 $fp2 $fp3 $fp4 $fp5 $fp6 $fp7 $mm0 $mm1 $mm2 $mm3 $mm4 $mm5 $mm6 $mm7 $r8 $r9 $r10 $r11 $st0 $st1 $st2 $st3 $st4 $st5 $st6 $st7 $xmm0 $xmm1 $xmm2 $xmm3 $xmm4 $xmm5 $xmm6 $xmm7 $xmm8 $xmm9 $xmm10 $xmm11 $xmm12 $xmm13 $xmm14 $xmm15 $r8b $r9b $r10b $r11b $r8bh $r9bh $r10bh $r11bh $r8d $r9d $r10d $r11d $r8w $r9w $r10w $r11w $r8wh $r9wh $r10wh $r11wh $ymm0 $ymm1 $ymm2 $ymm3 $ymm4 $ymm5 $ymm6 $ymm7 $ymm8 $ymm9 $ymm10 $ymm11 $ymm12 $ymm13 $ymm14 $ymm15 $k0 $k1 $k2 $k3 $k4 $k5 $k6 $k7 $xmm16 $xmm17 $xmm18 $xmm19 $xmm20 $xmm21 $xmm22 $xmm23 $xmm24 $xmm25 $xmm26 $xmm27 $xmm28 $xmm29 $xmm30 $xmm31 $ymm16 $ymm17 $ymm18 $ymm19 $ymm20 $ymm21 $ymm22 $ymm23 $ymm24 $ymm25 $ymm26 $ymm27 $ymm28 $ymm29 $ymm30 $ymm31 $zmm0 $zmm1 $zmm2 $zmm3 $zmm4 $zmm5 $zmm6 $zmm7 $zmm8 $zmm9 $zmm10 $zmm11 $zmm12 $zmm13 $zmm14 $zmm15 $zmm16 $zmm17 $zmm18 $zmm19 $zmm20 $zmm21 $zmm22 $zmm23 $zmm24 $zmm25 $zmm26 $zmm27 $zmm28 $zmm29 $zmm30 $zmm31 $k0_k1 $k2_k3 $k4_k5 $k6_k7 $tmmcfg $tmm0 $tmm1 $tmm2 $tmm3 $tmm4 $tmm5 $tmm6 $tmm7 $r16 $r17 $r18 $r19 $r20 $r21 $r22 $r23 $r24 $r25 $r26 $r27 $r28 $r29 $r30 $r31 $r16b $r17b $r18b $r19b $r20b $r21b $r22b $r23b $r24b $r25b $r26b $r27b $r28b $r29b $r30b $r31b $r16bh $r17bh $r18bh $r19bh $r20bh $r21bh $r22bh $r23bh $r24bh $r25bh $r26bh $r27bh $r28bh $r29bh $r30bh $r31bh $r16d $r17d $r18d $r19d $r20d $r21d $r22d $r23d $r24d $r25d $r26d $r27d $r28d $r29d $r30d $r31d $r16w $r17w $r18w $r19w $r20w $r21w $r22w $r23w $r24w $r25w $r26w $r27w $r28w $r29w $r30w $r31w $r16wh $r17wh $r18wh $r19wh $r20wh $r21wh $r22wh $r23wh $r24wh $r25wh $r26wh $r27wh $r28wh $r29wh $r30wh $r31wh
   call void @bar1()
   call void @bar2()
   ret void
diff --git a/llvm/test/CodeGen/X86/isel-llvm.sincos.ll b/llvm/test/CodeGen/X86/isel-llvm.sincos.ll
index 065710f91457b..8576f8f149e9a 100644
--- a/llvm/test/CodeGen/X86/isel-llvm.sincos.ll
+++ b/llvm/test/CodeGen/X86/isel-llvm.sincos.ll
@@ -3,6 +3,9 @@
 ; RUN: llc < %s -mtriple=x86_64-linux-gnu -fast-isel  | FileCheck %s --check-prefixes=X64,FASTISEL-X64
 ; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel=0 -fast-isel=0  | FileCheck %s --check-prefixes=X86,SDAG-X86
 ; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel=0 -fast-isel=0  | FileCheck %s --check-prefixes=X64,SDAG-X64
+; RUN: llc < %s -mtriple=x86_64-apple-macosx10.9.0 -mcpu=core2 | FileCheck %s --check-prefix=MACOS-SINCOS-STRET
+; RUN: llc < %s -mtriple=x86_64-apple-macosx10.8.0 -mcpu=core2 | FileCheck %s --check-prefix=MACOS-NOSINCOS-STRET
+
 ; TODO: The below RUN line will fails GISEL selection and will fallback to DAG selection due to lack of support for loads/stores in i686 mode, support is expected soon enough, for this reason the llvm/test/CodeGen/X86/GlobalISel/llvm.sincos.mir test is added for now because of the lack of support for i686 in GlobalISel.
 ; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel=1 -global-isel-abort=2 | FileCheck %s --check-prefixes=GISEL-X86
 ; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel=1 -global-isel-abort=1 | FileCheck %s --check-prefixes=GISEL-X64
@@ -34,6 +37,29 @@ define { float, float } @test_sincos_f32(float %Val) nounwind {
 ; X64-NEXT:    popq %rax
 ; X64-NEXT:    retq
 ;
+; MACOS-SINCOS-STRET-LABEL: test_sincos_f32:
+; MACOS-SINCOS-STRET:       ## %bb.0:
+; MACOS-SINCOS-STRET-NEXT:    pushq %rax
+; MACOS-SINCOS-STRET-NEXT:    callq ___sincosf_stret
+; MACOS-SINCOS-STRET-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; MACOS-SINCOS-STRET-NEXT:    popq %rax
+; MACOS-SINCOS-STRET-NEXT:    retq
+;
+; MACOS-NOSINCOS-STRET-LABEL: test_sincos_f32:
+; MACOS-NOSINCOS-STRET:       ## %bb.0:
+; MACOS-NOSINCOS-STRET-NEXT:    pushq %rax
+; MACOS-NOSINCOS-STRET-NEXT:    movss %xmm0, (%rsp) ## 4-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT:    callq _sinf
+; MACOS-NOSINCOS-STRET-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT:    movss (%rsp), %xmm0 ## 4-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT:    ## xmm0 = mem[0],zero,zero,zero
+; MACOS-NOSINCOS-STRET-NEXT:    callq _cosf
+; MACOS-NOSINCOS-STRET-NEXT:    movaps %xmm0, %xmm1
+; MACOS-NOSINCOS-STRET-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 4-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT:    ## xmm0 = mem[0],zero,zero,zero
+; MACOS-NOSINCOS-STRET-NEXT:    popq %rax
+; MACOS-NOSINCOS-STRET-NEXT:    retq
+;
 ; GISEL-X86-LABEL: test_sincos_f32:
 ; GISEL-X86:       # %bb.0:
 ; GISEL-X86-NEXT:    subl $28, %esp
@@ -93,6 +119,28 @@ define { double, double } @test_sincos_f64(double %Val) nounwind  {
 ; X64-NEXT:    addq $24, %rsp
 ; X64-NEXT:    retq
 ;
+; MACOS-SINCOS-STRET-LABEL: test_sincos_f64:
+; MACOS-SINCOS-STRET:       ## %bb.0:
+; MACOS-SINCOS-STRET-NEXT:    pushq %rax
+; MACOS-SINCOS-STRET-NEXT:    callq ___sincos_stret
+; MACOS-SINCOS-STRET-NEXT:    popq %rax
+; MACOS-SINCOS-STRET-NEXT:    retq
+;
+; MACOS-NOSINCOS-STRET-LABEL: test_sincos_f64:
+; MACOS-NOSINCOS-STRET:       ## %bb.0:
+; MACOS-NOSINCOS-STRET-NEXT:    subq $24, %rsp
+; MACOS-NOSINCOS-STRET-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT:    callq _sin
+; MACOS-NOSINCOS-STRET-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 8-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT:    ## xmm0 = mem[0],zero
+; MACOS-NOSINCOS-STRET-NEXT:    callq _cos
+; MACOS-NOSINCOS-STRET-NEXT:    movaps %xmm0, %xmm1
+; MACOS-NOSINCOS-STRET-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 8-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT:    ## xmm0 = mem[0],zero
+; MACOS-NOSINCOS-STRET-NEXT:    addq $24, %rsp
+; MACOS-NOSINCOS-STRET-NEXT:    retq
+;
 ; GISEL-X86-LABEL: test_sincos_f64:
 ; GISEL-X86:       # %bb.0:
 ; GISEL-X86-NEXT:    subl $44, %esp
@@ -153,6 +201,40 @@ define { x86_fp80, x86_fp80 } @test_sincos_f80(x86_fp80 %Val) nounwind {
 ; X64-NEXT:    addq $56, %rsp
 ; X64-NEXT:    retq
 ;
+; MACOS-SINCOS-STRET-LABEL: test_sincos_f80:
+; MACOS-SINCOS-STRET:       ## %bb.0:
+; MACOS-SINCOS-STRET-NEXT:    subq $40, %rsp
+; MACOS-SINCOS-STRET-NEXT:    fldt {{[0-9]+}}(%rsp)
+; MACOS-SINCOS-STRET-NEXT:    fld %st(0)
+; MACOS-SINCOS-STRET-NEXT:    fstpt {{[-0-9]+}}(%r{{[sb]}}p) ## 10-byte Folded Spill
+; MACOS-SINCOS-STRET-NEXT:    fstpt (%rsp)
+; MACOS-SINCOS-STRET-NEXT:    callq _cosl
+; MACOS-SINCOS-STRET-NEXT:    fstpt {{[-0-9]+}}(%r{{[sb]}}p) ## 10-byte Folded Spill
+; MACOS-SINCOS-STRET-NEXT:    fldt {{[-0-9]+}}(%r{{[sb]}}p) ## 10-byte Folded Reload
+; MACOS-SINCOS-STRET-NEXT:    fstpt (%rsp)
+; MACOS-SINCOS-STRET-NEXT:    callq _sinl
+; MACOS-SINCOS-STRET-NEXT:    fldt {{[-0-9]+}}(%r{{[sb]}}p) ## 10-byte Folded Reload
+; MACOS-SINCOS-STRET-NEXT:    fxch %st(1)
+; MACOS-SINCOS-STRET-NEXT:    addq $40, %rsp
+; MACOS-SINCOS-STRET-NEXT:    retq
+;
+; MACOS-NOSINCOS-STRET-LABEL: test_sincos_f80:
+; MACOS-NOSINCOS-STRET:       ## %bb.0:
+; MACOS-NOSINCOS-STRET-NEXT:    subq $40, %rsp
+; MACOS-NOSINCOS-STRET-NEXT:    fldt {{[0-9]+}}(%rsp)
+; MACOS-NOSINCOS-STRET-NEXT:    fld %st(0)
+; MACOS-NOSINCOS-STRET-NEXT:    fstpt {{[-0-9]+}}(%r{{[sb]}}p) ## 10-byte Folded Spill
+; MACOS-NOSINCOS-STRET-NEXT:    fstpt (%rsp)
+; MACOS-NOSINCOS-STRET-NEXT:    callq _cosl
+; MACOS-NOSINCOS-STRET-NEXT:    fstpt {{[-0-9]+}}(%r{{[sb]}}p) ## 10-byte Folded Spill
+; MACOS-NOSINCOS-STRET-NEXT:    fldt {{[-0-9]+}}(%r{{[sb]}}p) ## 10-byte Folded Reload
+; MACOS-NOSINCOS-STRET-NEXT:    fstpt (%rsp)
+; MACOS-NOSINCOS-STRET-NEXT:    callq _sinl
+; MACOS-NOSINCOS-STRET-NEXT:    fldt {{[-0-9]+}}(%r{{[sb]}}p) ## 10-byte Folded Reload
+; MACOS-NOSINCOS-STRET-NEXT:    fxch %st(1)
+; MACOS-NOSINCOS-STRET-NEXT:    addq $40, %rsp
+; MACOS-NOSINCOS-STRET-NEXT:    retq
+;
 ; GISEL-X86-LABEL: test_sincos_f80:
 ; GISEL-X86:       # %bb.0:
 ; GISEL-X86-NEXT:    subl $60, %esp
@@ -288,6 +370,57 @@ define void @can_fold_with_call_in_chain(float %x, ptr noalias %a, ptr noalias %
 ; SDAG-X64-NEXT:    popq %r14
 ; SDAG-X64-NEXT:    retq
 ;
+; MACOS-SINCOS-STRET-LABEL: can_fold_with_call_in_chain:
+; MACOS-SINCOS-STRET:       ## %bb.0: ## %entry
+; MACOS-SINCOS-STRET-NEXT:    pushq %r14
+; MACOS-SINCOS-STRET-NEXT:    pushq %rbx
+; MACOS-SINCOS-STRET-NEXT:    subq $40, %rsp
+; MACOS-SINCOS-STRET-NEXT:    movq %rsi, %rbx
+; MACOS-SINCOS-STRET-NEXT:    movq %rdi, %r14
+; MACOS-SINCOS-STRET-NEXT:    callq ___sincosf_stret
+; MACOS-SINCOS-STRET-NEXT:    movaps %xmm0, (%rsp) ## 16-byte Spill
+; MACOS-SINCOS-STRET-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; MACOS-SINCOS-STRET-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-SINCOS-STRET-NEXT:    movq %r14, %rdi
+; MACOS-SINCOS-STRET-NEXT:    movq %rbx, %rsi
+; MACOS-SINCOS-STRET-NEXT:    callq _foo
+; MACOS-SINCOS-STRET-NEXT:    movaps (%rsp), %xmm0 ## 16-byte Reload
+; MACOS-SINCOS-STRET-NEXT:    movss %xmm0, (%r14)
+; MACOS-SINCOS-STRET-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; MACOS-SINCOS-STRET-NEXT:    movss %xmm0, (%rbx)
+; MACOS-SINCOS-STRET-NEXT:    addq $40, %rsp
+; MACOS-SINCOS-STRET-NEXT:    popq %rbx
+; MACOS-SINCOS-STRET-NEXT:    popq %r14
+; MACOS-SINCOS-STRET-NEXT:    retq
+;
+; MACOS-NOSINCOS-STRET-LABEL: can_fold_with_call_in_chain:
+; MACOS-NOSINCOS-STRET:       ## %bb.0: ## %entry
+; MACOS-NOSINCOS-STRET-NEXT:    pushq %r14
+; MACOS-NOSINCOS-STRET-NEXT:    pushq %rbx
+; MACOS-NOSINCOS-STRET-NEXT:    pushq %rax
+; MACOS-NOSINCOS-STRET-NEXT:    movq %rsi, %rbx
+; MACOS-NOSINCOS-STRET-NEXT:    movq %rdi, %r14
+; MACOS-NOSINCOS-STRET-NEXT:    movss %xmm0, (%rsp) ## 4-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT:    callq _sinf
+; MACOS-NOSINCOS-STRET-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT:    movss (%rsp), %xmm0 ## 4-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT:    ## xmm0 = mem[0],zero,zero,zero
+; MACOS-NOSINCOS-STRET-NEXT:    callq _cosf
+; MACOS-NOSINCOS-STRET-NEXT:    movss %xmm0, (%rsp) ## 4-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT:    movq %r14, %rdi
+; MACOS-NOSINCOS-STRET-NEXT:    movq %rbx, %rsi
+; MACOS-NOSINCOS-STRET-NEXT:    callq _foo
+; MACOS-NOSINCOS-STRET-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 4-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT:    ## xmm0 = mem[0],zero,zero,zero
+; MACOS-NOSINCOS-STRET-NEXT:    movss %xmm0, (%r14)
+; MACOS-NOSINCOS-STRET-NEXT:    movss (%rsp), %xmm0 ## 4-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT:    ## xmm0 = mem[0],zero,zero,zero
+; MACOS-NOSINCOS-STRET-NEXT:    movss %xmm0, (%rbx)
+; MACOS-NOSINCOS-STRET-NEXT:    addq $8, %rsp
+; MACOS-NOSINCOS-STRET-NEXT:    popq %rbx
+; MACOS-NOSINCOS-STRET-NEXT:    popq %r14
+; MACOS-NOSINCOS-STRET-NEXT:    retq
+;
 ; GISEL-X86-LABEL: can_fold_with_call_in_chain:
 ; GISEL-X86:       # %bb.0: # %entry
 ; GISEL-X86-NEXT:    pushl %ebx
diff --git a/llvm/test/CodeGen/X86/ldexp-avx512.ll b/llvm/test/CodeGen/X86/ldexp-avx512.ll
new file mode 100644
index 0000000000000..ea93a911a1ad0
--- /dev/null
+++ b/llvm/test/CodeGen/X86/ldexp-avx512.ll
@@ -0,0 +1,467 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX512VL
+
+define half @test_half(half %x, i32 %exp) nounwind {
+; CHECK-LABEL: test_half:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    vcvtph2ps %xmm0, %xmm0
+; CHECK-NEXT:    callq ldexpf@PLT
+; CHECK-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    retq
+entry:
+  %r = tail call fast half @llvm.ldexp.f16.i32(half %x, i32 %exp)
+  ret half %r
+}
+declare half @llvm.ldexp.f16.i32(half, i32) memory(none)
+
+define float @test_float(float %x, i32 %exp) nounwind {
+; CHECK-LABEL: test_float:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    jmp ldexpf@PLT # TAILCALL
+entry:
+  %r = tail call fast float @ldexpf(float %x, i32 %exp)
+  ret float %r
+}
+declare float @ldexpf(float, i32) memory(none)
+
+define double @test_double(double %x, i32 %exp) nounwind {
+; CHECK-LABEL: test_double:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    jmp ldexp@PLT # TAILCALL
+entry:
+  %r = tail call fast double @ldexp(double %x, i32 %exp)
+  ret double %r
+}
+declare double @ldexp(double, i32) memory(none)
+
+define fp128 @testExpl(fp128 %x, i32 %exp) nounwind {
+; CHECK-LABEL: testExpl:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    jmp ldexpl@PLT # TAILCALL
+entry:
+  %r = tail call fast fp128 @ldexpl(fp128 %x, i32 %exp)
+  ret fp128 %r
+}
+declare fp128 @ldexpl(fp128, i32) memory(none)
+
+define <4 x float> @test_ldexp_4xfloat(<4 x float> %x, <4 x i32> %exp) nounwind {
+; CHECK-LABEL: test_ldexp_4xfloat:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $56, %rsp
+; CHECK-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovd %xmm1, %edi
+; CHECK-NEXT:    callq ldexpf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    vextractps $1, %xmm0, %edi
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq ldexpf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    vextractps $2, %xmm0, %edi
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq ldexpf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    vextractps $3, %xmm0, %edi
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    callq ldexpf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; CHECK-NEXT:    addq $56, %rsp
+; CHECK-NEXT:    retq
+  %r = call <4 x float> @llvm.ldexp.v4f32.v4i32(<4 x float> %x, <4 x i32> %exp)
+  ret <4 x float> %r
+}
+declare <4 x float> @llvm.ldexp.v4f32.v4i32(<4 x float>, <4 x i32>)
+
+define <2 x double> @test_ldexp_2xdouble(<2 x double> %x, <2 x i32> %exp) nounwind {
+; CHECK-LABEL: test_ldexp_2xdouble:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $56, %rsp
+; CHECK-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovd %xmm1, %edi
+; CHECK-NEXT:    callq ldexp@PLT
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    vextractps $1, %xmm0, %edi
+; CHECK-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq ldexp@PLT
+; CHECK-NEXT:    vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; CHECK-NEXT:    addq $56, %rsp
+; CHECK-NEXT:    retq
+  %r = call <2 x double> @llvm.ldexp.v2f64.v2i32(<2 x double> %x, <2 x i32> %exp)
+  ret <2 x double> %r
+}
+declare <2 x double> @llvm.ldexp.v2f64.v2i32(<2 x double>, <2 x i32>)
+
+define <8 x float> @test_ldexp_8xfloat(<8 x float> %x, <8 x i32> %exp) nounwind {
+; CHECK-LABEL: test_ldexp_8xfloat:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $120, %rsp
+; CHECK-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; CHECK-NEXT:    vmovdqa %xmm1, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovd %xmm1, %edi
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq ldexpf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    vextractps $1, %xmm0, %edi
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq ldexpf@PLT
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    vextractps $2, %xmm0, %edi
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq ldexpf@PLT
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    vextractps $3, %xmm0, %edi
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    callq ldexpf@PLT
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    vmovd %xmm0, %edi
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq ldexpf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    vextractps $1, %xmm0, %edi
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq ldexpf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    vextractps $2, %xmm0, %edi
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq ldexpf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    vextractps $3, %xmm0, %edi
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq ldexpf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; CHECK-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; CHECK-NEXT:    addq $120, %rsp
+; CHECK-NEXT:    retq
+  %r = call <8 x float> @llvm.ldexp.v8f32.v8i32(<8 x float> %x, <8 x i32> %exp)
+  ret <8 x float> %r
+}
+declare <8 x float> @llvm.ldexp.v8f32.v8i32(<8 x float>, <8 x i32>)
+
+define <4 x double> @test_ldexp_4xdouble(<4 x double> %x, <4 x i32> %exp) nounwind {
+; CHECK-LABEL: test_ldexp_4xdouble:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $88, %rsp
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    vextractps $2, %xmm1, %edi
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq ldexp@PLT
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    vextractps $3, %xmm0, %edi
+; CHECK-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq ldexp@PLT
+; CHECK-NEXT:    vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; CHECK-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    vmovd %xmm0, %edi
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq ldexp@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    vextractps $1, %xmm0, %edi
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq ldexp@PLT
+; CHECK-NEXT:    vmovapd (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; CHECK-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; CHECK-NEXT:    addq $88, %rsp
+; CHECK-NEXT:    retq
+  %r = call <4 x double> @llvm.ldexp.v4f64.v4i32(<4 x double> %x, <4 x i32> %exp)
+  ret <4 x double> %r
+}
+declare <4 x double> @llvm.ldexp.v4f64.v4i32(<4 x double>, <4 x i32>)
+
+define <16 x float> @test_ldexp_16xfloat(<16 x float> %x, <16 x i32> %exp) nounwind {
+; CHECK-LABEL: test_ldexp_16xfloat:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $216, %rsp
+; CHECK-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vextracti32x4 $3, %zmm1, %xmm1
+; CHECK-NEXT:    vmovdqa %xmm1, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovd %xmm1, %edi
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq ldexpf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    vextractps $1, %xmm0, %edi
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq ldexpf@PLT
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    vextractps $2, %xmm0, %edi
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq ldexpf@PLT
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    vextractps $3, %xmm0, %edi
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    callq ldexpf@PLT
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; CHECK-NEXT:    vextractf32x4 $2, %zmm0, %xmm1
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; CHECK-NEXT:    vextracti32x4 $2, %zmm0, %xmm0
+; CHECK-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovd %xmm0, %edi
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq ldexpf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    vextractps $1, %xmm0, %edi
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq ldexpf@PLT
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    vextractps $2, %xmm0, %edi
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq ldexpf@PLT
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    vextractps $3, %xmm0, %edi
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    callq ldexpf@PLT
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; CHECK-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovd %xmm0, %edi
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq ldexpf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    vextractps $1, %xmm0, %edi
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq ldexpf@PLT
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    vextractps $2, %xmm0, %edi
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq ldexpf@PLT
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    vextractps $3, %xmm0, %edi
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    callq ldexpf@PLT
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; CHECK-NEXT:    vmovd %xmm0, %edi
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq ldexpf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; CHECK-NEXT:    vextractps $1, %xmm0, %edi
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq ldexpf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; CHECK-NEXT:    vextractps $2, %xmm0, %edi
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq ldexpf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; CHECK-NEXT:    vextractps $3, %xmm0, %edi
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq ldexpf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; CHECK-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; CHECK-NEXT:    vinsertf64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
+; CHECK-NEXT:    addq $216, %rsp
+; CHECK-NEXT:    retq
+  %r = call <16 x float> @llvm.ldexp.v16f32.v16i32(<16 x float> %x, <16 x i32> %exp)
+  ret <16 x float> %r
+}
+declare <16 x float> @llvm.ldexp.v16f32.v16i32(<16 x float>, <16 x i32>)
+
+define <8 x double> @test_ldexp_8xdouble(<8 x double> %x, <8 x i32> %exp) nounwind {
+; CHECK-LABEL: test_ldexp_8xdouble:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $184, %rsp
+; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vextractps $2, %xmm1, %edi
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq ldexp@PLT
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    vextractps $3, %xmm0, %edi
+; CHECK-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq ldexp@PLT
+; CHECK-NEXT:    vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; CHECK-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; CHECK-NEXT:    vextractf32x4 $2, %zmm0, %xmm1
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    vmovd %xmm0, %edi
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq ldexp@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    vextractps $1, %xmm0, %edi
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq ldexp@PLT
+; CHECK-NEXT:    vmovapd (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; CHECK-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; CHECK-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    vextractps $2, %xmm0, %edi
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq ldexp@PLT
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    vextractps $3, %xmm0, %edi
+; CHECK-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq ldexp@PLT
+; CHECK-NEXT:    vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; CHECK-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    vmovd %xmm0, %edi
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq ldexp@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    vextractps $1, %xmm0, %edi
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq ldexp@PLT
+; CHECK-NEXT:    vmovapd (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; CHECK-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; CHECK-NEXT:    vinsertf64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
+; CHECK-NEXT:    addq $184, %rsp
+; CHECK-NEXT:    retq
+  %r = call <8 x double> @llvm.ldexp.v8f64.v8i32(<8 x double> %x, <8 x i32> %exp)
+  ret <8 x double> %r
+}
+declare <8 x double> @llvm.ldexp.v8f64.v8i32(<8 x double>, <8 x i32>)
+
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; AVX512: {{.*}}
+; AVX512VL: {{.*}}
diff --git a/llvm/test/CodeGen/X86/llvm.sincos.vec.ll b/llvm/test/CodeGen/X86/llvm.sincos.vec.ll
index 834dd788ff7fb..9b02438952035 100644
--- a/llvm/test/CodeGen/X86/llvm.sincos.vec.ll
+++ b/llvm/test/CodeGen/X86/llvm.sincos.vec.ll
@@ -1,59 +1,213 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_sp --version 5
-; RUN: llc < %s -mtriple=i386-unknown-linux-gnu  | FileCheck %s
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu  | FileCheck -check-prefix=X86 %s
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu  | FileCheck -check-prefix=X64 %s
+; RUN: llc < %s -mtriple=x86_64-apple-macosx10.9.0 | FileCheck --check-prefix=MACOS-SINCOS-STRET %s
+; RUN: llc < %s -mtriple=x86_64-apple-macosx10.8.0 | FileCheck --check-prefix=MACOS-NOSINCOS-STRET %s
 
 define void @test_sincos_v4f32(<4 x float> %x, ptr noalias %out_sin, ptr noalias %out_cos) nounwind {
-; CHECK-LABEL: test_sincos_v4f32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    pushl %edi
-; CHECK-NEXT:    pushl %esi
-; CHECK-NEXT:    subl $52, %esp
-; CHECK-NEXT:    movl 84(%esp), %esi
-; CHECK-NEXT:    flds 76(%esp)
-; CHECK-NEXT:    fstps {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; CHECK-NEXT:    flds 64(%esp)
-; CHECK-NEXT:    fstps {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; CHECK-NEXT:    flds 72(%esp)
-; CHECK-NEXT:    fstps {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; CHECK-NEXT:    flds 68(%esp)
-; CHECK-NEXT:    movl 80(%esp), %edi
-; CHECK-NEXT:    leal 40(%esp), %eax
-; CHECK-NEXT:    movl %eax, 8(%esp)
-; CHECK-NEXT:    leal 4(%edi), %eax
-; CHECK-NEXT:    movl %eax, 4(%esp)
-; CHECK-NEXT:    fstps (%esp)
-; CHECK-NEXT:    calll sincosf
-; CHECK-NEXT:    leal 44(%esp), %eax
-; CHECK-NEXT:    movl %eax, 8(%esp)
-; CHECK-NEXT:    leal 8(%edi), %eax
-; CHECK-NEXT:    movl %eax, 4(%esp)
-; CHECK-NEXT:    flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; CHECK-NEXT:    fstps (%esp)
-; CHECK-NEXT:    calll sincosf
-; CHECK-NEXT:    leal 36(%esp), %eax
-; CHECK-NEXT:    movl %eax, 8(%esp)
-; CHECK-NEXT:    movl %edi, 4(%esp)
-; CHECK-NEXT:    flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; CHECK-NEXT:    fstps (%esp)
-; CHECK-NEXT:    calll sincosf
-; CHECK-NEXT:    leal 48(%esp), %eax
-; CHECK-NEXT:    movl %eax, 8(%esp)
-; CHECK-NEXT:    addl $12, %edi
-; CHECK-NEXT:    movl %edi, 4(%esp)
-; CHECK-NEXT:    flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; CHECK-NEXT:    fstps (%esp)
-; CHECK-NEXT:    calll sincosf
-; CHECK-NEXT:    flds 36(%esp)
-; CHECK-NEXT:    flds 40(%esp)
-; CHECK-NEXT:    flds 44(%esp)
-; CHECK-NEXT:    flds 48(%esp)
-; CHECK-NEXT:    fstps 12(%esi)
-; CHECK-NEXT:    fstps 8(%esi)
-; CHECK-NEXT:    fstps 4(%esi)
-; CHECK-NEXT:    fstps (%esi)
-; CHECK-NEXT:    addl $52, %esp
-; CHECK-NEXT:    popl %esi
-; CHECK-NEXT:    popl %edi
-; CHECK-NEXT:    retl
+; X86-LABEL: test_sincos_v4f32:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $52, %esp
+; X86-NEXT:    movl 84(%esp), %esi
+; X86-NEXT:    flds 76(%esp)
+; X86-NEXT:    fstps {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    flds 64(%esp)
+; X86-NEXT:    fstps {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    flds 72(%esp)
+; X86-NEXT:    fstps {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    flds 68(%esp)
+; X86-NEXT:    movl 80(%esp), %edi
+; X86-NEXT:    leal 40(%esp), %eax
+; X86-NEXT:    movl %eax, 8(%esp)
+; X86-NEXT:    leal 4(%edi), %eax
+; X86-NEXT:    movl %eax, 4(%esp)
+; X86-NEXT:    fstps (%esp)
+; X86-NEXT:    calll sincosf
+; X86-NEXT:    leal 44(%esp), %eax
+; X86-NEXT:    movl %eax, 8(%esp)
+; X86-NEXT:    leal 8(%edi), %eax
+; X86-NEXT:    movl %eax, 4(%esp)
+; X86-NEXT:    flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    fstps (%esp)
+; X86-NEXT:    calll sincosf
+; X86-NEXT:    leal 36(%esp), %eax
+; X86-NEXT:    movl %eax, 8(%esp)
+; X86-NEXT:    movl %edi, 4(%esp)
+; X86-NEXT:    flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    fstps (%esp)
+; X86-NEXT:    calll sincosf
+; X86-NEXT:    leal 48(%esp), %eax
+; X86-NEXT:    movl %eax, 8(%esp)
+; X86-NEXT:    addl $12, %edi
+; X86-NEXT:    movl %edi, 4(%esp)
+; X86-NEXT:    flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    fstps (%esp)
+; X86-NEXT:    calll sincosf
+; X86-NEXT:    flds 36(%esp)
+; X86-NEXT:    flds 40(%esp)
+; X86-NEXT:    flds 44(%esp)
+; X86-NEXT:    flds 48(%esp)
+; X86-NEXT:    fstps 12(%esi)
+; X86-NEXT:    fstps 8(%esi)
+; X86-NEXT:    fstps 4(%esi)
+; X86-NEXT:    fstps (%esi)
+; X86-NEXT:    addl $52, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_sincos_v4f32:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %r14
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    subq $56, %rsp
+; X64-NEXT:    movq %rsi, %rbx
+; X64-NEXT:    movq %rdi, %r14
+; X64-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; X64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; X64-NEXT:    leaq 4(%rsp), %rdi
+; X64-NEXT:    movq %rsp, %rsi
+; X64-NEXT:    callq sincosf@PLT
+; X64-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; X64-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; X64-NEXT:    leaq 12(%rsp), %rdi
+; X64-NEXT:    leaq 8(%rsp), %rsi
+; X64-NEXT:    callq sincosf@PLT
+; X64-NEXT:    leaq 28(%rsp), %rdi
+; X64-NEXT:    leaq 24(%rsp), %rsi
+; X64-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; X64-NEXT:    callq sincosf@PLT
+; X64-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; X64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X64-NEXT:    leaq 20(%rsp), %rdi
+; X64-NEXT:    leaq 16(%rsp), %rsi
+; X64-NEXT:    callq sincosf@PLT
+; X64-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X64-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X64-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X64-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X64-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; X64-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; X64-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; X64-NEXT:    movups %xmm1, (%r14)
+; X64-NEXT:    movups %xmm0, (%rbx)
+; X64-NEXT:    addq $56, %rsp
+; X64-NEXT:    popq %rbx
+; X64-NEXT:    popq %r14
+; X64-NEXT:    retq
+;
+; MACOS-SINCOS-STRET-LABEL: test_sincos_v4f32:
+; MACOS-SINCOS-STRET:       ## %bb.0:
+; MACOS-SINCOS-STRET-NEXT:    pushq %r14
+; MACOS-SINCOS-STRET-NEXT:    pushq %rbx
+; MACOS-SINCOS-STRET-NEXT:    subq $104, %rsp
+; MACOS-SINCOS-STRET-NEXT:    movq %rsi, %rbx
+; MACOS-SINCOS-STRET-NEXT:    movq %rdi, %r14
+; MACOS-SINCOS-STRET-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-SINCOS-STRET-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; MACOS-SINCOS-STRET-NEXT:    callq ___sincosf_stret
+; MACOS-SINCOS-STRET-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-SINCOS-STRET-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; MACOS-SINCOS-STRET-NEXT:    movaps %xmm0, (%rsp) ## 16-byte Spill
+; MACOS-SINCOS-STRET-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; MACOS-SINCOS-STRET-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; MACOS-SINCOS-STRET-NEXT:    callq ___sincosf_stret
+; MACOS-SINCOS-STRET-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-SINCOS-STRET-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; MACOS-SINCOS-STRET-NEXT:    unpcklps (%rsp), %xmm0 ## 16-byte Folded Reload
+; MACOS-SINCOS-STRET-NEXT:    ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; MACOS-SINCOS-STRET-NEXT:    movaps %xmm0, (%rsp) ## 16-byte Spill
+; MACOS-SINCOS-STRET-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; MACOS-SINCOS-STRET-NEXT:    callq ___sincosf_stret
+; MACOS-SINCOS-STRET-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-SINCOS-STRET-NEXT:    movaps %xmm0, %xmm1
+; MACOS-SINCOS-STRET-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
+; MACOS-SINCOS-STRET-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-SINCOS-STRET-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; MACOS-SINCOS-STRET-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; MACOS-SINCOS-STRET-NEXT:    callq ___sincosf_stret
+; MACOS-SINCOS-STRET-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; MACOS-SINCOS-STRET-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; MACOS-SINCOS-STRET-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; MACOS-SINCOS-STRET-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 ## 16-byte Reload
+; MACOS-SINCOS-STRET-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; MACOS-SINCOS-STRET-NEXT:    unpcklpd (%rsp), %xmm2 ## 16-byte Folded Reload
+; MACOS-SINCOS-STRET-NEXT:    ## xmm2 = xmm2[0],mem[0]
+; MACOS-SINCOS-STRET-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; MACOS-SINCOS-STRET-NEXT:    unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; MACOS-SINCOS-STRET-NEXT:    ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; MACOS-SINCOS-STRET-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; MACOS-SINCOS-STRET-NEXT:    movups %xmm1, (%r14)
+; MACOS-SINCOS-STRET-NEXT:    movups %xmm2, (%rbx)
+; MACOS-SINCOS-STRET-NEXT:    addq $104, %rsp
+; MACOS-SINCOS-STRET-NEXT:    popq %rbx
+; MACOS-SINCOS-STRET-NEXT:    popq %r14
+; MACOS-SINCOS-STRET-NEXT:    retq
+;
+; MACOS-NOSINCOS-STRET-LABEL: test_sincos_v4f32:
+; MACOS-NOSINCOS-STRET:       ## %bb.0:
+; MACOS-NOSINCOS-STRET-NEXT:    pushq %r14
+; MACOS-NOSINCOS-STRET-NEXT:    pushq %rbx
+; MACOS-NOSINCOS-STRET-NEXT:    subq $104, %rsp
+; MACOS-NOSINCOS-STRET-NEXT:    movq %rsi, %rbx
+; MACOS-NOSINCOS-STRET-NEXT:    movq %rdi, %r14
+; MACOS-NOSINCOS-STRET-NEXT:    movaps %xmm0, (%rsp) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; MACOS-NOSINCOS-STRET-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT:    callq _cosf
+; MACOS-NOSINCOS-STRET-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT:    movaps (%rsp), %xmm0 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; MACOS-NOSINCOS-STRET-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT:    callq _cosf
+; MACOS-NOSINCOS-STRET-NEXT:    unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; MACOS-NOSINCOS-STRET-NEXT:    ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; MACOS-NOSINCOS-STRET-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT:    movaps (%rsp), %xmm0 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT:    callq _cosf
+; MACOS-NOSINCOS-STRET-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT:    movaps (%rsp), %xmm0 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; MACOS-NOSINCOS-STRET-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT:    callq _cosf
+; MACOS-NOSINCOS-STRET-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; MACOS-NOSINCOS-STRET-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload
+; MACOS-NOSINCOS-STRET-NEXT:    ## xmm1 = xmm1[0],mem[0]
+; MACOS-NOSINCOS-STRET-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT:    callq _sinf
+; MACOS-NOSINCOS-STRET-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT:    callq _sinf
+; MACOS-NOSINCOS-STRET-NEXT:    unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; MACOS-NOSINCOS-STRET-NEXT:    ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; MACOS-NOSINCOS-STRET-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT:    movaps (%rsp), %xmm0 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT:    callq _sinf
+; MACOS-NOSINCOS-STRET-NEXT:    movaps %xmm0, (%rsp) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT:    callq _sinf
+; MACOS-NOSINCOS-STRET-NEXT:    movaps (%rsp), %xmm1 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; MACOS-NOSINCOS-STRET-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload
+; MACOS-NOSINCOS-STRET-NEXT:    ## xmm1 = xmm1[0],mem[0]
+; MACOS-NOSINCOS-STRET-NEXT:    movups %xmm1, (%r14)
+; MACOS-NOSINCOS-STRET-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT:    movups %xmm0, (%rbx)
+; MACOS-NOSINCOS-STRET-NEXT:    addq $104, %rsp
+; MACOS-NOSINCOS-STRET-NEXT:    popq %rbx
+; MACOS-NOSINCOS-STRET-NEXT:    popq %r14
+; MACOS-NOSINCOS-STRET-NEXT:    retq
   %result = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> %x)
   %result.0 = extractvalue { <4 x float>, <4 x float> } %result, 0
   %result.1 = extractvalue { <4 x float>, <4 x float> } %result, 1
@@ -63,36 +217,120 @@ define void @test_sincos_v4f32(<4 x float> %x, ptr noalias %out_sin, ptr noalias
 }
 
 define void @test_sincos_v2f64(<2 x double> %x, ptr noalias %out_sin, ptr noalias %out_cos) nounwind {
-; CHECK-LABEL: test_sincos_v2f64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    pushl %edi
-; CHECK-NEXT:    pushl %esi
-; CHECK-NEXT:    subl $52, %esp
-; CHECK-NEXT:    movl 84(%esp), %esi
-; CHECK-NEXT:    fldl 72(%esp)
-; CHECK-NEXT:    fstpl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Spill
-; CHECK-NEXT:    fldl 64(%esp)
-; CHECK-NEXT:    movl 80(%esp), %edi
-; CHECK-NEXT:    leal 24(%esp), %eax
-; CHECK-NEXT:    movl %eax, 12(%esp)
-; CHECK-NEXT:    movl %edi, 8(%esp)
-; CHECK-NEXT:    fstpl (%esp)
-; CHECK-NEXT:    calll sincos
-; CHECK-NEXT:    leal 32(%esp), %eax
-; CHECK-NEXT:    movl %eax, 12(%esp)
-; CHECK-NEXT:    addl $8, %edi
-; CHECK-NEXT:    movl %edi, 8(%esp)
-; CHECK-NEXT:    fldl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Reload
-; CHECK-NEXT:    fstpl (%esp)
-; CHECK-NEXT:    calll sincos
-; CHECK-NEXT:    fldl 24(%esp)
-; CHECK-NEXT:    fldl 32(%esp)
-; CHECK-NEXT:    fstpl 8(%esi)
-; CHECK-NEXT:    fstpl (%esi)
-; CHECK-NEXT:    addl $52, %esp
-; CHECK-NEXT:    popl %esi
-; CHECK-NEXT:    popl %edi
-; CHECK-NEXT:    retl
+; X86-LABEL: test_sincos_v2f64:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $52, %esp
+; X86-NEXT:    movl 84(%esp), %esi
+; X86-NEXT:    fldl 72(%esp)
+; X86-NEXT:    fstpl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Spill
+; X86-NEXT:    fldl 64(%esp)
+; X86-NEXT:    movl 80(%esp), %edi
+; X86-NEXT:    leal 24(%esp), %eax
+; X86-NEXT:    movl %eax, 12(%esp)
+; X86-NEXT:    movl %edi, 8(%esp)
+; X86-NEXT:    fstpl (%esp)
+; X86-NEXT:    calll sincos
+; X86-NEXT:    leal 32(%esp), %eax
+; X86-NEXT:    movl %eax, 12(%esp)
+; X86-NEXT:    addl $8, %edi
+; X86-NEXT:    movl %edi, 8(%esp)
+; X86-NEXT:    fldl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Reload
+; X86-NEXT:    fstpl (%esp)
+; X86-NEXT:    calll sincos
+; X86-NEXT:    fldl 24(%esp)
+; X86-NEXT:    fldl 32(%esp)
+; X86-NEXT:    fstpl 8(%esi)
+; X86-NEXT:    fstpl (%esi)
+; X86-NEXT:    addl $52, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_sincos_v2f64:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %r14
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    subq $56, %rsp
+; X64-NEXT:    movq %rsi, %rbx
+; X64-NEXT:    movq %rdi, %r14
+; X64-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; X64-NEXT:    leaq 24(%rsp), %rdi
+; X64-NEXT:    leaq 16(%rsp), %rsi
+; X64-NEXT:    callq sincos@PLT
+; X64-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; X64-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; X64-NEXT:    leaq 8(%rsp), %rdi
+; X64-NEXT:    movq %rsp, %rsi
+; X64-NEXT:    callq sincos@PLT
+; X64-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
+; X64-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; X64-NEXT:    movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
+; X64-NEXT:    movups %xmm1, (%r14)
+; X64-NEXT:    movups %xmm0, (%rbx)
+; X64-NEXT:    addq $56, %rsp
+; X64-NEXT:    popq %rbx
+; X64-NEXT:    popq %r14
+; X64-NEXT:    retq
+;
+; MACOS-SINCOS-STRET-LABEL: test_sincos_v2f64:
+; MACOS-SINCOS-STRET:       ## %bb.0:
+; MACOS-SINCOS-STRET-NEXT:    pushq %r14
+; MACOS-SINCOS-STRET-NEXT:    pushq %rbx
+; MACOS-SINCOS-STRET-NEXT:    subq $56, %rsp
+; MACOS-SINCOS-STRET-NEXT:    movq %rsi, %rbx
+; MACOS-SINCOS-STRET-NEXT:    movq %rdi, %r14
+; MACOS-SINCOS-STRET-NEXT:    movaps %xmm0, (%rsp) ## 16-byte Spill
+; MACOS-SINCOS-STRET-NEXT:    callq ___sincos_stret
+; MACOS-SINCOS-STRET-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-SINCOS-STRET-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-SINCOS-STRET-NEXT:    movaps (%rsp), %xmm0 ## 16-byte Reload
+; MACOS-SINCOS-STRET-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; MACOS-SINCOS-STRET-NEXT:    callq ___sincos_stret
+; MACOS-SINCOS-STRET-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 ## 16-byte Reload
+; MACOS-SINCOS-STRET-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; MACOS-SINCOS-STRET-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; MACOS-SINCOS-STRET-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; MACOS-SINCOS-STRET-NEXT:    movups %xmm1, (%r14)
+; MACOS-SINCOS-STRET-NEXT:    movups %xmm2, (%rbx)
+; MACOS-SINCOS-STRET-NEXT:    addq $56, %rsp
+; MACOS-SINCOS-STRET-NEXT:    popq %rbx
+; MACOS-SINCOS-STRET-NEXT:    popq %r14
+; MACOS-SINCOS-STRET-NEXT:    retq
+;
+; MACOS-NOSINCOS-STRET-LABEL: test_sincos_v2f64:
+; MACOS-NOSINCOS-STRET:       ## %bb.0:
+; MACOS-NOSINCOS-STRET-NEXT:    pushq %r14
+; MACOS-NOSINCOS-STRET-NEXT:    pushq %rbx
+; MACOS-NOSINCOS-STRET-NEXT:    subq $56, %rsp
+; MACOS-NOSINCOS-STRET-NEXT:    movq %rsi, %rbx
+; MACOS-NOSINCOS-STRET-NEXT:    movq %rdi, %r14
+; MACOS-NOSINCOS-STRET-NEXT:    movaps %xmm0, (%rsp) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT:    callq _cos
+; MACOS-NOSINCOS-STRET-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT:    movaps (%rsp), %xmm0 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; MACOS-NOSINCOS-STRET-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT:    callq _cos
+; MACOS-NOSINCOS-STRET-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; MACOS-NOSINCOS-STRET-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT:    movaps (%rsp), %xmm0 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT:    callq _sin
+; MACOS-NOSINCOS-STRET-NEXT:    movaps %xmm0, (%rsp) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT:    callq _sin
+; MACOS-NOSINCOS-STRET-NEXT:    movaps (%rsp), %xmm1 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; MACOS-NOSINCOS-STRET-NEXT:    movups %xmm1, (%r14)
+; MACOS-NOSINCOS-STRET-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT:    movups %xmm0, (%rbx)
+; MACOS-NOSINCOS-STRET-NEXT:    addq $56, %rsp
+; MACOS-NOSINCOS-STRET-NEXT:    popq %rbx
+; MACOS-NOSINCOS-STRET-NEXT:    popq %r14
+; MACOS-NOSINCOS-STRET-NEXT:    retq
   %result = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> %x)
   %result.0 = extractvalue { <2 x double>, <2 x double> } %result, 0
   %result.1 = extractvalue { <2 x double>, <2 x double> } %result, 1
diff --git a/llvm/test/CodeGen/X86/pr165755.ll b/llvm/test/CodeGen/X86/pr165755.ll
new file mode 100644
index 0000000000000..3ab484f676c45
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr165755.ll
@@ -0,0 +1,26 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=i686-- | FileCheck %s --check-prefixes=X86
+; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s --check-prefixes=X64
+
+define i32 @PR165755(ptr %p0) {
+; X86-LABEL: PR165755:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl (%ecx), %eax
+; X86-NEXT:    movb $0, (%ecx)
+; X86-NEXT:    retl
+;
+; X64-LABEL: PR165755:
+; X64:       # %bb.0:
+; X64-NEXT:    movl (%rdi), %eax
+; X64-NEXT:    movb $0, (%rdi)
+; X64-NEXT:    retq
+  %ld64 = load i64, ptr %p0, align 8
+  store i8 0, ptr %p0, align 1
+  %ld32 = load i32, ptr %p0, align 8
+  %mask = and i32 %ld32, 32
+  %zext = zext i32 %mask to i64
+  %srl = lshr i64 %ld64, %zext
+  %res = trunc i64 %srl to i32
+  ret i32 %res
+}
diff --git a/llvm/test/CodeGen/X86/trunc-srl-load.ll b/llvm/test/CodeGen/X86/trunc-srl-load.ll
index 4dae1433b2196..d9c21d3a3f570 100644
--- a/llvm/test/CodeGen/X86/trunc-srl-load.ll
+++ b/llvm/test/CodeGen/X86/trunc-srl-load.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i686-unknown                   | FileCheck %s --check-prefixes=X86
-; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64    | FileCheck %s --check-prefixes=X64,SSE
-; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=X64,SSE
-; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=X64,AVX,AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=X64,AVX,AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64    | FileCheck %s --check-prefixes=X64
+; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=X64
+; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=X64
+; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=X64
 
 ; Tests showing for the analysis of non-constant shift amounts to improve load address math
 
@@ -12,42 +12,20 @@
 define i16 @extractSub64_16(ptr %word, i32 %idx) nounwind {
 ; X86-LABEL: extractSub64_16:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %ch
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    movl 4(%eax), %esi
-; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    andb $16, %cl
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    shrl %cl, %eax
-; X86-NEXT:    shrdl %cl, %esi, %edx
-; X86-NEXT:    testb $32, %ch
-; X86-NEXT:    jne .LBB0_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:  .LBB0_2:
-; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $48, %ecx
+; X86-NEXT:    shrl $3, %ecx
+; X86-NEXT:    movzwl (%eax,%ecx), %eax
 ; X86-NEXT:    retl
 ;
-; SSE-LABEL: extractSub64_16:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movl %esi, %ecx
-; SSE-NEXT:    movq (%rdi), %rax
-; SSE-NEXT:    andb $48, %cl
-; SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
-; SSE-NEXT:    shrq %cl, %rax
-; SSE-NEXT:    # kill: def $ax killed $ax killed $rax
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: extractSub64_16:
-; AVX:       # %bb.0:
-; AVX-NEXT:    # kill: def $esi killed $esi def $rsi
-; AVX-NEXT:    andb $48, %sil
-; AVX-NEXT:    shrxq %rsi, (%rdi), %rax
-; AVX-NEXT:    # kill: def $ax killed $ax killed $rax
-; AVX-NEXT:    retq
+; X64-LABEL: extractSub64_16:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    andl $48, %esi
+; X64-NEXT:    shrl $3, %esi
+; X64-NEXT:    movzwl (%rdi,%rsi), %eax
+; X64-NEXT:    retq
   %idx_bounds = and i32 %idx, 63
   %idx_align = and i32 %idx_bounds, -16
   %sh = zext nneg i32 %idx_align to i64
@@ -60,67 +38,20 @@ define i16 @extractSub64_16(ptr %word, i32 %idx) nounwind {
 define i16 @extractSub128_16(ptr %word, i32 %idx) nounwind {
 ; X86-LABEL: extractSub128_16:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $32, %esp
-; X86-NEXT:    movzbl 12(%ebp), %eax
-; X86-NEXT:    movl 8(%ebp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl 4(%ecx), %esi
-; X86-NEXT:    movl 8(%ecx), %edi
-; X86-NEXT:    movl 12(%ecx), %ecx
-; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, (%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andb $16, %cl
-; X86-NEXT:    shrb $3, %al
-; X86-NEXT:    andb $12, %al
-; X86-NEXT:    movzbl %al, %edx
-; X86-NEXT:    movl (%esp,%edx), %eax
-; X86-NEXT:    movl 4(%esp,%edx), %edx
-; X86-NEXT:    shrdl %cl, %edx, %eax
-; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    leal -8(%ebp), %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $112, %ecx
+; X86-NEXT:    shrl $3, %ecx
+; X86-NEXT:    movzwl (%eax,%ecx), %eax
 ; X86-NEXT:    retl
 ;
-; SSE-LABEL: extractSub128_16:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movq (%rdi), %rax
-; SSE-NEXT:    movq 8(%rdi), %rdx
-; SSE-NEXT:    movl %esi, %ecx
-; SSE-NEXT:    andb $48, %cl
-; SSE-NEXT:    movq %rdx, %rdi
-; SSE-NEXT:    shrq %cl, %rdi
-; SSE-NEXT:    shrdq %cl, %rdx, %rax
-; SSE-NEXT:    testb $64, %sil
-; SSE-NEXT:    cmovneq %rdi, %rax
-; SSE-NEXT:    # kill: def $ax killed $ax killed $rax
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: extractSub128_16:
-; AVX:       # %bb.0:
-; AVX-NEXT:    movq (%rdi), %rdx
-; AVX-NEXT:    movq 8(%rdi), %rax
-; AVX-NEXT:    movl %esi, %ecx
-; AVX-NEXT:    andb $48, %cl
-; AVX-NEXT:    shrdq %cl, %rax, %rdx
-; AVX-NEXT:    shrxq %rcx, %rax, %rax
-; AVX-NEXT:    testb $64, %sil
-; AVX-NEXT:    cmoveq %rdx, %rax
-; AVX-NEXT:    # kill: def $ax killed $ax killed $rax
-; AVX-NEXT:    retq
+; X64-LABEL: extractSub128_16:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    andl $112, %esi
+; X64-NEXT:    shrl $3, %esi
+; X64-NEXT:    movzwl (%rdi,%rsi), %eax
+; X64-NEXT:    retq
   %idx_bounds = and i32 %idx, 127
   %idx_align = and i32 %idx_bounds, -16
   %sh = zext nneg i32 %idx_align to i128
@@ -133,62 +64,20 @@ define i16 @extractSub128_16(ptr %word, i32 %idx) nounwind {
 define i32 @extractSub128_32(ptr %word, i32 %idx) nounwind {
 ; X86-LABEL: extractSub128_32:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $32, %esp
-; X86-NEXT:    movzbl 12(%ebp), %eax
-; X86-NEXT:    movl 8(%ebp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl 4(%ecx), %esi
-; X86-NEXT:    movl 8(%ecx), %edi
-; X86-NEXT:    movl 12(%ecx), %ecx
-; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, (%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    andb $96, %al
-; X86-NEXT:    shrb $3, %al
-; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    movl (%esp,%eax), %eax
-; X86-NEXT:    leal -8(%ebp), %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $96, %ecx
+; X86-NEXT:    shrl $3, %ecx
+; X86-NEXT:    movl (%eax,%ecx), %eax
 ; X86-NEXT:    retl
 ;
-; SSE-LABEL: extractSub128_32:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movq (%rdi), %rax
-; SSE-NEXT:    movq 8(%rdi), %rdx
-; SSE-NEXT:    movl %esi, %ecx
-; SSE-NEXT:    andb $32, %cl
-; SSE-NEXT:    movq %rdx, %rdi
-; SSE-NEXT:    shrq %cl, %rdi
-; SSE-NEXT:    shrdq %cl, %rdx, %rax
-; SSE-NEXT:    testb $64, %sil
-; SSE-NEXT:    cmovneq %rdi, %rax
-; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: extractSub128_32:
-; AVX:       # %bb.0:
-; AVX-NEXT:    movq (%rdi), %rdx
-; AVX-NEXT:    movq 8(%rdi), %rax
-; AVX-NEXT:    movl %esi, %ecx
-; AVX-NEXT:    andb $32, %cl
-; AVX-NEXT:    shrdq %cl, %rax, %rdx
-; AVX-NEXT:    shrxq %rcx, %rax, %rax
-; AVX-NEXT:    testb $64, %sil
-; AVX-NEXT:    cmoveq %rdx, %rax
-; AVX-NEXT:    # kill: def $eax killed $eax killed $rax
-; AVX-NEXT:    retq
+; X64-LABEL: extractSub128_32:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    andl $96, %esi
+; X64-NEXT:    shrl $3, %esi
+; X64-NEXT:    movl (%rdi,%rsi), %eax
+; X64-NEXT:    retq
   %idx_bounds = and i32 %idx, 127
   %idx_align = and i32 %idx_bounds, -32
   %sh = zext nneg i32 %idx_align to i128
@@ -201,46 +90,20 @@ define i32 @extractSub128_32(ptr %word, i32 %idx) nounwind {
 define i64 @extractSub128_64(ptr %word, i32 %idx) nounwind {
 ; X86-LABEL: extractSub128_64:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $32, %esp
-; X86-NEXT:    movzbl 12(%ebp), %eax
-; X86-NEXT:    movl 8(%ebp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl 4(%ecx), %esi
-; X86-NEXT:    movl 8(%ecx), %edi
-; X86-NEXT:    movl 12(%ecx), %ecx
-; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, (%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    andb $64, %al
-; X86-NEXT:    shrb $3, %al
-; X86-NEXT:    movzbl %al, %ecx
-; X86-NEXT:    movl (%esp,%ecx), %eax
-; X86-NEXT:    movl 4(%esp,%ecx), %edx
-; X86-NEXT:    leal -8(%ebp), %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    andl $64, %edx
+; X86-NEXT:    shrl $3, %edx
+; X86-NEXT:    movl (%ecx,%edx), %eax
+; X86-NEXT:    movl 4(%ecx,%edx), %edx
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: extractSub128_64:
 ; X64:       # %bb.0:
-; X64-NEXT:    testb $64, %sil
-; X64-NEXT:    je .LBB3_1
-; X64-NEXT:  # %bb.2:
-; X64-NEXT:    movq 8(%rdi), %rax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB3_1:
-; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    andl $64, %esi
+; X64-NEXT:    shrl $3, %esi
+; X64-NEXT:    movq (%rdi,%rsi), %rax
 ; X64-NEXT:    retq
   %idx_bounds = and i32 %idx, 127
   %idx_align = and i32 %idx_bounds, -64
@@ -254,185 +117,20 @@ define i64 @extractSub128_64(ptr %word, i32 %idx) nounwind {
 define i8 @extractSub512_8(ptr %word, i32 %idx) nounwind {
 ; X86-LABEL: extractSub512_8:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $192, %esp
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl (%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 4(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 8(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 12(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 16(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 20(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 24(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 28(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 32(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 36(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 40(%eax), %ebx
-; X86-NEXT:    movl 44(%eax), %edi
-; X86-NEXT:    movl 48(%eax), %esi
-; X86-NEXT:    movl 52(%eax), %edx
-; X86-NEXT:    movl 56(%eax), %ecx
-; X86-NEXT:    movl 60(%eax), %eax
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 12(%ebp), %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    andl $24, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    shrl $3, %edx
-; X86-NEXT:    andl $60, %edx
-; X86-NEXT:    movl 48(%esp,%edx), %eax
-; X86-NEXT:    movl 52(%esp,%edx), %edx
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shrdl %cl, %edx, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    leal -12(%ebp), %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    shrl $3, %ecx
+; X86-NEXT:    andl $63, %ecx
+; X86-NEXT:    movzbl (%eax,%ecx), %eax
 ; X86-NEXT:    retl
 ;
-; SSE-LABEL: extractSub512_8:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pushq %rax
-; SSE-NEXT:    # kill: def $esi killed $esi def $rsi
-; SSE-NEXT:    movups (%rdi), %xmm0
-; SSE-NEXT:    movups 16(%rdi), %xmm1
-; SSE-NEXT:    movups 32(%rdi), %xmm2
-; SSE-NEXT:    movups 48(%rdi), %xmm3
-; SSE-NEXT:    xorps %xmm4, %xmm4
-; SSE-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm3, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movl %esi, %ecx
-; SSE-NEXT:    andl $56, %ecx
-; SSE-NEXT:    shrl $3, %esi
-; SSE-NEXT:    andl $56, %esi
-; SSE-NEXT:    movq -128(%rsp,%rsi), %rdx
-; SSE-NEXT:    shrq %cl, %rdx
-; SSE-NEXT:    movl -120(%rsp,%rsi), %eax
-; SSE-NEXT:    addl %eax, %eax
-; SSE-NEXT:    notl %ecx
-; SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
-; SSE-NEXT:    shlq %cl, %rax
-; SSE-NEXT:    orl %edx, %eax
-; SSE-NEXT:    # kill: def $al killed $al killed $rax
-; SSE-NEXT:    popq %rcx
-; SSE-NEXT:    retq
-;
-; AVX2-LABEL: extractSub512_8:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    pushq %rax
-; AVX2-NEXT:    # kill: def $esi killed $esi def $rsi
-; AVX2-NEXT:    vmovups (%rdi), %ymm0
-; AVX2-NEXT:    vmovups 32(%rdi), %ymm1
-; AVX2-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movl %esi, %ecx
-; AVX2-NEXT:    andl $56, %ecx
-; AVX2-NEXT:    shrl $3, %esi
-; AVX2-NEXT:    andl $56, %esi
-; AVX2-NEXT:    shrxq %rcx, -128(%rsp,%rsi), %rax
-; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT:    notl %ecx
-; AVX2-NEXT:    movl -120(%rsp,%rsi), %edx
-; AVX2-NEXT:    addl %edx, %edx
-; AVX2-NEXT:    shlxq %rcx, %rdx, %rcx
-; AVX2-NEXT:    orl %ecx, %eax
-; AVX2-NEXT:    # kill: def $al killed $al killed $rax
-; AVX2-NEXT:    popq %rcx
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: extractSub512_8:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    pushq %rax
-; AVX512-NEXT:    vmovups (%rdi), %ymm0
-; AVX512-NEXT:    vmovups 32(%rdi), %ymm1
-; AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    # kill: def $esi killed $esi def $rsi
-; AVX512-NEXT:    movl %esi, %ecx
-; AVX512-NEXT:    andl $56, %ecx
-; AVX512-NEXT:    shrl $3, %esi
-; AVX512-NEXT:    andl $56, %esi
-; AVX512-NEXT:    shrxq %rcx, -128(%rsp,%rsi), %rax
-; AVX512-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX512-NEXT:    notl %ecx
-; AVX512-NEXT:    movl -120(%rsp,%rsi), %edx
-; AVX512-NEXT:    addl %edx, %edx
-; AVX512-NEXT:    shlxq %rcx, %rdx, %rcx
-; AVX512-NEXT:    orl %ecx, %eax
-; AVX512-NEXT:    # kill: def $al killed $al killed $rax
-; AVX512-NEXT:    popq %rcx
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; X64-LABEL: extractSub512_8:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    shrl $3, %esi
+; X64-NEXT:    andl $63, %esi
+; X64-NEXT:    movzbl (%rdi,%rsi), %eax
+; X64-NEXT:    retq
   %idx_bounds = and i32 %idx, 511
   %idx_align = and i32 %idx_bounds, -8
   %ld = load i512, ptr %word, align 8
@@ -445,152 +143,21 @@ define i8 @extractSub512_8(ptr %word, i32 %idx) nounwind {
 define i64 @extractSub512_64(ptr %word, i32 %idx) nounwind {
 ; X86-LABEL: extractSub512_64:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $192, %esp
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl (%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 4(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 8(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 12(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 16(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 20(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 24(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 28(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 32(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 36(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 40(%eax), %ebx
-; X86-NEXT:    movl 44(%eax), %edi
-; X86-NEXT:    movl 48(%eax), %esi
-; X86-NEXT:    movl 52(%eax), %edx
-; X86-NEXT:    movl 56(%eax), %ecx
-; X86-NEXT:    movl 60(%eax), %eax
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 12(%ebp), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    shrl $3, %ecx
-; X86-NEXT:    andl $56, %ecx
-; X86-NEXT:    movl 48(%esp,%ecx), %eax
-; X86-NEXT:    movl 52(%esp,%ecx), %edx
-; X86-NEXT:    leal -12(%ebp), %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    shrl $3, %edx
+; X86-NEXT:    andl $56, %edx
+; X86-NEXT:    movl (%ecx,%edx), %eax
+; X86-NEXT:    movl 4(%ecx,%edx), %edx
 ; X86-NEXT:    retl
 ;
-; SSE-LABEL: extractSub512_64:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pushq %rax
-; SSE-NEXT:    # kill: def $esi killed $esi def $rsi
-; SSE-NEXT:    movups (%rdi), %xmm0
-; SSE-NEXT:    movups 16(%rdi), %xmm1
-; SSE-NEXT:    movups 32(%rdi), %xmm2
-; SSE-NEXT:    movups 48(%rdi), %xmm3
-; SSE-NEXT:    xorps %xmm4, %xmm4
-; SSE-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm3, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    shrl $3, %esi
-; SSE-NEXT:    andl $56, %esi
-; SSE-NEXT:    movq -128(%rsp,%rsi), %rax
-; SSE-NEXT:    popq %rcx
-; SSE-NEXT:    retq
-;
-; AVX2-LABEL: extractSub512_64:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    pushq %rax
-; AVX2-NEXT:    vmovups (%rdi), %ymm0
-; AVX2-NEXT:    vmovups 32(%rdi), %ymm1
-; AVX2-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    # kill: def $esi killed $esi def $rsi
-; AVX2-NEXT:    shrl $3, %esi
-; AVX2-NEXT:    andl $56, %esi
-; AVX2-NEXT:    movq -128(%rsp,%rsi), %rax
-; AVX2-NEXT:    popq %rcx
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: extractSub512_64:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    pushq %rax
-; AVX512-NEXT:    vmovups (%rdi), %ymm0
-; AVX512-NEXT:    vmovups 32(%rdi), %ymm1
-; AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    # kill: def $esi killed $esi def $rsi
-; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    shrl $3, %esi
-; AVX512-NEXT:    andl $56, %esi
-; AVX512-NEXT:    movq -128(%rsp,%rsi), %rax
-; AVX512-NEXT:    popq %rcx
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; X64-LABEL: extractSub512_64:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    shrl $3, %esi
+; X64-NEXT:    andl $56, %esi
+; X64-NEXT:    movq (%rdi,%rsi), %rax
+; X64-NEXT:    retq
   %idx_bounds = and i32 %idx, 511
   %idx_align = and i32 %idx_bounds, -64
   %sh = zext nneg i32 %idx_align to i512
@@ -603,143 +170,35 @@ define i64 @extractSub512_64(ptr %word, i32 %idx) nounwind {
 define i128 @extractSub512_128(ptr %word, i32 %idx) nounwind {
 ; X86-LABEL: extractSub512_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $192, %esp
-; X86-NEXT:    movl 12(%ebp), %eax
-; X86-NEXT:    movl (%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 4(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 8(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 12(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 16(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 20(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 24(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 28(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 32(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 36(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 40(%eax), %ebx
-; X86-NEXT:    movl 44(%eax), %edi
-; X86-NEXT:    movl 48(%eax), %esi
-; X86-NEXT:    movl 52(%eax), %edx
-; X86-NEXT:    movl 56(%eax), %ecx
-; X86-NEXT:    movl 60(%eax), %eax
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 16(%ebp), %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    shrl $3, %edi
-; X86-NEXT:    andl $48, %edi
-; X86-NEXT:    movl 48(%esp,%edi), %ecx
-; X86-NEXT:    movl 52(%esp,%edi), %edx
-; X86-NEXT:    movl 56(%esp,%edi), %esi
-; X86-NEXT:    movl 60(%esp,%edi), %edi
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %esi, 8(%eax)
-; X86-NEXT:    movl %edx, 4(%eax)
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    leal -12(%ebp), %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    shrl $3, %edx
+; X86-NEXT:    andl $48, %edx
+; X86-NEXT:    movl (%ecx,%edx), %esi
+; X86-NEXT:    movl 4(%ecx,%edx), %edi
+; X86-NEXT:    movl 8(%ecx,%edx), %ebx
+; X86-NEXT:    movl 12(%ecx,%edx), %ecx
+; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    movl %ebx, 8(%eax)
+; X86-NEXT:    movl %edi, 4(%eax)
+; X86-NEXT:    movl %esi, (%eax)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
-; SSE-LABEL: extractSub512_128:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pushq %rax
-; SSE-NEXT:    # kill: def $esi killed $esi def $rsi
-; SSE-NEXT:    movups (%rdi), %xmm0
-; SSE-NEXT:    movups 16(%rdi), %xmm1
-; SSE-NEXT:    movups 32(%rdi), %xmm2
-; SSE-NEXT:    movups 48(%rdi), %xmm3
-; SSE-NEXT:    xorps %xmm4, %xmm4
-; SSE-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm3, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    shrl $3, %esi
-; SSE-NEXT:    andl $48, %esi
-; SSE-NEXT:    movq -128(%rsp,%rsi), %rax
-; SSE-NEXT:    movq -120(%rsp,%rsi), %rdx
-; SSE-NEXT:    popq %rcx
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: extractSub512_128:
-; AVX:       # %bb.0:
-; AVX-NEXT:    pushq %rax
-; AVX-NEXT:    # kill: def $esi killed $esi def $rsi
-; AVX-NEXT:    vmovups (%rdi), %ymm0
-; AVX-NEXT:    vmovups 32(%rdi), %ymm1
-; AVX-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    shrl $3, %esi
-; AVX-NEXT:    andl $48, %esi
-; AVX-NEXT:    movq -128(%rsp,%rsi), %rax
-; AVX-NEXT:    movq -120(%rsp,%rsi), %rdx
-; AVX-NEXT:    popq %rcx
-; AVX-NEXT:    vzeroupper
-; AVX-NEXT:    retq
+; X64-LABEL: extractSub512_128:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    shrl $3, %esi
+; X64-NEXT:    andl $48, %esi
+; X64-NEXT:    movq (%rdi,%rsi), %rax
+; X64-NEXT:    movq 8(%rdi,%rsi), %rdx
+; X64-NEXT:    retq
   %idx_bounds = and i32 %idx, 511
   %idx_align = and i32 %idx_bounds, -128
   %sh = zext nneg i32 %idx_align to i512
@@ -752,916 +211,21 @@ define i128 @extractSub512_128(ptr %word, i32 %idx) nounwind {
 define i64 @extractSub4096_64(ptr %word, i32 %idx) nounwind {
 ; X86-LABEL: extractSub4096_64:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $1536, %esp # imm = 0x600
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl 4(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 8(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 12(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 16(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 20(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 24(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 28(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 32(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 36(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 40(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 44(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 48(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 52(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 56(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 60(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 64(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 68(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 72(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 76(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 80(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 84(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 88(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 92(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 96(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 100(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 104(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 108(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 112(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 116(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 120(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 124(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 128(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 132(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 136(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 140(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 144(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 148(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 152(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 156(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 160(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 164(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 168(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 172(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 176(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 180(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 184(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 188(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 192(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 196(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 200(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 204(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 208(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 212(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 216(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 220(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 224(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 228(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 232(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 236(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 240(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 244(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 248(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 252(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 256(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 260(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 264(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 268(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 272(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 276(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 280(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 284(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 288(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 292(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 296(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 300(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 304(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 308(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 312(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 316(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 320(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 324(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 328(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 332(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 336(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 340(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 344(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 348(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 352(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 356(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 360(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 364(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 368(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 372(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 376(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 380(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl (%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 384(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 388(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 392(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 396(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 400(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 404(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 408(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 412(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 416(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 420(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 424(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 428(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 432(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 436(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 440(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 444(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 448(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 452(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 456(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 460(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 464(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 468(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 472(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 476(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 480(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 484(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 488(%eax), %ebx
-; X86-NEXT:    movl 492(%eax), %edi
-; X86-NEXT:    movl 496(%eax), %esi
-; X86-NEXT:    movl 500(%eax), %edx
-; X86-NEXT:    movl 504(%eax), %ecx
-; X86-NEXT:    movl 508(%eax), %eax
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $4032, %ecx # imm = 0xFC0
-; X86-NEXT:    andl 12(%ebp), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    shrl $3, %ecx
-; X86-NEXT:    movl 496(%esp,%ecx), %eax
-; X86-NEXT:    movl 500(%esp,%ecx), %edx
-; X86-NEXT:    leal -12(%ebp), %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl $4032, %edx # imm = 0xFC0
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    shrl $3, %edx
+; X86-NEXT:    movl (%ecx,%edx), %eax
+; X86-NEXT:    movl 4(%ecx,%edx), %edx
 ; X86-NEXT:    retl
 ;
-; SSE-LABEL: extractSub4096_64:
-; SSE:       # %bb.0:
-; SSE-NEXT:    subq $1176, %rsp # imm = 0x498
-; SSE-NEXT:    # kill: def $esi killed $esi def $rsi
-; SSE-NEXT:    movups (%rdi), %xmm0
-; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movups 16(%rdi), %xmm0
-; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movups 32(%rdi), %xmm0
-; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movups 48(%rdi), %xmm0
-; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movups 64(%rdi), %xmm0
-; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movups 80(%rdi), %xmm0
-; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movups 96(%rdi), %xmm0
-; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movups 112(%rdi), %xmm0
-; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movups 128(%rdi), %xmm0
-; SSE-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; SSE-NEXT:    movups 144(%rdi), %xmm0
-; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movups 160(%rdi), %xmm0
-; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movups 176(%rdi), %xmm0
-; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movups 192(%rdi), %xmm0
-; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movups 208(%rdi), %xmm0
-; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movups 224(%rdi), %xmm0
-; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movups 240(%rdi), %xmm0
-; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movups 256(%rdi), %xmm0
-; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movups 272(%rdi), %xmm15
-; SSE-NEXT:    movups 288(%rdi), %xmm14
-; SSE-NEXT:    movups 304(%rdi), %xmm13
-; SSE-NEXT:    movups 320(%rdi), %xmm12
-; SSE-NEXT:    movups 336(%rdi), %xmm11
-; SSE-NEXT:    movups 352(%rdi), %xmm10
-; SSE-NEXT:    movups 368(%rdi), %xmm9
-; SSE-NEXT:    movups 384(%rdi), %xmm8
-; SSE-NEXT:    movups 400(%rdi), %xmm7
-; SSE-NEXT:    movups 416(%rdi), %xmm6
-; SSE-NEXT:    movups 432(%rdi), %xmm5
-; SSE-NEXT:    movups 448(%rdi), %xmm4
-; SSE-NEXT:    movups 464(%rdi), %xmm3
-; SSE-NEXT:    movups 480(%rdi), %xmm2
-; SSE-NEXT:    movups 496(%rdi), %xmm1
-; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm3, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm4, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm5, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm6, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm7, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm8, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm9, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm10, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm11, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm12, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm13, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm14, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm15, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    andl $4032, %esi # imm = 0xFC0
-; SSE-NEXT:    shrl $3, %esi
-; SSE-NEXT:    movq 144(%rsp,%rsi), %rax
-; SSE-NEXT:    addq $1176, %rsp # imm = 0x498
-; SSE-NEXT:    retq
-;
-; AVX2-LABEL: extractSub4096_64:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    subq $936, %rsp # imm = 0x3A8
-; AVX2-NEXT:    vmovups (%rdi), %ymm0
-; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovups 32(%rdi), %ymm1
-; AVX2-NEXT:    vmovups 64(%rdi), %ymm2
-; AVX2-NEXT:    vmovups 96(%rdi), %ymm3
-; AVX2-NEXT:    vmovups 128(%rdi), %ymm4
-; AVX2-NEXT:    vmovups 160(%rdi), %ymm5
-; AVX2-NEXT:    vmovups 192(%rdi), %ymm6
-; AVX2-NEXT:    vmovups 224(%rdi), %ymm7
-; AVX2-NEXT:    vmovups 256(%rdi), %ymm8
-; AVX2-NEXT:    vmovups 288(%rdi), %ymm9
-; AVX2-NEXT:    vmovups 320(%rdi), %ymm10
-; AVX2-NEXT:    vmovups 352(%rdi), %ymm11
-; AVX2-NEXT:    vmovups 384(%rdi), %ymm12
-; AVX2-NEXT:    vmovups 416(%rdi), %ymm13
-; AVX2-NEXT:    vmovups 448(%rdi), %ymm14
-; AVX2-NEXT:    vmovups 480(%rdi), %ymm15
-; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm15, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm14, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm13, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm12, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm11, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm10, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm9, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm8, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm7, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm6, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm5, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm4, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm3, (%rsp)
-; AVX2-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    # kill: def $esi killed $esi def $rsi
-; AVX2-NEXT:    andl $4032, %esi # imm = 0xFC0
-; AVX2-NEXT:    shrl $3, %esi
-; AVX2-NEXT:    movq -96(%rsp,%rsi), %rax
-; AVX2-NEXT:    addq $936, %rsp # imm = 0x3A8
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: extractSub4096_64:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    subq $904, %rsp # imm = 0x388
-; AVX512-NEXT:    # kill: def $esi killed $esi def $rsi
-; AVX512-NEXT:    vmovups (%rdi), %ymm0
-; AVX512-NEXT:    vmovups 32(%rdi), %ymm1
-; AVX512-NEXT:    vmovups 64(%rdi), %ymm2
-; AVX512-NEXT:    vmovups 96(%rdi), %ymm3
-; AVX512-NEXT:    vmovups 128(%rdi), %ymm4
-; AVX512-NEXT:    vmovups 160(%rdi), %ymm5
-; AVX512-NEXT:    vmovups 192(%rdi), %ymm6
-; AVX512-NEXT:    vmovups 224(%rdi), %ymm7
-; AVX512-NEXT:    vmovups 256(%rdi), %ymm8
-; AVX512-NEXT:    vmovups 288(%rdi), %ymm9
-; AVX512-NEXT:    vmovups 320(%rdi), %ymm10
-; AVX512-NEXT:    vmovups 352(%rdi), %ymm11
-; AVX512-NEXT:    vmovups 384(%rdi), %ymm12
-; AVX512-NEXT:    vmovups 416(%rdi), %ymm13
-; AVX512-NEXT:    andl $4032, %esi # imm = 0xFC0
-; AVX512-NEXT:    vmovups 448(%rdi), %ymm14
-; AVX512-NEXT:    vmovups 480(%rdi), %ymm15
-; AVX512-NEXT:    vxorps %xmm16, %xmm16, %xmm16
-; AVX512-NEXT:    vmovups %ymm16, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm16, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm16, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm16, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm16, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm16, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm16, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm16, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm16, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm16, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm16, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm16, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm16, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm16, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm16, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm16, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm15, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm14, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm13, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm12, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm11, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm10, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm9, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm8, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm7, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm6, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm5, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm4, (%rsp)
-; AVX512-NEXT:    vmovups %ymm3, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    shrl $3, %esi
-; AVX512-NEXT:    movq -128(%rsp,%rsi), %rax
-; AVX512-NEXT:    addq $904, %rsp # imm = 0x388
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; X64-LABEL: extractSub4096_64:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    andl $4032, %esi # imm = 0xFC0
+; X64-NEXT:    shrl $3, %esi
+; X64-NEXT:    movq (%rdi,%rsi), %rax
+; X64-NEXT:    retq
   %idx_bounds = and i32 %idx, 4095
   %idx_align = and i32 %idx_bounds, -64
   %sh = zext nneg i32 %idx_align to i4096
diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll
index ab1feba98b008..044327d94c0ef 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll
@@ -875,28 +875,12 @@ define i1 @mask_v8i32(<8 x i32> %a0) {
 ; SSE41-NEXT:    sete %al
 ; SSE41-NEXT:    retq
 ;
-; AVX1-LABEL: mask_v8i32:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0
-; AVX1-NEXT:    sete %al
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: mask_v8i32:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9223372039002259456,9223372039002259456,9223372039002259456,9223372039002259456]
-; AVX2-NEXT:    vptest %ymm1, %ymm0
-; AVX2-NEXT:    sete %al
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: mask_v8i32:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9223372039002259456,9223372039002259456,9223372039002259456,9223372039002259456]
-; AVX512-NEXT:    vptest %ymm1, %ymm0
-; AVX512-NEXT:    sete %al
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; AVX-LABEL: mask_v8i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vtestps %ymm0, %ymm0
+; AVX-NEXT:    sete %al
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
   %1 = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %a0)
   %2 = and i32 %1, 2147483648
   %3 = icmp eq i32 %2, 0
@@ -965,33 +949,46 @@ define i1 @signtest_v8i32(<8 x i32> %a0) {
 ; SSE41-NEXT:    sete %al
 ; SSE41-NEXT:    retq
 ;
-; AVX1-LABEL: signtest_v8i32:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0
-; AVX1-NEXT:    sete %al
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: signtest_v8i32:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9223372039002259456,9223372039002259456,9223372039002259456,9223372039002259456]
-; AVX2-NEXT:    vptest %ymm1, %ymm0
-; AVX2-NEXT:    sete %al
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: signtest_v8i32:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9223372039002259456,9223372039002259456,9223372039002259456,9223372039002259456]
-; AVX512-NEXT:    vptest %ymm1, %ymm0
-; AVX512-NEXT:    sete %al
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; AVX-LABEL: signtest_v8i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vtestps %ymm0, %ymm0
+; AVX-NEXT:    sete %al
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
   %1 = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %a0)
   %2 = icmp sgt i32 %1, -1
   ret i1 %2
 }
 
+define i1 @signtest_v4i64(<4 x i64> %a0) {
+; SSE2-LABEL: signtest_v4i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSE2-NEXT:    por %xmm0, %xmm1
+; SSE2-NEXT:    movq %xmm1, %rax
+; SSE2-NEXT:    testq %rax, %rax
+; SSE2-NEXT:    setns %al
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: signtest_v4i64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    sete %al
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: signtest_v4i64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vtestpd %ymm0, %ymm0
+; AVX-NEXT:    sete %al
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+  %1 = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> %a0)
+  %2 = icmp sgt i64 %1, -1
+  ret i1 %2
+}
+
 define i1 @trunc_v16i16(<16 x i16> %a0) {
 ; SSE2-LABEL: trunc_v16i16:
 ; SSE2:       # %bb.0:
@@ -1162,11 +1159,11 @@ define i32 @mask_v3i1(<3 x i32> %a, <3 x i32> %b) {
 ; SSE2-NEXT:    movd %xmm0, %eax
 ; SSE2-NEXT:    orl %ecx, %eax
 ; SSE2-NEXT:    testb $1, %al
-; SSE2-NEXT:    je .LBB29_2
+; SSE2-NEXT:    je .LBB30_2
 ; SSE2-NEXT:  # %bb.1:
 ; SSE2-NEXT:    xorl %eax, %eax
 ; SSE2-NEXT:    retq
-; SSE2-NEXT:  .LBB29_2:
+; SSE2-NEXT:  .LBB30_2:
 ; SSE2-NEXT:    movl $1, %eax
 ; SSE2-NEXT:    retq
 ;
@@ -1181,11 +1178,11 @@ define i32 @mask_v3i1(<3 x i32> %a, <3 x i32> %b) {
 ; SSE41-NEXT:    pextrd $2, %xmm1, %eax
 ; SSE41-NEXT:    orl %ecx, %eax
 ; SSE41-NEXT:    testb $1, %al
-; SSE41-NEXT:    je .LBB29_2
+; SSE41-NEXT:    je .LBB30_2
 ; SSE41-NEXT:  # %bb.1:
 ; SSE41-NEXT:    xorl %eax, %eax
 ; SSE41-NEXT:    retq
-; SSE41-NEXT:  .LBB29_2:
+; SSE41-NEXT:  .LBB30_2:
 ; SSE41-NEXT:    movl $1, %eax
 ; SSE41-NEXT:    retq
 ;
@@ -1200,11 +1197,11 @@ define i32 @mask_v3i1(<3 x i32> %a, <3 x i32> %b) {
 ; AVX1OR2-NEXT:    vpextrd $2, %xmm0, %eax
 ; AVX1OR2-NEXT:    orl %ecx, %eax
 ; AVX1OR2-NEXT:    testb $1, %al
-; AVX1OR2-NEXT:    je .LBB29_2
+; AVX1OR2-NEXT:    je .LBB30_2
 ; AVX1OR2-NEXT:  # %bb.1:
 ; AVX1OR2-NEXT:    xorl %eax, %eax
 ; AVX1OR2-NEXT:    retq
-; AVX1OR2-NEXT:  .LBB29_2:
+; AVX1OR2-NEXT:  .LBB30_2:
 ; AVX1OR2-NEXT:    movl $1, %eax
 ; AVX1OR2-NEXT:    retq
 ;
@@ -1219,12 +1216,12 @@ define i32 @mask_v3i1(<3 x i32> %a, <3 x i32> %b) {
 ; AVX512F-NEXT:    korw %k0, %k1, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
 ; AVX512F-NEXT:    testb $1, %al
-; AVX512F-NEXT:    je .LBB29_2
+; AVX512F-NEXT:    je .LBB30_2
 ; AVX512F-NEXT:  # %bb.1:
 ; AVX512F-NEXT:    xorl %eax, %eax
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
-; AVX512F-NEXT:  .LBB29_2:
+; AVX512F-NEXT:  .LBB30_2:
 ; AVX512F-NEXT:    movl $1, %eax
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
@@ -1240,12 +1237,12 @@ define i32 @mask_v3i1(<3 x i32> %a, <3 x i32> %b) {
 ; AVX512BW-NEXT:    korw %k0, %k1, %k0
 ; AVX512BW-NEXT:    kmovd %k0, %eax
 ; AVX512BW-NEXT:    testb $1, %al
-; AVX512BW-NEXT:    je .LBB29_2
+; AVX512BW-NEXT:    je .LBB30_2
 ; AVX512BW-NEXT:  # %bb.1:
 ; AVX512BW-NEXT:    xorl %eax, %eax
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
-; AVX512BW-NEXT:  .LBB29_2:
+; AVX512BW-NEXT:  .LBB30_2:
 ; AVX512BW-NEXT:    movl $1, %eax
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -1259,11 +1256,11 @@ define i32 @mask_v3i1(<3 x i32> %a, <3 x i32> %b) {
 ; AVX512BWVL-NEXT:    korw %k0, %k1, %k0
 ; AVX512BWVL-NEXT:    kmovd %k0, %eax
 ; AVX512BWVL-NEXT:    testb $1, %al
-; AVX512BWVL-NEXT:    je .LBB29_2
+; AVX512BWVL-NEXT:    je .LBB30_2
 ; AVX512BWVL-NEXT:  # %bb.1:
 ; AVX512BWVL-NEXT:    xorl %eax, %eax
 ; AVX512BWVL-NEXT:    retq
-; AVX512BWVL-NEXT:  .LBB29_2:
+; AVX512BWVL-NEXT:  .LBB30_2:
 ; AVX512BWVL-NEXT:    movl $1, %eax
 ; AVX512BWVL-NEXT:    retq
   %1 = icmp ne <3 x i32> %a, %b
diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
index 81c4d5d71084c..c3054a365c466 100644
--- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
+++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
@@ -962,39 +962,22 @@ define void @load_8byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
 }
 
 define void @load_1byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-NO-BMI2-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X64-NO-BMI2:       # %bb.0:
-; X64-NO-BMI2-NEXT:    movups (%rdi), %xmm0
-; X64-NO-BMI2-NEXT:    xorps %xmm1, %xmm1
-; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
-; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movl %ecx, %eax
-; X64-NO-BMI2-NEXT:    shrb $6, %al
-; X64-NO-BMI2-NEXT:    movzbl %al, %eax
-; X64-NO-BMI2-NEXT:    movq -72(%rsp,%rax,8), %rax
-; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NO-BMI2-NEXT:    shrq %cl, %rax
-; X64-NO-BMI2-NEXT:    movb %al, (%rdx)
-; X64-NO-BMI2-NEXT:    retq
-;
-; X64-BMI2-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X64-BMI2:       # %bb.0:
-; X64-BMI2-NEXT:    movups (%rdi), %xmm0
-; X64-BMI2-NEXT:    xorps %xmm1, %xmm1
-; X64-BMI2-NEXT:    shll $3, %esi
-; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movl %esi, %eax
-; X64-BMI2-NEXT:    shrb $6, %al
-; X64-BMI2-NEXT:    movzbl %al, %eax
-; X64-BMI2-NEXT:    shrxq %rsi, -72(%rsp,%rax,8), %rax
-; X64-BMI2-NEXT:    movb %al, (%rdx)
-; X64-BMI2-NEXT:    retq
+; X64-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64:       # %bb.0:
+; X64-NEXT:    movups (%rdi), %xmm0
+; X64-NEXT:    xorps %xmm1, %xmm1
+; X64-NEXT:    leal (,%rsi,8), %eax
+; X64-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    shrb $6, %al
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    leaq -72(%rsp,%rax,8), %rax
+; X64-NEXT:    andl $7, %esi
+; X64-NEXT:    movzbl (%rsi,%rax), %eax
+; X64-NEXT:    movb %al, (%rdx)
+; X64-NEXT:    retq
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
 ; X86-NO-BMI2-NO-SHLD:       # %bb.0:
@@ -3417,7 +3400,6 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; ALL: {{.*}}
-; X64: {{.*}}
 ; X64-NO-SHLD: {{.*}}
 ; X86: {{.*}}
 ; X86-HAVE-BMI2-HAVE-SHLD: {{.*}}
diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll
index 8d36eef952a2b..84c2cc6d5ec31 100644
--- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll
+++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll
@@ -1220,41 +1220,23 @@ define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; no @load_16byte_chunk_of_16byte_alloca
 
 define void @load_1byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-NO-BMI2-LABEL: load_1byte_chunk_of_32byte_alloca:
-; X64-NO-BMI2:       # %bb.0:
-; X64-NO-BMI2-NEXT:    movups (%rdi), %xmm0
-; X64-NO-BMI2-NEXT:    movups 16(%rdi), %xmm1
-; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
-; X64-NO-BMI2-NEXT:    xorps %xmm2, %xmm2
-; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movl %ecx, %eax
-; X64-NO-BMI2-NEXT:    shrb $6, %al
-; X64-NO-BMI2-NEXT:    movzbl %al, %eax
-; X64-NO-BMI2-NEXT:    movq -72(%rsp,%rax,8), %rax
-; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NO-BMI2-NEXT:    shrq %cl, %rax
-; X64-NO-BMI2-NEXT:    movb %al, (%rdx)
-; X64-NO-BMI2-NEXT:    retq
-;
-; X64-BMI2-LABEL: load_1byte_chunk_of_32byte_alloca:
-; X64-BMI2:       # %bb.0:
-; X64-BMI2-NEXT:    movups (%rdi), %xmm0
-; X64-BMI2-NEXT:    movups 16(%rdi), %xmm1
-; X64-BMI2-NEXT:    shll $3, %esi
-; X64-BMI2-NEXT:    xorps %xmm2, %xmm2
-; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movl %esi, %eax
-; X64-BMI2-NEXT:    shrb $6, %al
-; X64-BMI2-NEXT:    movzbl %al, %eax
-; X64-BMI2-NEXT:    shrxq %rsi, -72(%rsp,%rax,8), %rax
-; X64-BMI2-NEXT:    movb %al, (%rdx)
-; X64-BMI2-NEXT:    retq
+; X64-LABEL: load_1byte_chunk_of_32byte_alloca:
+; X64:       # %bb.0:
+; X64-NEXT:    movups (%rdi), %xmm0
+; X64-NEXT:    movups 16(%rdi), %xmm1
+; X64-NEXT:    leal (,%rsi,8), %eax
+; X64-NEXT:    xorps %xmm2, %xmm2
+; X64-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    shrb $6, %al
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    leaq -72(%rsp,%rax,8), %rax
+; X64-NEXT:    andl $7, %esi
+; X64-NEXT:    movzbl (%rsi,%rax), %eax
+; X64-NEXT:    movb %al, (%rdx)
+; X64-NEXT:    retq
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca:
 ; X86-NO-BMI2-NO-SHLD:       # %bb.0:
@@ -2156,7 +2138,6 @@ define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst
 ; no @load_32byte_chunk_of_32byte_alloca
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; ALL: {{.*}}
-; X64: {{.*}}
 ; X64-NO-SHLD: {{.*}}
 ; X86: {{.*}}
 ; X86-NO-SHLD: {{.*}}
diff --git a/llvm/test/DebugInfo/Generic/objc-property.ll b/llvm/test/DebugInfo/Generic/objc-property.ll
new file mode 100644
index 0000000000000..1ee792941bcbb
--- /dev/null
+++ b/llvm/test/DebugInfo/Generic/objc-property.ll
@@ -0,0 +1,94 @@
+; UNSUPPORTED:  target={{.*}}-aix{{.*}}
+;
+; RUN: llc -filetype=obj -o - %s | llvm-dwarfdump --debug-info - | FileCheck %s
+
+; CHECK: DW_TAG_structure_type
+; CHECK:   DW_AT_name ("Foo")
+;
+; CHECK:   0x[[AUTO_SYNTH:[0-9a-f]+]]: DW_TAG_APPLE_property
+; CHECK:     DW_AT_APPLE_property_name ("autoSynthProp")
+; CHECK:     DW_AT_APPLE_property_attribute
+; CHECK-SAME: DW_APPLE_PROPERTY_assign, DW_APPLE_PROPERTY_readwrite,
+; CHECK-SAME: DW_APPLE_PROPERTY_atomic, DW_APPLE_PROPERTY_unsafe_unretained
+;
+; CHECK:   0x[[SYNTH:[0-9a-f]+]]: DW_TAG_APPLE_property
+; CHECK:     DW_AT_APPLE_property_name ("synthProp")
+; CHECK:     DW_AT_APPLE_property_attribute
+; CHECK-SAME: DW_APPLE_PROPERTY_assign, DW_APPLE_PROPERTY_readwrite,
+; CHECK-SAME: DW_APPLE_PROPERTY_atomic, DW_APPLE_PROPERTY_unsafe_unretained
+;
+; CHECK:   0x[[GET:[0-9a-f]+]]: DW_TAG_APPLE_property
+; CHECK:     DW_AT_APPLE_property_name ("customGetterProp")
+; CHECK:     DW_AT_APPLE_property_getter   ("customGetter")
+; CHECK:     DW_AT_APPLE_property_attribute
+; CHECK-SAME: DW_APPLE_PROPERTY_getter, DW_APPLE_PROPERTY_assign, DW_APPLE_PROPERTY_readwrite,
+; CHECK-SAME: DW_APPLE_PROPERTY_atomic, DW_APPLE_PROPERTY_unsafe_unretained
+;
+; CHECK:   0x[[SET:[0-9a-f]+]]: DW_TAG_APPLE_property
+; CHECK:     DW_AT_APPLE_property_name ("customSetterProp")
+; CHECK:     DW_AT_APPLE_property_setter   ("customSetter:")
+; CHECK:     DW_AT_APPLE_property_attribute
+; CHECK-SAME: DW_APPLE_PROPERTY_assign, DW_APPLE_PROPERTY_readwrite,
+; CHECK-SAME: DW_APPLE_PROPERTY_setter, DW_APPLE_PROPERTY_atomic, DW_APPLE_PROPERTY_unsafe_unretained
+;
+; CHECK:   0x[[ACCESSORS:[0-9a-f]+]]: DW_TAG_APPLE_property
+; CHECK:     DW_AT_APPLE_property_name ("customAccessorsProp")
+; CHECK:     DW_AT_APPLE_property_getter   ("customGetter")
+; CHECK:     DW_AT_APPLE_property_setter   ("customSetter:")
+; CHECK:     DW_AT_APPLE_property_attribute
+; CHECK-SAME: DW_APPLE_PROPERTY_getter, DW_APPLE_PROPERTY_assign, DW_APPLE_PROPERTY_readwrite,
+; CHECK-SAME: DW_APPLE_PROPERTY_setter, DW_APPLE_PROPERTY_atomic, DW_APPLE_PROPERTY_unsafe_unretained
+;
+; CHECK:   DW_TAG_member
+; CHECK:     DW_AT_name ("someBackingIvar")
+; CHECK:     DW_AT_APPLE_property (0x[[SYNTH]] "synthProp")
+;
+; CHECK:   DW_TAG_member
+; CHECK:     DW_AT_name ("_autoSynthProp")
+; CHECK:     DW_AT_APPLE_property (0x[[AUTO_SYNTH]] "autoSynthProp")
+;
+; CHECK:   DW_TAG_member
+; CHECK:     DW_AT_name ("_customGetterProp")
+; CHECK:     DW_AT_APPLE_property (0x[[GET]] "customGetterProp")
+;
+; CHECK:   DW_TAG_member
+; CHECK:     DW_AT_name ("_customSetterProp")
+; CHECK:     DW_AT_APPLE_property (0x[[SET]] "customSetterProp")
+
+!llvm.module.flags = !{!0, !1}
+!llvm.dbg.cu = !{!2}
+
+!0 = !{i32 7, !"Dwarf Version", i32 5}
+!1 = !{i32 2, !"Debug Info Version", i32 3}
+!2 = distinct !DICompileUnit(language: DW_LANG_ObjC, file: !3, producer: "hand written", isOptimized: false, runtimeVersion: 2, emissionKind: FullDebug, retainedTypes: !4, splitDebugInlining: false, debugInfoForProfiling: true, nameTableKind: Apple)
+!3 = !DIFile(filename: "main.m", directory: "/tmp")
+!4 = !{!5}
+!5 = !DICompositeType(tag: DW_TAG_structure_type, name: "Foo", scope: !3, file: !3, line: 1, size: 128, flags: DIFlagObjcClassComplete, elements: !6, runtimeLang: DW_LANG_ObjC)
+!6 = !{!7, !9, !10, !11, !12, !13, !14, !15, !16, !17, !24, !27, !28, !29, !30, !31, !32}
+!7 = !DIObjCProperty(name: "autoSynthProp", file: !3, line: 5, attributes: 2316, type: !8)
+!8 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!9 = !DIObjCProperty(name: "synthProp", file: !3, line: 6, attributes: 2316, type: !8)
+!10 = !DIObjCProperty(name: "customGetterProp", file: !3, line: 7, getter: "customGetter", attributes: 2318, type: !8)
+!11 = !DIObjCProperty(name: "customSetterProp", file: !3, line: 8, setter: "customSetter:", attributes: 2444, type: !8)
+!12 = !DIObjCProperty(name: "customAccessorsProp", file: !3, line: 9, setter: "customSetter:", getter: "customGetter", attributes: 2446, type: !8)
+!13 = !DIDerivedType(tag: DW_TAG_member, name: "someBackingIvar", scope: !3, file: !3, line: 2, baseType: !8, size: 32, flags: DIFlagProtected, extraData: !9)
+!14 = !DIDerivedType(tag: DW_TAG_member, name: "_autoSynthProp", scope: !3, file: !3, line: 5, baseType: !8, size: 32, flags: DIFlagPrivate, extraData: !7)
+!15 = !DIDerivedType(tag: DW_TAG_member, name: "_customGetterProp", scope: !3, file: !3, line: 7, baseType: !8, size: 32, flags: DIFlagPrivate, extraData: !10)
+!16 = !DIDerivedType(tag: DW_TAG_member, name: "_customSetterProp", scope: !3, file: !3, line: 8, baseType: !8, size: 32, flags: DIFlagPrivate, extraData: !11)
+!17 = !DISubprogram(name: "-[Foo customGetter]", scope: !5, file: !3, line: 19, type: !18, scopeLine: 19, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit)
+!18 = !DISubroutineType(types: !19)
+!19 = !{!8, !20, !21}
+!20 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !5, size: 64, flags: DIFlagArtificial | DIFlagObjectPointer)
+!21 = !DIDerivedType(tag: DW_TAG_typedef, name: "SEL", file: !3, baseType: !22, flags: DIFlagArtificial)
+!22 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !23, size: 64)
+!23 = !DICompositeType(tag: DW_TAG_structure_type, name: "objc_selector", file: !3, flags: DIFlagFwdDecl)
+!24 = !DISubprogram(name: "-[Foo customSetter:]", scope: !5, file: !3, line: 23, type: !25, scopeLine: 23, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit)
+!25 = !DISubroutineType(types: !26)
+!26 = !{null, !20, !21, !8}
+!27 = !DISubprogram(name: "-[Foo synthProp]", scope: !5, file: !3, line: 17, type: !18, scopeLine: 17, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit)
+!28 = !DISubprogram(name: "-[Foo setSynthProp:]", scope: !5, file: !3, line: 17, type: !25, scopeLine: 17, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit)
+!29 = !DISubprogram(name: "-[Foo autoSynthProp]", scope: !5, file: !3, line: 5, type: !18, scopeLine: 5, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit)
+!30 = !DISubprogram(name: "-[Foo setAutoSynthProp:]", scope: !5, file: !3, line: 5, type: !25, scopeLine: 5, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit)
+!31 = !DISubprogram(name: "-[Foo setCustomGetterProp:]", scope: !5, file: !3, line: 7, type: !25, scopeLine: 7, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit)
+!32 = !DISubprogram(name: "-[Foo customSetterProp]", scope: !5, file: !3, line: 8, type: !18, scopeLine: 8, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit)
+
diff --git a/llvm/test/DebugInfo/PDB/Native/pdb-native-index-overflow.test b/llvm/test/DebugInfo/PDB/Native/pdb-native-index-overflow.test
new file mode 100755
index 0000000000000..aa3f6dcb9632a
--- /dev/null
+++ b/llvm/test/DebugInfo/PDB/Native/pdb-native-index-overflow.test
@@ -0,0 +1,13 @@
+; Test that the native PDB reader isn't crashed by index value bigger than
+; number of types in TPI or IPI stream
+; RUN: llvm-pdbutil dump %p/../Inputs/empty.pdb --type-index=20000000\
+; RUN:   | FileCheck -check-prefixes=TYPES,NOT_FOUND %s
+; RUN: llvm-pdbutil dump %p/../Inputs/empty.pdb --id-index=20000000\
+; RUN:   | FileCheck -check-prefixes=IDS,NOT_FOUND %s
+
+TYPES:                     Types (TPI Stream)
+IDS:                       Types (IPI Stream)
+NOT_FOUND:============================================================
+NOT_FOUND:  Showing 1 records.
+NOT_FOUND:  Type 0x1312D00 doesn't exist in TPI stream
+
diff --git a/llvm/test/DebugInfo/X86/base-type-size.ll b/llvm/test/DebugInfo/X86/base-type-size.ll
index 3a8dc37bdc65f..2f0ff2f60e95f 100644
--- a/llvm/test/DebugInfo/X86/base-type-size.ll
+++ b/llvm/test/DebugInfo/X86/base-type-size.ll
@@ -11,7 +11,10 @@
 ; CHECK: DW_TAG_base_type
 ; CHECK-NEXT: DW_AT_name      ("DW_ATE_unsigned_1")
 ; CHECK-NEXT: DW_AT_encoding  (DW_ATE_unsigned)
+;; TODO: Should this type use bit_size?
+; CHECK-NOT:  DW_AT_bit_size
 ; CHECK-NEXT: DW_AT_byte_size (0x01)
+; CHECK-NOT:  DW_AT_bit_size
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/llvm/test/DebugInfo/bit-int-size.ll b/llvm/test/DebugInfo/bit-int-size.ll
new file mode 100644
index 0000000000000..e28921dc83db3
--- /dev/null
+++ b/llvm/test/DebugInfo/bit-int-size.ll
@@ -0,0 +1,38 @@
+; RUN: %llc_dwarf %s -filetype=obj -o - | llvm-dwarfdump - | FileCheck %s
+; REQUIRES: object-emission
+
+;; Check base types with bit-sizes that don't fit fully fit within a byte
+;; multiple get both a a byte_size and bit_size attribute.
+
+; CHECK: DW_TAG_base_type
+; CHECK-NEXT: DW_AT_name      ("unsigned _BitInt")
+; CHECK-NEXT: DW_AT_encoding  (DW_ATE_unsigned)
+; CHECK-NEXT: DW_AT_byte_size (0x04)
+; CHECK-NEXT: DW_AT_bit_size  (0x11)
+
+; CHECK: DW_TAG_base_type
+; CHECK-NEXT: DW_AT_name      ("_BitInt")
+; CHECK-NEXT: DW_AT_encoding  (DW_ATE_signed)
+; CHECK-NEXT: DW_AT_byte_size (0x01)
+; CHECK-NEXT: DW_AT_bit_size  (0x02)
+
+@a = global i8 0, align 1, !dbg !0
+@b = global i8 0, align 1, !dbg !5
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!10, !11}
+!llvm.ident = !{!12}
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "a", scope: !2, file: !7, line: 4, type: !9, isLocal: false, isDefinition: true)
+!2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !3, producer: "clang version 22.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, globals: !4, splitDebugInlining: false, nameTableKind: None)
+!3 = !DIFile(filename: "bit-int.c", directory: "/")
+!4 = !{!0, !5}
+!5 = !DIGlobalVariableExpression(var: !6, expr: !DIExpression())
+!6 = distinct !DIGlobalVariable(name: "b", scope: !2, file: !7, line: 5, type: !8, isLocal: false, isDefinition: true)
+!7 = !DIFile(filename: "bit-int.c", directory: "/")
+!8 = !DIBasicType(name: "_BitInt", size: 8, dataSize: 2, encoding: DW_ATE_signed)
+!9 = !DIBasicType(name: "unsigned _BitInt", size: 32, dataSize: 17, encoding: DW_ATE_unsigned)
+!10 = !{i32 2, !"Debug Info Version", i32 3}
+!11 = !{i32 1, !"wchar_size", i32 4}
+!12 = !{!"clang version 22.0.0git"}
diff --git a/llvm/test/Instrumentation/TypeSanitizer/basic_outlined.ll b/llvm/test/Instrumentation/TypeSanitizer/basic_outlined.ll
new file mode 100644
index 0000000000000..1d118560f7580
--- /dev/null
+++ b/llvm/test/Instrumentation/TypeSanitizer/basic_outlined.ll
@@ -0,0 +1,68 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
+; Test basic type sanitizer instrumentation.
+;
+; RUN: opt -passes='tysan' -tysan-outline-instrumentation -S %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+;.
+; CHECK: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 0, ptr @tysan.module_ctor, ptr null }]
+; CHECK: @__tysan_v1_Simple_20C_2b_2b_20TBAA = linkonce_odr constant { i64, i64, [16 x i8] } { i64 2, i64 0, [16 x i8] c"Simple C++ TBAA\00" }, comdat
+; CHECK: @__tysan_v1_omnipotent_20char = linkonce_odr constant { i64, i64, ptr, i64, [16 x i8] } { i64 2, i64 1, ptr @__tysan_v1_Simple_20C_2b_2b_20TBAA, i64 0, [16 x i8] c"omnipotent char\00" }, comdat
+; CHECK: @__tysan_v1_int = linkonce_odr constant { i64, i64, ptr, i64, [4 x i8] } { i64 2, i64 1, ptr @__tysan_v1_omnipotent_20char, i64 0, [4 x i8] c"int\00" }, comdat
+; CHECK: @__tysan_v1_int_o_0 = linkonce_odr constant { i64, ptr, ptr, i64 } { i64 1, ptr @__tysan_v1_int, ptr @__tysan_v1_int, i64 0 }, comdat
+; CHECK: @__tysan_shadow_memory_address = external global i64
+; CHECK: @__tysan_app_memory_mask = external global i64
+; CHECK: @__tysan_v1___ZTS1x = linkonce_odr constant { i64, i64, ptr, i64, ptr, i64, [7 x i8] } { i64 2, i64 2, ptr @__tysan_v1_int, i64 0, ptr @__tysan_v1_int, i64 4, [7 x i8] c"_ZTS1x\00" }, comdat
+; CHECK: @__tysan_v1___ZTS1v = linkonce_odr constant { i64, i64, ptr, i64, ptr, i64, ptr, i64, [7 x i8] } { i64 2, i64 3, ptr @__tysan_v1_int, i64 8, ptr @__tysan_v1_int, i64 12, ptr @__tysan_v1___ZTS1x, i64 16, [7 x i8] c"_ZTS1v\00" }, comdat
+; CHECK: @__tysan_v1___ZTS1v_o_12 = linkonce_odr constant { i64, ptr, ptr, i64 } { i64 1, ptr @__tysan_v1___ZTS1v, ptr @__tysan_v1_int, i64 12 }, comdat
+; CHECK: @llvm.used = appending global [8 x ptr] [ptr @tysan.module_ctor, ptr @__tysan_v1_Simple_20C_2b_2b_20TBAA, ptr @__tysan_v1_omnipotent_20char, ptr @__tysan_v1_int, ptr @__tysan_v1_int_o_0, ptr @__tysan_v1___ZTS1x, ptr @__tysan_v1___ZTS1v, ptr @__tysan_v1___ZTS1v_o_12], section "llvm.metadata"
+;.
+define i32 @test_load(ptr %a) sanitize_type {
+; CHECK-LABEL: @test_load(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[APP_MEM_MASK:%.*]] = load i64, ptr @__tysan_app_memory_mask, align 8
+; CHECK-NEXT:    [[SHADOW_BASE:%.*]] = load i64, ptr @__tysan_shadow_memory_address, align 8
+; CHECK-NEXT:    call void @__tysan_instrument_with_shadow_update(ptr [[A:%.*]], ptr @__tysan_v1_int_o_0, i1 true, i64 4, i32 1)
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A]], align 4, !tbaa [[TBAA0:![0-9]+]]
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+entry:
+  %tmp1 = load i32, ptr %a, align 4, !tbaa !3
+  ret i32 %tmp1
+}
+
+define void @test_store(ptr %a) sanitize_type {
+; CHECK-LABEL: @test_store(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[APP_MEM_MASK:%.*]] = load i64, ptr @__tysan_app_memory_mask, align 8
+; CHECK-NEXT:    [[SHADOW_BASE:%.*]] = load i64, ptr @__tysan_shadow_memory_address, align 8
+; CHECK-NEXT:    call void @__tysan_instrument_with_shadow_update(ptr [[A:%.*]], ptr @__tysan_v1___ZTS1v_o_12, i1 true, i64 4, i32 2)
+; CHECK-NEXT:    store i32 42, ptr [[A]], align 4, !tbaa [[TBAA4:![0-9]+]]
+; CHECK-NEXT:    ret void
+;
+
+entry:
+  store i32 42, ptr %a, align 4, !tbaa !6
+  ret void
+}
+
+!0 = !{!"Simple C++ TBAA"}
+!1 = !{!"omnipotent char", !0, i64 0}
+!2 = !{!"int", !1, i64 0}
+!3 = !{!2, !2, i64 0}
+!4 = !{!"_ZTS1x", !2, i64 0, !2, i64 4}
+!5 = !{!"_ZTS1v", !2, i64 8, !2, i64 12, !4, i64 16}
+!6 = !{!5, !2, i64 12}
+;.
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { sanitize_type }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { nounwind }
+;.
+; CHECK: [[TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; CHECK: [[META1]] = !{!"int", [[META2:![0-9]+]], i64 0}
+; CHECK: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]], i64 0}
+; CHECK: [[META3]] = !{!"Simple C++ TBAA"}
+; CHECK: [[TBAA4]] = !{[[META5:![0-9]+]], [[META1]], i64 12}
+; CHECK: [[META5]] = !{!"_ZTS1v", [[META1]], i64 8, [[META1]], i64 12, [[META6:![0-9]+]], i64 16}
+; CHECK: [[META6]] = !{!"_ZTS1x", [[META1]], i64 0, [[META1]], i64 4}
+;.
diff --git a/llvm/test/Instrumentation/TypeSanitizer/basic_verify_outlined.ll b/llvm/test/Instrumentation/TypeSanitizer/basic_verify_outlined.ll
new file mode 100644
index 0000000000000..187a41ea8a825
--- /dev/null
+++ b/llvm/test/Instrumentation/TypeSanitizer/basic_verify_outlined.ll
@@ -0,0 +1,736 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
+; Test basic type sanitizer instrumentation.
+;
+; RUN: opt -passes='tysan' -S -tysan-outline-instrumentation -tysan-verify-outlined-instrumentation -S %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+;.
+; CHECK: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 0, ptr @tysan.module_ctor, ptr null }]
+; CHECK: @__tysan_v1_Simple_20C_2b_2b_20TBAA = linkonce_odr constant { i64, i64, [16 x i8] } { i64 2, i64 0, [16 x i8] c"Simple C++ TBAA\00" }, comdat
+; CHECK: @__tysan_v1_omnipotent_20char = linkonce_odr constant { i64, i64, ptr, i64, [16 x i8] } { i64 2, i64 1, ptr @__tysan_v1_Simple_20C_2b_2b_20TBAA, i64 0, [16 x i8] c"omnipotent char\00" }, comdat
+; CHECK: @__tysan_v1_int = linkonce_odr constant { i64, i64, ptr, i64, [4 x i8] } { i64 2, i64 1, ptr @__tysan_v1_omnipotent_20char, i64 0, [4 x i8] c"int\00" }, comdat
+; CHECK: @__tysan_v1_int_o_0 = linkonce_odr constant { i64, ptr, ptr, i64 } { i64 1, ptr @__tysan_v1_int, ptr @__tysan_v1_int, i64 0 }, comdat
+; CHECK: @__tysan_shadow_memory_address = external global i64
+; CHECK: @__tysan_app_memory_mask = external global i64
+; CHECK: @__tysan_v1___ZTS1x = linkonce_odr constant { i64, i64, ptr, i64, ptr, i64, [7 x i8] } { i64 2, i64 2, ptr @__tysan_v1_int, i64 0, ptr @__tysan_v1_int, i64 4, [7 x i8] c"_ZTS1x\00" }, comdat
+; CHECK: @__tysan_v1___ZTS1v = linkonce_odr constant { i64, i64, ptr, i64, ptr, i64, ptr, i64, [7 x i8] } { i64 2, i64 3, ptr @__tysan_v1_int, i64 8, ptr @__tysan_v1_int, i64 12, ptr @__tysan_v1___ZTS1x, i64 16, [7 x i8] c"_ZTS1v\00" }, comdat
+; CHECK: @__tysan_v1___ZTS1v_o_12 = linkonce_odr constant { i64, ptr, ptr, i64 } { i64 1, ptr @__tysan_v1___ZTS1v, ptr @__tysan_v1_int, i64 12 }, comdat
+; CHECK: @llvm.used = appending global [8 x ptr] [ptr @tysan.module_ctor, ptr @__tysan_v1_Simple_20C_2b_2b_20TBAA, ptr @__tysan_v1_omnipotent_20char, ptr @__tysan_v1_int, ptr @__tysan_v1_int_o_0, ptr @__tysan_v1___ZTS1x, ptr @__tysan_v1___ZTS1v, ptr @__tysan_v1___ZTS1v_o_12], section "llvm.metadata"
+;.
+define i32 @test_load(ptr %a) sanitize_type {
+; CHECK-LABEL: @test_load(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[APP_MEM_MASK2:%.*]] = load i64, ptr @__tysan_app_memory_mask, align 8
+; CHECK-NEXT:    [[SHADOW_BASE1:%.*]] = load i64, ptr @__tysan_shadow_memory_address, align 8
+; CHECK-NEXT:    [[APP_PTR_MASKED:%.*]] = and i64 ptrtoint (ptr @__tysan_app_memory_mask to i64), [[APP_MEM_MASK2]]
+; CHECK-NEXT:    [[APP_PTR_SHIFTED:%.*]] = shl i64 [[APP_PTR_MASKED]], 3
+; CHECK-NEXT:    [[SHADOW_PTR_INT:%.*]] = add i64 [[APP_PTR_SHIFTED]], [[SHADOW_BASE1]]
+; CHECK-NEXT:    [[SHADOW_PTR:%.*]] = inttoptr i64 [[SHADOW_PTR_INT]] to ptr
+; CHECK-NEXT:    [[SHADOW_DESC:%.*]] = load ptr, ptr [[SHADOW_PTR]], align 8
+; CHECK-NEXT:    [[BAD_DESC:%.*]] = icmp ne ptr [[SHADOW_DESC]], null
+; CHECK-NEXT:    br i1 [[BAD_DESC]], label [[TMP0:%.*]], label [[TMP42:%.*]], !prof [[PROF0:![0-9]+]]
+; CHECK:       0:
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq ptr [[SHADOW_DESC]], null
+; CHECK-NEXT:    br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP40:%.*]]
+; CHECK:       2:
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[SHADOW_PTR_INT]], 8
+; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne ptr [[TMP5]], null
+; CHECK-NEXT:    [[TMP7:%.*]] = or i1 false, [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[SHADOW_PTR_INT]], 16
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne ptr [[TMP10]], null
+; CHECK-NEXT:    [[TMP12:%.*]] = or i1 [[TMP7]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[SHADOW_PTR_INT]], 24
+; CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
+; CHECK-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[TMP14]], align 8
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne ptr [[TMP15]], null
+; CHECK-NEXT:    [[TMP17:%.*]] = or i1 [[TMP12]], [[TMP16]]
+; CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[SHADOW_PTR_INT]], 32
+; CHECK-NEXT:    [[TMP19:%.*]] = inttoptr i64 [[TMP18]] to ptr
+; CHECK-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP19]], align 8
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne ptr [[TMP20]], null
+; CHECK-NEXT:    [[TMP22:%.*]] = or i1 [[TMP17]], [[TMP21]]
+; CHECK-NEXT:    [[TMP23:%.*]] = add i64 [[SHADOW_PTR_INT]], 40
+; CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+; CHECK-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[TMP24]], align 8
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp ne ptr [[TMP25]], null
+; CHECK-NEXT:    [[TMP27:%.*]] = or i1 [[TMP22]], [[TMP26]]
+; CHECK-NEXT:    [[TMP28:%.*]] = add i64 [[SHADOW_PTR_INT]], 48
+; CHECK-NEXT:    [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
+; CHECK-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[TMP29]], align 8
+; CHECK-NEXT:    [[TMP31:%.*]] = icmp ne ptr [[TMP30]], null
+; CHECK-NEXT:    [[TMP32:%.*]] = or i1 [[TMP27]], [[TMP31]]
+; CHECK-NEXT:    [[TMP33:%.*]] = add i64 [[SHADOW_PTR_INT]], 56
+; CHECK-NEXT:    [[TMP34:%.*]] = inttoptr i64 [[TMP33]] to ptr
+; CHECK-NEXT:    [[TMP35:%.*]] = load ptr, ptr [[TMP34]], align 8
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp ne ptr [[TMP35]], null
+; CHECK-NEXT:    [[TMP37:%.*]] = or i1 [[TMP32]], [[TMP36]]
+; CHECK-NEXT:    br i1 [[TMP37]], label [[TMP38:%.*]], label [[TMP39:%.*]], !prof [[PROF0]]
+; CHECK:       38:
+; CHECK-NEXT:    call void @__tysan_check(ptr @__tysan_app_memory_mask, i32 8, ptr null, i32 1)
+; CHECK-NEXT:    br label [[TMP39]]
+; CHECK:       39:
+; CHECK-NEXT:    store ptr null, ptr [[SHADOW_PTR]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_1_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 8
+; CHECK-NEXT:    [[SHADOW_BYTE_1_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_1_OFFSET]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -1 to ptr), ptr [[SHADOW_BYTE_1_PTR]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_2_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 16
+; CHECK-NEXT:    [[SHADOW_BYTE_2_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_2_OFFSET]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -2 to ptr), ptr [[SHADOW_BYTE_2_PTR]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_3_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 24
+; CHECK-NEXT:    [[SHADOW_BYTE_3_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_3_OFFSET]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -3 to ptr), ptr [[SHADOW_BYTE_3_PTR]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_4_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 32
+; CHECK-NEXT:    [[SHADOW_BYTE_4_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_4_OFFSET]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -4 to ptr), ptr [[SHADOW_BYTE_4_PTR]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_5_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 40
+; CHECK-NEXT:    [[SHADOW_BYTE_5_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_5_OFFSET]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -5 to ptr), ptr [[SHADOW_BYTE_5_PTR]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_6_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 48
+; CHECK-NEXT:    [[SHADOW_BYTE_6_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_6_OFFSET]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -6 to ptr), ptr [[SHADOW_BYTE_6_PTR]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_7_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 56
+; CHECK-NEXT:    [[SHADOW_BYTE_7_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_7_OFFSET]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -7 to ptr), ptr [[SHADOW_BYTE_7_PTR]], align 8
+; CHECK-NEXT:    br label [[TMP41:%.*]]
+; CHECK:       40:
+; CHECK-NEXT:    call void @__tysan_check(ptr @__tysan_app_memory_mask, i32 8, ptr null, i32 1)
+; CHECK-NEXT:    br label [[TMP41]]
+; CHECK:       41:
+; CHECK-NEXT:    br label [[TMP87:%.*]]
+; CHECK:       42:
+; CHECK-NEXT:    [[TMP43:%.*]] = add i64 [[SHADOW_PTR_INT]], 8
+; CHECK-NEXT:    [[TMP44:%.*]] = inttoptr i64 [[TMP43]] to ptr
+; CHECK-NEXT:    [[TMP45:%.*]] = load ptr, ptr [[TMP44]], align 8
+; CHECK-NEXT:    [[TMP46:%.*]] = ptrtoint ptr [[TMP45]] to i64
+; CHECK-NEXT:    [[TMP47:%.*]] = icmp sge i64 [[TMP46]], 0
+; CHECK-NEXT:    [[TMP48:%.*]] = or i1 false, [[TMP47]]
+; CHECK-NEXT:    [[TMP49:%.*]] = add i64 [[SHADOW_PTR_INT]], 16
+; CHECK-NEXT:    [[TMP50:%.*]] = inttoptr i64 [[TMP49]] to ptr
+; CHECK-NEXT:    [[TMP51:%.*]] = load ptr, ptr [[TMP50]], align 8
+; CHECK-NEXT:    [[TMP52:%.*]] = ptrtoint ptr [[TMP51]] to i64
+; CHECK-NEXT:    [[TMP53:%.*]] = icmp sge i64 [[TMP52]], 0
+; CHECK-NEXT:    [[TMP54:%.*]] = or i1 [[TMP48]], [[TMP53]]
+; CHECK-NEXT:    [[TMP55:%.*]] = add i64 [[SHADOW_PTR_INT]], 24
+; CHECK-NEXT:    [[TMP56:%.*]] = inttoptr i64 [[TMP55]] to ptr
+; CHECK-NEXT:    [[TMP57:%.*]] = load ptr, ptr [[TMP56]], align 8
+; CHECK-NEXT:    [[TMP58:%.*]] = ptrtoint ptr [[TMP57]] to i64
+; CHECK-NEXT:    [[TMP59:%.*]] = icmp sge i64 [[TMP58]], 0
+; CHECK-NEXT:    [[TMP60:%.*]] = or i1 [[TMP54]], [[TMP59]]
+; CHECK-NEXT:    [[TMP61:%.*]] = add i64 [[SHADOW_PTR_INT]], 32
+; CHECK-NEXT:    [[TMP62:%.*]] = inttoptr i64 [[TMP61]] to ptr
+; CHECK-NEXT:    [[TMP63:%.*]] = load ptr, ptr [[TMP62]], align 8
+; CHECK-NEXT:    [[TMP64:%.*]] = ptrtoint ptr [[TMP63]] to i64
+; CHECK-NEXT:    [[TMP65:%.*]] = icmp sge i64 [[TMP64]], 0
+; CHECK-NEXT:    [[TMP66:%.*]] = or i1 [[TMP60]], [[TMP65]]
+; CHECK-NEXT:    [[TMP67:%.*]] = add i64 [[SHADOW_PTR_INT]], 40
+; CHECK-NEXT:    [[TMP68:%.*]] = inttoptr i64 [[TMP67]] to ptr
+; CHECK-NEXT:    [[TMP69:%.*]] = load ptr, ptr [[TMP68]], align 8
+; CHECK-NEXT:    [[TMP70:%.*]] = ptrtoint ptr [[TMP69]] to i64
+; CHECK-NEXT:    [[TMP71:%.*]] = icmp sge i64 [[TMP70]], 0
+; CHECK-NEXT:    [[TMP72:%.*]] = or i1 [[TMP66]], [[TMP71]]
+; CHECK-NEXT:    [[TMP73:%.*]] = add i64 [[SHADOW_PTR_INT]], 48
+; CHECK-NEXT:    [[TMP74:%.*]] = inttoptr i64 [[TMP73]] to ptr
+; CHECK-NEXT:    [[TMP75:%.*]] = load ptr, ptr [[TMP74]], align 8
+; CHECK-NEXT:    [[TMP76:%.*]] = ptrtoint ptr [[TMP75]] to i64
+; CHECK-NEXT:    [[TMP77:%.*]] = icmp sge i64 [[TMP76]], 0
+; CHECK-NEXT:    [[TMP78:%.*]] = or i1 [[TMP72]], [[TMP77]]
+; CHECK-NEXT:    [[TMP79:%.*]] = add i64 [[SHADOW_PTR_INT]], 56
+; CHECK-NEXT:    [[TMP80:%.*]] = inttoptr i64 [[TMP79]] to ptr
+; CHECK-NEXT:    [[TMP81:%.*]] = load ptr, ptr [[TMP80]], align 8
+; CHECK-NEXT:    [[TMP82:%.*]] = ptrtoint ptr [[TMP81]] to i64
+; CHECK-NEXT:    [[TMP83:%.*]] = icmp sge i64 [[TMP82]], 0
+; CHECK-NEXT:    [[TMP84:%.*]] = or i1 [[TMP78]], [[TMP83]]
+; CHECK-NEXT:    br i1 [[TMP84]], label [[TMP85:%.*]], label [[TMP86:%.*]], !prof [[PROF0]]
+; CHECK:       85:
+; CHECK-NEXT:    call void @__tysan_check(ptr @__tysan_app_memory_mask, i32 8, ptr null, i32 1)
+; CHECK-NEXT:    br label [[TMP86]]
+; CHECK:       86:
+; CHECK-NEXT:    br label [[TMP87]]
+; CHECK:       87:
+; CHECK-NEXT:    [[APP_MEM_MASK:%.*]] = load i64, ptr @__tysan_app_memory_mask, align 8
+; CHECK-NEXT:    [[APP_PTR_MASKED3:%.*]] = and i64 ptrtoint (ptr @__tysan_shadow_memory_address to i64), [[APP_MEM_MASK2]]
+; CHECK-NEXT:    [[APP_PTR_SHIFTED4:%.*]] = shl i64 [[APP_PTR_MASKED3]], 3
+; CHECK-NEXT:    [[SHADOW_PTR_INT5:%.*]] = add i64 [[APP_PTR_SHIFTED4]], [[SHADOW_BASE1]]
+; CHECK-NEXT:    [[SHADOW_PTR6:%.*]] = inttoptr i64 [[SHADOW_PTR_INT5]] to ptr
+; CHECK-NEXT:    [[SHADOW_DESC7:%.*]] = load ptr, ptr [[SHADOW_PTR6]], align 8
+; CHECK-NEXT:    [[BAD_DESC8:%.*]] = icmp ne ptr [[SHADOW_DESC7]], null
+; CHECK-NEXT:    br i1 [[BAD_DESC8]], label [[TMP88:%.*]], label [[TMP130:%.*]], !prof [[PROF0]]
+; CHECK:       88:
+; CHECK-NEXT:    [[TMP89:%.*]] = icmp eq ptr [[SHADOW_DESC7]], null
+; CHECK-NEXT:    br i1 [[TMP89]], label [[TMP90:%.*]], label [[TMP128:%.*]]
+; CHECK:       90:
+; CHECK-NEXT:    [[TMP91:%.*]] = add i64 [[SHADOW_PTR_INT5]], 8
+; CHECK-NEXT:    [[TMP92:%.*]] = inttoptr i64 [[TMP91]] to ptr
+; CHECK-NEXT:    [[TMP93:%.*]] = load ptr, ptr [[TMP92]], align 8
+; CHECK-NEXT:    [[TMP94:%.*]] = icmp ne ptr [[TMP93]], null
+; CHECK-NEXT:    [[TMP95:%.*]] = or i1 false, [[TMP94]]
+; CHECK-NEXT:    [[TMP96:%.*]] = add i64 [[SHADOW_PTR_INT5]], 16
+; CHECK-NEXT:    [[TMP97:%.*]] = inttoptr i64 [[TMP96]] to ptr
+; CHECK-NEXT:    [[TMP98:%.*]] = load ptr, ptr [[TMP97]], align 8
+; CHECK-NEXT:    [[TMP99:%.*]] = icmp ne ptr [[TMP98]], null
+; CHECK-NEXT:    [[TMP100:%.*]] = or i1 [[TMP95]], [[TMP99]]
+; CHECK-NEXT:    [[TMP101:%.*]] = add i64 [[SHADOW_PTR_INT5]], 24
+; CHECK-NEXT:    [[TMP102:%.*]] = inttoptr i64 [[TMP101]] to ptr
+; CHECK-NEXT:    [[TMP103:%.*]] = load ptr, ptr [[TMP102]], align 8
+; CHECK-NEXT:    [[TMP104:%.*]] = icmp ne ptr [[TMP103]], null
+; CHECK-NEXT:    [[TMP105:%.*]] = or i1 [[TMP100]], [[TMP104]]
+; CHECK-NEXT:    [[TMP106:%.*]] = add i64 [[SHADOW_PTR_INT5]], 32
+; CHECK-NEXT:    [[TMP107:%.*]] = inttoptr i64 [[TMP106]] to ptr
+; CHECK-NEXT:    [[TMP108:%.*]] = load ptr, ptr [[TMP107]], align 8
+; CHECK-NEXT:    [[TMP109:%.*]] = icmp ne ptr [[TMP108]], null
+; CHECK-NEXT:    [[TMP110:%.*]] = or i1 [[TMP105]], [[TMP109]]
+; CHECK-NEXT:    [[TMP111:%.*]] = add i64 [[SHADOW_PTR_INT5]], 40
+; CHECK-NEXT:    [[TMP112:%.*]] = inttoptr i64 [[TMP111]] to ptr
+; CHECK-NEXT:    [[TMP113:%.*]] = load ptr, ptr [[TMP112]], align 8
+; CHECK-NEXT:    [[TMP114:%.*]] = icmp ne ptr [[TMP113]], null
+; CHECK-NEXT:    [[TMP115:%.*]] = or i1 [[TMP110]], [[TMP114]]
+; CHECK-NEXT:    [[TMP116:%.*]] = add i64 [[SHADOW_PTR_INT5]], 48
+; CHECK-NEXT:    [[TMP117:%.*]] = inttoptr i64 [[TMP116]] to ptr
+; CHECK-NEXT:    [[TMP118:%.*]] = load ptr, ptr [[TMP117]], align 8
+; CHECK-NEXT:    [[TMP119:%.*]] = icmp ne ptr [[TMP118]], null
+; CHECK-NEXT:    [[TMP120:%.*]] = or i1 [[TMP115]], [[TMP119]]
+; CHECK-NEXT:    [[TMP121:%.*]] = add i64 [[SHADOW_PTR_INT5]], 56
+; CHECK-NEXT:    [[TMP122:%.*]] = inttoptr i64 [[TMP121]] to ptr
+; CHECK-NEXT:    [[TMP123:%.*]] = load ptr, ptr [[TMP122]], align 8
+; CHECK-NEXT:    [[TMP124:%.*]] = icmp ne ptr [[TMP123]], null
+; CHECK-NEXT:    [[TMP125:%.*]] = or i1 [[TMP120]], [[TMP124]]
+; CHECK-NEXT:    br i1 [[TMP125]], label [[TMP126:%.*]], label [[TMP127:%.*]], !prof [[PROF0]]
+; CHECK:       126:
+; CHECK-NEXT:    call void @__tysan_check(ptr @__tysan_shadow_memory_address, i32 8, ptr null, i32 1)
+; CHECK-NEXT:    br label [[TMP127]]
+; CHECK:       127:
+; CHECK-NEXT:    store ptr null, ptr [[SHADOW_PTR6]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_1_OFFSET9:%.*]] = add i64 [[SHADOW_PTR_INT5]], 8
+; CHECK-NEXT:    [[SHADOW_BYTE_1_PTR10:%.*]] = inttoptr i64 [[SHADOW_BYTE_1_OFFSET9]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -1 to ptr), ptr [[SHADOW_BYTE_1_PTR10]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_2_OFFSET11:%.*]] = add i64 [[SHADOW_PTR_INT5]], 16
+; CHECK-NEXT:    [[SHADOW_BYTE_2_PTR12:%.*]] = inttoptr i64 [[SHADOW_BYTE_2_OFFSET11]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -2 to ptr), ptr [[SHADOW_BYTE_2_PTR12]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_3_OFFSET13:%.*]] = add i64 [[SHADOW_PTR_INT5]], 24
+; CHECK-NEXT:    [[SHADOW_BYTE_3_PTR14:%.*]] = inttoptr i64 [[SHADOW_BYTE_3_OFFSET13]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -3 to ptr), ptr [[SHADOW_BYTE_3_PTR14]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_4_OFFSET15:%.*]] = add i64 [[SHADOW_PTR_INT5]], 32
+; CHECK-NEXT:    [[SHADOW_BYTE_4_PTR16:%.*]] = inttoptr i64 [[SHADOW_BYTE_4_OFFSET15]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -4 to ptr), ptr [[SHADOW_BYTE_4_PTR16]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_5_OFFSET17:%.*]] = add i64 [[SHADOW_PTR_INT5]], 40
+; CHECK-NEXT:    [[SHADOW_BYTE_5_PTR18:%.*]] = inttoptr i64 [[SHADOW_BYTE_5_OFFSET17]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -5 to ptr), ptr [[SHADOW_BYTE_5_PTR18]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_6_OFFSET19:%.*]] = add i64 [[SHADOW_PTR_INT5]], 48
+; CHECK-NEXT:    [[SHADOW_BYTE_6_PTR20:%.*]] = inttoptr i64 [[SHADOW_BYTE_6_OFFSET19]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -6 to ptr), ptr [[SHADOW_BYTE_6_PTR20]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_7_OFFSET21:%.*]] = add i64 [[SHADOW_PTR_INT5]], 56
+; CHECK-NEXT:    [[SHADOW_BYTE_7_PTR22:%.*]] = inttoptr i64 [[SHADOW_BYTE_7_OFFSET21]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -7 to ptr), ptr [[SHADOW_BYTE_7_PTR22]], align 8
+; CHECK-NEXT:    br label [[TMP129:%.*]]
+; CHECK:       128:
+; CHECK-NEXT:    call void @__tysan_check(ptr @__tysan_shadow_memory_address, i32 8, ptr null, i32 1)
+; CHECK-NEXT:    br label [[TMP129]]
+; CHECK:       129:
+; CHECK-NEXT:    br label [[TMP175:%.*]]
+; CHECK:       130:
+; CHECK-NEXT:    [[TMP131:%.*]] = add i64 [[SHADOW_PTR_INT5]], 8
+; CHECK-NEXT:    [[TMP132:%.*]] = inttoptr i64 [[TMP131]] to ptr
+; CHECK-NEXT:    [[TMP133:%.*]] = load ptr, ptr [[TMP132]], align 8
+; CHECK-NEXT:    [[TMP134:%.*]] = ptrtoint ptr [[TMP133]] to i64
+; CHECK-NEXT:    [[TMP135:%.*]] = icmp sge i64 [[TMP134]], 0
+; CHECK-NEXT:    [[TMP136:%.*]] = or i1 false, [[TMP135]]
+; CHECK-NEXT:    [[TMP137:%.*]] = add i64 [[SHADOW_PTR_INT5]], 16
+; CHECK-NEXT:    [[TMP138:%.*]] = inttoptr i64 [[TMP137]] to ptr
+; CHECK-NEXT:    [[TMP139:%.*]] = load ptr, ptr [[TMP138]], align 8
+; CHECK-NEXT:    [[TMP140:%.*]] = ptrtoint ptr [[TMP139]] to i64
+; CHECK-NEXT:    [[TMP141:%.*]] = icmp sge i64 [[TMP140]], 0
+; CHECK-NEXT:    [[TMP142:%.*]] = or i1 [[TMP136]], [[TMP141]]
+; CHECK-NEXT:    [[TMP143:%.*]] = add i64 [[SHADOW_PTR_INT5]], 24
+; CHECK-NEXT:    [[TMP144:%.*]] = inttoptr i64 [[TMP143]] to ptr
+; CHECK-NEXT:    [[TMP145:%.*]] = load ptr, ptr [[TMP144]], align 8
+; CHECK-NEXT:    [[TMP146:%.*]] = ptrtoint ptr [[TMP145]] to i64
+; CHECK-NEXT:    [[TMP147:%.*]] = icmp sge i64 [[TMP146]], 0
+; CHECK-NEXT:    [[TMP148:%.*]] = or i1 [[TMP142]], [[TMP147]]
+; CHECK-NEXT:    [[TMP149:%.*]] = add i64 [[SHADOW_PTR_INT5]], 32
+; CHECK-NEXT:    [[TMP150:%.*]] = inttoptr i64 [[TMP149]] to ptr
+; CHECK-NEXT:    [[TMP151:%.*]] = load ptr, ptr [[TMP150]], align 8
+; CHECK-NEXT:    [[TMP152:%.*]] = ptrtoint ptr [[TMP151]] to i64
+; CHECK-NEXT:    [[TMP153:%.*]] = icmp sge i64 [[TMP152]], 0
+; CHECK-NEXT:    [[TMP154:%.*]] = or i1 [[TMP148]], [[TMP153]]
+; CHECK-NEXT:    [[TMP155:%.*]] = add i64 [[SHADOW_PTR_INT5]], 40
+; CHECK-NEXT:    [[TMP156:%.*]] = inttoptr i64 [[TMP155]] to ptr
+; CHECK-NEXT:    [[TMP157:%.*]] = load ptr, ptr [[TMP156]], align 8
+; CHECK-NEXT:    [[TMP158:%.*]] = ptrtoint ptr [[TMP157]] to i64
+; CHECK-NEXT:    [[TMP159:%.*]] = icmp sge i64 [[TMP158]], 0
+; CHECK-NEXT:    [[TMP160:%.*]] = or i1 [[TMP154]], [[TMP159]]
+; CHECK-NEXT:    [[TMP161:%.*]] = add i64 [[SHADOW_PTR_INT5]], 48
+; CHECK-NEXT:    [[TMP162:%.*]] = inttoptr i64 [[TMP161]] to ptr
+; CHECK-NEXT:    [[TMP163:%.*]] = load ptr, ptr [[TMP162]], align 8
+; CHECK-NEXT:    [[TMP164:%.*]] = ptrtoint ptr [[TMP163]] to i64
+; CHECK-NEXT:    [[TMP165:%.*]] = icmp sge i64 [[TMP164]], 0
+; CHECK-NEXT:    [[TMP166:%.*]] = or i1 [[TMP160]], [[TMP165]]
+; CHECK-NEXT:    [[TMP167:%.*]] = add i64 [[SHADOW_PTR_INT5]], 56
+; CHECK-NEXT:    [[TMP168:%.*]] = inttoptr i64 [[TMP167]] to ptr
+; CHECK-NEXT:    [[TMP169:%.*]] = load ptr, ptr [[TMP168]], align 8
+; CHECK-NEXT:    [[TMP170:%.*]] = ptrtoint ptr [[TMP169]] to i64
+; CHECK-NEXT:    [[TMP171:%.*]] = icmp sge i64 [[TMP170]], 0
+; CHECK-NEXT:    [[TMP172:%.*]] = or i1 [[TMP166]], [[TMP171]]
+; CHECK-NEXT:    br i1 [[TMP172]], label [[TMP173:%.*]], label [[TMP174:%.*]], !prof [[PROF0]]
+; CHECK:       173:
+; CHECK-NEXT:    call void @__tysan_check(ptr @__tysan_shadow_memory_address, i32 8, ptr null, i32 1)
+; CHECK-NEXT:    br label [[TMP174]]
+; CHECK:       174:
+; CHECK-NEXT:    br label [[TMP175]]
+; CHECK:       175:
+; CHECK-NEXT:    [[SHADOW_BASE:%.*]] = load i64, ptr @__tysan_shadow_memory_address, align 8
+; CHECK-NEXT:    call void @__tysan_instrument_with_shadow_update(ptr [[A:%.*]], ptr @__tysan_v1_int_o_0, i1 true, i64 4, i32 1)
+; CHECK-NEXT:    [[APP_PTR_INT:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[APP_PTR_MASKED23:%.*]] = and i64 [[APP_PTR_INT]], [[APP_MEM_MASK2]]
+; CHECK-NEXT:    [[APP_PTR_SHIFTED24:%.*]] = shl i64 [[APP_PTR_MASKED23]], 3
+; CHECK-NEXT:    [[SHADOW_PTR_INT25:%.*]] = add i64 [[APP_PTR_SHIFTED24]], [[SHADOW_BASE1]]
+; CHECK-NEXT:    [[SHADOW_PTR26:%.*]] = inttoptr i64 [[SHADOW_PTR_INT25]] to ptr
+; CHECK-NEXT:    [[SHADOW_DESC27:%.*]] = load ptr, ptr [[SHADOW_PTR26]], align 8
+; CHECK-NEXT:    [[BAD_DESC28:%.*]] = icmp ne ptr [[SHADOW_DESC27]], @__tysan_v1_int_o_0
+; CHECK-NEXT:    br i1 [[BAD_DESC28]], label [[TMP176:%.*]], label [[TMP198:%.*]], !prof [[PROF0]]
+; CHECK:       176:
+; CHECK-NEXT:    [[TMP177:%.*]] = icmp eq ptr [[SHADOW_DESC27]], null
+; CHECK-NEXT:    br i1 [[TMP177]], label [[TMP178:%.*]], label [[TMP196:%.*]]
+; CHECK:       178:
+; CHECK-NEXT:    [[TMP179:%.*]] = add i64 [[SHADOW_PTR_INT25]], 8
+; CHECK-NEXT:    [[TMP180:%.*]] = inttoptr i64 [[TMP179]] to ptr
+; CHECK-NEXT:    [[TMP181:%.*]] = load ptr, ptr [[TMP180]], align 8
+; CHECK-NEXT:    [[TMP182:%.*]] = icmp ne ptr [[TMP181]], null
+; CHECK-NEXT:    [[TMP183:%.*]] = or i1 false, [[TMP182]]
+; CHECK-NEXT:    [[TMP184:%.*]] = add i64 [[SHADOW_PTR_INT25]], 16
+; CHECK-NEXT:    [[TMP185:%.*]] = inttoptr i64 [[TMP184]] to ptr
+; CHECK-NEXT:    [[TMP186:%.*]] = load ptr, ptr [[TMP185]], align 8
+; CHECK-NEXT:    [[TMP187:%.*]] = icmp ne ptr [[TMP186]], null
+; CHECK-NEXT:    [[TMP188:%.*]] = or i1 [[TMP183]], [[TMP187]]
+; CHECK-NEXT:    [[TMP189:%.*]] = add i64 [[SHADOW_PTR_INT25]], 24
+; CHECK-NEXT:    [[TMP190:%.*]] = inttoptr i64 [[TMP189]] to ptr
+; CHECK-NEXT:    [[TMP191:%.*]] = load ptr, ptr [[TMP190]], align 8
+; CHECK-NEXT:    [[TMP192:%.*]] = icmp ne ptr [[TMP191]], null
+; CHECK-NEXT:    [[TMP193:%.*]] = or i1 [[TMP188]], [[TMP192]]
+; CHECK-NEXT:    br i1 [[TMP193]], label [[TMP194:%.*]], label [[TMP195:%.*]], !prof [[PROF0]]
+; CHECK:       194:
+; CHECK-NEXT:    call void @__tysan_check(ptr [[A]], i32 4, ptr @__tysan_v1_int_o_0, i32 1)
+; CHECK-NEXT:    br label [[TMP195]]
+; CHECK:       195:
+; CHECK-NEXT:    store ptr @__tysan_v1_int_o_0, ptr [[SHADOW_PTR26]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_1_OFFSET29:%.*]] = add i64 [[SHADOW_PTR_INT25]], 8
+; CHECK-NEXT:    [[SHADOW_BYTE_1_PTR30:%.*]] = inttoptr i64 [[SHADOW_BYTE_1_OFFSET29]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -1 to ptr), ptr [[SHADOW_BYTE_1_PTR30]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_2_OFFSET31:%.*]] = add i64 [[SHADOW_PTR_INT25]], 16
+; CHECK-NEXT:    [[SHADOW_BYTE_2_PTR32:%.*]] = inttoptr i64 [[SHADOW_BYTE_2_OFFSET31]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -2 to ptr), ptr [[SHADOW_BYTE_2_PTR32]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_3_OFFSET33:%.*]] = add i64 [[SHADOW_PTR_INT25]], 24
+; CHECK-NEXT:    [[SHADOW_BYTE_3_PTR34:%.*]] = inttoptr i64 [[SHADOW_BYTE_3_OFFSET33]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -3 to ptr), ptr [[SHADOW_BYTE_3_PTR34]], align 8
+; CHECK-NEXT:    br label [[TMP197:%.*]]
+; CHECK:       196:
+; CHECK-NEXT:    call void @__tysan_check(ptr [[A]], i32 4, ptr @__tysan_v1_int_o_0, i32 1)
+; CHECK-NEXT:    br label [[TMP197]]
+; CHECK:       197:
+; CHECK-NEXT:    br label [[TMP219:%.*]]
+; CHECK:       198:
+; CHECK-NEXT:    [[TMP199:%.*]] = add i64 [[SHADOW_PTR_INT25]], 8
+; CHECK-NEXT:    [[TMP200:%.*]] = inttoptr i64 [[TMP199]] to ptr
+; CHECK-NEXT:    [[TMP201:%.*]] = load ptr, ptr [[TMP200]], align 8
+; CHECK-NEXT:    [[TMP202:%.*]] = ptrtoint ptr [[TMP201]] to i64
+; CHECK-NEXT:    [[TMP203:%.*]] = icmp sge i64 [[TMP202]], 0
+; CHECK-NEXT:    [[TMP204:%.*]] = or i1 false, [[TMP203]]
+; CHECK-NEXT:    [[TMP205:%.*]] = add i64 [[SHADOW_PTR_INT25]], 16
+; CHECK-NEXT:    [[TMP206:%.*]] = inttoptr i64 [[TMP205]] to ptr
+; CHECK-NEXT:    [[TMP207:%.*]] = load ptr, ptr [[TMP206]], align 8
+; CHECK-NEXT:    [[TMP208:%.*]] = ptrtoint ptr [[TMP207]] to i64
+; CHECK-NEXT:    [[TMP209:%.*]] = icmp sge i64 [[TMP208]], 0
+; CHECK-NEXT:    [[TMP210:%.*]] = or i1 [[TMP204]], [[TMP209]]
+; CHECK-NEXT:    [[TMP211:%.*]] = add i64 [[SHADOW_PTR_INT25]], 24
+; CHECK-NEXT:    [[TMP212:%.*]] = inttoptr i64 [[TMP211]] to ptr
+; CHECK-NEXT:    [[TMP213:%.*]] = load ptr, ptr [[TMP212]], align 8
+; CHECK-NEXT:    [[TMP214:%.*]] = ptrtoint ptr [[TMP213]] to i64
+; CHECK-NEXT:    [[TMP215:%.*]] = icmp sge i64 [[TMP214]], 0
+; CHECK-NEXT:    [[TMP216:%.*]] = or i1 [[TMP210]], [[TMP215]]
+; CHECK-NEXT:    br i1 [[TMP216]], label [[TMP217:%.*]], label [[TMP218:%.*]], !prof [[PROF0]]
+; CHECK:       217:
+; CHECK-NEXT:    call void @__tysan_check(ptr [[A]], i32 4, ptr @__tysan_v1_int_o_0, i32 1)
+; CHECK-NEXT:    br label [[TMP218]]
+; CHECK:       218:
+; CHECK-NEXT:    br label [[TMP219]]
+; CHECK:       219:
+; CHECK-NEXT:    [[WAA:%.*]] = load i32, ptr [[A]], align 4, !tbaa [[TBAA1:![0-9]+]]
+; CHECK-NEXT:    ret i32 [[WAA]]
+;
+entry:
+  %WAA = load i32, ptr %a, align 4, !tbaa !3
+  ret i32 %WAA
+}
+
+define void @test_store(ptr %a) sanitize_type {
+; CHECK-LABEL: @test_store(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[APP_MEM_MASK2:%.*]] = load i64, ptr @__tysan_app_memory_mask, align 8
+; CHECK-NEXT:    [[SHADOW_BASE1:%.*]] = load i64, ptr @__tysan_shadow_memory_address, align 8
+; CHECK-NEXT:    [[APP_PTR_MASKED:%.*]] = and i64 ptrtoint (ptr @__tysan_app_memory_mask to i64), [[APP_MEM_MASK2]]
+; CHECK-NEXT:    [[APP_PTR_SHIFTED:%.*]] = shl i64 [[APP_PTR_MASKED]], 3
+; CHECK-NEXT:    [[SHADOW_PTR_INT:%.*]] = add i64 [[APP_PTR_SHIFTED]], [[SHADOW_BASE1]]
+; CHECK-NEXT:    [[SHADOW_PTR:%.*]] = inttoptr i64 [[SHADOW_PTR_INT]] to ptr
+; CHECK-NEXT:    [[SHADOW_DESC:%.*]] = load ptr, ptr [[SHADOW_PTR]], align 8
+; CHECK-NEXT:    [[BAD_DESC:%.*]] = icmp ne ptr [[SHADOW_DESC]], null
+; CHECK-NEXT:    br i1 [[BAD_DESC]], label [[TMP0:%.*]], label [[TMP42:%.*]], !prof [[PROF0]]
+; CHECK:       0:
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq ptr [[SHADOW_DESC]], null
+; CHECK-NEXT:    br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP40:%.*]]
+; CHECK:       2:
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[SHADOW_PTR_INT]], 8
+; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne ptr [[TMP5]], null
+; CHECK-NEXT:    [[TMP7:%.*]] = or i1 false, [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[SHADOW_PTR_INT]], 16
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne ptr [[TMP10]], null
+; CHECK-NEXT:    [[TMP12:%.*]] = or i1 [[TMP7]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[SHADOW_PTR_INT]], 24
+; CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
+; CHECK-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[TMP14]], align 8
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne ptr [[TMP15]], null
+; CHECK-NEXT:    [[TMP17:%.*]] = or i1 [[TMP12]], [[TMP16]]
+; CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[SHADOW_PTR_INT]], 32
+; CHECK-NEXT:    [[TMP19:%.*]] = inttoptr i64 [[TMP18]] to ptr
+; CHECK-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP19]], align 8
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne ptr [[TMP20]], null
+; CHECK-NEXT:    [[TMP22:%.*]] = or i1 [[TMP17]], [[TMP21]]
+; CHECK-NEXT:    [[TMP23:%.*]] = add i64 [[SHADOW_PTR_INT]], 40
+; CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+; CHECK-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[TMP24]], align 8
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp ne ptr [[TMP25]], null
+; CHECK-NEXT:    [[TMP27:%.*]] = or i1 [[TMP22]], [[TMP26]]
+; CHECK-NEXT:    [[TMP28:%.*]] = add i64 [[SHADOW_PTR_INT]], 48
+; CHECK-NEXT:    [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
+; CHECK-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[TMP29]], align 8
+; CHECK-NEXT:    [[TMP31:%.*]] = icmp ne ptr [[TMP30]], null
+; CHECK-NEXT:    [[TMP32:%.*]] = or i1 [[TMP27]], [[TMP31]]
+; CHECK-NEXT:    [[TMP33:%.*]] = add i64 [[SHADOW_PTR_INT]], 56
+; CHECK-NEXT:    [[TMP34:%.*]] = inttoptr i64 [[TMP33]] to ptr
+; CHECK-NEXT:    [[TMP35:%.*]] = load ptr, ptr [[TMP34]], align 8
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp ne ptr [[TMP35]], null
+; CHECK-NEXT:    [[TMP37:%.*]] = or i1 [[TMP32]], [[TMP36]]
+; CHECK-NEXT:    br i1 [[TMP37]], label [[TMP38:%.*]], label [[TMP39:%.*]], !prof [[PROF0]]
+; CHECK:       38:
+; CHECK-NEXT:    call void @__tysan_check(ptr @__tysan_app_memory_mask, i32 8, ptr null, i32 1)
+; CHECK-NEXT:    br label [[TMP39]]
+; CHECK:       39:
+; CHECK-NEXT:    store ptr null, ptr [[SHADOW_PTR]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_1_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 8
+; CHECK-NEXT:    [[SHADOW_BYTE_1_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_1_OFFSET]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -1 to ptr), ptr [[SHADOW_BYTE_1_PTR]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_2_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 16
+; CHECK-NEXT:    [[SHADOW_BYTE_2_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_2_OFFSET]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -2 to ptr), ptr [[SHADOW_BYTE_2_PTR]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_3_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 24
+; CHECK-NEXT:    [[SHADOW_BYTE_3_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_3_OFFSET]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -3 to ptr), ptr [[SHADOW_BYTE_3_PTR]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_4_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 32
+; CHECK-NEXT:    [[SHADOW_BYTE_4_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_4_OFFSET]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -4 to ptr), ptr [[SHADOW_BYTE_4_PTR]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_5_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 40
+; CHECK-NEXT:    [[SHADOW_BYTE_5_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_5_OFFSET]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -5 to ptr), ptr [[SHADOW_BYTE_5_PTR]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_6_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 48
+; CHECK-NEXT:    [[SHADOW_BYTE_6_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_6_OFFSET]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -6 to ptr), ptr [[SHADOW_BYTE_6_PTR]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_7_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 56
+; CHECK-NEXT:    [[SHADOW_BYTE_7_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_7_OFFSET]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -7 to ptr), ptr [[SHADOW_BYTE_7_PTR]], align 8
+; CHECK-NEXT:    br label [[TMP41:%.*]]
+; CHECK:       40:
+; CHECK-NEXT:    call void @__tysan_check(ptr @__tysan_app_memory_mask, i32 8, ptr null, i32 1)
+; CHECK-NEXT:    br label [[TMP41]]
+; CHECK:       41:
+; CHECK-NEXT:    br label [[TMP87:%.*]]
+; CHECK:       42:
+; CHECK-NEXT:    [[TMP43:%.*]] = add i64 [[SHADOW_PTR_INT]], 8
+; CHECK-NEXT:    [[TMP44:%.*]] = inttoptr i64 [[TMP43]] to ptr
+; CHECK-NEXT:    [[TMP45:%.*]] = load ptr, ptr [[TMP44]], align 8
+; CHECK-NEXT:    [[TMP46:%.*]] = ptrtoint ptr [[TMP45]] to i64
+; CHECK-NEXT:    [[TMP47:%.*]] = icmp sge i64 [[TMP46]], 0
+; CHECK-NEXT:    [[TMP48:%.*]] = or i1 false, [[TMP47]]
+; CHECK-NEXT:    [[TMP49:%.*]] = add i64 [[SHADOW_PTR_INT]], 16
+; CHECK-NEXT:    [[TMP50:%.*]] = inttoptr i64 [[TMP49]] to ptr
+; CHECK-NEXT:    [[TMP51:%.*]] = load ptr, ptr [[TMP50]], align 8
+; CHECK-NEXT:    [[TMP52:%.*]] = ptrtoint ptr [[TMP51]] to i64
+; CHECK-NEXT:    [[TMP53:%.*]] = icmp sge i64 [[TMP52]], 0
+; CHECK-NEXT:    [[TMP54:%.*]] = or i1 [[TMP48]], [[TMP53]]
+; CHECK-NEXT:    [[TMP55:%.*]] = add i64 [[SHADOW_PTR_INT]], 24
+; CHECK-NEXT:    [[TMP56:%.*]] = inttoptr i64 [[TMP55]] to ptr
+; CHECK-NEXT:    [[TMP57:%.*]] = load ptr, ptr [[TMP56]], align 8
+; CHECK-NEXT:    [[TMP58:%.*]] = ptrtoint ptr [[TMP57]] to i64
+; CHECK-NEXT:    [[TMP59:%.*]] = icmp sge i64 [[TMP58]], 0
+; CHECK-NEXT:    [[TMP60:%.*]] = or i1 [[TMP54]], [[TMP59]]
+; CHECK-NEXT:    [[TMP61:%.*]] = add i64 [[SHADOW_PTR_INT]], 32
+; CHECK-NEXT:    [[TMP62:%.*]] = inttoptr i64 [[TMP61]] to ptr
+; CHECK-NEXT:    [[TMP63:%.*]] = load ptr, ptr [[TMP62]], align 8
+; CHECK-NEXT:    [[TMP64:%.*]] = ptrtoint ptr [[TMP63]] to i64
+; CHECK-NEXT:    [[TMP65:%.*]] = icmp sge i64 [[TMP64]], 0
+; CHECK-NEXT:    [[TMP66:%.*]] = or i1 [[TMP60]], [[TMP65]]
+; CHECK-NEXT:    [[TMP67:%.*]] = add i64 [[SHADOW_PTR_INT]], 40
+; CHECK-NEXT:    [[TMP68:%.*]] = inttoptr i64 [[TMP67]] to ptr
+; CHECK-NEXT:    [[TMP69:%.*]] = load ptr, ptr [[TMP68]], align 8
+; CHECK-NEXT:    [[TMP70:%.*]] = ptrtoint ptr [[TMP69]] to i64
+; CHECK-NEXT:    [[TMP71:%.*]] = icmp sge i64 [[TMP70]], 0
+; CHECK-NEXT:    [[TMP72:%.*]] = or i1 [[TMP66]], [[TMP71]]
+; CHECK-NEXT:    [[TMP73:%.*]] = add i64 [[SHADOW_PTR_INT]], 48
+; CHECK-NEXT:    [[TMP74:%.*]] = inttoptr i64 [[TMP73]] to ptr
+; CHECK-NEXT:    [[TMP75:%.*]] = load ptr, ptr [[TMP74]], align 8
+; CHECK-NEXT:    [[TMP76:%.*]] = ptrtoint ptr [[TMP75]] to i64
+; CHECK-NEXT:    [[TMP77:%.*]] = icmp sge i64 [[TMP76]], 0
+; CHECK-NEXT:    [[TMP78:%.*]] = or i1 [[TMP72]], [[TMP77]]
+; CHECK-NEXT:    [[TMP79:%.*]] = add i64 [[SHADOW_PTR_INT]], 56
+; CHECK-NEXT:    [[TMP80:%.*]] = inttoptr i64 [[TMP79]] to ptr
+; CHECK-NEXT:    [[TMP81:%.*]] = load ptr, ptr [[TMP80]], align 8
+; CHECK-NEXT:    [[TMP82:%.*]] = ptrtoint ptr [[TMP81]] to i64
+; CHECK-NEXT:    [[TMP83:%.*]] = icmp sge i64 [[TMP82]], 0
+; CHECK-NEXT:    [[TMP84:%.*]] = or i1 [[TMP78]], [[TMP83]]
+; CHECK-NEXT:    br i1 [[TMP84]], label [[TMP85:%.*]], label [[TMP86:%.*]], !prof [[PROF0]]
+; CHECK:       85:
+; CHECK-NEXT:    call void @__tysan_check(ptr @__tysan_app_memory_mask, i32 8, ptr null, i32 1)
+; CHECK-NEXT:    br label [[TMP86]]
+; CHECK:       86:
+; CHECK-NEXT:    br label [[TMP87]]
+; CHECK:       87:
+; CHECK-NEXT:    [[APP_MEM_MASK:%.*]] = load i64, ptr @__tysan_app_memory_mask, align 8
+; CHECK-NEXT:    [[APP_PTR_MASKED3:%.*]] = and i64 ptrtoint (ptr @__tysan_shadow_memory_address to i64), [[APP_MEM_MASK2]]
+; CHECK-NEXT:    [[APP_PTR_SHIFTED4:%.*]] = shl i64 [[APP_PTR_MASKED3]], 3
+; CHECK-NEXT:    [[SHADOW_PTR_INT5:%.*]] = add i64 [[APP_PTR_SHIFTED4]], [[SHADOW_BASE1]]
+; CHECK-NEXT:    [[SHADOW_PTR6:%.*]] = inttoptr i64 [[SHADOW_PTR_INT5]] to ptr
+; CHECK-NEXT:    [[SHADOW_DESC7:%.*]] = load ptr, ptr [[SHADOW_PTR6]], align 8
+; CHECK-NEXT:    [[BAD_DESC8:%.*]] = icmp ne ptr [[SHADOW_DESC7]], null
+; CHECK-NEXT:    br i1 [[BAD_DESC8]], label [[TMP88:%.*]], label [[TMP130:%.*]], !prof [[PROF0]]
+; CHECK:       88:
+; CHECK-NEXT:    [[TMP89:%.*]] = icmp eq ptr [[SHADOW_DESC7]], null
+; CHECK-NEXT:    br i1 [[TMP89]], label [[TMP90:%.*]], label [[TMP128:%.*]]
+; CHECK:       90:
+; CHECK-NEXT:    [[TMP91:%.*]] = add i64 [[SHADOW_PTR_INT5]], 8
+; CHECK-NEXT:    [[TMP92:%.*]] = inttoptr i64 [[TMP91]] to ptr
+; CHECK-NEXT:    [[TMP93:%.*]] = load ptr, ptr [[TMP92]], align 8
+; CHECK-NEXT:    [[TMP94:%.*]] = icmp ne ptr [[TMP93]], null
+; CHECK-NEXT:    [[TMP95:%.*]] = or i1 false, [[TMP94]]
+; CHECK-NEXT:    [[TMP96:%.*]] = add i64 [[SHADOW_PTR_INT5]], 16
+; CHECK-NEXT:    [[TMP97:%.*]] = inttoptr i64 [[TMP96]] to ptr
+; CHECK-NEXT:    [[TMP98:%.*]] = load ptr, ptr [[TMP97]], align 8
+; CHECK-NEXT:    [[TMP99:%.*]] = icmp ne ptr [[TMP98]], null
+; CHECK-NEXT:    [[TMP100:%.*]] = or i1 [[TMP95]], [[TMP99]]
+; CHECK-NEXT:    [[TMP101:%.*]] = add i64 [[SHADOW_PTR_INT5]], 24
+; CHECK-NEXT:    [[TMP102:%.*]] = inttoptr i64 [[TMP101]] to ptr
+; CHECK-NEXT:    [[TMP103:%.*]] = load ptr, ptr [[TMP102]], align 8
+; CHECK-NEXT:    [[TMP104:%.*]] = icmp ne ptr [[TMP103]], null
+; CHECK-NEXT:    [[TMP105:%.*]] = or i1 [[TMP100]], [[TMP104]]
+; CHECK-NEXT:    [[TMP106:%.*]] = add i64 [[SHADOW_PTR_INT5]], 32
+; CHECK-NEXT:    [[TMP107:%.*]] = inttoptr i64 [[TMP106]] to ptr
+; CHECK-NEXT:    [[TMP108:%.*]] = load ptr, ptr [[TMP107]], align 8
+; CHECK-NEXT:    [[TMP109:%.*]] = icmp ne ptr [[TMP108]], null
+; CHECK-NEXT:    [[TMP110:%.*]] = or i1 [[TMP105]], [[TMP109]]
+; CHECK-NEXT:    [[TMP111:%.*]] = add i64 [[SHADOW_PTR_INT5]], 40
+; CHECK-NEXT:    [[TMP112:%.*]] = inttoptr i64 [[TMP111]] to ptr
+; CHECK-NEXT:    [[TMP113:%.*]] = load ptr, ptr [[TMP112]], align 8
+; CHECK-NEXT:    [[TMP114:%.*]] = icmp ne ptr [[TMP113]], null
+; CHECK-NEXT:    [[TMP115:%.*]] = or i1 [[TMP110]], [[TMP114]]
+; CHECK-NEXT:    [[TMP116:%.*]] = add i64 [[SHADOW_PTR_INT5]], 48
+; CHECK-NEXT:    [[TMP117:%.*]] = inttoptr i64 [[TMP116]] to ptr
+; CHECK-NEXT:    [[TMP118:%.*]] = load ptr, ptr [[TMP117]], align 8
+; CHECK-NEXT:    [[TMP119:%.*]] = icmp ne ptr [[TMP118]], null
+; CHECK-NEXT:    [[TMP120:%.*]] = or i1 [[TMP115]], [[TMP119]]
+; CHECK-NEXT:    [[TMP121:%.*]] = add i64 [[SHADOW_PTR_INT5]], 56
+; CHECK-NEXT:    [[TMP122:%.*]] = inttoptr i64 [[TMP121]] to ptr
+; CHECK-NEXT:    [[TMP123:%.*]] = load ptr, ptr [[TMP122]], align 8
+; CHECK-NEXT:    [[TMP124:%.*]] = icmp ne ptr [[TMP123]], null
+; CHECK-NEXT:    [[TMP125:%.*]] = or i1 [[TMP120]], [[TMP124]]
+; CHECK-NEXT:    br i1 [[TMP125]], label [[TMP126:%.*]], label [[TMP127:%.*]], !prof [[PROF0]]
+; CHECK:       126:
+; CHECK-NEXT:    call void @__tysan_check(ptr @__tysan_shadow_memory_address, i32 8, ptr null, i32 1)
+; CHECK-NEXT:    br label [[TMP127]]
+; CHECK:       127:
+; CHECK-NEXT:    store ptr null, ptr [[SHADOW_PTR6]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_1_OFFSET9:%.*]] = add i64 [[SHADOW_PTR_INT5]], 8
+; CHECK-NEXT:    [[SHADOW_BYTE_1_PTR10:%.*]] = inttoptr i64 [[SHADOW_BYTE_1_OFFSET9]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -1 to ptr), ptr [[SHADOW_BYTE_1_PTR10]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_2_OFFSET11:%.*]] = add i64 [[SHADOW_PTR_INT5]], 16
+; CHECK-NEXT:    [[SHADOW_BYTE_2_PTR12:%.*]] = inttoptr i64 [[SHADOW_BYTE_2_OFFSET11]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -2 to ptr), ptr [[SHADOW_BYTE_2_PTR12]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_3_OFFSET13:%.*]] = add i64 [[SHADOW_PTR_INT5]], 24
+; CHECK-NEXT:    [[SHADOW_BYTE_3_PTR14:%.*]] = inttoptr i64 [[SHADOW_BYTE_3_OFFSET13]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -3 to ptr), ptr [[SHADOW_BYTE_3_PTR14]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_4_OFFSET15:%.*]] = add i64 [[SHADOW_PTR_INT5]], 32
+; CHECK-NEXT:    [[SHADOW_BYTE_4_PTR16:%.*]] = inttoptr i64 [[SHADOW_BYTE_4_OFFSET15]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -4 to ptr), ptr [[SHADOW_BYTE_4_PTR16]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_5_OFFSET17:%.*]] = add i64 [[SHADOW_PTR_INT5]], 40
+; CHECK-NEXT:    [[SHADOW_BYTE_5_PTR18:%.*]] = inttoptr i64 [[SHADOW_BYTE_5_OFFSET17]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -5 to ptr), ptr [[SHADOW_BYTE_5_PTR18]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_6_OFFSET19:%.*]] = add i64 [[SHADOW_PTR_INT5]], 48
+; CHECK-NEXT:    [[SHADOW_BYTE_6_PTR20:%.*]] = inttoptr i64 [[SHADOW_BYTE_6_OFFSET19]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -6 to ptr), ptr [[SHADOW_BYTE_6_PTR20]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_7_OFFSET21:%.*]] = add i64 [[SHADOW_PTR_INT5]], 56
+; CHECK-NEXT:    [[SHADOW_BYTE_7_PTR22:%.*]] = inttoptr i64 [[SHADOW_BYTE_7_OFFSET21]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -7 to ptr), ptr [[SHADOW_BYTE_7_PTR22]], align 8
+; CHECK-NEXT:    br label [[TMP129:%.*]]
+; CHECK:       128:
+; CHECK-NEXT:    call void @__tysan_check(ptr @__tysan_shadow_memory_address, i32 8, ptr null, i32 1)
+; CHECK-NEXT:    br label [[TMP129]]
+; CHECK:       129:
+; CHECK-NEXT:    br label [[TMP175:%.*]]
+; CHECK:       130:
+; CHECK-NEXT:    [[TMP131:%.*]] = add i64 [[SHADOW_PTR_INT5]], 8
+; CHECK-NEXT:    [[TMP132:%.*]] = inttoptr i64 [[TMP131]] to ptr
+; CHECK-NEXT:    [[TMP133:%.*]] = load ptr, ptr [[TMP132]], align 8
+; CHECK-NEXT:    [[TMP134:%.*]] = ptrtoint ptr [[TMP133]] to i64
+; CHECK-NEXT:    [[TMP135:%.*]] = icmp sge i64 [[TMP134]], 0
+; CHECK-NEXT:    [[TMP136:%.*]] = or i1 false, [[TMP135]]
+; CHECK-NEXT:    [[TMP137:%.*]] = add i64 [[SHADOW_PTR_INT5]], 16
+; CHECK-NEXT:    [[TMP138:%.*]] = inttoptr i64 [[TMP137]] to ptr
+; CHECK-NEXT:    [[TMP139:%.*]] = load ptr, ptr [[TMP138]], align 8
+; CHECK-NEXT:    [[TMP140:%.*]] = ptrtoint ptr [[TMP139]] to i64
+; CHECK-NEXT:    [[TMP141:%.*]] = icmp sge i64 [[TMP140]], 0
+; CHECK-NEXT:    [[TMP142:%.*]] = or i1 [[TMP136]], [[TMP141]]
+; CHECK-NEXT:    [[TMP143:%.*]] = add i64 [[SHADOW_PTR_INT5]], 24
+; CHECK-NEXT:    [[TMP144:%.*]] = inttoptr i64 [[TMP143]] to ptr
+; CHECK-NEXT:    [[TMP145:%.*]] = load ptr, ptr [[TMP144]], align 8
+; CHECK-NEXT:    [[TMP146:%.*]] = ptrtoint ptr [[TMP145]] to i64
+; CHECK-NEXT:    [[TMP147:%.*]] = icmp sge i64 [[TMP146]], 0
+; CHECK-NEXT:    [[TMP148:%.*]] = or i1 [[TMP142]], [[TMP147]]
+; CHECK-NEXT:    [[TMP149:%.*]] = add i64 [[SHADOW_PTR_INT5]], 32
+; CHECK-NEXT:    [[TMP150:%.*]] = inttoptr i64 [[TMP149]] to ptr
+; CHECK-NEXT:    [[TMP151:%.*]] = load ptr, ptr [[TMP150]], align 8
+; CHECK-NEXT:    [[TMP152:%.*]] = ptrtoint ptr [[TMP151]] to i64
+; CHECK-NEXT:    [[TMP153:%.*]] = icmp sge i64 [[TMP152]], 0
+; CHECK-NEXT:    [[TMP154:%.*]] = or i1 [[TMP148]], [[TMP153]]
+; CHECK-NEXT:    [[TMP155:%.*]] = add i64 [[SHADOW_PTR_INT5]], 40
+; CHECK-NEXT:    [[TMP156:%.*]] = inttoptr i64 [[TMP155]] to ptr
+; CHECK-NEXT:    [[TMP157:%.*]] = load ptr, ptr [[TMP156]], align 8
+; CHECK-NEXT:    [[TMP158:%.*]] = ptrtoint ptr [[TMP157]] to i64
+; CHECK-NEXT:    [[TMP159:%.*]] = icmp sge i64 [[TMP158]], 0
+; CHECK-NEXT:    [[TMP160:%.*]] = or i1 [[TMP154]], [[TMP159]]
+; CHECK-NEXT:    [[TMP161:%.*]] = add i64 [[SHADOW_PTR_INT5]], 48
+; CHECK-NEXT:    [[TMP162:%.*]] = inttoptr i64 [[TMP161]] to ptr
+; CHECK-NEXT:    [[TMP163:%.*]] = load ptr, ptr [[TMP162]], align 8
+; CHECK-NEXT:    [[TMP164:%.*]] = ptrtoint ptr [[TMP163]] to i64
+; CHECK-NEXT:    [[TMP165:%.*]] = icmp sge i64 [[TMP164]], 0
+; CHECK-NEXT:    [[TMP166:%.*]] = or i1 [[TMP160]], [[TMP165]]
+; CHECK-NEXT:    [[TMP167:%.*]] = add i64 [[SHADOW_PTR_INT5]], 56
+; CHECK-NEXT:    [[TMP168:%.*]] = inttoptr i64 [[TMP167]] to ptr
+; CHECK-NEXT:    [[TMP169:%.*]] = load ptr, ptr [[TMP168]], align 8
+; CHECK-NEXT:    [[TMP170:%.*]] = ptrtoint ptr [[TMP169]] to i64
+; CHECK-NEXT:    [[TMP171:%.*]] = icmp sge i64 [[TMP170]], 0
+; CHECK-NEXT:    [[TMP172:%.*]] = or i1 [[TMP166]], [[TMP171]]
+; CHECK-NEXT:    br i1 [[TMP172]], label [[TMP173:%.*]], label [[TMP174:%.*]], !prof [[PROF0]]
+; CHECK:       173:
+; CHECK-NEXT:    call void @__tysan_check(ptr @__tysan_shadow_memory_address, i32 8, ptr null, i32 1)
+; CHECK-NEXT:    br label [[TMP174]]
+; CHECK:       174:
+; CHECK-NEXT:    br label [[TMP175]]
+; CHECK:       175:
+; CHECK-NEXT:    [[SHADOW_BASE:%.*]] = load i64, ptr @__tysan_shadow_memory_address, align 8
+; CHECK-NEXT:    call void @__tysan_instrument_with_shadow_update(ptr [[A:%.*]], ptr @__tysan_v1___ZTS1v_o_12, i1 true, i64 4, i32 2)
+; CHECK-NEXT:    [[APP_PTR_INT:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[APP_PTR_MASKED23:%.*]] = and i64 [[APP_PTR_INT]], [[APP_MEM_MASK2]]
+; CHECK-NEXT:    [[APP_PTR_SHIFTED24:%.*]] = shl i64 [[APP_PTR_MASKED23]], 3
+; CHECK-NEXT:    [[SHADOW_PTR_INT25:%.*]] = add i64 [[APP_PTR_SHIFTED24]], [[SHADOW_BASE1]]
+; CHECK-NEXT:    [[SHADOW_PTR26:%.*]] = inttoptr i64 [[SHADOW_PTR_INT25]] to ptr
+; CHECK-NEXT:    [[SHADOW_DESC27:%.*]] = load ptr, ptr [[SHADOW_PTR26]], align 8
+; CHECK-NEXT:    [[BAD_DESC28:%.*]] = icmp ne ptr [[SHADOW_DESC27]], @__tysan_v1___ZTS1v_o_12
+; CHECK-NEXT:    br i1 [[BAD_DESC28]], label [[TMP176:%.*]], label [[TMP198:%.*]], !prof [[PROF0]]
+; CHECK:       176:
+; CHECK-NEXT:    [[TMP177:%.*]] = icmp eq ptr [[SHADOW_DESC27]], null
+; CHECK-NEXT:    br i1 [[TMP177]], label [[TMP178:%.*]], label [[TMP196:%.*]]
+; CHECK:       178:
+; CHECK-NEXT:    [[TMP179:%.*]] = add i64 [[SHADOW_PTR_INT25]], 8
+; CHECK-NEXT:    [[TMP180:%.*]] = inttoptr i64 [[TMP179]] to ptr
+; CHECK-NEXT:    [[TMP181:%.*]] = load ptr, ptr [[TMP180]], align 8
+; CHECK-NEXT:    [[TMP182:%.*]] = icmp ne ptr [[TMP181]], null
+; CHECK-NEXT:    [[TMP183:%.*]] = or i1 false, [[TMP182]]
+; CHECK-NEXT:    [[TMP184:%.*]] = add i64 [[SHADOW_PTR_INT25]], 16
+; CHECK-NEXT:    [[TMP185:%.*]] = inttoptr i64 [[TMP184]] to ptr
+; CHECK-NEXT:    [[TMP186:%.*]] = load ptr, ptr [[TMP185]], align 8
+; CHECK-NEXT:    [[TMP187:%.*]] = icmp ne ptr [[TMP186]], null
+; CHECK-NEXT:    [[TMP188:%.*]] = or i1 [[TMP183]], [[TMP187]]
+; CHECK-NEXT:    [[TMP189:%.*]] = add i64 [[SHADOW_PTR_INT25]], 24
+; CHECK-NEXT:    [[TMP190:%.*]] = inttoptr i64 [[TMP189]] to ptr
+; CHECK-NEXT:    [[TMP191:%.*]] = load ptr, ptr [[TMP190]], align 8
+; CHECK-NEXT:    [[TMP192:%.*]] = icmp ne ptr [[TMP191]], null
+; CHECK-NEXT:    [[TMP193:%.*]] = or i1 [[TMP188]], [[TMP192]]
+; CHECK-NEXT:    br i1 [[TMP193]], label [[TMP194:%.*]], label [[TMP195:%.*]], !prof [[PROF0]]
+; CHECK:       194:
+; CHECK-NEXT:    call void @__tysan_check(ptr [[A]], i32 4, ptr @__tysan_v1___ZTS1v_o_12, i32 2)
+; CHECK-NEXT:    br label [[TMP195]]
+; CHECK:       195:
+; CHECK-NEXT:    store ptr @__tysan_v1___ZTS1v_o_12, ptr [[SHADOW_PTR26]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_1_OFFSET29:%.*]] = add i64 [[SHADOW_PTR_INT25]], 8
+; CHECK-NEXT:    [[SHADOW_BYTE_1_PTR30:%.*]] = inttoptr i64 [[SHADOW_BYTE_1_OFFSET29]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -1 to ptr), ptr [[SHADOW_BYTE_1_PTR30]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_2_OFFSET31:%.*]] = add i64 [[SHADOW_PTR_INT25]], 16
+; CHECK-NEXT:    [[SHADOW_BYTE_2_PTR32:%.*]] = inttoptr i64 [[SHADOW_BYTE_2_OFFSET31]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -2 to ptr), ptr [[SHADOW_BYTE_2_PTR32]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_3_OFFSET33:%.*]] = add i64 [[SHADOW_PTR_INT25]], 24
+; CHECK-NEXT:    [[SHADOW_BYTE_3_PTR34:%.*]] = inttoptr i64 [[SHADOW_BYTE_3_OFFSET33]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -3 to ptr), ptr [[SHADOW_BYTE_3_PTR34]], align 8
+; CHECK-NEXT:    br label [[TMP197:%.*]]
+; CHECK:       196:
+; CHECK-NEXT:    call void @__tysan_check(ptr [[A]], i32 4, ptr @__tysan_v1___ZTS1v_o_12, i32 2)
+; CHECK-NEXT:    br label [[TMP197]]
+; CHECK:       197:
+; CHECK-NEXT:    br label [[TMP219:%.*]]
+; CHECK:       198:
+; CHECK-NEXT:    [[TMP199:%.*]] = add i64 [[SHADOW_PTR_INT25]], 8
+; CHECK-NEXT:    [[TMP200:%.*]] = inttoptr i64 [[TMP199]] to ptr
+; CHECK-NEXT:    [[TMP201:%.*]] = load ptr, ptr [[TMP200]], align 8
+; CHECK-NEXT:    [[TMP202:%.*]] = ptrtoint ptr [[TMP201]] to i64
+; CHECK-NEXT:    [[TMP203:%.*]] = icmp sge i64 [[TMP202]], 0
+; CHECK-NEXT:    [[TMP204:%.*]] = or i1 false, [[TMP203]]
+; CHECK-NEXT:    [[TMP205:%.*]] = add i64 [[SHADOW_PTR_INT25]], 16
+; CHECK-NEXT:    [[TMP206:%.*]] = inttoptr i64 [[TMP205]] to ptr
+; CHECK-NEXT:    [[TMP207:%.*]] = load ptr, ptr [[TMP206]], align 8
+; CHECK-NEXT:    [[TMP208:%.*]] = ptrtoint ptr [[TMP207]] to i64
+; CHECK-NEXT:    [[TMP209:%.*]] = icmp sge i64 [[TMP208]], 0
+; CHECK-NEXT:    [[TMP210:%.*]] = or i1 [[TMP204]], [[TMP209]]
+; CHECK-NEXT:    [[TMP211:%.*]] = add i64 [[SHADOW_PTR_INT25]], 24
+; CHECK-NEXT:    [[TMP212:%.*]] = inttoptr i64 [[TMP211]] to ptr
+; CHECK-NEXT:    [[TMP213:%.*]] = load ptr, ptr [[TMP212]], align 8
+; CHECK-NEXT:    [[TMP214:%.*]] = ptrtoint ptr [[TMP213]] to i64
+; CHECK-NEXT:    [[TMP215:%.*]] = icmp sge i64 [[TMP214]], 0
+; CHECK-NEXT:    [[TMP216:%.*]] = or i1 [[TMP210]], [[TMP215]]
+; CHECK-NEXT:    br i1 [[TMP216]], label [[TMP217:%.*]], label [[TMP218:%.*]], !prof [[PROF0]]
+; CHECK:       217:
+; CHECK-NEXT:    call void @__tysan_check(ptr [[A]], i32 4, ptr @__tysan_v1___ZTS1v_o_12, i32 2)
+; CHECK-NEXT:    br label [[TMP218]]
+; CHECK:       218:
+; CHECK-NEXT:    br label [[TMP219]]
+; CHECK:       219:
+; CHECK-NEXT:    store i32 42, ptr [[A]], align 4, !tbaa [[TBAA5:![0-9]+]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  store i32 42, ptr %a, align 4, !tbaa !6
+  ret void
+}
+
+!0 = !{!"Simple C++ TBAA"}
+!1 = !{!"omnipotent char", !0, i64 0}
+!2 = !{!"int", !1, i64 0}
+!3 = !{!2, !2, i64 0}
+!4 = !{!"_ZTS1x", !2, i64 0, !2, i64 4}
+!5 = !{!"_ZTS1v", !2, i64 8, !2, i64 12, !4, i64 16}
+!6 = !{!5, !2, i64 12}
+;.
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { sanitize_type }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { nounwind }
+;.
+; CHECK: [[PROF0]] = !{!"branch_weights", i32 1, i32 100000}
+; CHECK: [[TBAA1]] = !{[[META2:![0-9]+]], [[META2]], i64 0}
+; CHECK: [[META2]] = !{!"int", [[META3:![0-9]+]], i64 0}
+; CHECK: [[META3]] = !{!"omnipotent char", [[META4:![0-9]+]], i64 0}
+; CHECK: [[META4]] = !{!"Simple C++ TBAA"}
+; CHECK: [[TBAA5]] = !{[[META6:![0-9]+]], [[META2]], i64 12}
+; CHECK: [[META6]] = !{!"_ZTS1v", [[META2]], i64 8, [[META2]], i64 12, [[META7:![0-9]+]], i64 16}
+; CHECK: [[META7]] = !{!"_ZTS1x", [[META2]], i64 0, [[META2]], i64 4}
+;.
diff --git a/llvm/test/Instrumentation/TypeSanitizer/globals_outlined.ll b/llvm/test/Instrumentation/TypeSanitizer/globals_outlined.ll
new file mode 100644
index 0000000000000..0bd7940467415
--- /dev/null
+++ b/llvm/test/Instrumentation/TypeSanitizer/globals_outlined.ll
@@ -0,0 +1,24 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals --include-generated-funcs
+; RUN: opt -passes='tysan' -tysan-outline-instrumentation -S %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+@global1 = global i32 0, align 4
+@global2 = global i32 0, align 4
+
+
+; CHECK-LABEL: define internal void @__tysan_set_globals_types(
+; CHECK-NEXT:     %app.mem.mask = load i64, ptr @__tysan_app_memory_mask, align 8
+; CHECK-NEXT:     %shadow.base = load i64, ptr @__tysan_shadow_memory_address, align 8
+; CHECK-NEXT:     call void @__tysan_set_shadow_type(ptr @global1, ptr @__tysan_v1_int, i64 4)
+; CHECK-NEXT:     call void @__tysan_set_shadow_type(ptr @global1, ptr @__tysan_v1_int, i64 4)
+; CHECK-NEXT:     ret void
+; CHECK-NEXT:   }
+
+!llvm.tysan.globals = !{!13, !14}
+
+!0 = !{!"Simple C++ TBAA"}
+!1 = !{!"omnipotent char", !0, i64 0}
+!2 = !{!"int", !1, i64 0}
+!13 = !{ptr @global1, !2}
+!14 = !{ptr @global1, !2}
diff --git a/llvm/test/MC/AMDGPU/buffer-op-swz-operand.s b/llvm/test/MC/AMDGPU/buffer-op-swz-operand.s
index 8bd91484d149c..4542027b0df90 100644
--- a/llvm/test/MC/AMDGPU/buffer-op-swz-operand.s
+++ b/llvm/test/MC/AMDGPU/buffer-op-swz-operand.s
@@ -2,7 +2,7 @@
 
 // CHECK: .amdgcn_target "amdgcn-amd-amdhsa--gfx1100"
 buffer_load_dwordx4 v[0:3], v0, s[0:3], 0, offen offset:4092 slc
-// CHECK: buffer_load_b128 v[0:3], v0, s[0:3], 0 offen offset:4092 slc ; <MCInst #13135 BUFFER_LOAD_DWORDX4_OFFEN_gfx11
+// CHECK: buffer_load_b128 v[0:3], v0, s[0:3], 0 offen offset:4092 slc ; <MCInst #{{[0-9]+}} BUFFER_LOAD_DWORDX4_OFFEN_gfx11
 // CHECK-NEXT: ;  <MCOperand Reg:10104>
 // CHECK-NEXT: ;  <MCOperand Reg:486>
 // CHECK-NEXT: ;  <MCOperand Reg:7754>
@@ -11,7 +11,7 @@ buffer_load_dwordx4 v[0:3], v0, s[0:3], 0, offen offset:4092 slc
 // CHECK-NEXT: ;  <MCOperand Imm:2>
 // CHECK-NEXT: ;  <MCOperand Imm:0>>
 buffer_store_dword v0, v1, s[0:3], 0 offen slc
-// CHECK: buffer_store_b32 v0, v1, s[0:3], 0 offen slc ; <MCInst #14553 BUFFER_STORE_DWORD_OFFEN_gfx11
+// CHECK: buffer_store_b32 v0, v1, s[0:3], 0 offen slc ; <MCInst #{{[0-9]+}} BUFFER_STORE_DWORD_OFFEN_gfx11
 // CHECK-NEXT: ;  <MCOperand Reg:486>
 // CHECK-NEXT: ;  <MCOperand Reg:487>
 // CHECK-NEXT: ;  <MCOperand Reg:7754>
@@ -22,7 +22,7 @@ buffer_store_dword v0, v1, s[0:3], 0 offen slc
 
 ; tbuffer ops use autogenerate asm parsers
 tbuffer_load_format_xyzw v[0:3], v0, s[0:3], 0 format:[BUF_FMT_32_32_SINT] offen offset:4092 slc
-// CHECK: tbuffer_load_format_xyzw v[0:3], v0, s[0:3], 0 format:[BUF_FMT_32_32_SINT] offen offset:4092 slc ; <MCInst #34095 TBUFFER_LOAD_FORMAT_XYZW_OFFEN_gfx11
+// CHECK: tbuffer_load_format_xyzw v[0:3], v0, s[0:3], 0 format:[BUF_FMT_32_32_SINT] offen offset:4092 slc ; <MCInst #{{[0-9]+}} TBUFFER_LOAD_FORMAT_XYZW_OFFEN_gfx11
 // CHECK-NEXT: ;  <MCOperand Reg:10104>
 // CHECK-NEXT: ;  <MCOperand Reg:486>
 // CHECK-NEXT: ;  <MCOperand Reg:7754>
@@ -32,7 +32,7 @@ tbuffer_load_format_xyzw v[0:3], v0, s[0:3], 0 format:[BUF_FMT_32_32_SINT] offen
 // CHECK-NEXT: ;  <MCOperand Imm:2>
 // CHECK-NEXT: ;  <MCOperand Imm:0>>
 tbuffer_store_d16_format_x v0, v1, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] offen slc
-// CHECK: tbuffer_store_d16_format_x v0, v1, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] offen slc ; <MCInst #34264 TBUFFER_STORE_FORMAT_D16_X_OFFEN_gfx11
+// CHECK: tbuffer_store_d16_format_x v0, v1, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] offen slc ; <MCInst #{{[0-9]+}} TBUFFER_STORE_FORMAT_D16_X_OFFEN_gfx11
 // CHECK-NEXT: ;  <MCOperand Reg:486>
 // CHECK-NEXT: ;  <MCOperand Reg:487>
 // CHECK-NEXT: ;  <MCOperand Reg:7754>
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vimage.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vimage.s
index fec8ba19f93fe..0a480a73cde5b 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vimage.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vimage.s
@@ -2,33 +2,33 @@
 ; RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: --strict-whitespace %s
 
 tensor_load_to_lds s[0:3], s[4:11]
-// GFX1250: tensor_load_to_lds s[0:3], s[4:11] ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x00,0x00,0x00,0x04,0x7c,0x7c]
+// GFX1250: tensor_load_to_lds s[0:3], s[4:11] ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x00,0x7c,0x00,0x04,0x7c,0x7c]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
 
 tensor_load_to_lds s[0:3], s[4:11] th:TH_LOAD_BYPASS scope:SCOPE_SYS
-// GFX1250: tensor_load_to_lds s[0:3], s[4:11] th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x3c,0x00,0x00,0x04,0x7c,0x7c]
+// GFX1250: tensor_load_to_lds s[0:3], s[4:11] th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x3c,0x7c,0x00,0x04,0x7c,0x7c]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
 
 tensor_load_to_lds s[0:3], s[4:11], s[12:15], s[16:19]
-// GFX1250: tensor_load_to_lds s[0:3], s[4:11], s[12:15], s[16:19] ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x00,0x00,0x00,0x04,0x0c,0x10]
+// GFX1250: tensor_load_to_lds s[0:3], s[4:11], s[12:15], s[16:19] ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x00,0x7c,0x00,0x04,0x0c,0x10]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
 
 tensor_load_to_lds s[0:3], s[4:11], s[12:15], s[16:19] th:TH_LOAD_NT_HT scope:SCOPE_DEV
-// GFX1250: tensor_load_to_lds s[0:3], s[4:11], s[12:15], s[16:19] th:TH_LOAD_NT_HT scope:SCOPE_DEV ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x68,0x00,0x00,0x04,0x0c,0x10]
+// GFX1250: tensor_load_to_lds s[0:3], s[4:11], s[12:15], s[16:19] th:TH_LOAD_NT_HT scope:SCOPE_DEV ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x68,0x7c,0x00,0x04,0x0c,0x10]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
 
 tensor_store_from_lds s[0:3], s[4:11]
-// GFX1250: tensor_store_from_lds s[0:3], s[4:11] ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x00,0x00,0x00,0x04,0x7c,0x7c]
+// GFX1250: tensor_store_from_lds s[0:3], s[4:11] ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x00,0x7c,0x00,0x04,0x7c,0x7c]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
 
 tensor_store_from_lds s[0:3], s[4:11] th:TH_STORE_BYPASS scope:SCOPE_SYS
-// GFX1250: tensor_store_from_lds s[0:3], s[4:11] th:TH_STORE_BYPASS scope:SCOPE_SYS ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x3c,0x00,0x00,0x04,0x7c,0x7c]
+// GFX1250: tensor_store_from_lds s[0:3], s[4:11] th:TH_STORE_BYPASS scope:SCOPE_SYS ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x3c,0x7c,0x00,0x04,0x7c,0x7c]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
 
 tensor_store_from_lds s[0:3], s[4:11], s[12:15], s[16:19]
-// GFX1250: tensor_store_from_lds s[0:3], s[4:11], s[12:15], s[16:19] ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x00,0x00,0x00,0x04,0x0c,0x10]
+// GFX1250: tensor_store_from_lds s[0:3], s[4:11], s[12:15], s[16:19] ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x00,0x7c,0x00,0x04,0x0c,0x10]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
 
 tensor_store_from_lds s[0:3], s[4:11], s[12:15], s[16:19] th:TH_STORE_NT_HT scope:SCOPE_DEV
-// GFX1250: tensor_store_from_lds s[0:3], s[4:11], s[12:15], s[16:19] th:TH_STORE_NT_HT scope:SCOPE_DEV ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x68,0x00,0x00,0x04,0x0c,0x10]
+// GFX1250: tensor_store_from_lds s[0:3], s[4:11], s[12:15], s[16:19] th:TH_STORE_NT_HT scope:SCOPE_DEV ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x68,0x7c,0x00,0x04,0x0c,0x10]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s
index d85ea799ed3d7..399a6441629ca 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --unique --version 5
 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,+real-true16 -show-encoding -comment-column=0 %s | FileCheck --strict-whitespace --check-prefixes=GFX12,GFX12-ASM %s
-// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,+real-true16 -show-encoding %s | sed -n 's#.*\(\[0x[0-9a-fx,]\{1,\}\]\)#\1#p' | llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,+real-true16 -disassemble -show-encoding -comment-column=0 | FileCheck --strict-whitespace --check-prefixes=GFX12,GFX12-DIS %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,+real-true16 -show-encoding %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,+real-true16 -disassemble -show-encoding -comment-column=0 | FileCheck --strict-whitespace --check-prefixes=GFX12,GFX12-DIS %s
 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -show-encoding -comment-column=0 %s | FileCheck --strict-whitespace --check-prefixes=GFX12,GFX12-ASM %s
-// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -show-encoding %s | sed -n 's#.*\(\[0x[0-9a-fx,]\{1,\}\]\)#\1#p' | llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -disassemble -show-encoding -comment-column=0 | FileCheck --strict-whitespace --check-prefixes=GFX12,GFX12-DIS %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -show-encoding %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -disassemble -show-encoding -comment-column=0 | FileCheck --strict-whitespace --check-prefixes=GFX12,GFX12-DIS %s
 
 v_bfrev_b32_e32 v5, v1
 // GFX12: v_bfrev_b32_e32 v5, v1 ; encoding: [0x01,0x71,0x0a,0x7e]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vimage.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vimage.txt
index 9afaa075ea838..800579391d8eb 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vimage.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vimage.txt
@@ -1,25 +1,25 @@
 # RUN: llvm-mc -disassemble -triple=amdgcn -mcpu=gfx1250 -show-encoding %s | FileCheck --check-prefix=GFX1250 %s
 
-# GFX1250: tensor_load_to_lds s[0:3], s[4:11] ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x00,0x00,0x00,0x04,0x7c,0x7c]
-0x01,0x00,0x71,0xd0,0x00,0x00,0x00,0x00,0x00,0x04,0x7c,0x7c
+# GFX1250: tensor_load_to_lds s[0:3], s[4:11] ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x00,0x7c,0x00,0x04,0x7c,0x7c]
+0x01,0x00,0x71,0xd0,0x00,0x00,0x00,0x7c,0x00,0x04,0x7c,0x7c
 
-# GFX1250: tensor_load_to_lds s[0:3], s[4:11] th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x3c,0x00,0x00,0x04,0x7c,0x7c]
-0x01,0x00,0x71,0xd0,0x00,0x00,0x3c,0x00,0x00,0x04,0x7c,0x7c
+# GFX1250: tensor_load_to_lds s[0:3], s[4:11] th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x3c,0x7c,0x00,0x04,0x7c,0x7c]
+0x01,0x00,0x71,0xd0,0x00,0x00,0x3c,0x7c,0x00,0x04,0x7c,0x7c
 
-# GFX1250: tensor_load_to_lds s[0:3], s[4:11], s[12:15], s[16:19] ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x00,0x00,0x00,0x04,0x0c,0x10]
-0x01,0x00,0x71,0xd0,0x00,0x00,0x00,0x00,0x00,0x04,0x0c,0x10
+# GFX1250: tensor_load_to_lds s[0:3], s[4:11], s[12:15], s[16:19] ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x00,0x7c,0x00,0x04,0x0c,0x10]
+0x01,0x00,0x71,0xd0,0x00,0x00,0x00,0x7c,0x00,0x04,0x0c,0x10
 
-# GFX1250: tensor_load_to_lds s[0:3], s[4:11], s[12:15], s[16:19] th:TH_LOAD_NT_HT scope:SCOPE_DEV ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x68,0x00,0x00,0x04,0x0c,0x10]
-0x01,0x00,0x71,0xd0,0x00,0x00,0x68,0x00,0x00,0x04,0x0c,0x10
+# GFX1250: tensor_load_to_lds s[0:3], s[4:11], s[12:15], s[16:19] th:TH_LOAD_NT_HT scope:SCOPE_DEV ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x68,0x7c,0x00,0x04,0x0c,0x10]
+0x01,0x00,0x71,0xd0,0x00,0x00,0x68,0x7c,0x00,0x04,0x0c,0x10
 
-# GFX1250: tensor_store_from_lds s[0:3], s[4:11] ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x00,0x00,0x00,0x04,0x7c,0x7c]
-0x01,0x40,0x71,0xd0,0x00,0x00,0x00,0x00,0x00,0x04,0x7c,0x7c
+# GFX1250: tensor_store_from_lds s[0:3], s[4:11] ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x00,0x7c,0x00,0x04,0x7c,0x7c]
+0x01,0x40,0x71,0xd0,0x00,0x00,0x00,0x7c,0x00,0x04,0x7c,0x7c
 
-# GFX1250: tensor_store_from_lds s[0:3], s[4:11] th:TH_STORE_BYPASS scope:SCOPE_SYS ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x3c,0x00,0x00,0x04,0x7c,0x7c]
-0x01,0x40,0x71,0xd0,0x00,0x00,0x3c,0x00,0x00,0x04,0x7c,0x7c
+# GFX1250: tensor_store_from_lds s[0:3], s[4:11] th:TH_STORE_BYPASS scope:SCOPE_SYS ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x3c,0x7c,0x00,0x04,0x7c,0x7c]
+0x01,0x40,0x71,0xd0,0x00,0x00,0x3c,0x7c,0x00,0x04,0x7c,0x7c
 
-# GFX1250: tensor_store_from_lds s[0:3], s[4:11], s[12:15], s[16:19] ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x00,0x00,0x00,0x04,0x0c,0x10]
-0x01,0x40,0x71,0xd0,0x00,0x00,0x00,0x00,0x00,0x04,0x0c,0x10
+# GFX1250: tensor_store_from_lds s[0:3], s[4:11], s[12:15], s[16:19] ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x00,0x7c,0x00,0x04,0x0c,0x10]
+0x01,0x40,0x71,0xd0,0x00,0x00,0x00,0x7c,0x00,0x04,0x0c,0x10
 
-# GFX1250: tensor_store_from_lds s[0:3], s[4:11], s[12:15], s[16:19] th:TH_STORE_NT_HT scope:SCOPE_DEV ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x68,0x00,0x00,0x04,0x0c,0x10]
-0x01,0x40,0x71,0xd0,0x00,0x00,0x68,0x00,0x00,0x04,0x0c,0x10
+# GFX1250: tensor_store_from_lds s[0:3], s[4:11], s[12:15], s[16:19] th:TH_STORE_NT_HT scope:SCOPE_DEV ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x68,0x7c,0x00,0x04,0x0c,0x10]
+0x01,0x40,0x71,0xd0,0x00,0x00,0x68,0x7c,0x00,0x04,0x0c,0x10
diff --git a/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt b/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt
index 054489ce51a60..f5cb4b72959f9 100644
--- a/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt
+++ b/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt
@@ -286,6 +286,9 @@
 #CHECK: xvmulhuh  4, 5, 7
 0xf0,0x85,0x3b,0xd0
 
+#CHECK: mtlpl 3, 4
+0x7c,0x80,0x1a,0x26
+
 #CHECK: xxmulmul 8, 3, 4, 2
 0xed,0x03,0x22,0x08
 
diff --git a/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt b/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt
index 17d1413bacc3a..f0df8ce39021b 100644
--- a/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt
+++ b/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt
@@ -280,6 +280,9 @@
 #CHECK: xvmulhuh  4, 5, 7
 0xd0,0x3b,0x85,0xf0
 
+#CHECK: mtlpl 3, 4
+0x26,0x1a,0x80,0x7c
+
 #CHECK: xxmulmul 8, 3, 4, 2
 0x08,0x22,0x03,0xed
 
diff --git a/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-movrs.txt b/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-movrs.txt
index 57e3153da401b..5c2927afbda4c 100755
--- a/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-movrs.txt
+++ b/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-movrs.txt
@@ -1,70 +1,6 @@
 # RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s -check-prefix=ATT
 # RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s -check-prefix=INTEL
 
-# ATT:   t2rpntlvwz0rs 268435456(%rbp,%r14,8), %tmm6
-# INTEL: t2rpntlvwz0rs tmm6, [rbp + 8*r14 + 268435456]
-0xc4,0xa5,0x78,0xf8,0xb4,0xf5,0x00,0x00,0x00,0x10
-
-# ATT:   t2rpntlvwz0rs 291(%r8,%rax,4), %tmm2
-# INTEL: t2rpntlvwz0rs tmm2, [r8 + 4*rax + 291]
-0xc4,0xc5,0x78,0xf8,0x94,0x80,0x23,0x01,0x00,0x00
-
-# ATT:   t2rpntlvwz0rs 64(%rbx), %tmm6
-# INTEL: t2rpntlvwz0rs tmm6, [rbx + 64]
-0xc4,0xe5,0x78,0xf8,0x74,0x23,0x40
-
-# ATT:   t2rpntlvwz0rs -32(,%rbp,2), %tmm2
-# INTEL: t2rpntlvwz0rs tmm2, [2*rbp - 32]
-0xc4,0xe5,0x78,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff
-
-# ATT:   t2rpntlvwz0rst1 268435456(%rbp,%r14,8), %tmm6
-# INTEL: t2rpntlvwz0rst1 tmm6, [rbp + 8*r14 + 268435456]
-0xc4,0xa5,0x78,0xf9,0xb4,0xf5,0x00,0x00,0x00,0x10
-
-# ATT:   t2rpntlvwz0rst1 291(%r8,%rax,4), %tmm2
-# INTEL: t2rpntlvwz0rst1 tmm2, [r8 + 4*rax + 291]
-0xc4,0xc5,0x78,0xf9,0x94,0x80,0x23,0x01,0x00,0x00
-
-# ATT:   t2rpntlvwz0rst1 64(%rbx), %tmm6
-# INTEL: t2rpntlvwz0rst1 tmm6, [rbx + 64]
-0xc4,0xe5,0x78,0xf9,0x74,0x23,0x40
-
-# ATT:   t2rpntlvwz0rst1 -32(,%rbp,2), %tmm2
-# INTEL: t2rpntlvwz0rst1 tmm2, [2*rbp - 32]
-0xc4,0xe5,0x78,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff
-
-# ATT:   t2rpntlvwz1rs 268435456(%rbp,%r14,8), %tmm6
-# INTEL: t2rpntlvwz1rs tmm6, [rbp + 8*r14 + 268435456]
-0xc4,0xa5,0x79,0xf8,0xb4,0xf5,0x00,0x00,0x00,0x10
-
-# ATT:   t2rpntlvwz1rs 291(%r8,%rax,4), %tmm2
-# INTEL: t2rpntlvwz1rs tmm2, [r8 + 4*rax + 291]
-0xc4,0xc5,0x79,0xf8,0x94,0x80,0x23,0x01,0x00,0x00
-
-# ATT:   t2rpntlvwz1rs 64(%rbx), %tmm6
-# INTEL: t2rpntlvwz1rs tmm6, [rbx + 64]
-0xc4,0xe5,0x79,0xf8,0x74,0x23,0x40
-
-# ATT:   t2rpntlvwz1rs -32(,%rbp,2), %tmm2
-# INTEL: t2rpntlvwz1rs tmm2, [2*rbp - 32]
-0xc4,0xe5,0x79,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff
-
-# ATT:   t2rpntlvwz1rst1 268435456(%rbp,%r14,8), %tmm6
-# INTEL: t2rpntlvwz1rst1 tmm6, [rbp + 8*r14 + 268435456]
-0xc4,0xa5,0x79,0xf9,0xb4,0xf5,0x00,0x00,0x00,0x10
-
-# ATT:   t2rpntlvwz1rst1 291(%r8,%rax,4), %tmm2
-# INTEL: t2rpntlvwz1rst1 tmm2, [r8 + 4*rax + 291]
-0xc4,0xc5,0x79,0xf9,0x94,0x80,0x23,0x01,0x00,0x00
-
-# ATT:   t2rpntlvwz1rst1 64(%rbx), %tmm6
-# INTEL: t2rpntlvwz1rst1 tmm6, [rbx + 64]
-0xc4,0xe5,0x79,0xf9,0x74,0x23,0x40
-
-# ATT:   t2rpntlvwz1rst1 -32(,%rbp,2), %tmm2
-# INTEL: t2rpntlvwz1rst1 tmm2, [2*rbp - 32]
-0xc4,0xe5,0x79,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff
-
 # ATT:   tileloaddrs 268435456(%rbp,%r14,8), %tmm6
 # INTEL: tileloaddrs tmm6, [rbp + 8*r14 + 268435456]
 0xc4,0xa2,0x7b,0x4a,0xb4,0xf5,0x00,0x00,0x00,0x10
@@ -97,70 +33,6 @@
 # INTEL: tileloaddrst1 tmm3, [2*rbp - 32]
 0xc4,0xe2,0x79,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff
 
-# ATT:   t2rpntlvwz0rs 268435456(%r16,%r14,8), %tmm6
-# INTEL: t2rpntlvwz0rs tmm6, [r16 + 8*r14 + 268435456]
-0x62,0xbd,0x7c,0x08,0xf8,0xb4,0xf0,0x00,0x00,0x00,0x10
-
-# ATT:   t2rpntlvwz0rs 291(%r8,%r17,4), %tmm2
-# INTEL: t2rpntlvwz0rs tmm2, [r8 + 4*r17 + 291]
-0x62,0xd5,0x78,0x08,0xf8,0x94,0x88,0x23,0x01,0x00,0x00
-
-# ATT:   t2rpntlvwz0rs 64(%r18), %tmm6
-# INTEL: t2rpntlvwz0rs tmm6, [r18 + 64]
-0x62,0xfd,0x7c,0x08,0xf8,0x74,0x22,0x40
-
-# ATT:   t2rpntlvwz0rs -32(,%rbp,2), %tmm2
-# INTEL: t2rpntlvwz0rs tmm2, [2*rbp - 32]
-0x62,0xf5,0x7c,0x08,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff
-
-# ATT:   t2rpntlvwz0rst1 268435456(%r16,%r14,8), %tmm6
-# INTEL: t2rpntlvwz0rst1 tmm6, [r16 + 8*r14 + 268435456]
-0x62,0xbd,0x7c,0x08,0xf9,0xb4,0xf0,0x00,0x00,0x00,0x10
-
-# ATT:   t2rpntlvwz0rst1 291(%r8,%r17,4), %tmm2
-# INTEL: t2rpntlvwz0rst1 tmm2, [r8 + 4*r17 + 291]
-0x62,0xd5,0x78,0x08,0xf9,0x94,0x88,0x23,0x01,0x00,0x00
-
-# ATT:   t2rpntlvwz0rst1 64(%r18), %tmm6
-# INTEL: t2rpntlvwz0rst1 tmm6, [r18 + 64]
-0x62,0xfd,0x7c,0x08,0xf9,0x74,0x22,0x40
-
-# ATT:   t2rpntlvwz0rst1 -32(,%rbp,2), %tmm2
-# INTEL: t2rpntlvwz0rst1 tmm2, [2*rbp - 32]
-0x62,0xf5,0x7c,0x08,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff
-
-# ATT:   t2rpntlvwz1rs 268435456(%r16,%r14,8), %tmm6
-# INTEL: t2rpntlvwz1rs tmm6, [r16 + 8*r14 + 268435456]
-0x62,0xbd,0x7d,0x08,0xf8,0xb4,0xf0,0x00,0x00,0x00,0x10
-
-# ATT:   t2rpntlvwz1rs 291(%r8,%r17,4), %tmm2
-# INTEL: t2rpntlvwz1rs tmm2, [r8 + 4*r17 + 291]
-0x62,0xd5,0x79,0x08,0xf8,0x94,0x88,0x23,0x01,0x00,0x00
-
-# ATT:   t2rpntlvwz1rs 64(%r18), %tmm6
-# INTEL: t2rpntlvwz1rs tmm6, [r18 + 64]
-0x62,0xfd,0x7d,0x08,0xf8,0x74,0x22,0x40
-
-# ATT:   t2rpntlvwz1rs -32(,%rbp,2), %tmm2
-# INTEL: t2rpntlvwz1rs tmm2, [2*rbp - 32]
-0x62,0xf5,0x7d,0x08,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff
-
-# ATT:   t2rpntlvwz1rst1 268435456(%r16,%r14,8), %tmm6
-# INTEL: t2rpntlvwz1rst1 tmm6, [r16 + 8*r14 + 268435456]
-0x62,0xbd,0x7d,0x08,0xf9,0xb4,0xf0,0x00,0x00,0x00,0x10
-
-# ATT:   t2rpntlvwz1rst1 291(%r8,%r17,4), %tmm2
-# INTEL: t2rpntlvwz1rst1 tmm2, [r8 + 4*r17 + 291]
-0x62,0xd5,0x79,0x08,0xf9,0x94,0x88,0x23,0x01,0x00,0x00
-
-# ATT:   t2rpntlvwz1rst1 64(%r18), %tmm6
-# INTEL: t2rpntlvwz1rst1 tmm6, [r18 + 64]
-0x62,0xfd,0x7d,0x08,0xf9,0x74,0x22,0x40
-
-# ATT:   t2rpntlvwz1rst1 -32(,%rbp,2), %tmm2
-# INTEL: t2rpntlvwz1rst1 tmm2, [2*rbp - 32]
-0x62,0xf5,0x7d,0x08,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff
-
 # ATT:   tileloaddrs 268435456(%r16,%r14,8), %tmm6
 # INTEL: tileloaddrs tmm6, [r16 + 8*r14 + 268435456]
 0x62,0xba,0x7f,0x08,0x4a,0xb4,0xf0,0x00,0x00,0x00,0x10
diff --git a/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-tf32.txt b/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-tf32.txt
index f372c42982b1b..347e61cdfc4b8 100644
--- a/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-tf32.txt
+++ b/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-tf32.txt
@@ -9,11 +9,3 @@
 # INTEL:      tmmultf32ps tmm3, tmm2, tmm1
 0xc4,0xe2,0x71,0x48,0xda
 
-# ATT:      ttmmultf32ps %tmm4, %tmm5, %tmm6
-# INTEL:      ttmmultf32ps tmm6, tmm5, tmm4
-0xc4,0xe2,0x58,0x48,0xf5
-
-# ATT:      ttmmultf32ps %tmm1, %tmm2, %tmm3
-# INTEL:      ttmmultf32ps tmm3, tmm2, tmm1
-0xc4,0xe2,0x70,0x48,0xda
-
diff --git a/llvm/test/MC/Disassembler/X86/amx-transpose-att.txt b/llvm/test/MC/Disassembler/X86/amx-transpose-att.txt
deleted file mode 100644
index d768630ac1475..0000000000000
--- a/llvm/test/MC/Disassembler/X86/amx-transpose-att.txt
+++ /dev/null
@@ -1,154 +0,0 @@
-# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
-# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
-
-# ATT:   t2rpntlvwz0 268435456(%rbp,%r14,8), %tmm4
-# INTEL: t2rpntlvwz0 tmm4, [rbp + 8*r14 + 268435456]
-0xc4,0xa2,0x78,0x6e,0xa4,0xf5,0x00,0x00,0x00,0x10
-
-# ATT:   t2rpntlvwz0 291(%r8,%rax,4), %tmm2
-# INTEL: t2rpntlvwz0 tmm2, [r8 + 4*rax + 291]
-0xc4,0xc2,0x78,0x6e,0x94,0x80,0x23,0x01,0x00,0x00
-
-# ATT:   t2rpntlvwz0 -32(,%rbp,2), %tmm2
-# INTEL: t2rpntlvwz0 tmm2, [2*rbp - 32]
-0xc4,0xe2,0x78,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff
-
-# ATT:   t2rpntlvwz0t1 268435456(%rbp,%r14,8), %tmm4
-# INTEL: t2rpntlvwz0t1 tmm4, [rbp + 8*r14 + 268435456]
-0xc4,0xa2,0x78,0x6f,0xa4,0xf5,0x00,0x00,0x00,0x10
-
-# ATT:   t2rpntlvwz0t1 291(%r8,%rax,4), %tmm2
-# INTEL: t2rpntlvwz0t1 tmm2, [r8 + 4*rax + 291]
-0xc4,0xc2,0x78,0x6f,0x94,0x80,0x23,0x01,0x00,0x00
-
-# ATT:   t2rpntlvwz0t1 -32(,%rbp,2), %tmm2
-# INTEL: t2rpntlvwz0t1 tmm2, [2*rbp - 32]
-0xc4,0xe2,0x78,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff
-
-# ATT:   t2rpntlvwz1 268435456(%rbp,%r14,8), %tmm4
-# INTEL: t2rpntlvwz1 tmm4, [rbp + 8*r14 + 268435456]
-0xc4,0xa2,0x79,0x6e,0xa4,0xf5,0x00,0x00,0x00,0x10
-
-# ATT:   t2rpntlvwz1 291(%r8,%rax,4), %tmm2
-# INTEL: t2rpntlvwz1 tmm2, [r8 + 4*rax + 291]
-0xc4,0xc2,0x79,0x6e,0x94,0x80,0x23,0x01,0x00,0x00
-
-# ATT:   t2rpntlvwz1 -32(,%rbp,2), %tmm2
-# INTEL: t2rpntlvwz1 tmm2, [2*rbp - 32]
-0xc4,0xe2,0x79,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff
-
-# ATT:   t2rpntlvwz1t1 268435456(%rbp,%r14,8), %tmm4
-# INTEL: t2rpntlvwz1t1 tmm4, [rbp + 8*r14 + 268435456]
-0xc4,0xa2,0x79,0x6f,0xa4,0xf5,0x00,0x00,0x00,0x10
-
-# ATT:   t2rpntlvwz1t1 291(%r8,%rax,4), %tmm2
-# INTEL: t2rpntlvwz1t1 tmm2, [r8 + 4*rax + 291]
-0xc4,0xc2,0x79,0x6f,0x94,0x80,0x23,0x01,0x00,0x00
-
-# ATT:   t2rpntlvwz1t1 -32(,%rbp,2), %tmm2
-# INTEL: t2rpntlvwz1t1 tmm2, [2*rbp - 32]
-0xc4,0xe2,0x79,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff
-
-# ATT:   t2rpntlvwz0 268435456(%r16,%r14,8), %tmm4
-# INTEL: t2rpntlvwz0 tmm4, [r16 + 8*r14 + 268435456]
-0x62,0xba,0x7c,0x08,0x6e,0xa4,0xf0,0x00,0x00,0x00,0x10
-
-# ATT:   t2rpntlvwz0 291(%r8,%r17,4), %tmm2
-# INTEL: t2rpntlvwz0 tmm2, [r8 + 4*r17 + 291]
-0x62,0xd2,0x78,0x08,0x6e,0x94,0x88,0x23,0x01,0x00,0x00
-
-# ATT:   t2rpntlvwz0 -32(,%rbp,2), %tmm2
-# INTEL: t2rpntlvwz0 tmm2, [2*rbp - 32]
-0x62,0xf2,0x7c,0x08,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff
-
-# ATT:   t2rpntlvwz0t1 268435456(%r16,%r14,8), %tmm4
-# INTEL: t2rpntlvwz0t1 tmm4, [r16 + 8*r14 + 268435456]
-0x62,0xba,0x7c,0x08,0x6f,0xa4,0xf0,0x00,0x00,0x00,0x10
-
-# ATT:   t2rpntlvwz0t1 291(%r8,%r17,4), %tmm2
-# INTEL: t2rpntlvwz0t1 tmm2, [r8 + 4*r17 + 291]
-0x62,0xd2,0x78,0x08,0x6f,0x94,0x88,0x23,0x01,0x00,0x00
-
-# ATT:   t2rpntlvwz0t1 -32(,%rbp,2), %tmm2
-# INTEL: t2rpntlvwz0t1 tmm2, [2*rbp - 32]
-0x62,0xf2,0x7c,0x08,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff
-
-# ATT:   t2rpntlvwz1 268435456(%r16,%r14,8), %tmm4
-# INTEL: t2rpntlvwz1 tmm4, [r16 + 8*r14 + 268435456]
-0x62,0xba,0x7d,0x08,0x6e,0xa4,0xf0,0x00,0x00,0x00,0x10
-
-# ATT:   t2rpntlvwz1 291(%r8,%r17,4), %tmm2
-# INTEL: t2rpntlvwz1 tmm2, [r8 + 4*r17 + 291]
-0x62,0xd2,0x79,0x08,0x6e,0x94,0x88,0x23,0x01,0x00,0x00
-
-# ATT:   t2rpntlvwz1 -32(,%rbp,2), %tmm2
-# INTEL: t2rpntlvwz1 tmm2, [2*rbp - 32]
-0x62,0xf2,0x7d,0x08,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff
-
-# ATT:   t2rpntlvwz1t1 268435456(%r16,%r14,8), %tmm4
-# INTEL: t2rpntlvwz1t1 tmm4, [r16 + 8*r14 + 268435456]
-0x62,0xba,0x7d,0x08,0x6f,0xa4,0xf0,0x00,0x00,0x00,0x10
-
-# ATT:   t2rpntlvwz1t1 291(%r8,%r17,4), %tmm2
-# INTEL: t2rpntlvwz1t1 tmm2, [r8 + 4*r17 + 291]
-0x62,0xd2,0x79,0x08,0x6f,0x94,0x88,0x23,0x01,0x00,0x00
-
-# ATT:   t2rpntlvwz1t1 -32(,%rbp,2), %tmm2
-# INTEL: t2rpntlvwz1t1 tmm2, [2*rbp - 32]
-0x62,0xf2,0x7d,0x08,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff
-
-# ATT:   ttransposed %tmm1, %tmm2
-# INTEL: ttransposed tmm2, tmm1
-0xc4,0xe2,0x7a,0x5f,0xd1
-
-# ATT:   ttransposed %tmm2, %tmm3
-# INTEL: ttransposed tmm3, tmm2
-0xc4,0xe2,0x7a,0x5f,0xda
-
-# ATT:   ttdpbf16ps %tmm7, %tmm6, %tmm5
-# INTEL: ttdpbf16ps tmm5, tmm6, tmm7
-0xc4,0xe2,0x42,0x6c,0xee
-
-# ATT:   ttdpbf16ps %tmm1, %tmm2, %tmm3
-# INTEL: ttdpbf16ps tmm3, tmm2, tmm1
-0xc4,0xe2,0x72,0x6c,0xda
-
-# ATT:   ttdpfp16ps %tmm7, %tmm6, %tmm5
-# INTEL: ttdpfp16ps tmm5, tmm6, tmm7
-0xc4,0xe2,0x43,0x6c,0xee
-
-# ATT:   ttdpfp16ps %tmm1, %tmm2, %tmm3
-# INTEL: ttdpfp16ps tmm3, tmm2, tmm1
-0xc4,0xe2,0x73,0x6c,0xda
-
-# ATT:   ttcmmimfp16ps %tmm4, %tmm5, %tmm6
-# INTEL: ttcmmimfp16ps tmm6, tmm5, tmm4
-0xc4,0xe2,0x5b,0x6b,0xf5
-
-# ATT:   ttcmmimfp16ps %tmm1, %tmm2, %tmm3
-# INTEL: ttcmmimfp16ps tmm3, tmm2, tmm1
-0xc4,0xe2,0x73,0x6b,0xda
-
-# ATT:   ttcmmrlfp16ps %tmm4, %tmm5, %tmm6
-# INTEL: ttcmmrlfp16ps tmm6, tmm5, tmm4
-0xc4,0xe2,0x5a,0x6b,0xf5
-
-# ATT:   ttcmmrlfp16ps %tmm1, %tmm2, %tmm3
-# INTEL: ttcmmrlfp16ps tmm3, tmm2, tmm1
-0xc4,0xe2,0x72,0x6b,0xda
-
-# ATT:   tconjtcmmimfp16ps %tmm4, %tmm5, %tmm6
-# INTEL: tconjtcmmimfp16ps tmm6, tmm5, tmm4
-0xc4,0xe2,0x58,0x6b,0xf5
-
-# ATT:   tconjtcmmimfp16ps %tmm1, %tmm2, %tmm3
-# INTEL: tconjtcmmimfp16ps tmm3, tmm2, tmm1
-0xc4,0xe2,0x70,0x6b,0xda
-
-# ATT:   tconjtfp16 %tmm5, %tmm6
-# INTEL: tconjtfp16 tmm6, tmm5
-0xc4,0xe2,0x79,0x6b,0xf5
-
-# ATT:   tconjtfp16 %tmm2, %tmm3
-# INTEL: tconjtfp16 tmm3, tmm2
-0xc4,0xe2,0x79,0x6b,0xda
diff --git a/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s b/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s
index e5bc1f47bf666..bc0683e38887c 100644
--- a/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s
+++ b/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s
@@ -403,6 +403,10 @@
 #CHECK-BE: xvmulhuh 4, 5, 7              # encoding: [0xf0,0x85,0x3b,0xd0]
 #CHECK-LE: xvmulhuh 4, 5, 7              # encoding: [0xd0,0x3b,0x85,0xf0]
 
+           mtlpl 3, 4
+#CHECK-BE: mtlpl 3, 4                     # encoding: [0x7c,0x80,0x1a,0x26]
+#CHECK-LE: mtlpl 3, 4                     # encoding: [0x26,0x1a,0x80,0x7c]
+
            xxmulmul 8, 3, 4, 2
 #CHECK-BE: xxmulmul 8, 3, 4, 2          # encoding: [0xed,0x03,0x22,0x08]
 #CHECK-LE: xxmulmul 8, 3, 4, 2          # encoding: [0x08,0x22,0x03,0xed]
diff --git a/llvm/test/MC/RISCV/xqcili-linker-relaxation.s b/llvm/test/MC/RISCV/xqcili-linker-relaxation.s
new file mode 100644
index 0000000000000..ace677979ee13
--- /dev/null
+++ b/llvm/test/MC/RISCV/xqcili-linker-relaxation.s
@@ -0,0 +1,37 @@
+# RUN: llvm-mc --triple=riscv32 -mattr=+relax,+experimental-xqcili \
+# RUN:    %s -filetype=obj -o - -riscv-add-build-attributes \
+# RUN:    | llvm-objdump -dr -M no-aliases - \
+# RUN:    | FileCheck %s
+
+## This tests that we correctly emit relocations for linker relaxation when
+## emitting `QC.E.LI` and `QC.LI`.
+
+  .section .text.ex1, "ax", @progbits
+# CHECK-LABEL: <.text.ex1>:
+  blez    a1, .L1
+# CHECK-NEXT: bge zero, a1, 0x0 <.text.ex1>
+# CHECK-NEXT: R_RISCV_BRANCH .L1{{$}}
+  qc.e.li a0, sym
+# CHECK-NEXT: qc.e.li a0, 0x0
+# CHECK-NEXT: R_RISCV_VENDOR QUALCOMM{{$}}
+# CHECK-NEXT: R_RISCV_CUSTOM194 sym{{$}}
+# CHECK-NEXT: R_RISCV_RELAX *ABS*{{$}}
+.L1:
+# CHECK: <.L1>:
+  ret
+# CHECK-NEXT: c.jr ra
+
+  .section .text.ex2, "ax", @progbits
+# CHECK-LABEL: <.text.ex2>:
+  blez    a1, .L2
+# CHECK-NEXT: bge zero, a1, 0x0 <.text.ex2>
+# CHECK-NEXT: R_RISCV_BRANCH .L2{{$}}
+  qc.li a0,  %qc.abs20(sym)
+# CHECK-NEXT: qc.li a0, 0x0
+# CHECK-NEXT: R_RISCV_VENDOR QUALCOMM{{$}}
+# CHECK-NEXT: R_RISCV_CUSTOM192 sym{{$}}
+# CHECK-NEXT: R_RISCV_RELAX *ABS*{{$}}
+.L2:
+# CHECK: <.L2>:
+  ret
+# CHECK-NEXT: c.jr ra
diff --git a/llvm/test/MC/X86/AMX/x86-64-amx-movrs-att.s b/llvm/test/MC/X86/AMX/x86-64-amx-movrs-att.s
index 92db672e1c82d..497a1c6b7bad5 100755
--- a/llvm/test/MC/X86/AMX/x86-64-amx-movrs-att.s
+++ b/llvm/test/MC/X86/AMX/x86-64-amx-movrs-att.s
@@ -1,69 +1,5 @@
 // RUN: llvm-mc -triple x86_64-unknown-unknown --show-encoding %s | FileCheck %s
 
-// CHECK: t2rpntlvwz0rs 268435456(%rbp,%r14,8), %tmm6
-// CHECK: encoding: [0xc4,0xa5,0x78,0xf8,0xb4,0xf5,0x00,0x00,0x00,0x10]
-          t2rpntlvwz0rs 268435456(%rbp,%r14,8), %tmm6
-
-// CHECK: t2rpntlvwz0rs 291(%r8,%rax,4), %tmm2
-// CHECK: encoding: [0xc4,0xc5,0x78,0xf8,0x94,0x80,0x23,0x01,0x00,0x00]
-          t2rpntlvwz0rs 291(%r8,%rax,4), %tmm2
-
-// CHECK: t2rpntlvwz0rs 64(%rbx), %tmm6
-// CHECK: encoding: [0xc4,0xe5,0x78,0xf8,0x74,0x23,0x40]
-          t2rpntlvwz0rs 64(%rbx), %tmm6
-
-// CHECK: t2rpntlvwz0rs -32(,%rbp,2), %tmm2
-// CHECK: encoding: [0xc4,0xe5,0x78,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff]
-          t2rpntlvwz0rs -32(,%rbp,2), %tmm2
-
-// CHECK: t2rpntlvwz0rst1 268435456(%rbp,%r14,8), %tmm6
-// CHECK: encoding: [0xc4,0xa5,0x78,0xf9,0xb4,0xf5,0x00,0x00,0x00,0x10]
-          t2rpntlvwz0rst1 268435456(%rbp,%r14,8), %tmm6
-
-// CHECK: t2rpntlvwz0rst1 291(%r8,%rax,4), %tmm2
-// CHECK: encoding: [0xc4,0xc5,0x78,0xf9,0x94,0x80,0x23,0x01,0x00,0x00]
-          t2rpntlvwz0rst1 291(%r8,%rax,4), %tmm2
-
-// CHECK: t2rpntlvwz0rst1 64(%rbx), %tmm6
-// CHECK: encoding: [0xc4,0xe5,0x78,0xf9,0x74,0x23,0x40]
-          t2rpntlvwz0rst1 64(%rbx), %tmm6
-
-// CHECK: t2rpntlvwz0rst1 -32(,%rbp,2), %tmm2
-// CHECK: encoding: [0xc4,0xe5,0x78,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff]
-          t2rpntlvwz0rst1 -32(,%rbp,2), %tmm2
-
-// CHECK: t2rpntlvwz1rs 268435456(%rbp,%r14,8), %tmm6
-// CHECK: encoding: [0xc4,0xa5,0x79,0xf8,0xb4,0xf5,0x00,0x00,0x00,0x10]
-          t2rpntlvwz1rs 268435456(%rbp,%r14,8), %tmm6
-
-// CHECK: t2rpntlvwz1rs 291(%r8,%rax,4), %tmm2
-// CHECK: encoding: [0xc4,0xc5,0x79,0xf8,0x94,0x80,0x23,0x01,0x00,0x00]
-          t2rpntlvwz1rs 291(%r8,%rax,4), %tmm2
-
-// CHECK: t2rpntlvwz1rs 64(%rbx), %tmm6
-// CHECK: encoding: [0xc4,0xe5,0x79,0xf8,0x74,0x23,0x40]
-          t2rpntlvwz1rs 64(%rbx), %tmm6
-
-// CHECK: t2rpntlvwz1rs -32(,%rbp,2), %tmm2
-// CHECK: encoding: [0xc4,0xe5,0x79,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff]
-          t2rpntlvwz1rs -32(,%rbp,2), %tmm2
-
-// CHECK: t2rpntlvwz1rst1 268435456(%rbp,%r14,8), %tmm6
-// CHECK: encoding: [0xc4,0xa5,0x79,0xf9,0xb4,0xf5,0x00,0x00,0x00,0x10]
-          t2rpntlvwz1rst1 268435456(%rbp,%r14,8), %tmm6
-
-// CHECK: t2rpntlvwz1rst1 291(%r8,%rax,4), %tmm2
-// CHECK: encoding: [0xc4,0xc5,0x79,0xf9,0x94,0x80,0x23,0x01,0x00,0x00]
-          t2rpntlvwz1rst1 291(%r8,%rax,4), %tmm2
-
-// CHECK: t2rpntlvwz1rst1 64(%rbx), %tmm6
-// CHECK: encoding: [0xc4,0xe5,0x79,0xf9,0x74,0x23,0x40]
-          t2rpntlvwz1rst1 64(%rbx), %tmm6
-
-// CHECK: t2rpntlvwz1rst1 -32(,%rbp,2), %tmm2
-// CHECK: encoding: [0xc4,0xe5,0x79,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff]
-          t2rpntlvwz1rst1 -32(,%rbp,2), %tmm2
-
 // CHECK: tileloaddrs 268435456(%rbp,%r14,8), %tmm6
 // CHECK: encoding: [0xc4,0xa2,0x7b,0x4a,0xb4,0xf5,0x00,0x00,0x00,0x10]
           tileloaddrs 268435456(%rbp,%r14,8), %tmm6
@@ -88,70 +24,6 @@
 // CHECK: encoding: [0xc4,0xe2,0x79,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff]
           tileloaddrst1 -32(,%rbp,2), %tmm3
 
-// CHECK: t2rpntlvwz0rs 268435456(%r16,%r14,8), %tmm6
-// CHECK: encoding: [0x62,0xbd,0x7c,0x08,0xf8,0xb4,0xf0,0x00,0x00,0x00,0x10]
-          t2rpntlvwz0rs 268435456(%r16,%r14,8), %tmm6
-
-// CHECK: t2rpntlvwz0rs   291(%r8,%r17,4), %tmm2
-// CHECK: encoding: [0x62,0xd5,0x78,0x08,0xf8,0x94,0x88,0x23,0x01,0x00,0x00]
-          t2rpntlvwz0rs 291(%r8,%r17,4), %tmm2
-
-// CHECK: t2rpntlvwz0rs   64(%r18), %tmm6
-// CHECK: encoding: [0x62,0xfd,0x7c,0x08,0xf8,0x74,0x22,0x40]
-          t2rpntlvwz0rs 64(%r18), %tmm6
-
-// CHECK: {evex}  t2rpntlvwz0rs   -32(,%rbp,2), %tmm2
-// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff]
-          {evex} t2rpntlvwz0rs -32(,%rbp,2), %tmm2
-
-// CHECK: t2rpntlvwz0rst1 268435456(%r16,%r14,8), %tmm6
-// CHECK: encoding: [0x62,0xbd,0x7c,0x08,0xf9,0xb4,0xf0,0x00,0x00,0x00,0x10]
-          t2rpntlvwz0rst1 268435456(%r16,%r14,8), %tmm6
-
-// CHECK: t2rpntlvwz0rst1   291(%r8,%r17,4), %tmm2
-// CHECK: encoding: [0x62,0xd5,0x78,0x08,0xf9,0x94,0x88,0x23,0x01,0x00,0x00]
-          t2rpntlvwz0rst1 291(%r8,%r17,4), %tmm2
-
-// CHECK: t2rpntlvwz0rst1   64(%r18), %tmm6
-// CHECK: encoding: [0x62,0xfd,0x7c,0x08,0xf9,0x74,0x22,0x40]
-          t2rpntlvwz0rst1 64(%r18), %tmm6
-
-// CHECK: {evex}  t2rpntlvwz0rst1   -32(,%rbp,2), %tmm2
-// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff]
-          {evex} t2rpntlvwz0rst1 -32(,%rbp,2), %tmm2
-
-// CHECK: t2rpntlvwz1rs 268435456(%r16,%r14,8), %tmm6
-// CHECK: encoding: [0x62,0xbd,0x7d,0x08,0xf8,0xb4,0xf0,0x00,0x00,0x00,0x10]
-          t2rpntlvwz1rs 268435456(%r16,%r14,8), %tmm6
-
-// CHECK: t2rpntlvwz1rs   291(%r8,%r17,4), %tmm2
-// CHECK: encoding: [0x62,0xd5,0x79,0x08,0xf8,0x94,0x88,0x23,0x01,0x00,0x00]
-          t2rpntlvwz1rs 291(%r8,%r17,4), %tmm2
-
-// CHECK: t2rpntlvwz1rs   64(%r18), %tmm6
-// CHECK: encoding: [0x62,0xfd,0x7d,0x08,0xf8,0x74,0x22,0x40]
-          t2rpntlvwz1rs 64(%r18), %tmm6
-
-// CHECK: {evex}  t2rpntlvwz1rs   -32(,%rbp,2), %tmm2
-// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff]
-          {evex} t2rpntlvwz1rs -32(,%rbp,2), %tmm2
-
-// CHECK: t2rpntlvwz1rst1 268435456(%r16,%r14,8), %tmm6
-// CHECK: encoding: [0x62,0xbd,0x7d,0x08,0xf9,0xb4,0xf0,0x00,0x00,0x00,0x10]
-          t2rpntlvwz1rst1 268435456(%r16,%r14,8), %tmm6
-
-// CHECK: t2rpntlvwz1rst1   291(%r8,%r17,4), %tmm2
-// CHECK: encoding: [0x62,0xd5,0x79,0x08,0xf9,0x94,0x88,0x23,0x01,0x00,0x00]
-          t2rpntlvwz1rst1 291(%r8,%r17,4), %tmm2
-
-// CHECK: t2rpntlvwz1rst1   64(%r18), %tmm6
-// CHECK: encoding: [0x62,0xfd,0x7d,0x08,0xf9,0x74,0x22,0x40]
-          t2rpntlvwz1rst1 64(%r18), %tmm6
-
-// CHECK: {evex}  t2rpntlvwz1rst1   -32(,%rbp,2), %tmm2
-// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff]
-          {evex} t2rpntlvwz1rst1 -32(,%rbp,2), %tmm2
-
 // CHECK: tileloaddrs     291(%r16,%rax,4), %tmm3
 // CHECK: encoding: [0x62,0xfa,0x7f,0x08,0x4a,0x9c,0x80,0x23,0x01,0x00,0x00]
           tileloaddrs 291(%r16,%rax,4), %tmm3
diff --git a/llvm/test/MC/X86/AMX/x86-64-amx-movrs-intel.s b/llvm/test/MC/X86/AMX/x86-64-amx-movrs-intel.s
index 140d1aa6b198e..0e030ca415a16 100755
--- a/llvm/test/MC/X86/AMX/x86-64-amx-movrs-intel.s
+++ b/llvm/test/MC/X86/AMX/x86-64-amx-movrs-intel.s
@@ -1,69 +1,5 @@
 // RUN: llvm-mc -triple x86_64-unknown-unknown -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding  %s | FileCheck %s
 
-// CHECK: t2rpntlvwz0rs tmm6, [rbp + 8*r14 + 268435456]
-// CHECK: encoding: [0xc4,0xa5,0x78,0xf8,0xb4,0xf5,0x00,0x00,0x00,0x10]
-          t2rpntlvwz0rs tmm6, [rbp + 8*r14 + 268435456]
-
-// CHECK: t2rpntlvwz0rs tmm2, [r8 + 4*rax + 291]
-// CHECK: encoding: [0xc4,0xc5,0x78,0xf8,0x94,0x80,0x23,0x01,0x00,0x00]
-          t2rpntlvwz0rs tmm2, [r8 + 4*rax + 291]
-
-// CHECK: t2rpntlvwz0rs tmm6, [rbx + 64]
-// CHECK: encoding: [0xc4,0xe5,0x78,0xf8,0x74,0x23,0x40]
-          t2rpntlvwz0rs tmm6, [rbx + 64]
-
-// CHECK: t2rpntlvwz0rs tmm2, [2*rbp - 32]
-// CHECK: encoding: [0xc4,0xe5,0x78,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff]
-          t2rpntlvwz0rs tmm2, [2*rbp - 32]
-
-// CHECK: t2rpntlvwz0rst1 tmm6, [rbp + 8*r14 + 268435456]
-// CHECK: encoding: [0xc4,0xa5,0x78,0xf9,0xb4,0xf5,0x00,0x00,0x00,0x10]
-          t2rpntlvwz0rst1 tmm6, [rbp + 8*r14 + 268435456]
-
-// CHECK: t2rpntlvwz0rst1 tmm2, [r8 + 4*rax + 291]
-// CHECK: encoding: [0xc4,0xc5,0x78,0xf9,0x94,0x80,0x23,0x01,0x00,0x00]
-          t2rpntlvwz0rst1 tmm2, [r8 + 4*rax + 291]
-
-// CHECK: t2rpntlvwz0rst1 tmm6, [rbx + 64]
-// CHECK: encoding: [0xc4,0xe5,0x78,0xf9,0x74,0x23,0x40]
-          t2rpntlvwz0rst1 tmm6, [rbx + 64]
-
-// CHECK: t2rpntlvwz0rst1 tmm2, [2*rbp - 32]
-// CHECK: encoding: [0xc4,0xe5,0x78,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff]
-          t2rpntlvwz0rst1 tmm2, [2*rbp - 32]
-
-// CHECK: t2rpntlvwz1rs tmm6, [rbp + 8*r14 + 268435456]
-// CHECK: encoding: [0xc4,0xa5,0x79,0xf8,0xb4,0xf5,0x00,0x00,0x00,0x10]
-          t2rpntlvwz1rs tmm6, [rbp + 8*r14 + 268435456]
-
-// CHECK: t2rpntlvwz1rs tmm2, [r8 + 4*rax + 291]
-// CHECK: encoding: [0xc4,0xc5,0x79,0xf8,0x94,0x80,0x23,0x01,0x00,0x00]
-          t2rpntlvwz1rs tmm2, [r8 + 4*rax + 291]
-
-// CHECK: t2rpntlvwz1rs tmm6, [rbx + 64]
-// CHECK: encoding: [0xc4,0xe5,0x79,0xf8,0x74,0x23,0x40]
-          t2rpntlvwz1rs tmm6, [rbx + 64]
-
-// CHECK: t2rpntlvwz1rs tmm2, [2*rbp - 32]
-// CHECK: encoding: [0xc4,0xe5,0x79,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff]
-          t2rpntlvwz1rs tmm2, [2*rbp - 32]
-
-// CHECK: t2rpntlvwz1rst1 tmm6, [rbp + 8*r14 + 268435456]
-// CHECK: encoding: [0xc4,0xa5,0x79,0xf9,0xb4,0xf5,0x00,0x00,0x00,0x10]
-          t2rpntlvwz1rst1 tmm6, [rbp + 8*r14 + 268435456]
-
-// CHECK: t2rpntlvwz1rst1 tmm2, [r8 + 4*rax + 291]
-// CHECK: encoding: [0xc4,0xc5,0x79,0xf9,0x94,0x80,0x23,0x01,0x00,0x00]
-          t2rpntlvwz1rst1 tmm2, [r8 + 4*rax + 291]
-
-// CHECK: t2rpntlvwz1rst1 tmm6, [rbx + 64]
-// CHECK: encoding: [0xc4,0xe5,0x79,0xf9,0x74,0x23,0x40]
-          t2rpntlvwz1rst1 tmm6, [rbx + 64]
-
-// CHECK: t2rpntlvwz1rst1 tmm2, [2*rbp - 32]
-// CHECK: encoding: [0xc4,0xe5,0x79,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff]
-          t2rpntlvwz1rst1 tmm2, [2*rbp - 32]
-
 // CHECK: tileloaddrs tmm6, [rbp + 8*r14 + 268435456]
 // CHECK: encoding: [0xc4,0xa2,0x7b,0x4a,0xb4,0xf5,0x00,0x00,0x00,0x10]
           tileloaddrs tmm6, [rbp + 8*r14 + 268435456]
@@ -96,70 +32,6 @@
 // CHECK: encoding: [0xc4,0xe2,0x79,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff]
           tileloaddrst1 tmm3, [2*rbp - 32]
 
-// CHECK: t2rpntlvwz0rs tmm6, [r16 + 8*r14 + 268435456]
-// CHECK: encoding: [0x62,0xbd,0x7c,0x08,0xf8,0xb4,0xf0,0x00,0x00,0x00,0x10]
-          t2rpntlvwz0rs tmm6, [r16 + 8*r14 + 268435456]
-
-// CHECK: t2rpntlvwz0rs tmm2, [r8 + 4*r17 + 291]
-// CHECK: encoding: [0x62,0xd5,0x78,0x08,0xf8,0x94,0x88,0x23,0x01,0x00,0x00]
-          t2rpntlvwz0rs tmm2, [r8 + 4*r17 + 291]
-
-// CHECK: t2rpntlvwz0rs tmm6, [r18 + 64]
-// CHECK: encoding: [0x62,0xfd,0x7c,0x08,0xf8,0x74,0x22,0x40]
-          t2rpntlvwz0rs tmm6, [r18 + 64]
-
-// CHECK: {evex} t2rpntlvwz0rs tmm2, [2*rbp - 32]
-// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff]
-          {evex} t2rpntlvwz0rs tmm2, [2*rbp - 32]
-
-// CHECK: t2rpntlvwz0rst1 tmm6, [r16 + 8*r14 + 268435456]
-// CHECK: encoding: [0x62,0xbd,0x7c,0x08,0xf9,0xb4,0xf0,0x00,0x00,0x00,0x10]
-          t2rpntlvwz0rst1 tmm6, [r16 + 8*r14 + 268435456]
-
-// CHECK: t2rpntlvwz0rst1 tmm2, [r8 + 4*r17 + 291]
-// CHECK: encoding: [0x62,0xd5,0x78,0x08,0xf9,0x94,0x88,0x23,0x01,0x00,0x00]
-          t2rpntlvwz0rst1 tmm2, [r8 + 4*r17 + 291]
-
-// CHECK: t2rpntlvwz0rst1 tmm6, [r18 + 64]
-// CHECK: encoding: [0x62,0xfd,0x7c,0x08,0xf9,0x74,0x22,0x40]
-          t2rpntlvwz0rst1 tmm6, [r18 + 64]
-
-// CHECK: {evex} t2rpntlvwz0rst1 tmm2, [2*rbp - 32]
-// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff]
-          {evex} t2rpntlvwz0rst1 tmm2, [2*rbp - 32]
-
-// CHECK: t2rpntlvwz1rs tmm6, [r16 + 8*r14 + 268435456]
-// CHECK: encoding: [0x62,0xbd,0x7d,0x08,0xf8,0xb4,0xf0,0x00,0x00,0x00,0x10]
-          t2rpntlvwz1rs tmm6, [r16 + 8*r14 + 268435456]
-
-// CHECK: t2rpntlvwz1rs tmm2, [r8 + 4*r17 + 291]
-// CHECK: encoding: [0x62,0xd5,0x79,0x08,0xf8,0x94,0x88,0x23,0x01,0x00,0x00]
-          t2rpntlvwz1rs tmm2, [r8 + 4*r17 + 291]
-
-// CHECK: t2rpntlvwz1rs tmm6, [r18 + 64]
-// CHECK: encoding: [0x62,0xfd,0x7d,0x08,0xf8,0x74,0x22,0x40]
-          t2rpntlvwz1rs tmm6, [r18 + 64]
-
-// CHECK: {evex} t2rpntlvwz1rs tmm2, [2*rbp - 32]
-// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff]
-          {evex} t2rpntlvwz1rs tmm2, [2*rbp - 32]
-
-// CHECK: t2rpntlvwz1rst1 tmm6, [r16 + 8*r14 + 268435456]
-// CHECK: encoding: [0x62,0xbd,0x7d,0x08,0xf9,0xb4,0xf0,0x00,0x00,0x00,0x10]
-          t2rpntlvwz1rst1 tmm6, [r16 + 8*r14 + 268435456]
-
-// CHECK: t2rpntlvwz1rst1 tmm2, [r8 + 4*r17 + 291]
-// CHECK: encoding: [0x62,0xd5,0x79,0x08,0xf9,0x94,0x88,0x23,0x01,0x00,0x00]
-          t2rpntlvwz1rst1 tmm2, [r8 + 4*r17 + 291]
-
-// CHECK: t2rpntlvwz1rst1 tmm6, [r18 + 64]
-// CHECK: encoding: [0x62,0xfd,0x7d,0x08,0xf9,0x74,0x22,0x40]
-          t2rpntlvwz1rst1 tmm6, [r18 + 64]
-
-// CHECK: {evex} t2rpntlvwz1rst1 tmm2, [2*rbp - 32]
-// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff]
-          {evex} t2rpntlvwz1rst1 tmm2, [2*rbp - 32]
-
 // CHECK: tileloaddrs     tmm6, [r16 + 8*r14 + 268435456]
 // CHECK: encoding: [0x62,0xba,0x7f,0x08,0x4a,0xb4,0xf0,0x00,0x00,0x00,0x10]
           tileloaddrs tmm6, [r16 + 8*r14 + 268435456]
diff --git a/llvm/test/MC/X86/AMX/x86-64-amx-tf32-att.s b/llvm/test/MC/X86/AMX/x86-64-amx-tf32-att.s
index b413597cd9da7..d1d0997b7eec0 100644
--- a/llvm/test/MC/X86/AMX/x86-64-amx-tf32-att.s
+++ b/llvm/test/MC/X86/AMX/x86-64-amx-tf32-att.s
@@ -8,10 +8,3 @@
 // CHECK: encoding: [0xc4,0xe2,0x71,0x48,0xda]
                tmmultf32ps %tmm1, %tmm2, %tmm3
 
-// CHECK:      ttmmultf32ps %tmm4, %tmm5, %tmm6
-// CHECK: encoding: [0xc4,0xe2,0x58,0x48,0xf5]
-               ttmmultf32ps %tmm4, %tmm5, %tmm6
-
-// CHECK:      ttmmultf32ps %tmm1, %tmm2, %tmm3
-// CHECK: encoding: [0xc4,0xe2,0x70,0x48,0xda]
-               ttmmultf32ps %tmm1, %tmm2, %tmm3
diff --git a/llvm/test/MC/X86/AMX/x86-64-amx-tf32-intel.s b/llvm/test/MC/X86/AMX/x86-64-amx-tf32-intel.s
index 98f55275716eb..b6c0947ee750c 100644
--- a/llvm/test/MC/X86/AMX/x86-64-amx-tf32-intel.s
+++ b/llvm/test/MC/X86/AMX/x86-64-amx-tf32-intel.s
@@ -8,10 +8,3 @@
 // CHECK: encoding: [0xc4,0xe2,0x71,0x48,0xda]
                tmmultf32ps tmm3, tmm2, tmm1
 
-// CHECK:      ttmmultf32ps tmm6, tmm5, tmm4
-// CHECK: encoding: [0xc4,0xe2,0x58,0x48,0xf5]
-               ttmmultf32ps tmm6, tmm5, tmm4
-
-// CHECK:      ttmmultf32ps tmm3, tmm2, tmm1
-// CHECK: encoding: [0xc4,0xe2,0x70,0x48,0xda]
-               ttmmultf32ps tmm3, tmm2, tmm1
diff --git a/llvm/test/MC/X86/amx-transpose-att.s b/llvm/test/MC/X86/amx-transpose-att.s
deleted file mode 100644
index 5158470f8c905..0000000000000
--- a/llvm/test/MC/X86/amx-transpose-att.s
+++ /dev/null
@@ -1,153 +0,0 @@
-// RUN: llvm-mc -triple x86_64-unknown-unknown --show-encoding %s | FileCheck %s
-
-// CHECK: t2rpntlvwz0     268435456(%rbp,%r14,8), %tmm4
-// CHECK: encoding: [0xc4,0xa2,0x78,0x6e,0xa4,0xf5,0x00,0x00,0x00,0x10]
-          t2rpntlvwz0 268435456(%rbp,%r14,8), %tmm4
-
-// CHECK: t2rpntlvwz0     291(%r8,%rax,4), %tmm2
-// CHECK: encoding: [0xc4,0xc2,0x78,0x6e,0x94,0x80,0x23,0x01,0x00,0x00]
-          t2rpntlvwz0 291(%r8,%rax,4), %tmm2
-
-// CHECK: t2rpntlvwz0     -32(,%rbp,2), %tmm2
-// CHECK: encoding: [0xc4,0xe2,0x78,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff]
-          t2rpntlvwz0 -32(,%rbp,2), %tmm2
-
-// CHECK: t2rpntlvwz0t1     268435456(%rbp,%r14,8), %tmm4
-// CHECK: encoding: [0xc4,0xa2,0x78,0x6f,0xa4,0xf5,0x00,0x00,0x00,0x10]
-          t2rpntlvwz0t1 268435456(%rbp,%r14,8), %tmm5
-
-// CHECK: t2rpntlvwz0t1     291(%r8,%rax,4), %tmm2
-// CHECK: encoding: [0xc4,0xc2,0x78,0x6f,0x94,0x80,0x23,0x01,0x00,0x00]
-          t2rpntlvwz0t1 291(%r8,%rax,4), %tmm2
-
-// CHECK: t2rpntlvwz0t1     -32(,%rbp,2), %tmm2
-// CHECK: encoding: [0xc4,0xe2,0x78,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff]
-          t2rpntlvwz0t1 -32(,%rbp,2), %tmm2
-
-// CHECK: t2rpntlvwz1     268435456(%rbp,%r14,8), %tmm4
-// CHECK: encoding: [0xc4,0xa2,0x79,0x6e,0xa4,0xf5,0x00,0x00,0x00,0x10]
-          t2rpntlvwz1 268435456(%rbp,%r14,8), %tmm5
-
-// CHECK: t2rpntlvwz1     291(%r8,%rax,4), %tmm2
-// CHECK: encoding: [0xc4,0xc2,0x79,0x6e,0x94,0x80,0x23,0x01,0x00,0x00]
-          t2rpntlvwz1 291(%r8,%rax,4), %tmm2
-
-// CHECK: t2rpntlvwz1     -32(,%rbp,2), %tmm2
-// CHECK: encoding: [0xc4,0xe2,0x79,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff]
-          t2rpntlvwz1 -32(,%rbp,2), %tmm2
-
-// CHECK: t2rpntlvwz1t1     268435456(%rbp,%r14,8), %tmm2
-// CHECK: encoding: [0xc4,0xa2,0x79,0x6f,0x94,0xf5,0x00,0x00,0x00,0x10]
-          t2rpntlvwz1t1 268435456(%rbp,%r14,8), %tmm3
-
-// CHECK: t2rpntlvwz1t1     291(%r8,%rax,4), %tmm2
-// CHECK: encoding: [0xc4,0xc2,0x79,0x6f,0x94,0x80,0x23,0x01,0x00,0x00]
-          t2rpntlvwz1t1 291(%r8,%rax,4), %tmm2
-
-// CHECK: t2rpntlvwz1t1     -32(,%rbp,2), %tmm2
-// CHECK: encoding: [0xc4,0xe2,0x79,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff]
-          t2rpntlvwz1t1 -32(,%rbp,2), %tmm2
-
-// CHECK: t2rpntlvwz0     268435456(%r16,%r14,8), %tmm4
-// CHECK: encoding: [0x62,0xba,0x7c,0x08,0x6e,0xa4,0xf0,0x00,0x00,0x00,0x10]
-          t2rpntlvwz0 268435456(%r16,%r14,8), %tmm4
-
-// CHECK: t2rpntlvwz0     291(%r8,%r17,4), %tmm2
-// CHECK: encoding: [0x62,0xd2,0x78,0x08,0x6e,0x94,0x88,0x23,0x01,0x00,0x00]
-          t2rpntlvwz0 291(%r8,%r17,4), %tmm2
-
-// CHECK: {evex}  t2rpntlvwz0     -32(,%rbp,2), %tmm2
-// CHECK: encoding: [0x62,0xf2,0x7c,0x08,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff]
-          {evex} t2rpntlvwz0 -32(,%rbp,2), %tmm2
-
-// CHECK: t2rpntlvwz0t1     268435456(%r16,%r14,8), %tmm4
-// CHECK: encoding: [0x62,0xba,0x7c,0x08,0x6f,0xa4,0xf0,0x00,0x00,0x00,0x10]
-          t2rpntlvwz0t1 268435456(%r16,%r14,8), %tmm4
-
-// CHECK: t2rpntlvwz0t1     291(%r8,%r17,4), %tmm2
-// CHECK: encoding: [0x62,0xd2,0x78,0x08,0x6f,0x94,0x88,0x23,0x01,0x00,0x00]
-          t2rpntlvwz0t1 291(%r8,%r17,4), %tmm2
-
-// CHECK: {evex}  t2rpntlvwz0t1     -32(,%rbp,2), %tmm2
-// CHECK: encoding: [0x62,0xf2,0x7c,0x08,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff]
-          {evex} t2rpntlvwz0t1 -32(,%rbp,2), %tmm2
-
-// CHECK: t2rpntlvwz1     268435456(%r16,%r14,8), %tmm4
-// CHECK: encoding: [0x62,0xba,0x7d,0x08,0x6e,0xa4,0xf0,0x00,0x00,0x00,0x10]
-          t2rpntlvwz1 268435456(%r16,%r14,8), %tmm4
-
-// CHECK: t2rpntlvwz1     291(%r8,%r17,4), %tmm2
-// CHECK: encoding: [0x62,0xd2,0x79,0x08,0x6e,0x94,0x88,0x23,0x01,0x00,0x00]
-          t2rpntlvwz1 291(%r8,%r17,4), %tmm2
-
-// CHECK: {evex}  t2rpntlvwz1     -32(,%rbp,2), %tmm2
-// CHECK: encoding: [0x62,0xf2,0x7d,0x08,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff]
-          {evex} t2rpntlvwz1 -32(,%rbp,2), %tmm2
-
-// CHECK: t2rpntlvwz1t1     268435456(%r16,%r14,8), %tmm4
-// CHECK: encoding: [0x62,0xba,0x7d,0x08,0x6f,0xa4,0xf0,0x00,0x00,0x00,0x10]
-          t2rpntlvwz1t1 268435456(%r16,%r14,8), %tmm4
-
-// CHECK: t2rpntlvwz1t1     291(%r8,%r17,4), %tmm2
-// CHECK: encoding: [0x62,0xd2,0x79,0x08,0x6f,0x94,0x88,0x23,0x01,0x00,0x00]
-          t2rpntlvwz1t1 291(%r8,%r17,4), %tmm2
-
-// CHECK: {evex}  t2rpntlvwz1t1     -32(,%rbp,2), %tmm2
-// CHECK: encoding: [0x62,0xf2,0x7d,0x08,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff]
-          {evex} t2rpntlvwz1t1 -32(,%rbp,2), %tmm2
-
-// CHECK: ttransposed     %tmm1, %tmm5
-// CHECK: encoding: [0xc4,0xe2,0x7a,0x5f,0xe9]
-          ttransposed %tmm1, %tmm5
-
-// CHECK: ttransposed     %tmm2, %tmm3
-// CHECK: encoding: [0xc4,0xe2,0x7a,0x5f,0xda]
-          ttransposed %tmm2, %tmm3
-
-// CHECK: ttdpbf16ps     %tmm1, %tmm2, %tmm5
-// CHECK: encoding: [0xc4,0xe2,0x72,0x6c,0xea]
-          ttdpbf16ps %tmm1, %tmm2, %tmm5
-
-// CHECK: ttdpbf16ps     %tmm1, %tmm2, %tmm3
-// CHECK: encoding: [0xc4,0xe2,0x72,0x6c,0xda]
-          ttdpbf16ps %tmm1, %tmm2, %tmm3
-
-// CHECK: ttdpfp16ps     %tmm3, %tmm4, %tmm5
-// CHECK: encoding: [0xc4,0xe2,0x63,0x6c,0xec]
-          ttdpfp16ps %tmm3, %tmm4, %tmm5
-
-// CHECK: ttdpfp16ps     %tmm1, %tmm2, %tmm3
-// CHECK: encoding: [0xc4,0xe2,0x73,0x6c,0xda]
-          ttdpfp16ps %tmm1, %tmm2, %tmm3
-
-// CHECK: ttcmmimfp16ps %tmm4, %tmm5, %tmm6
-// CHECK: encoding: [0xc4,0xe2,0x5b,0x6b,0xf5]
-          ttcmmimfp16ps %tmm4, %tmm5, %tmm6
-
-// CHECK: ttcmmimfp16ps %tmm1, %tmm2, %tmm3
-// CHECK: encoding: [0xc4,0xe2,0x73,0x6b,0xda]
-          ttcmmimfp16ps %tmm1, %tmm2, %tmm3
-
-// CHECK: ttcmmrlfp16ps %tmm4, %tmm5, %tmm6
-// CHECK: encoding: [0xc4,0xe2,0x5a,0x6b,0xf5]
-          ttcmmrlfp16ps %tmm4, %tmm5, %tmm6
-
-// CHECK: ttcmmrlfp16ps %tmm1, %tmm2, %tmm3
-// CHECK: encoding: [0xc4,0xe2,0x72,0x6b,0xda]
-          ttcmmrlfp16ps %tmm1, %tmm2, %tmm3
-
-// CHECK: tconjtcmmimfp16ps %tmm4, %tmm5, %tmm6
-// CHECK: encoding: [0xc4,0xe2,0x58,0x6b,0xf5]
-          tconjtcmmimfp16ps %tmm4, %tmm5, %tmm6
-
-// CHECK: tconjtcmmimfp16ps %tmm1, %tmm2, %tmm3
-// CHECK: encoding: [0xc4,0xe2,0x70,0x6b,0xda]
-          tconjtcmmimfp16ps %tmm1, %tmm2, %tmm3
-
-// CHECK: tconjtfp16 %tmm5, %tmm6
-// CHECK: encoding: [0xc4,0xe2,0x79,0x6b,0xf5]
-          tconjtfp16 %tmm5, %tmm6
-
-// CHECK: tconjtfp16 %tmm2, %tmm3
-// CHECK: encoding: [0xc4,0xe2,0x79,0x6b,0xda]
-          tconjtfp16 %tmm2, %tmm3
diff --git a/llvm/test/MC/X86/amx-transpose-intel.s b/llvm/test/MC/X86/amx-transpose-intel.s
deleted file mode 100644
index 0d2c22f67a173..0000000000000
--- a/llvm/test/MC/X86/amx-transpose-intel.s
+++ /dev/null
@@ -1,153 +0,0 @@
-// RUN: llvm-mc -triple x86_64-unknown-unknown -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
-
-// CHECK: t2rpntlvwz0     tmm6, [rbp + 8*r14 + 268435456]
-// CHECK: encoding: [0xc4,0xa2,0x78,0x6e,0xb4,0xf5,0x00,0x00,0x00,0x10]
-          t2rpntlvwz0 tmm6, [rbp + 8*r14 + 268435456]
-
-// CHECK: t2rpntlvwz0     tmm2, [r8 + 4*rax + 291]
-// CHECK: encoding: [0xc4,0xc2,0x78,0x6e,0x94,0x80,0x23,0x01,0x00,0x00]
-          t2rpntlvwz0 tmm2, [r8 + 4*rax + 291]
-
-// CHECK: t2rpntlvwz0     tmm2, [2*rbp - 32]
-// CHECK: encoding: [0xc4,0xe2,0x78,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff]
-          t2rpntlvwz0 tmm2, [2*rbp - 32]
-
-// CHECK: t2rpntlvwz0t1     tmm6, [rbp + 8*r14 + 268435456]
-// CHECK: encoding: [0xc4,0xa2,0x78,0x6f,0xb4,0xf5,0x00,0x00,0x00,0x10]
-          t2rpntlvwz0t1 tmm7, [rbp + 8*r14 + 268435456]
-
-// CHECK: t2rpntlvwz0t1     tmm2, [r8 + 4*rax + 291]
-// CHECK: encoding: [0xc4,0xc2,0x78,0x6f,0x94,0x80,0x23,0x01,0x00,0x00]
-          t2rpntlvwz0t1 tmm2, [r8 + 4*rax + 291]
-
-// CHECK: t2rpntlvwz0t1     tmm2, [2*rbp - 32]
-// CHECK: encoding: [0xc4,0xe2,0x78,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff]
-          t2rpntlvwz0t1 tmm2, [2*rbp - 32]
-
-// CHECK: t2rpntlvwz1     tmm0, [rbp + 8*r14 + 268435456]
-// CHECK: encoding: [0xc4,0xa2,0x79,0x6e,0x84,0xf5,0x00,0x00,0x00,0x10]
-          t2rpntlvwz1 tmm1, [rbp + 8*r14 + 268435456]
-
-// CHECK: t2rpntlvwz1     tmm2, [r8 + 4*rax + 291]
-// CHECK: encoding: [0xc4,0xc2,0x79,0x6e,0x94,0x80,0x23,0x01,0x00,0x00]
-          t2rpntlvwz1 tmm2, [r8 + 4*rax + 291]
-
-// CHECK: t2rpntlvwz1     tmm2, [2*rbp - 32]
-// CHECK: encoding: [0xc4,0xe2,0x79,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff]
-          t2rpntlvwz1 tmm2, [2*rbp - 32]
-
-// CHECK: t2rpntlvwz1t1     tmm6, [rbp + 8*r14 + 268435456]
-// CHECK: encoding: [0xc4,0xa2,0x79,0x6f,0xb4,0xf5,0x00,0x00,0x00,0x10]
-          t2rpntlvwz1t1 tmm6, [rbp + 8*r14 + 268435456]
-
-// CHECK: t2rpntlvwz1t1     tmm2, [r8 + 4*rax + 291]
-// CHECK: encoding: [0xc4,0xc2,0x79,0x6f,0x94,0x80,0x23,0x01,0x00,0x00]
-          t2rpntlvwz1t1 tmm2, [r8 + 4*rax + 291]
-
-// CHECK: t2rpntlvwz1t1     tmm2, [2*rbp - 32]
-// CHECK: encoding: [0xc4,0xe2,0x79,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff]
-          t2rpntlvwz1t1 tmm2, [2*rbp - 32]
-
-// CHECK: t2rpntlvwz0     tmm4, [r16 + 8*r14 + 268435456]
-// CHECK: encoding: [0x62,0xba,0x7c,0x08,0x6e,0xa4,0xf0,0x00,0x00,0x00,0x10]
-          t2rpntlvwz0 tmm4, [r16 + 8*r14 + 268435456]
-
-// CHECK: t2rpntlvwz0     tmm2, [r8 + 4*r17 + 291]
-// CHECK: encoding: [0x62,0xd2,0x78,0x08,0x6e,0x94,0x88,0x23,0x01,0x00,0x00]
-          t2rpntlvwz0 tmm2, [r8 + 4*r17 + 291]
-
-// CHECK: {evex} t2rpntlvwz0     tmm2, [2*rbp - 32]
-// CHECK: encoding: [0x62,0xf2,0x7c,0x08,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff]
-          {evex} t2rpntlvwz0 tmm2, [2*rbp - 32]
-
-// CHECK: t2rpntlvwz0t1     tmm4, [r16 + 8*r14 + 268435456]
-// CHECK: encoding: [0x62,0xba,0x7c,0x08,0x6f,0xa4,0xf0,0x00,0x00,0x00,0x10]
-          t2rpntlvwz0t1 tmm4, [r16 + 8*r14 + 268435456]
-
-// CHECK: t2rpntlvwz0t1     tmm2, [r8 + 4*r17 + 291]
-// CHECK: encoding: [0x62,0xd2,0x78,0x08,0x6f,0x94,0x88,0x23,0x01,0x00,0x00]
-          t2rpntlvwz0t1 tmm2, [r8 + 4*r17 + 291]
-
-// CHECK: {evex} t2rpntlvwz0t1     tmm2, [2*rbp - 32]
-// CHECK: encoding: [0x62,0xf2,0x7c,0x08,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff]
-          {evex} t2rpntlvwz0t1 tmm2, [2*rbp - 32]
-
-// CHECK: t2rpntlvwz1     tmm4, [r16 + 8*r14 + 268435456]
-// CHECK: encoding: [0x62,0xba,0x7d,0x08,0x6e,0xa4,0xf0,0x00,0x00,0x00,0x10]
-          t2rpntlvwz1 tmm4, [r16 + 8*r14 + 268435456]
-
-// CHECK: t2rpntlvwz1     tmm2, [r8 + 4*r17 + 291]
-// CHECK: encoding: [0x62,0xd2,0x79,0x08,0x6e,0x94,0x88,0x23,0x01,0x00,0x00]
-          t2rpntlvwz1 tmm2, [r8 + 4*r17 + 291]
-
-// CHECK: {evex} t2rpntlvwz1     tmm2, [2*rbp - 32]
-// CHECK: encoding: [0x62,0xf2,0x7d,0x08,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff]
-          {evex} t2rpntlvwz1 tmm2, [2*rbp - 32]
-
-// CHECK: t2rpntlvwz1t1     tmm4, [r16 + 8*r14 + 268435456]
-// CHECK: encoding: [0x62,0xba,0x7d,0x08,0x6f,0xa4,0xf0,0x00,0x00,0x00,0x10]
-          t2rpntlvwz1t1 tmm4, [r16 + 8*r14 + 268435456]
-
-// CHECK: t2rpntlvwz1t1     tmm2, [r8 + 4*r17 + 291]
-// CHECK: encoding: [0x62,0xd2,0x79,0x08,0x6f,0x94,0x88,0x23,0x01,0x00,0x00]
-          t2rpntlvwz1t1 tmm2, [r8 + 4*r17 + 291]
-
-// CHECK: {evex} t2rpntlvwz1t1     tmm2, [2*rbp - 32]
-// CHECK: encoding: [0x62,0xf2,0x7d,0x08,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff]
-          {evex} t2rpntlvwz1t1 tmm2, [2*rbp - 32]
-
-// CHECK: ttransposed     tmm5, tmm1
-// CHECK: encoding: [0xc4,0xe2,0x7a,0x5f,0xe9]
-          ttransposed tmm5, tmm1
-
-// CHECK: ttransposed     tmm3, tmm2
-// CHECK: encoding: [0xc4,0xe2,0x7a,0x5f,0xda]
-          ttransposed tmm3, tmm2
-
-// CHECK: ttdpbf16ps     tmm5, tmm0, tmm4
-// CHECK: encoding: [0xc4,0xe2,0x5a,0x6c,0xe8]
-          ttdpbf16ps tmm5, tmm0, tmm4
-
-// CHECK: ttdpbf16ps     tmm3, tmm2, tmm1
-// CHECK: encoding: [0xc4,0xe2,0x72,0x6c,0xda]
-          ttdpbf16ps tmm3, tmm2, tmm1
-
-// CHECK: ttdpfp16ps     tmm1, tmm0, tmm4
-// CHECK: encoding: [0xc4,0xe2,0x5b,0x6c,0xc8]
-          ttdpfp16ps tmm1, tmm0, tmm4
-
-// CHECK: ttdpfp16ps     tmm3, tmm2, tmm1
-// CHECK: encoding: [0xc4,0xe2,0x73,0x6c,0xda]
-          ttdpfp16ps tmm3, tmm2, tmm1
-
-// CHECK: ttcmmimfp16ps tmm6, tmm5, tmm4
-// CHECK: encoding: [0xc4,0xe2,0x5b,0x6b,0xf5]
-          ttcmmimfp16ps tmm6, tmm5, tmm4
-
-// CHECK: ttcmmimfp16ps tmm3, tmm2, tmm1
-// CHECK: encoding: [0xc4,0xe2,0x73,0x6b,0xda]
-          ttcmmimfp16ps tmm3, tmm2, tmm1
-
-// CHECK: ttcmmrlfp16ps tmm6, tmm5, tmm4
-// CHECK: encoding: [0xc4,0xe2,0x5a,0x6b,0xf5]
-          ttcmmrlfp16ps tmm6, tmm5, tmm4
-
-// CHECK: ttcmmrlfp16ps tmm3, tmm2, tmm1
-// CHECK: encoding: [0xc4,0xe2,0x72,0x6b,0xda]
-          ttcmmrlfp16ps tmm3, tmm2, tmm1
-
-// CHECK: tconjtcmmimfp16ps tmm6, tmm5, tmm4
-// CHECK: encoding: [0xc4,0xe2,0x58,0x6b,0xf5]
-          tconjtcmmimfp16ps tmm6, tmm5, tmm4
-
-// CHECK: tconjtcmmimfp16ps tmm3, tmm2, tmm1
-// CHECK: encoding: [0xc4,0xe2,0x70,0x6b,0xda]
-          tconjtcmmimfp16ps tmm3, tmm2, tmm1
-
-// CHECK: tconjtfp16 tmm6, tmm5
-// CHECK: encoding: [0xc4,0xe2,0x79,0x6b,0xf5]
-          tconjtfp16 tmm6, tmm5
-
-// CHECK: tconjtfp16 tmm3, tmm2
-// CHECK: encoding: [0xc4,0xe2,0x79,0x6b,0xda]
-          tconjtfp16 tmm3, tmm2
diff --git a/llvm/test/TableGen/x86-instr-mapping.inc b/llvm/test/TableGen/x86-instr-mapping.inc
index f621979b2af95..6d2873ed4e749 100644
--- a/llvm/test/TableGen/x86-instr-mapping.inc
+++ b/llvm/test/TableGen/x86-instr-mapping.inc
@@ -167,14 +167,6 @@ static const X86TableEntry X86CompressEVEXTable[] = {
   { X86::SHRX64rm_EVEX, X86::SHRX64rm },
   { X86::SHRX64rr_EVEX, X86::SHRX64rr },
   { X86::STTILECFG_EVEX, X86::STTILECFG },
-  { X86::T2RPNTLVWZ0RST1_EVEX, X86::T2RPNTLVWZ0RST1 },
-  { X86::T2RPNTLVWZ0RS_EVEX, X86::T2RPNTLVWZ0RS },
-  { X86::T2RPNTLVWZ0T1_EVEX, X86::T2RPNTLVWZ0T1 },
-  { X86::T2RPNTLVWZ0_EVEX, X86::T2RPNTLVWZ0 },
-  { X86::T2RPNTLVWZ1RST1_EVEX, X86::T2RPNTLVWZ1RST1 },
-  { X86::T2RPNTLVWZ1RS_EVEX, X86::T2RPNTLVWZ1RS },
-  { X86::T2RPNTLVWZ1T1_EVEX, X86::T2RPNTLVWZ1T1 },
-  { X86::T2RPNTLVWZ1_EVEX, X86::T2RPNTLVWZ1 },
   { X86::TILELOADDRST1_EVEX, X86::TILELOADDRST1 },
   { X86::TILELOADDRS_EVEX, X86::TILELOADDRS },
   { X86::TILELOADDT1_EVEX, X86::TILELOADDT1 },
diff --git a/llvm/test/Transforms/DFAJumpThreading/max-outer-uses.ll b/llvm/test/Transforms/DFAJumpThreading/max-outer-uses.ll
new file mode 100644
index 0000000000000..dfcc5b1a5c3fe
--- /dev/null
+++ b/llvm/test/Transforms/DFAJumpThreading/max-outer-uses.ll
@@ -0,0 +1,326 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -passes=dfa-jump-threading -dfa-max-out-use-blocks=5 %s | FileCheck %s
+
+declare void @use(i32)
+
+define void @max_outer_uses_by_switch(i32 %cond, ptr %p) {
+; CHECK-LABEL: define void @max_outer_uses_by_switch(
+; CHECK-SAME: i32 [[COND:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[SWITCH_BB:.*]]
+; CHECK:       [[SWITCH_BB]]:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[DETERMINE:%.*]], %[[SUB_SWITCH_BB:.*]] ], [ 2, %[[CASE2:.*]] ]
+; CHECK-NEXT:    switch i32 [[PHI]], label %[[DEFAULT_DEST:.*]] [
+; CHECK-NEXT:      i32 0, label %[[CASE1:.*]]
+; CHECK-NEXT:      i32 1, label %[[CASE2]]
+; CHECK-NEXT:      i32 2, label %[[CASE3:.*]]
+; CHECK-NEXT:    ]
+; CHECK:       [[CASE1]]:
+; CHECK-NEXT:    br label %[[SUB_SWITCH_BB]]
+; CHECK:       [[CASE3]]:
+; CHECK-NEXT:    br label %[[SUB_SWITCH_BB]]
+; CHECK:       [[SUB_SWITCH_BB]]:
+; CHECK-NEXT:    [[DETERMINE]] = phi i32 [ 1, %[[CASE1]] ], [ 3, %[[CASE3]] ]
+; CHECK-NEXT:    [[DEF:%.*]] = load i32, ptr [[P]], align 4
+; CHECK-NEXT:    switch i32 [[COND]], label %[[SWITCH_BB]] [
+; CHECK-NEXT:      i32 0, label %[[OUTER1:.*]]
+; CHECK-NEXT:      i32 1, label %[[OUTER2:.*]]
+; CHECK-NEXT:      i32 2, label %[[OUTER3:.*]]
+; CHECK-NEXT:      i32 3, label %[[OUTER4:.*]]
+; CHECK-NEXT:    ]
+; CHECK:       [[CASE2]]:
+; CHECK-NEXT:    br label %[[SWITCH_BB]]
+; CHECK:       [[OUTER1]]:
+; CHECK-NEXT:    call void @use(i32 [[DEF]])
+; CHECK-NEXT:    ret void
+; CHECK:       [[OUTER2]]:
+; CHECK-NEXT:    call void @use(i32 [[DEF]])
+; CHECK-NEXT:    ret void
+; CHECK:       [[OUTER3]]:
+; CHECK-NEXT:    call void @use(i32 [[DEF]])
+; CHECK-NEXT:    ret void
+; CHECK:       [[OUTER4]]:
+; CHECK-NEXT:    call void @use(i32 [[DEF]])
+; CHECK-NEXT:    ret void
+; CHECK:       [[DEFAULT_DEST]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %switch_bb
+
+switch_bb:
+  %phi = phi i32 [ 0, %entry ], [ %determine, %sub_switch_bb ], [ 2, %case2 ]
+  switch i32 %phi, label %default_dest [
+  i32 0, label %case1
+  i32 1, label %case2
+  i32 2, label %case3
+  ]
+
+case1:
+  br label %sub_switch_bb
+
+case3:
+  br label %sub_switch_bb
+
+sub_switch_bb:
+  %determine = phi i32 [ 1, %case1 ], [ 3, %case3 ]
+  %def = load i32, ptr %p
+  switch i32 %cond, label %switch_bb [
+  i32 0, label %outer1
+  i32 1, label %outer2
+  i32 2, label %outer3
+  i32 3, label %outer4
+  ]
+
+case2:
+  br label %switch_bb
+
+outer1:
+  call void @use(i32 %def)
+  ret void
+
+outer2:
+  call void @use(i32 %def)
+  ret void
+
+outer3:
+  call void @use(i32 %def)
+  ret void
+
+outer4:
+  call void @use(i32 %def)
+  ret void
+
+default_dest:
+  ret void
+}
+
+define void @less_outer_uses_by_switch(i32 %cond, ptr %p) {
+; CHECK-LABEL: define void @less_outer_uses_by_switch(
+; CHECK-SAME: i32 [[COND:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[SWITCH_BB:.*]]
+; CHECK:       [[SWITCH_BB]]:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ poison, %[[SUB_SWITCH_BB:.*]] ]
+; CHECK-NEXT:    switch i32 [[PHI]], label %[[DEFAULT_DEST:.*]] [
+; CHECK-NEXT:      i32 0, label %[[CASE1:.*]]
+; CHECK-NEXT:      i32 1, label %[[CASE2:.*]]
+; CHECK-NEXT:      i32 2, label %[[CASE3:.*]]
+; CHECK-NEXT:    ]
+; CHECK:       [[SWITCH_BB_JT2:.*]]:
+; CHECK-NEXT:    [[PHI_JT2:%.*]] = phi i32 [ 2, %[[CASE2]] ]
+; CHECK-NEXT:    br label %[[CASE3]]
+; CHECK:       [[SWITCH_BB_JT3:.*]]:
+; CHECK-NEXT:    [[PHI_JT3:%.*]] = phi i32 [ [[DETERMINE_JT3:%.*]], %[[SUB_SWITCH_BB_JT3:.*]] ]
+; CHECK-NEXT:    br label %[[DEFAULT_DEST]]
+; CHECK:       [[SWITCH_BB_JT1:.*]]:
+; CHECK-NEXT:    [[PHI_JT1:%.*]] = phi i32 [ [[DETERMINE_JT1:%.*]], %[[SUB_SWITCH_BB_JT1:.*]] ]
+; CHECK-NEXT:    br label %[[CASE2]]
+; CHECK:       [[CASE1]]:
+; CHECK-NEXT:    br label %[[SUB_SWITCH_BB_JT1]]
+; CHECK:       [[CASE3]]:
+; CHECK-NEXT:    br label %[[SUB_SWITCH_BB_JT3]]
+; CHECK:       [[SUB_SWITCH_BB]]:
+; CHECK-NEXT:    [[DEF:%.*]] = load i32, ptr [[P]], align 4
+; CHECK-NEXT:    switch i32 [[COND]], label %[[SWITCH_BB]] [
+; CHECK-NEXT:      i32 0, label %[[OUTER1:.*]]
+; CHECK-NEXT:    ]
+; CHECK:       [[SUB_SWITCH_BB_JT3]]:
+; CHECK-NEXT:    [[DETERMINE_JT3]] = phi i32 [ 3, %[[CASE3]] ]
+; CHECK-NEXT:    [[DEF_JT3:%.*]] = load i32, ptr [[P]], align 4
+; CHECK-NEXT:    switch i32 [[COND]], label %[[SWITCH_BB_JT3]] [
+; CHECK-NEXT:      i32 0, label %[[OUTER1]]
+; CHECK-NEXT:    ]
+; CHECK:       [[SUB_SWITCH_BB_JT1]]:
+; CHECK-NEXT:    [[DETERMINE_JT1]] = phi i32 [ 1, %[[CASE1]] ]
+; CHECK-NEXT:    [[DEF_JT1:%.*]] = load i32, ptr [[P]], align 4
+; CHECK-NEXT:    switch i32 [[COND]], label %[[SWITCH_BB_JT1]] [
+; CHECK-NEXT:      i32 0, label %[[OUTER1]]
+; CHECK-NEXT:    ]
+; CHECK:       [[CASE2]]:
+; CHECK-NEXT:    br label %[[SWITCH_BB_JT2]]
+; CHECK:       [[OUTER1]]:
+; CHECK-NEXT:    [[DEF1:%.*]] = phi i32 [ [[DEF_JT3]], %[[SUB_SWITCH_BB_JT3]] ], [ [[DEF_JT1]], %[[SUB_SWITCH_BB_JT1]] ], [ [[DEF]], %[[SUB_SWITCH_BB]] ]
+; CHECK-NEXT:    call void @use(i32 [[DEF1]])
+; CHECK-NEXT:    ret void
+; CHECK:       [[DEFAULT_DEST]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %switch_bb
+
+switch_bb:
+  %phi = phi i32 [ 0, %entry ], [ %determine, %sub_switch_bb ], [ 2, %case2 ]
+  switch i32 %phi, label %default_dest [
+  i32 0, label %case1
+  i32 1, label %case2
+  i32 2, label %case3
+  ]
+
+case1:
+  br label %sub_switch_bb
+
+case3:
+  br label %sub_switch_bb
+
+sub_switch_bb:
+  %determine = phi i32 [ 1, %case1 ], [ 3, %case3 ]
+  %def = load i32, ptr %p
+  switch i32 %cond, label %switch_bb [
+  i32 0, label %outer1
+  ]
+
+case2:
+  br label %switch_bb
+
+outer1:
+  call void @use(i32 %def)
+  ret void
+
+default_dest:
+  ret void
+}
+
+
+define void @max_outer_uses_multi_preds(i32 %cond, ptr %p) {
+; CHECK-LABEL: define void @max_outer_uses_multi_preds(
+; CHECK-SAME: i32 [[COND:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[SWITCH_BB:.*]]
+; CHECK:       [[SWITCH_BB]]:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ poison, %[[SUB_SWITCH_BB:.*]] ]
+; CHECK-NEXT:    switch i32 [[PHI]], label %[[DEFAULT_DEST:.*]] [
+; CHECK-NEXT:      i32 0, label %[[CASE1:.*]]
+; CHECK-NEXT:      i32 1, label %[[CASE2:.*]]
+; CHECK-NEXT:      i32 2, label %[[CASE3:.*]]
+; CHECK-NEXT:      i32 3, label %[[CASE4:.*]]
+; CHECK-NEXT:    ]
+; CHECK:       [[SWITCH_BB_JT2:.*]]:
+; CHECK-NEXT:    [[PHI_JT2:%.*]] = phi i32 [ 2, %[[CASE2]] ]
+; CHECK-NEXT:    br label %[[CASE3]]
+; CHECK:       [[SWITCH_BB_JT3:.*]]:
+; CHECK-NEXT:    [[PHI_JT3:%.*]] = phi i32 [ [[DETERMINE_JT3:%.*]], %[[SUB_SWITCH_BB_JT3:.*]] ]
+; CHECK-NEXT:    br label %[[CASE4]]
+; CHECK:       [[SWITCH_BB_JT1:.*]]:
+; CHECK-NEXT:    [[PHI_JT1:%.*]] = phi i32 [ [[DETERMINE_JT1:%.*]], %[[SUB_SWITCH_BB_JT1:.*]] ]
+; CHECK-NEXT:    br label %[[CASE2]]
+; CHECK:       [[CASE1]]:
+; CHECK-NEXT:    br label %[[SUB_SWITCH_BB_JT1]]
+; CHECK:       [[CASE3]]:
+; CHECK-NEXT:    br label %[[SUB_SWITCH_BB_JT3]]
+; CHECK:       [[SUB_SWITCH_BB]]:
+; CHECK-NEXT:    [[DEF:%.*]] = load i32, ptr [[P]], align 4
+; CHECK-NEXT:    switch i32 [[COND]], label %[[SWITCH_BB]] [
+; CHECK-NEXT:      i32 0, label %[[OUTER1:.*]]
+; CHECK-NEXT:      i32 1, label %[[OUTER2:.*]]
+; CHECK-NEXT:      i32 2, label %[[OUTER3:.*]]
+; CHECK-NEXT:      i32 3, label %[[OUTER4:.*]]
+; CHECK-NEXT:    ]
+; CHECK:       [[SUB_SWITCH_BB_JT3]]:
+; CHECK-NEXT:    [[DETERMINE_JT3]] = phi i32 [ 3, %[[CASE3]] ]
+; CHECK-NEXT:    [[DEF_JT3:%.*]] = load i32, ptr [[P]], align 4
+; CHECK-NEXT:    switch i32 [[COND]], label %[[SWITCH_BB_JT3]] [
+; CHECK-NEXT:      i32 0, label %[[OUTER1]]
+; CHECK-NEXT:      i32 1, label %[[OUTER2]]
+; CHECK-NEXT:      i32 2, label %[[OUTER3]]
+; CHECK-NEXT:      i32 3, label %[[OUTER4]]
+; CHECK-NEXT:    ]
+; CHECK:       [[SUB_SWITCH_BB_JT1]]:
+; CHECK-NEXT:    [[DETERMINE_JT1]] = phi i32 [ 1, %[[CASE1]] ]
+; CHECK-NEXT:    [[DEF_JT1:%.*]] = load i32, ptr [[P]], align 4
+; CHECK-NEXT:    switch i32 [[COND]], label %[[SWITCH_BB_JT1]] [
+; CHECK-NEXT:      i32 0, label %[[OUTER1]]
+; CHECK-NEXT:      i32 1, label %[[OUTER2]]
+; CHECK-NEXT:      i32 2, label %[[OUTER3]]
+; CHECK-NEXT:      i32 3, label %[[OUTER4]]
+; CHECK-NEXT:    ]
+; CHECK:       [[CASE4]]:
+; CHECK-NEXT:    [[DEF1:%.*]] = load i32, ptr [[P]], align 4
+; CHECK-NEXT:    switch i32 [[COND]], label %[[OUTER4]] [
+; CHECK-NEXT:      i32 0, label %[[OUTER1]]
+; CHECK-NEXT:      i32 1, label %[[OUTER2]]
+; CHECK-NEXT:      i32 2, label %[[OUTER3]]
+; CHECK-NEXT:    ]
+; CHECK:       [[CASE2]]:
+; CHECK-NEXT:    br label %[[SWITCH_BB_JT2]]
+; CHECK:       [[OUTER1]]:
+; CHECK-NEXT:    [[PHI1:%.*]] = phi i32 [ [[DEF]], %[[SUB_SWITCH_BB]] ], [ [[DEF1]], %[[CASE4]] ], [ [[DEF_JT1]], %[[SUB_SWITCH_BB_JT1]] ], [ [[DEF_JT3]], %[[SUB_SWITCH_BB_JT3]] ]
+; CHECK-NEXT:    call void @use(i32 [[PHI1]])
+; CHECK-NEXT:    ret void
+; CHECK:       [[OUTER2]]:
+; CHECK-NEXT:    [[PHI2:%.*]] = phi i32 [ [[DEF]], %[[SUB_SWITCH_BB]] ], [ [[DEF1]], %[[CASE4]] ], [ [[DEF_JT1]], %[[SUB_SWITCH_BB_JT1]] ], [ [[DEF_JT3]], %[[SUB_SWITCH_BB_JT3]] ]
+; CHECK-NEXT:    call void @use(i32 [[PHI2]])
+; CHECK-NEXT:    ret void
+; CHECK:       [[OUTER3]]:
+; CHECK-NEXT:    [[PHI3:%.*]] = phi i32 [ [[DEF]], %[[SUB_SWITCH_BB]] ], [ [[DEF1]], %[[CASE4]] ], [ [[DEF_JT1]], %[[SUB_SWITCH_BB_JT1]] ], [ [[DEF_JT3]], %[[SUB_SWITCH_BB_JT3]] ]
+; CHECK-NEXT:    call void @use(i32 [[PHI3]])
+; CHECK-NEXT:    ret void
+; CHECK:       [[OUTER4]]:
+; CHECK-NEXT:    [[PHI4:%.*]] = phi i32 [ [[DEF]], %[[SUB_SWITCH_BB]] ], [ [[DEF1]], %[[CASE4]] ], [ [[DEF_JT1]], %[[SUB_SWITCH_BB_JT1]] ], [ [[DEF_JT3]], %[[SUB_SWITCH_BB_JT3]] ]
+; CHECK-NEXT:    call void @use(i32 [[PHI4]])
+; CHECK-NEXT:    ret void
+; CHECK:       [[DEFAULT_DEST]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %switch_bb
+
+switch_bb:
+  %phi = phi i32 [ 0, %entry ], [ %determine, %sub_switch_bb ], [ 2, %case2 ]
+  switch i32 %phi, label %default_dest [
+  i32 0, label %case1
+  i32 1, label %case2
+  i32 2, label %case3
+  i32 3, label %case4
+  ]
+
+case1:
+  br label %sub_switch_bb
+
+case3:
+  br label %sub_switch_bb
+
+sub_switch_bb:
+  %determine = phi i32 [ 1, %case1 ], [ 3, %case3 ]
+  %def = load i32, ptr %p
+  switch i32 %cond, label %switch_bb [
+  i32 0, label %outer1
+  i32 1, label %outer2
+  i32 2, label %outer3
+  i32 3, label %outer4
+  ]
+
+case4:
+  %def1 = load i32, ptr %p
+  switch i32 %cond, label %outer4 [
+  i32 0, label %outer1
+  i32 1, label %outer2
+  i32 2, label %outer3
+  ]
+
+case2:
+  br label %switch_bb
+
+outer1:
+  %phi1 = phi i32 [ %def, %sub_switch_bb ], [ %def1, %case4 ]
+  call void @use(i32 %phi1)
+  ret void
+
+outer2:
+  %phi2 = phi i32 [ %def, %sub_switch_bb ], [ %def1, %case4 ]
+  call void @use(i32 %phi2)
+  ret void
+
+outer3:
+  %phi3 = phi i32 [ %def, %sub_switch_bb ], [ %def1, %case4 ]
+  call void @use(i32 %phi3)
+  ret void
+
+outer4:
+  %phi4 = phi i32 [ %def, %sub_switch_bb ], [ %def1, %case4 ]
+  call void @use(i32 %phi4)
+  ret void
+
+default_dest:
+  ret void
+}
diff --git a/llvm/test/Transforms/DropUnnecessaryAssumes/basic.ll b/llvm/test/Transforms/DropUnnecessaryAssumes/basic.ll
index 8a6f60ba7a204..87aed77d06ef8 100644
--- a/llvm/test/Transforms/DropUnnecessaryAssumes/basic.ll
+++ b/llvm/test/Transforms/DropUnnecessaryAssumes/basic.ll
@@ -184,6 +184,18 @@ define void @type_test(ptr %x) {
   ret void
 }
 
+define void @public_type_test(ptr %x) {
+; CHECK-LABEL: define void @public_type_test(
+; CHECK-SAME: ptr [[X:%.*]]) {
+; CHECK-NEXT:    [[TEST:%.*]] = call i1 @llvm.public.type.test(ptr [[X]], metadata !"typeid")
+; CHECK-NEXT:    call void @llvm.assume(i1 [[TEST]])
+; CHECK-NEXT:    ret void
+;
+  %test = call i1 @llvm.public.type.test(ptr %x, metadata !"typeid")
+  call void @llvm.assume(i1 %test)
+  ret void
+}
+
 define void @multiple_dead_conds(i32 %x) {
 ; CHECK-LABEL: define void @multiple_dead_conds(
 ; CHECK-SAME: i32 [[X:%.*]]) {
diff --git a/llvm/test/Transforms/FixIrreducible/bug45623.ll b/llvm/test/Transforms/FixIrreducible/bug45623.ll
index 58724431ff0ee..b6dd6fb9e6fcb 100644
--- a/llvm/test/Transforms/FixIrreducible/bug45623.ll
+++ b/llvm/test/Transforms/FixIrreducible/bug45623.ll
@@ -90,3 +90,112 @@ for.end626:                                       ; preds = %for.cond616
 if.else629:                                       ; preds = %backtrack
   br label %retry
 }
+
+define void @tre_tnfa_run_backtrack_callbr(i1 %arg) {
+; CHECK-LABEL: @tre_tnfa_run_backtrack_callbr(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    callbr void asm "", ""()
+; CHECK-NEXT:            to label [[RETRY:%.*]] []
+; CHECK:       retry:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[ARG:%.*]])
+; CHECK-NEXT:            to label [[RETRY_TARGET_BACKTRACK:%.*]] [label %retry.target.while.body248]
+; CHECK:       while.body248:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[ARG]])
+; CHECK-NEXT:            to label [[IF_THEN250:%.*]] [label %if.end275]
+; CHECK:       if.then250:
+; CHECK-NEXT:    callbr void asm "", ""()
+; CHECK-NEXT:            to label [[FOR_COND264:%.*]] []
+; CHECK:       for.cond264:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[ARG]])
+; CHECK-NEXT:            to label [[FOR_BODY267:%.*]] [label %backtrack]
+; CHECK:       for.body267:
+; CHECK-NEXT:    callbr void asm "", ""()
+; CHECK-NEXT:            to label [[FOR_COND264]] []
+; CHECK:       if.end275:
+; CHECK-NEXT:    callbr void asm "", ""()
+; CHECK-NEXT:            to label [[FOR_COND342:%.*]] []
+; CHECK:       for.cond342:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[ARG]])
+; CHECK-NEXT:            to label [[FOR_BODY345:%.*]] [label %for.end580]
+; CHECK:       for.body345:
+; CHECK-NEXT:    callbr void asm "", ""()
+; CHECK-NEXT:            to label [[FOR_COND342]] []
+; CHECK:       for.end580:
+; CHECK-NEXT:    callbr void asm "", ""()
+; CHECK-NEXT:            to label [[BACKTRACK:%.*]] []
+; CHECK:       backtrack:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[ARG]])
+; CHECK-NEXT:            to label [[IF_THEN595:%.*]] [label %if.else629]
+; CHECK:       if.then595:
+; CHECK-NEXT:    callbr void asm "", ""()
+; CHECK-NEXT:            to label [[FOR_COND616:%.*]] []
+; CHECK:       for.cond616:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[ARG]])
+; CHECK-NEXT:            to label [[FOR_BODY619:%.*]] [label %for.end626]
+; CHECK:       for.body619:
+; CHECK-NEXT:    callbr void asm "", ""()
+; CHECK-NEXT:            to label [[FOR_COND616]] []
+; CHECK:       for.end626:
+; CHECK-NEXT:    callbr void asm "", ""()
+; CHECK-NEXT:            to label [[FOR_END626_TARGET_WHILE_BODY248:%.*]] []
+; CHECK:       if.else629:
+; CHECK-NEXT:    callbr void asm "", ""()
+; CHECK-NEXT:            to label [[RETRY]] []
+; CHECK:       for.end626.target.while.body248:
+; CHECK-NEXT:    br label [[IRR_GUARD:%.*]]
+; CHECK:       retry.target.backtrack:
+; CHECK-NEXT:    br label [[IRR_GUARD]]
+; CHECK:       retry.target.while.body248:
+; CHECK-NEXT:    br label [[IRR_GUARD]]
+; CHECK:       irr.guard:
+; CHECK-NEXT:    [[GUARD_WHILE_BODY248:%.*]] = phi i1 [ true, [[FOR_END626_TARGET_WHILE_BODY248]] ], [ false, [[RETRY_TARGET_BACKTRACK]] ], [ true, [[RETRY_TARGET_WHILE_BODY248:%.*]] ]
+; CHECK-NEXT:    br i1 [[GUARD_WHILE_BODY248]], label [[WHILE_BODY248:%.*]], label [[BACKTRACK]]
+;
+entry:
+  callbr void asm "", ""() to label %retry []
+
+retry:
+  callbr void asm "", "r,!i"(i1 %arg) to label %backtrack [label %while.body248]
+
+while.body248:                                    ; preds = %for.end626, %retry
+  callbr void asm "", "r,!i"(i1 %arg) to label %if.then250 [label %if.end275]
+
+if.then250:                                       ; preds = %while.body248
+  callbr void asm "", ""() to label %for.cond264 []
+
+for.cond264:                                      ; preds = %for.body267, %if.then250
+  callbr void asm "", "r,!i"(i1 %arg) to label %for.body267 [label %backtrack]
+
+for.body267:                                      ; preds = %for.cond264
+  callbr void asm "", ""() to label %for.cond264 []
+
+if.end275:                                        ; preds = %while.body248
+  callbr void asm "", ""() to label %for.cond342 []
+
+for.cond342:                                      ; preds = %for.body345, %if.end275
+  callbr void asm "", "r,!i"(i1 %arg) to label %for.body345 [label %for.end580]
+
+for.body345:                                      ; preds = %for.cond342
+  callbr void asm "", ""() to label %for.cond342 []
+
+for.end580:                                       ; preds = %for.cond342
+  callbr void asm "", ""() to label %backtrack []
+
+backtrack:                                        ; preds = %for.end580, %for.cond264, %retry
+  callbr void asm "", "r,!i"(i1 %arg) to label %if.then595 [label %if.else629]
+
+if.then595:                                       ; preds = %backtrack
+  callbr void asm "", ""() to label %for.cond616 []
+
+for.cond616:                                      ; preds = %for.body619, %if.then595
+  callbr void asm "", "r,!i"(i1 %arg) to label %for.body619 [label %for.end626]
+
+for.body619:                                      ; preds = %for.cond616
+  callbr void asm "", ""() to label %for.cond616 []
+
+for.end626:                                       ; preds = %for.cond616
+  callbr void asm "", ""() to label %while.body248 []
+
+if.else629:                                       ; preds = %backtrack
+  callbr void asm "", ""() to label %retry []
+}
diff --git a/llvm/test/Transforms/FixIrreducible/callbr.ll b/llvm/test/Transforms/FixIrreducible/callbr.ll
new file mode 100644
index 0000000000000..26ca6c7c12777
--- /dev/null
+++ b/llvm/test/Transforms/FixIrreducible/callbr.ll
@@ -0,0 +1,869 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes='fix-irreducible,verify<loops>' -S | FileCheck %s
+; RUN: opt < %s -passes='verify<loops>,fix-irreducible,verify<loops>' -S | FileCheck %s
+; RUN: opt < %s -passes='print<cycles>' -disable-output 2>&1 | FileCheck %s --check-prefix CYCLES-BEFORE
+; RUN: opt < %s -passes='fix-irreducible,print<cycles>' -disable-output 2>&1 | FileCheck %s --check-prefix CYCLES-AFTER
+
+; CYCLES-BEFORE:      CycleInfo for function: callbr_entry
+; CYCLES-BEFORE-NEXT:     depth=1: entries(indirect fallthrough)
+; CYCLES-AFTER:       CycleInfo for function: callbr_entry
+; CYCLES-AFTER-NEXT:      depth=1: entries(irr.guard) indirect fallthrough
+
+; CYCLES-BEFORE-NEXT: CycleInfo for function: callbr_entry_targets_with_phi_nodes
+; CYCLES-BEFORE-NEXT:     depth=1: entries(block1 block)
+; CYCLES-AFTER-NEXT:  CycleInfo for function: callbr_entry_targets_with_phi_nodes
+; CYCLES-AFTER-NEXT:      depth=1: entries(irr.guard) block1 block
+
+; CYCLES-BEFORE-NEXT: CycleInfo for function: callbr_entry_multiple_indirect_targets
+; CYCLES-BEFORE-NEXT:     depth=1: entries(indirect fallthrough)
+; CYCLES-AFTER-NEXT:  CycleInfo for function: callbr_entry_multiple_indirect_targets
+; CYCLES-AFTER-NEXT:      depth=1: entries(irr.guard) indirect fallthrough
+
+; CYCLES-BEFORE-NEXT: CycleInfo for function: callbr_entry_multiple_indirect_targets1
+; CYCLES-BEFORE-NEXT:     depth=1: entries(indirect1 indirect fallthrough)
+; CYCLES-BEFORE-NEXT:         depth=2: entries(indirect fallthrough)
+; CYCLES-AFTER-NEXT:  CycleInfo for function: callbr_entry_multiple_indirect_targets1
+; CYCLES-AFTER-NEXT:      depth=1: entries(irr.guard) indirect1 indirect fallthrough irr.guard1 irr.guard2
+; CYCLES-AFTER-NEXT:          depth=2: entries(irr.guard2) indirect fallthrough
+
+; CYCLES-BEFORE-NEXT: CycleInfo for function: callbr_header_no_indirect
+; CYCLES-BEFORE-NEXT:     depth=1: entries(fallthrough callbr)
+; CYCLES-AFTER-NEXT:  CycleInfo for function: callbr_header_no_indirect
+; CYCLES-AFTER-NEXT:      depth=1: entries(irr.guard) fallthrough callbr callbr.target.fallthrough
+
+; CYCLES-BEFORE-NEXT: CycleInfo for function: callbr_header
+; CYCLES-BEFORE-NEXT:     depth=1: entries(fallthrough callbr)
+; CYCLES-AFTER-NEXT:  CycleInfo for function: callbr_header
+; CYCLES-AFTER-NEXT:      depth=1: entries(irr.guard) fallthrough callbr callbr.target.fallthrough
+
+; CYCLES-BEFORE-NEXT: CycleInfo for function: callbr_header_multiple_indirect_targets
+; CYCLES-BEFORE-NEXT:     depth=1: entries(fallthrough callbr) indirect1
+; CYCLES-BEFORE-NEXT:         depth=2: entries(callbr) indirect1
+; CYCLES-AFTER-NEXT:  CycleInfo for function: callbr_header_multiple_indirect_targets
+; CYCLES-AFTER-NEXT:      depth=1: entries(irr.guard) fallthrough callbr indirect1 callbr.target.fallthrough
+; CYCLES-AFTER-NEXT:          depth=2: entries(callbr) indirect1
+
+; CYCLES-BEFORE-NEXT: CycleInfo for function: callbr_regular
+; CYCLES-BEFORE-NEXT:     depth=1: entries(fallthrough2 fallthrough1)
+; CYCLES-BEFORE-NEXT:     depth=1: entries(indirect2 indirect1)
+; CYCLES-BEFORE-NEXT:     depth=1: entries(nocallbr2 nocallbr1)
+; CYCLES-AFTER-NEXT:  CycleInfo for function: callbr_regular
+; CYCLES-AFTER-NEXT:      depth=1: entries(irr.guard) fallthrough2 fallthrough1
+; CYCLES-AFTER-NEXT:      depth=1: entries(irr.guard1) indirect2 indirect1
+; CYCLES-AFTER-NEXT:      depth=1: entries(irr.guard2) nocallbr2 nocallbr1
+
+; CYCLES-BEFORE-NEXT: CycleInfo for function: callbr_regular1
+; CYCLES-BEFORE-NEXT:     depth=1: entries(callbr nocallbr)
+; CYCLES-AFTER-NEXT:  CycleInfo for function: callbr_regular1
+; CYCLES-AFTER-NEXT:      depth=1: entries(irr.guard) callbr nocallbr
+
+; CYCLES-BEFORE-NEXT: CycleInfo for function: callbr_regular2
+; CYCLES-BEFORE-NEXT:     depth=1: entries(callbr nocallbr)
+; CYCLES-AFTER-NEXT:  CycleInfo for function: callbr_regular2
+; CYCLES-AFTER-NEXT:      depth=1: entries(irr.guard) callbr nocallbr
+
+; CYCLES-BEFORE-NEXT: CycleInfo for function: callbr_header_and_regular
+; CYCLES-BEFORE-NEXT:     depth=1: entries(callbr_header) callbr_regular mid
+; CYCLES-BEFORE-NEXT:         depth=2: entries(callbr_regular mid)
+; CYCLES-AFTER-NEXT:  CycleInfo for function: callbr_header_and_regular
+; CYCLES-AFTER-NEXT:      depth=1: entries(callbr_header) callbr_regular mid callbr_header.target.mid callbr_header.target.callbr_regular irr.guard
+; CYCLES-AFTER-NEXT:          depth=2: entries(irr.guard) callbr_regular mid
+
+; CYCLES-BEFORE-NEXT: CycleInfo for function: callbr_only
+; CYCLES-BEFORE-NEXT:     depth=1: entries(callbr_block callbr_header)
+; CYCLES-AFTER-NEXT:  CycleInfo for function: callbr_only
+; CYCLES-AFTER-NEXT:      depth=1: entries(irr.guard) callbr_block callbr_header callbr_header.target.callbr_block
+
+; CYCLES-BEFORE-NEXT: CycleInfo for function: entry_multiple_callbr
+; CYCLES-BEFORE-NEXT:     depth=1: entries(cb2 block block1)
+; CYCLES-BEFORE-NEXT:         depth=2: entries(block block1)
+; CYCLES-AFTER-NEXT:  CycleInfo for function: entry_multiple_callbr
+; CYCLES-AFTER-NEXT:      depth=1: entries(irr.guard) cb2 block block1 irr.guard1 cb2.target.block1 cb2.target.block irr.guard2
+; CYCLES-AFTER-NEXT:          depth=2: entries(irr.guard2) block block1
+
+; CYCLES-BEFORE-NEXT: CycleInfo for function: callbr_exit_with_separate_entries
+; CYCLES-BEFORE-NEXT:     depth=1: entries(l2 l1) cb
+; CYCLES-BEFORE-NEXT:         depth=2: entries(l1 cb)
+; CYCLES-AFTER-NEXT:  CycleInfo for function: callbr_exit_with_separate_entries
+; CYCLES-AFTER-NEXT:      depth=1: entries(irr.guard) l2 l1 cb cb.target.l1 irr.guard1
+; CYCLES-AFTER-NEXT:          depth=2: entries(irr.guard1) l1 cb cb.target.l1
+
+; CYCLES-BEFORE-NEXT: CycleInfo for function: callbr_exit_with_separate_entries1
+; CYCLES-BEFORE-NEXT:     depth=1: entries(loop2 loop1) cb
+; CYCLES-AFTER-NEXT:  CycleInfo for function: callbr_exit_with_separate_entries1
+; CYCLES-AFTER-NEXT:      depth=1: entries(irr.guard) loop2 loop1 cb cb.target.loop2
+
+; CYCLES-BEFORE-NEXT: CycleInfo for function: callbr_only_multiple
+; CYCLES-BEFORE-NEXT:     depth=1: entries(cb3 cb1 cb2)
+; CYCLES-BEFORE-NEXT:         depth=2: entries(cb1 cb2)
+; CYCLES-AFTER-NEXT:  CycleInfo for function: callbr_only_multiple
+; CYCLES-AFTER-NEXT:      depth=1: entries(irr.guard) cb3 cb1 cb2 cb2.target.cb3 cb1.target.cb3 irr.guard1 cb2.target.cb1 cb3.target.cb1 irr.guard2
+; CYCLES-AFTER-NEXT:          depth=2: entries(irr.guard2) cb1 cb2 cb2.target.cb1
+
+; CYCLES-BEFORE-NEXT: CycleInfo for function: callbr_bypass
+; CYCLES-BEFORE-NEXT:     depth=1: entries(l1 cb) l2
+; CYCLES-BEFORE-NEXT:         depth=2: entries(cb l2)
+; CYCLES-AFTER-NEXT:  CycleInfo for function: callbr_bypass
+; CYCLES-AFTER-NEXT:      depth=1: entries(irr.guard) l1 cb l2 cb.target.l1 irr.guard1
+; CYCLES-AFTER-NEXT:          depth=2: entries(irr.guard1) cb l2
+
+; CYCLES-BEFORE-NEXT: CycleInfo for function: callbr_multiple_with_exit
+; CYCLES-BEFORE-NEXT:     depth=1: entries(l3 l1 l2)
+; CYCLES-BEFORE-NEXT:         depth=2: entries(l1 l2)
+; CYCLES-AFTER-NEXT:  CycleInfo for function: callbr_multiple_with_exit
+; CYCLES-AFTER-NEXT:      depth=1: entries(irr.guard) l3 l1 l2 irr.guard1 irr.guard2
+; CYCLES-AFTER-NEXT:          depth=2: entries(irr.guard2) l1 l2
+
+; CYCLES-BEFORE-NEXT: CycleInfo for function: callbr_nested
+; CYCLES-BEFORE-NEXT:     depth=1: entries(bb bh)
+; CYCLES-BEFORE-NEXT:     depth=1: entries(b h)
+; CYCLES-AFTER-NEXT:  CycleInfo for function: callbr_nested
+; CYCLES-AFTER-NEXT:      depth=1: entries(irr.guard) bb bh
+; CYCLES-AFTER-NEXT:      depth=1: entries(irr.guard1) b h
+
+; Fix the irreducible loop in which callbr is the entry (see description at the
+; top of FixIrreducible.cpp).
+define void @callbr_entry(i1 %c) {
+; CHECK-LABEL: define void @callbr_entry(
+; CHECK-SAME: i1 [[C:%.*]]) {
+; CHECK-NEXT:  [[CALLBR:.*:]]
+; CHECK-NEXT:    callbr void asm "", "!i"()
+; CHECK-NEXT:            to label %[[CALLBR_TARGET_FALLTHROUGH:.*]] [label %callbr.target.indirect]
+; CHECK:       [[FALLTHROUGH:.*]]:
+; CHECK-NEXT:    br i1 [[C]], label %[[IRR_GUARD:.*]], label %[[RET:.*]]
+; CHECK:       [[INDIRECT:.*]]:
+; CHECK-NEXT:    br label %[[FALLTHROUGH]]
+; CHECK:       [[RET]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[CALLBR_TARGET_FALLTHROUGH]]:
+; CHECK-NEXT:    br label %[[IRR_GUARD]]
+; CHECK:       [[CALLBR_TARGET_INDIRECT:.*]]:
+; CHECK-NEXT:    br label %[[IRR_GUARD]]
+; CHECK:       [[IRR_GUARD]]:
+; CHECK-NEXT:    [[GUARD_INDIRECT:%.*]] = phi i1 [ true, %[[FALLTHROUGH]] ], [ false, %[[CALLBR_TARGET_FALLTHROUGH]] ], [ true, %[[CALLBR_TARGET_INDIRECT]] ]
+; CHECK-NEXT:    br i1 [[GUARD_INDIRECT]], label %[[INDIRECT]], label %[[FALLTHROUGH]]
+;
+callbr:
+  callbr void asm "", "!i"() to label %fallthrough [label %indirect]
+fallthrough:
+  br i1 %c, label %indirect, label %ret
+indirect:
+  br label %fallthrough
+ret:
+  ret void
+}
+
+define i32 @callbr_entry_targets_with_phi_nodes(i1 %c) {
+; CHECK-LABEL: define i32 @callbr_entry_targets_with_phi_nodes(
+; CHECK-SAME: i1 [[C:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    callbr void asm "", "!i"()
+; CHECK-NEXT:            to label %[[ENTRY_TARGET_BLOCK:.*]] [label %entry.target.block1]
+; CHECK:       [[BLOCK:.*]]:
+; CHECK-NEXT:    [[A:%.*]] = phi i32 [ 1, %[[BLOCK1:.*]] ], [ [[A_MOVED:%.*]], %[[IRR_GUARD:.*]] ]
+; CHECK-NEXT:    br label %[[IRR_GUARD]]
+; CHECK:       [[BLOCK1]]:
+; CHECK-NEXT:    br i1 [[C]], label %[[BLOCK]], label %[[RET:.*]]
+; CHECK:       [[RET]]:
+; CHECK-NEXT:    ret i32 [[B_MOVED:%.*]]
+; CHECK:       [[ENTRY_TARGET_BLOCK]]:
+; CHECK-NEXT:    br label %[[IRR_GUARD]]
+; CHECK:       [[ENTRY_TARGET_BLOCK1:.*]]:
+; CHECK-NEXT:    br label %[[IRR_GUARD]]
+; CHECK:       [[IRR_GUARD]]:
+; CHECK-NEXT:    [[A_MOVED]] = phi i32 [ poison, %[[BLOCK]] ], [ 42, %[[ENTRY_TARGET_BLOCK]] ], [ poison, %[[ENTRY_TARGET_BLOCK1]] ]
+; CHECK-NEXT:    [[B_MOVED]] = phi i32 [ [[A]], %[[BLOCK]] ], [ poison, %[[ENTRY_TARGET_BLOCK]] ], [ 43, %[[ENTRY_TARGET_BLOCK1]] ]
+; CHECK-NEXT:    [[GUARD_BLOCK1:%.*]] = phi i1 [ true, %[[BLOCK]] ], [ false, %[[ENTRY_TARGET_BLOCK]] ], [ true, %[[ENTRY_TARGET_BLOCK1]] ]
+; CHECK-NEXT:    br i1 [[GUARD_BLOCK1]], label %[[BLOCK1]], label %[[BLOCK]]
+;
+entry:
+  callbr void asm "", "!i"() to label %block [label %block1]
+block:
+  %a = phi i32 [42, %entry], [1, %block1]
+  br label %block1
+block1:
+  %b = phi i32 [43, %entry], [%a, %block]
+  br i1 %c, label %block, label %ret
+ret:
+  ret i32 %b
+}
+
+define void @callbr_entry_multiple_indirect_targets(i1 %c) {
+; CHECK-LABEL: define void @callbr_entry_multiple_indirect_targets(
+; CHECK-SAME: i1 [[C:%.*]]) {
+; CHECK-NEXT:  [[CALLBR:.*:]]
+; CHECK-NEXT:    callbr void asm "", "!i,!i,!i"()
+; CHECK-NEXT:            to label %[[CALLBR_TARGET_FALLTHROUGH:.*]] [label %[[CALLBR_TARGET_INDIRECT:.*]], label %[[INDIRECT1:.*]], label %indirect2]
+; CHECK:       [[INDIRECT3:.*]]:
+; CHECK-NEXT:    br i1 [[C]], label %[[IRR_GUARD:.*]], label %[[RET:.*]]
+; CHECK:       [[INDIRECT:.*]]:
+; CHECK-NEXT:    br label %[[INDIRECT3]]
+; CHECK:       [[INDIRECT1]]:
+; CHECK-NEXT:    br label %[[IRR_GUARD]]
+; CHECK:       [[INDIRECT2:.*:]]
+; CHECK-NEXT:    br label %[[RET]]
+; CHECK:       [[RET]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[CALLBR_TARGET_FALLTHROUGH]]:
+; CHECK-NEXT:    br label %[[IRR_GUARD]]
+; CHECK:       [[CALLBR_TARGET_INDIRECT]]:
+; CHECK-NEXT:    br label %[[IRR_GUARD]]
+; CHECK:       [[IRR_GUARD]]:
+; CHECK-NEXT:    [[GUARD_INDIRECT:%.*]] = phi i1 [ true, %[[INDIRECT3]] ], [ true, %[[INDIRECT1]] ], [ false, %[[CALLBR_TARGET_FALLTHROUGH]] ], [ true, %[[CALLBR_TARGET_INDIRECT]] ]
+; CHECK-NEXT:    br i1 [[GUARD_INDIRECT]], label %[[INDIRECT]], label %[[INDIRECT3]]
+;
+callbr:
+  callbr void asm "", "!i,!i,!i"() to label %fallthrough [label %indirect, label %indirect1, label %indirect2]
+fallthrough:
+  br i1 %c, label %indirect, label %ret
+indirect:
+  br label %fallthrough
+indirect1:
+  br label %indirect
+indirect2:
+  br label %ret
+ret:
+  ret void
+}
+
+define void @callbr_entry_multiple_indirect_targets1(i1 %c, i1 %d) {
+; CHECK-LABEL: define void @callbr_entry_multiple_indirect_targets1(
+; CHECK-SAME: i1 [[C:%.*]], i1 [[D:%.*]]) {
+; CHECK-NEXT:  [[CALLBR:.*:]]
+; CHECK-NEXT:    callbr void asm "", "!i,!i,!i"()
+; CHECK-NEXT:            to label %[[CALLBR_TARGET_FALLTHROUGH:.*]] [label %[[CALLBR_TARGET_INDIRECT:.*]], label %[[CALLBR_TARGET_INDIRECT1:.*]], label %indirect2]
+; CHECK:       [[INDIRECT3:.*]]:
+; CHECK-NEXT:    br i1 [[C]], label %[[IRR_GUARD2:.*]], label %[[RET:.*]]
+; CHECK:       [[INDIRECT:.*]]:
+; CHECK-NEXT:    br i1 [[D]], label %[[INDIRECT3]], label %[[IRR_GUARD:.*]]
+; CHECK:       [[INDIRECT1:.*]]:
+; CHECK-NEXT:    br label %[[IRR_GUARD2]]
+; CHECK:       [[INDIRECT2:.*:]]
+; CHECK-NEXT:    br label %[[RET]]
+; CHECK:       [[RET]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[CALLBR_TARGET_FALLTHROUGH]]:
+; CHECK-NEXT:    br label %[[IRR_GUARD]]
+; CHECK:       [[CALLBR_TARGET_INDIRECT]]:
+; CHECK-NEXT:    br label %[[IRR_GUARD]]
+; CHECK:       [[CALLBR_TARGET_INDIRECT1]]:
+; CHECK-NEXT:    br label %[[IRR_GUARD]]
+; CHECK:       [[IRR_GUARD]]:
+; CHECK-NEXT:    [[GUARD_INDIRECT1:%.*]] = phi i1 [ true, %[[INDIRECT]] ], [ false, %[[CALLBR_TARGET_FALLTHROUGH]] ], [ false, %[[CALLBR_TARGET_INDIRECT]] ], [ true, %[[CALLBR_TARGET_INDIRECT1]] ]
+; CHECK-NEXT:    [[GUARD_FALLTHROUGH:%.*]] = phi i1 [ false, %[[INDIRECT]] ], [ true, %[[CALLBR_TARGET_FALLTHROUGH]] ], [ false, %[[CALLBR_TARGET_INDIRECT]] ], [ false, %[[CALLBR_TARGET_INDIRECT1]] ]
+; CHECK-NEXT:    [[GUARD_FALLTHROUGH_INV:%.*]] = xor i1 [[GUARD_FALLTHROUGH]], true
+; CHECK-NEXT:    br i1 [[GUARD_INDIRECT1]], label %[[INDIRECT1]], label %[[IRR_GUARD1:.*]]
+; CHECK:       [[IRR_GUARD1]]:
+; CHECK-NEXT:    br label %[[IRR_GUARD2]]
+; CHECK:       [[IRR_GUARD2]]:
+; CHECK-NEXT:    [[GUARD_INDIRECT:%.*]] = phi i1 [ true, %[[INDIRECT3]] ], [ [[GUARD_FALLTHROUGH_INV]], %[[IRR_GUARD1]] ], [ true, %[[INDIRECT1]] ]
+; CHECK-NEXT:    br i1 [[GUARD_INDIRECT]], label %[[INDIRECT]], label %[[INDIRECT3]]
+;
+callbr:
+  callbr void asm "", "!i,!i,!i"() to label %fallthrough [label %indirect, label %indirect1, label %indirect2]
+fallthrough:
+  br i1 %c, label %indirect, label %ret
+indirect:
+  br i1 %d, label %fallthrough, label %indirect1
+indirect1:
+  br label %indirect
+indirect2:
+  br label %ret
+ret:
+  ret void
+}
+
+; Fix the irreducible loop in which callbr is the header (see the example at the
+; top of FixIrreducible.cpp).
+define void @callbr_header_no_indirect(i1 %c, i1 %d) {
+; CHECK-LABEL: define void @callbr_header_no_indirect(
+; CHECK-SAME: i1 [[C:%.*]], i1 [[D:%.*]]) {
+; CHECK-NEXT:    [[D_INV:%.*]] = xor i1 [[D]], true
+; CHECK-NEXT:    br label %[[IRR_GUARD:.*]]
+; CHECK:       [[CALLBR:.*]]:
+; CHECK-NEXT:    callbr void asm "", ""()
+; CHECK-NEXT:            to label %[[CALLBR_TARGET_FALLTHROUGH:.*]] []
+; CHECK:       [[FALLTHROUGH:.*]]:
+; CHECK-NEXT:    br i1 [[C]], label %[[CALLBR]], label %[[RET:.*]]
+; CHECK:       [[RET]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[CALLBR_TARGET_FALLTHROUGH]]:
+; CHECK-NEXT:    br label %[[IRR_GUARD]]
+; CHECK:       [[IRR_GUARD]]:
+; CHECK-NEXT:    [[GUARD_FALLTHROUGH:%.*]] = phi i1 [ true, %[[CALLBR_TARGET_FALLTHROUGH]] ], [ [[D_INV]], [[TMP0:%.*]] ]
+; CHECK-NEXT:    br i1 [[GUARD_FALLTHROUGH]], label %[[FALLTHROUGH]], label %[[CALLBR]]
+;
+  br i1 %d, label %callbr, label %fallthrough
+callbr:
+  callbr void asm "", ""() to label %fallthrough []
+fallthrough:
+  br i1 %c, label %callbr, label %ret
+ret:
+  ret void
+}
+
+; Fix the irreducible loop in which callbr is the header.
+define void @callbr_header(i1 %c, i1 %d) {
+; CHECK-LABEL: define void @callbr_header(
+; CHECK-SAME: i1 [[C:%.*]], i1 [[D:%.*]]) {
+; CHECK-NEXT:    [[D_INV:%.*]] = xor i1 [[D]], true
+; CHECK-NEXT:    br label %[[IRR_GUARD:.*]]
+; CHECK:       [[CALLBR:.*]]:
+; CHECK-NEXT:    callbr void asm "", "!i"()
+; CHECK-NEXT:            to label %[[CALLBR_TARGET_FALLTHROUGH:.*]] [label %indirect]
+; CHECK:       [[INDIRECT:.*:]]
+; CHECK-NEXT:    br label %[[RET:.*]]
+; CHECK:       [[FALLTHROUGH:.*]]:
+; CHECK-NEXT:    br i1 [[C]], label %[[CALLBR]], label %[[RET]]
+; CHECK:       [[RET]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[CALLBR_TARGET_FALLTHROUGH]]:
+; CHECK-NEXT:    br label %[[IRR_GUARD]]
+; CHECK:       [[IRR_GUARD]]:
+; CHECK-NEXT:    [[GUARD_FALLTHROUGH:%.*]] = phi i1 [ true, %[[CALLBR_TARGET_FALLTHROUGH]] ], [ [[D_INV]], [[TMP0:%.*]] ]
+; CHECK-NEXT:    br i1 [[GUARD_FALLTHROUGH]], label %[[FALLTHROUGH]], label %[[CALLBR]]
+;
+  br i1 %d, label %callbr, label %fallthrough
+callbr:
+  callbr void asm "", "!i"() to label %fallthrough [label %indirect]
+indirect:
+  br label %ret
+fallthrough:
+  br i1 %c, label %callbr, label %ret
+ret:
+  ret void
+}
+
+define void @callbr_header_multiple_indirect_targets(i1 %c, i1 %d) {
+; CHECK-LABEL: define void @callbr_header_multiple_indirect_targets(
+; CHECK-SAME: i1 [[C:%.*]], i1 [[D:%.*]]) {
+; CHECK-NEXT:    [[D_INV:%.*]] = xor i1 [[D]], true
+; CHECK-NEXT:    br label %[[IRR_GUARD:.*]]
+; CHECK:       [[CALLBR:.*]]:
+; CHECK-NEXT:    callbr void asm "", "!i,!i"()
+; CHECK-NEXT:            to label %[[CALLBR_TARGET_FALLTHROUGH:.*]] [label %[[INDIRECT1:.*]], label %indirect1]
+; CHECK:       [[INDIRECT1]]:
+; CHECK-NEXT:    br label %[[RET:.*]]
+; CHECK:       [[INDIRECT2:.*:]]
+; CHECK-NEXT:    br label %[[CALLBR]]
+; CHECK:       [[FALLTHROUGH:.*]]:
+; CHECK-NEXT:    br i1 [[C]], label %[[CALLBR]], label %[[RET]]
+; CHECK:       [[RET]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[CALLBR_TARGET_FALLTHROUGH]]:
+; CHECK-NEXT:    br label %[[IRR_GUARD]]
+; CHECK:       [[IRR_GUARD]]:
+; CHECK-NEXT:    [[GUARD_FALLTHROUGH:%.*]] = phi i1 [ true, %[[CALLBR_TARGET_FALLTHROUGH]] ], [ [[D_INV]], [[TMP0:%.*]] ]
+; CHECK-NEXT:    br i1 [[GUARD_FALLTHROUGH]], label %[[FALLTHROUGH]], label %[[CALLBR]]
+;
+  br i1 %d, label %callbr, label %fallthrough
+callbr:
+  callbr void asm "", "!i,!i"() to label %fallthrough [label %indirect, label %indirect1]
+indirect:
+  br label %ret
+indirect1:
+  br label %callbr
+fallthrough:
+  br i1 %c, label %callbr, label %ret
+ret:
+  ret void
+}
+
+; Fix the three usual irreducible loops (callbr isn't a part of one of them):
+; - fallthrough, fallthrough1, fallthrough2
+; - indirect, indirect1, indirect2
+; - nocallbr, nocallbr1, nocallbr2
+define void @callbr_regular(i1 %c, i1 %d) {
+; CHECK-LABEL: define void @callbr_regular(
+; CHECK-SAME: i1 [[C:%.*]], i1 [[D:%.*]]) {
+; CHECK-NEXT:    [[C_INV:%.*]] = xor i1 [[C]], true
+; CHECK-NEXT:    br i1 [[D]], label %[[CALLBR:.*]], label %[[NOCALLBR:.*]]
+; CHECK:       [[CALLBR]]:
+; CHECK-NEXT:    callbr void asm "", "!i"()
+; CHECK-NEXT:            to label %[[FALLTHROUGH:.*]] [label %indirect]
+; CHECK:       [[FALLTHROUGH]]:
+; CHECK-NEXT:    br label %[[IRR_GUARD:.*]]
+; CHECK:       [[FALLTHROUGH1:.*]]:
+; CHECK-NEXT:    br label %[[IRR_GUARD]]
+; CHECK:       [[FALLTHROUGH2:.*]]:
+; CHECK-NEXT:    br i1 [[D]], label %[[FALLTHROUGH1]], label %[[RET:.*]]
+; CHECK:       [[INDIRECT:.*]]:
+; CHECK-NEXT:    br label %[[IRR_GUARD1:.*]]
+; CHECK:       [[INDIRECT1:.*]]:
+; CHECK-NEXT:    br label %[[IRR_GUARD1]]
+; CHECK:       [[INDIRECT2:.*]]:
+; CHECK-NEXT:    br i1 [[D]], label %[[INDIRECT1]], label %[[RET]]
+; CHECK:       [[NOCALLBR]]:
+; CHECK-NEXT:    br label %[[IRR_GUARD2:.*]]
+; CHECK:       [[NOCALLBR1:.*]]:
+; CHECK-NEXT:    br label %[[IRR_GUARD2]]
+; CHECK:       [[NOCALLBR2:.*]]:
+; CHECK-NEXT:    br i1 [[D]], label %[[NOCALLBR1]], label %[[RET]]
+; CHECK:       [[RET]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[IRR_GUARD]]:
+; CHECK-NEXT:    [[GUARD_FALLTHROUGH2:%.*]] = phi i1 [ true, %[[FALLTHROUGH1]] ], [ [[C_INV]], %[[FALLTHROUGH]] ]
+; CHECK-NEXT:    br i1 [[GUARD_FALLTHROUGH2]], label %[[FALLTHROUGH2]], label %[[FALLTHROUGH1]]
+; CHECK:       [[IRR_GUARD1]]:
+; CHECK-NEXT:    [[GUARD_INDIRECT2:%.*]] = phi i1 [ true, %[[INDIRECT1]] ], [ [[C_INV]], %[[INDIRECT]] ]
+; CHECK-NEXT:    br i1 [[GUARD_INDIRECT2]], label %[[INDIRECT2]], label %[[INDIRECT1]]
+; CHECK:       [[IRR_GUARD2]]:
+; CHECK-NEXT:    [[GUARD_NOCALLBR2:%.*]] = phi i1 [ true, %[[NOCALLBR1]] ], [ [[C_INV]], %[[NOCALLBR]] ]
+; CHECK-NEXT:    br i1 [[GUARD_NOCALLBR2]], label %[[NOCALLBR2]], label %[[NOCALLBR1]]
+;
+  br i1 %d, label %callbr, label %nocallbr
+callbr:
+  callbr void asm "", "!i"() to label %fallthrough [label %indirect]
+fallthrough:
+  br i1 %c, label %fallthrough1, label %fallthrough2
+fallthrough1:
+  br label %fallthrough2
+fallthrough2:
+  br i1 %d, label %fallthrough1, label %ret
+indirect:
+  br i1 %c, label %indirect1, label %indirect2
+indirect1:
+  br label %indirect2
+indirect2:
+  br i1 %d, label %indirect1, label %ret
+nocallbr:
+  br i1 %c, label %nocallbr1, label %nocallbr2
+nocallbr1:
+  br label %nocallbr2
+nocallbr2:
+  br i1 %d, label %nocallbr1, label %ret
+ret:
+  ret void
+}
+
+; Fix an irreducible loop in which callbr is a regular block (neither entry nor
+; header). See the example at the top of FixIrreducible.cpp.
+define void @callbr_regular1(i1 %c) {
+; CHECK-LABEL: define void @callbr_regular1(
+; CHECK-SAME: i1 [[C:%.*]]) {
+; CHECK-NEXT:    [[C_INV:%.*]] = xor i1 [[C]], true
+; CHECK-NEXT:    br label %[[IRR_GUARD:.*]]
+; CHECK:       [[NOCALLBR:.*]]:
+; CHECK-NEXT:    br label %[[IRR_GUARD]]
+; CHECK:       [[CALLBR:.*]]:
+; CHECK-NEXT:    callbr void asm "", "!i"()
+; CHECK-NEXT:            to label %[[RET:.*]] [label %nocallbr]
+; CHECK:       [[RET]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[IRR_GUARD]]:
+; CHECK-NEXT:    [[GUARD_CALLBR:%.*]] = phi i1 [ true, %[[NOCALLBR]] ], [ [[C_INV]], [[TMP0:%.*]] ]
+; CHECK-NEXT:    br i1 [[GUARD_CALLBR]], label %[[CALLBR]], label %[[NOCALLBR]]
+;
+  br i1 %c, label %nocallbr, label %callbr
+nocallbr:
+  br label %callbr
+callbr:
+  callbr void asm "", "!i"() to label %ret [label %nocallbr]
+ret:
+  ret void
+}
+
+; Fix an irreducible loop in which callbr is a regular block (neither entry nor
+; header). See the example at the top of FixIrreducible.cpp.
+define void @callbr_regular2(i1 %c) {
+; CHECK-LABEL: define void @callbr_regular2(
+; CHECK-SAME: i1 [[C:%.*]]) {
+; CHECK-NEXT:    [[C_INV:%.*]] = xor i1 [[C]], true
+; CHECK-NEXT:    br label %[[IRR_GUARD:.*]]
+; CHECK:       [[NOCALLBR:.*]]:
+; CHECK-NEXT:    br label %[[IRR_GUARD]]
+; CHECK:       [[CALLBR:.*]]:
+; CHECK-NEXT:    callbr void asm "", "!i"()
+; CHECK-NEXT:            to label %[[NOCALLBR]] [label %ret]
+; CHECK:       [[RET:.*:]]
+; CHECK-NEXT:    ret void
+; CHECK:       [[IRR_GUARD]]:
+; CHECK-NEXT:    [[GUARD_CALLBR:%.*]] = phi i1 [ true, %[[NOCALLBR]] ], [ [[C_INV]], [[TMP0:%.*]] ]
+; CHECK-NEXT:    br i1 [[GUARD_CALLBR]], label %[[CALLBR]], label %[[NOCALLBR]]
+;
+  br i1 %c, label %nocallbr, label %callbr
+nocallbr:
+  br label %callbr
+callbr:
+  callbr void asm "", "!i"() to label %nocallbr [label %ret]
+ret:
+  ret void
+}
+
+; Fix an irreducible loop with two callbr blocks, one as header and one as regular block.
+define void @callbr_header_and_regular(i1 %c) {
+; CHECK-LABEL: define void @callbr_header_and_regular(
+; CHECK-SAME: i1 [[C:%.*]]) {
+; CHECK-NEXT:    br label %[[CALLBR_HEADER:.*]]
+; CHECK:       [[CALLBR_HEADER]]:
+; CHECK-NEXT:    callbr void asm "", "!i"()
+; CHECK-NEXT:            to label %[[CALLBR_HEADER_TARGET_MID:.*]] [label %callbr_header.target.callbr_regular]
+; CHECK:       [[MID:.*]]:
+; CHECK-NEXT:    br i1 [[C]], label %[[IRR_GUARD:.*]], label %[[RET:.*]]
+; CHECK:       [[CALLBR_REGULAR:.*]]:
+; CHECK-NEXT:    callbr void asm "", "!i"()
+; CHECK-NEXT:            to label %[[CALLBR_HEADER]] [label %mid]
+; CHECK:       [[RET]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[CALLBR_HEADER_TARGET_MID]]:
+; CHECK-NEXT:    br label %[[IRR_GUARD]]
+; CHECK:       [[CALLBR_HEADER_TARGET_CALLBR_REGULAR:.*]]:
+; CHECK-NEXT:    br label %[[IRR_GUARD]]
+; CHECK:       [[IRR_GUARD]]:
+; CHECK-NEXT:    [[GUARD_CALLBR_REGULAR:%.*]] = phi i1 [ true, %[[MID]] ], [ false, %[[CALLBR_HEADER_TARGET_MID]] ], [ true, %[[CALLBR_HEADER_TARGET_CALLBR_REGULAR]] ]
+; CHECK-NEXT:    br i1 [[GUARD_CALLBR_REGULAR]], label %[[CALLBR_REGULAR]], label %[[MID]]
+;
+  br label %callbr_header
+callbr_header:
+  callbr void asm "", "!i"() to label %mid [label %callbr_regular]
+mid:
+  br i1 %c, label %callbr_regular, label %ret
+callbr_regular:
+  callbr void asm "", "!i"() to label %callbr_header [label %mid]
+ret:
+  ret void
+}
+
+; Fix an irreducible loop consisting only of callbr blocks (and ret). See the
+; example at the top of FixIrreducible.cpp.
+define void @callbr_only(i1 %c) {
+; CHECK-LABEL: define void @callbr_only(
+; CHECK-SAME: i1 [[C:%.*]]) {
+; CHECK-NEXT:  [[CALLBR:.*:]]
+; CHECK-NEXT:    callbr void asm "", "!i"()
+; CHECK-NEXT:            to label %[[CALLBR_ENTRY_TARGET_CALLBR_HEADER:.*]] [label %callbr_entry.target.callbr_block]
+; CHECK:       [[CALLBR_HEADER:.*]]:
+; CHECK-NEXT:    callbr void asm "", ""()
+; CHECK-NEXT:            to label %[[CALLBR_HEADER_TARGET_CALLBR_BLOCK:.*]] []
+; CHECK:       [[CALLBR_BLOCK:.*]]:
+; CHECK-NEXT:    callbr void asm "", "!i"()
+; CHECK-NEXT:            to label %[[CALLBR_HEADER]] [label %ret]
+; CHECK:       [[RET:.*:]]
+; CHECK-NEXT:    ret void
+; CHECK:       [[CALLBR_HEADER_TARGET_CALLBR_BLOCK]]:
+; CHECK-NEXT:    br label %[[IRR_GUARD:.*]]
+; CHECK:       [[CALLBR_ENTRY_TARGET_CALLBR_HEADER]]:
+; CHECK-NEXT:    br label %[[IRR_GUARD]]
+; CHECK:       [[CALLBR_ENTRY_TARGET_CALLBR_BLOCK:.*]]:
+; CHECK-NEXT:    br label %[[IRR_GUARD]]
+; CHECK:       [[IRR_GUARD]]:
+; CHECK-NEXT:    [[GUARD_CALLBR_BLOCK:%.*]] = phi i1 [ true, %[[CALLBR_HEADER_TARGET_CALLBR_BLOCK]] ], [ false, %[[CALLBR_ENTRY_TARGET_CALLBR_HEADER]] ], [ true, %[[CALLBR_ENTRY_TARGET_CALLBR_BLOCK]] ]
+; CHECK-NEXT:    br i1 [[GUARD_CALLBR_BLOCK]], label %[[CALLBR_BLOCK]], label %[[CALLBR_HEADER]]
+;
+callbr_entry:
+  callbr void asm "", "!i"() to label %callbr_header [label %callbr_block]
+callbr_header:
+  callbr void asm "", ""() to label %callbr_block []
+callbr_block:
+  callbr void asm "", "!i"() to label %callbr_header [label %ret]
+ret:
+  ret void
+}
+
+; Irreducible loop: entry leading to multiple callbr blocks.
+define void @entry_multiple_callbr(i1 %a, i1 %b, i1 %c) {
+; CHECK-LABEL: define void @entry_multiple_callbr(
+; CHECK-SAME: i1 [[A:%.*]], i1 [[B:%.*]], i1 [[C:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 [[A]], label %[[CB1:.*]], label %[[IRR_GUARD:.*]]
+; CHECK:       [[CB1]]:
+; CHECK-NEXT:    callbr void asm "", "!i,!i"()
+; CHECK-NEXT:            to label %[[CB1_TARGET_BLOCK:.*]] [label %[[CB1_TARGET_CB2:.*]], label %cb1.target.block1]
+; CHECK:       [[BLOCK:.*]]:
+; CHECK-NEXT:    br i1 [[B]], label %[[IRR_GUARD]], label %[[BLOCK1:.*]]
+; CHECK:       [[CB2:.*]]:
+; CHECK-NEXT:    callbr void asm "", "!i"()
+; CHECK-NEXT:            to label %[[CB2_TARGET_BLOCK1:.*]] [label %cb2.target.block]
+; CHECK:       [[BLOCK1]]:
+; CHECK-NEXT:    br i1 [[C]], label %[[IRR_GUARD2:.*]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[CB1_TARGET_BLOCK]]:
+; CHECK-NEXT:    br label %[[IRR_GUARD]]
+; CHECK:       [[CB1_TARGET_CB2]]:
+; CHECK-NEXT:    br label %[[IRR_GUARD]]
+; CHECK:       [[CB1_TARGET_BLOCK1:.*]]:
+; CHECK-NEXT:    br label %[[IRR_GUARD]]
+; CHECK:       [[IRR_GUARD]]:
+; CHECK-NEXT:    [[GUARD_CB2:%.*]] = phi i1 [ true, %[[BLOCK]] ], [ false, %[[CB1_TARGET_BLOCK]] ], [ true, %[[CB1_TARGET_CB2]] ], [ false, %[[CB1_TARGET_BLOCK1]] ], [ true, %[[ENTRY]] ]
+; CHECK-NEXT:    [[GUARD_BLOCK:%.*]] = phi i1 [ false, %[[BLOCK]] ], [ true, %[[CB1_TARGET_BLOCK]] ], [ false, %[[CB1_TARGET_CB2]] ], [ false, %[[CB1_TARGET_BLOCK1]] ], [ false, %[[ENTRY]] ]
+; CHECK-NEXT:    br i1 [[GUARD_CB2]], label %[[CB2]], label %[[IRR_GUARD1:.*]]
+; CHECK:       [[IRR_GUARD1]]:
+; CHECK-NEXT:    br label %[[IRR_GUARD2]]
+; CHECK:       [[CB2_TARGET_BLOCK1]]:
+; CHECK-NEXT:    br label %[[IRR_GUARD2]]
+; CHECK:       [[CB2_TARGET_BLOCK:.*]]:
+; CHECK-NEXT:    br label %[[IRR_GUARD2]]
+; CHECK:       [[IRR_GUARD2]]:
+; CHECK-NEXT:    [[GUARD_BLOCK3:%.*]] = phi i1 [ true, %[[BLOCK1]] ], [ [[GUARD_BLOCK]], %[[IRR_GUARD1]] ], [ false, %[[CB2_TARGET_BLOCK1]] ], [ true, %[[CB2_TARGET_BLOCK]] ]
+; CHECK-NEXT:    br i1 [[GUARD_BLOCK3]], label %[[BLOCK]], label %[[BLOCK1]]
+;
+entry:
+  br i1 %a, label %cb1, label %cb2
+cb1:
+  callbr void asm "", "!i,!i"() to label %block [label %cb2, label %block1]
+block:
+  br i1 %b, label %cb2, label %block1
+cb2:
+  callbr void asm "", "!i"() to label %block1 [label %block]
+block1:
+  br i1 %c, label %block, label %exit
+exit:
+  ret void
+}
+
+; Irreducible loop: callbr as loop exit, with multiple entries
+define void @callbr_exit_with_separate_entries(i1 %a, i1 %b, i1 %c) {
+; CHECK-LABEL: define void @callbr_exit_with_separate_entries(
+; CHECK-SAME: i1 [[A:%.*]], i1 [[B:%.*]], i1 [[C:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[C_INV:%.*]] = xor i1 [[C]], true
+; CHECK-NEXT:    [[A_INV:%.*]] = xor i1 [[A]], true
+; CHECK-NEXT:    br label %[[IRR_GUARD:.*]]
+; CHECK:       [[L1:.*]]:
+; CHECK-NEXT:    br i1 [[B]], label %[[CB:.*]], label %[[IRR_GUARD]]
+; CHECK:       [[L2:.*]]:
+; CHECK-NEXT:    br label %[[IRR_GUARD1:.*]]
+; CHECK:       [[CB]]:
+; CHECK-NEXT:    callbr void asm "", "!i"()
+; CHECK-NEXT:            to label %[[EXIT:.*]] [label %cb.target.l1]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[IRR_GUARD]]:
+; CHECK-NEXT:    [[GUARD_L2:%.*]] = phi i1 [ true, %[[L1]] ], [ [[A_INV]], %[[ENTRY]] ]
+; CHECK-NEXT:    br i1 [[GUARD_L2]], label %[[L2]], label %[[IRR_GUARD1]]
+; CHECK:       [[CB_TARGET_L1:.*]]:
+; CHECK-NEXT:    br label %[[IRR_GUARD1]]
+; CHECK:       [[IRR_GUARD1]]:
+; CHECK-NEXT:    [[GUARD_L1:%.*]] = phi i1 [ true, %[[CB_TARGET_L1]] ], [ true, %[[IRR_GUARD]] ], [ [[C_INV]], %[[L2]] ]
+; CHECK-NEXT:    br i1 [[GUARD_L1]], label %[[L1]], label %[[CB]]
+;
+entry:
+  br i1 %a, label %l1, label %l2
+l1:
+  br i1 %b, label %cb, label %l2
+l2:
+  br i1 %c, label %cb, label %l1
+cb:
+  callbr void asm "", "!i"() to label %exit [label %l1]
+exit:
+  ret void
+}
+
+define void @callbr_exit_with_separate_entries1(i1 %a, i1 %b) {
+; CHECK-LABEL: define void @callbr_exit_with_separate_entries1(
+; CHECK-SAME: i1 [[A:%.*]], i1 [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[A_INV:%.*]] = xor i1 [[A]], true
+; CHECK-NEXT:    br label %[[IRR_GUARD:.*]]
+; CHECK:       [[LOOP1:.*]]:
+; CHECK-NEXT:    br i1 [[B]], label %[[CB:.*]], label %[[IRR_GUARD]]
+; CHECK:       [[LOOP2:.*]]:
+; CHECK-NEXT:    br label %[[LOOP1]]
+; CHECK:       [[CB]]:
+; CHECK-NEXT:    callbr void asm "", "!i"()
+; CHECK-NEXT:            to label %[[EXIT:.*]] [label %cb.target.loop2]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[CB_TARGET_LOOP2:.*]]:
+; CHECK-NEXT:    br label %[[IRR_GUARD]]
+; CHECK:       [[IRR_GUARD]]:
+; CHECK-NEXT:    [[GUARD_LOOP2:%.*]] = phi i1 [ true, %[[CB_TARGET_LOOP2]] ], [ true, %[[LOOP1]] ], [ [[A_INV]], %[[ENTRY]] ]
+; CHECK-NEXT:    br i1 [[GUARD_LOOP2]], label %[[LOOP2]], label %[[LOOP1]]
+;
+entry:
+  br i1 %a, label %loop1, label %loop2
+loop1:
+  br i1 %b, label %cb, label %loop2
+loop2:
+  br label %loop1
+cb:
+  callbr void asm "", "!i"() to label %exit [label %loop2]
+exit:
+  ret void
+}
+
+; Irreducible loop: all blocks are callbrs, with cross-edges
+define void @callbr_only_multiple(i1 %a, i1 %b, i1 %c) {
+; CHECK-LABEL: define void @callbr_only_multiple(
+; CHECK-SAME: i1 [[A:%.*]], i1 [[B:%.*]], i1 [[C:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    callbr void asm "", "!i,!i"()
+; CHECK-NEXT:            to label %[[ENTRY_TARGET_CB1:.*]] [label %[[ENTRY_TARGET_CB2:.*]], label %entry.target.cb3]
+; CHECK:       [[CB1:.*]]:
+; CHECK-NEXT:    callbr void asm "", "!i"()
+; CHECK-NEXT:            to label %[[CB2:.*]] [label %cb1.target.cb3]
+; CHECK:       [[CB2]]:
+; CHECK-NEXT:    callbr void asm "", "!i"()
+; CHECK-NEXT:            to label %[[CB2_TARGET_CB3:.*]] [label %cb2.target.cb1]
+; CHECK:       [[CB3:.*]]:
+; CHECK-NEXT:    callbr void asm "", "!i"()
+; CHECK-NEXT:            to label %[[CB3_TARGET_CB1:.*]] [label %exit]
+; CHECK:       [[EXIT:.*:]]
+; CHECK-NEXT:    ret void
+; CHECK:       [[CB2_TARGET_CB3]]:
+; CHECK-NEXT:    br label %[[IRR_GUARD:.*]]
+; CHECK:       [[CB1_TARGET_CB3:.*]]:
+; CHECK-NEXT:    br label %[[IRR_GUARD]]
+; CHECK:       [[ENTRY_TARGET_CB1]]:
+; CHECK-NEXT:    br label %[[IRR_GUARD]]
+; CHECK:       [[ENTRY_TARGET_CB2]]:
+; CHECK-NEXT:    br label %[[IRR_GUARD]]
+; CHECK:       [[ENTRY_TARGET_CB3:.*]]:
+; CHECK-NEXT:    br label %[[IRR_GUARD]]
+; CHECK:       [[IRR_GUARD]]:
+; CHECK-NEXT:    [[GUARD_CB3:%.*]] = phi i1 [ true, %[[CB2_TARGET_CB3]] ], [ true, %[[CB1_TARGET_CB3]] ], [ false, %[[ENTRY_TARGET_CB1]] ], [ false, %[[ENTRY_TARGET_CB2]] ], [ true, %[[ENTRY_TARGET_CB3]] ]
+; CHECK-NEXT:    [[GUARD_CB1:%.*]] = phi i1 [ false, %[[CB2_TARGET_CB3]] ], [ false, %[[CB1_TARGET_CB3]] ], [ true, %[[ENTRY_TARGET_CB1]] ], [ false, %[[ENTRY_TARGET_CB2]] ], [ false, %[[ENTRY_TARGET_CB3]] ]
+; CHECK-NEXT:    br i1 [[GUARD_CB3]], label %[[CB3]], label %[[IRR_GUARD1:.*]]
+; CHECK:       [[IRR_GUARD1]]:
+; CHECK-NEXT:    br label %[[IRR_GUARD2:.*]]
+; CHECK:       [[CB2_TARGET_CB1:.*]]:
+; CHECK-NEXT:    br label %[[IRR_GUARD2]]
+; CHECK:       [[CB3_TARGET_CB1]]:
+; CHECK-NEXT:    br label %[[IRR_GUARD2]]
+; CHECK:       [[IRR_GUARD2]]:
+; CHECK-NEXT:    [[GUARD_CB13:%.*]] = phi i1 [ true, %[[CB2_TARGET_CB1]] ], [ [[GUARD_CB1]], %[[IRR_GUARD1]] ], [ true, %[[CB3_TARGET_CB1]] ]
+; CHECK-NEXT:    br i1 [[GUARD_CB13]], label %[[CB1]], label %[[CB2]]
+;
+entry:
+  callbr void asm "", "!i,!i"() to label %cb1 [label %cb2, label %cb3]
+cb1:
+  callbr void asm "", "!i"() to label %cb2 [label %cb3]
+cb2:
+  callbr void asm "", "!i"() to label %cb3 [label %cb1]
+cb3:
+  callbr void asm "", "!i"() to label %cb1 [label %exit]
+exit:
+  ret void
+}
+
+; Irreducible loop: callbr as a "bypass" block
+define void @callbr_bypass(i1 %a, i1 %b, i1 %c) {
+; CHECK-LABEL: define void @callbr_bypass(
+; CHECK-SAME: i1 [[A:%.*]], i1 [[B:%.*]], i1 [[C:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[B_INV:%.*]] = xor i1 [[B]], true
+; CHECK-NEXT:    [[A_INV:%.*]] = xor i1 [[A]], true
+; CHECK-NEXT:    br label %[[IRR_GUARD:.*]]
+; CHECK:       [[CB:.*]]:
+; CHECK-NEXT:    callbr void asm "", "!i"()
+; CHECK-NEXT:            to label %[[L2:.*]] [label %cb.target.l1]
+; CHECK:       [[L1:.*]]:
+; CHECK-NEXT:    br label %[[IRR_GUARD1:.*]]
+; CHECK:       [[L2]]:
+; CHECK-NEXT:    br i1 [[C]], label %[[IRR_GUARD1]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[CB_TARGET_L1:.*]]:
+; CHECK-NEXT:    br label %[[IRR_GUARD]]
+; CHECK:       [[IRR_GUARD]]:
+; CHECK-NEXT:    [[GUARD_L1:%.*]] = phi i1 [ true, %[[CB_TARGET_L1]] ], [ [[A_INV]], %[[ENTRY]] ]
+; CHECK-NEXT:    br i1 [[GUARD_L1]], label %[[L1]], label %[[IRR_GUARD1]]
+; CHECK:       [[IRR_GUARD1]]:
+; CHECK-NEXT:    [[GUARD_CB:%.*]] = phi i1 [ true, %[[L2]] ], [ true, %[[IRR_GUARD]] ], [ [[B_INV]], %[[L1]] ]
+; CHECK-NEXT:    br i1 [[GUARD_CB]], label %[[CB]], label %[[L2]]
+;
+entry:
+  br i1 %a, label %cb, label %l1
+cb:
+  callbr void asm "", "!i"() to label %l2 [label %l1]
+l1:
+  br i1 %b, label %l2, label %cb
+l2:
+  br i1 %c, label %cb, label %exit
+exit:
+  ret void
+}
+
+; Irreducible loop: callbr with multiple indirect targets, some looping, some exiting
+define void @callbr_multiple_with_exit(i1 %a, i1 %b, i1 %c) {
+; CHECK-LABEL: define void @callbr_multiple_with_exit(
+; CHECK-SAME: i1 [[A:%.*]], i1 [[B:%.*]], i1 [[C:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    callbr void asm "", "!i,!i,!i"()
+; CHECK-NEXT:            to label %[[ENTRY_TARGET_L1:.*]] [label %[[ENTRY_TARGET_L2:.*]], label %[[EXIT:.*]], label %entry.target.l3]
+; CHECK:       [[L1:.*]]:
+; CHECK-NEXT:    br i1 [[A]], label %[[L2:.*]], label %[[IRR_GUARD:.*]]
+; CHECK:       [[L2]]:
+; CHECK-NEXT:    br i1 [[B]], label %[[IRR_GUARD2:.*]], label %[[EXIT]]
+; CHECK:       [[L3:.*]]:
+; CHECK-NEXT:    br i1 [[C]], label %[[IRR_GUARD2]], label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[ENTRY_TARGET_L1]]:
+; CHECK-NEXT:    br label %[[IRR_GUARD]]
+; CHECK:       [[ENTRY_TARGET_L2]]:
+; CHECK-NEXT:    br label %[[IRR_GUARD]]
+; CHECK:       [[ENTRY_TARGET_L3:.*]]:
+; CHECK-NEXT:    br label %[[IRR_GUARD]]
+; CHECK:       [[IRR_GUARD]]:
+; CHECK-NEXT:    [[GUARD_L3:%.*]] = phi i1 [ true, %[[L1]] ], [ false, %[[ENTRY_TARGET_L1]] ], [ false, %[[ENTRY_TARGET_L2]] ], [ true, %[[ENTRY_TARGET_L3]] ]
+; CHECK-NEXT:    [[GUARD_L1:%.*]] = phi i1 [ false, %[[L1]] ], [ true, %[[ENTRY_TARGET_L1]] ], [ false, %[[ENTRY_TARGET_L2]] ], [ false, %[[ENTRY_TARGET_L3]] ]
+; CHECK-NEXT:    br i1 [[GUARD_L3]], label %[[L3]], label %[[IRR_GUARD1:.*]]
+; CHECK:       [[IRR_GUARD1]]:
+; CHECK-NEXT:    br label %[[IRR_GUARD2]]
+; CHECK:       [[IRR_GUARD2]]:
+; CHECK-NEXT:    [[GUARD_L13:%.*]] = phi i1 [ true, %[[L2]] ], [ [[GUARD_L1]], %[[IRR_GUARD1]] ], [ true, %[[L3]] ]
+; CHECK-NEXT:    br i1 [[GUARD_L13]], label %[[L1]], label %[[L2]]
+;
+entry:
+  callbr void asm "", "!i,!i,!i"() to label %l1 [label %l2, label %exit, label %l3]
+l1:
+  br i1 %a, label %l2, label %l3
+l2:
+  br i1 %b, label %l1, label %exit
+l3:
+  br i1 %c, label %l1, label %exit
+exit:
+  ret void
+}
+
+define void @callbr_nested(i1 %c, i1 %d) {
+; CHECK-LABEL: define void @callbr_nested(
+; CHECK-SAME: i1 [[C:%.*]], i1 [[D:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    callbr void asm "", "!i"()
+; CHECK-NEXT:            to label %[[ENTRY_TARGET_H:.*]] [label %entry.target.b]
+; CHECK:       [[H:.*]]:
+; CHECK-NEXT:    br label %[[IRR_GUARD1:.*]]
+; CHECK:       [[B:.*]]:
+; CHECK-NEXT:    callbr void asm "", "!i,!i"()
+; CHECK-NEXT:            to label %[[H]] [label %[[B_TARGET_BH:.*]], label %b.target.bb]
+; CHECK:       [[BH:.*]]:
+; CHECK-NEXT:    br label %[[IRR_GUARD:.*]]
+; CHECK:       [[BB:.*]]:
+; CHECK-NEXT:    br i1 [[C]], label %[[BH]], label %[[RET:.*]]
+; CHECK:       [[RET]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[B_TARGET_BH]]:
+; CHECK-NEXT:    br label %[[IRR_GUARD]]
+; CHECK:       [[B_TARGET_BB:.*]]:
+; CHECK-NEXT:    br label %[[IRR_GUARD]]
+; CHECK:       [[IRR_GUARD]]:
+; CHECK-NEXT:    [[GUARD_BB:%.*]] = phi i1 [ true, %[[BH]] ], [ false, %[[B_TARGET_BH]] ], [ true, %[[B_TARGET_BB]] ]
+; CHECK-NEXT:    br i1 [[GUARD_BB]], label %[[BB]], label %[[BH]]
+; CHECK:       [[ENTRY_TARGET_H]]:
+; CHECK-NEXT:    br label %[[IRR_GUARD1]]
+; CHECK:       [[ENTRY_TARGET_B:.*]]:
+; CHECK-NEXT:    br label %[[IRR_GUARD1]]
+; CHECK:       [[IRR_GUARD1]]:
+; CHECK-NEXT:    [[GUARD_B:%.*]] = phi i1 [ true, %[[H]] ], [ false, %[[ENTRY_TARGET_H]] ], [ true, %[[ENTRY_TARGET_B]] ]
+; CHECK-NEXT:    br i1 [[GUARD_B]], label %[[B]], label %[[H]]
+;
+entry:
+  callbr void asm "","!i"() to label %h [label %b]
+h:
+  br label %b
+b:
+  callbr void asm "","!i,!i"() to label %h [label %bh, label %bb]
+bh:
+  br label %bb
+bb:
+  br i1 %c, label %bh, label %ret
+ret:
+  ret void
+}
+
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; LOOPS-AFTER: {{.*}}
+; LOOPS-BEFORE: {{.*}}
diff --git a/llvm/test/Transforms/FixIrreducible/nested.ll b/llvm/test/Transforms/FixIrreducible/nested.ll
index 0cc6b473d62f6..c9161cc14f208 100644
--- a/llvm/test/Transforms/FixIrreducible/nested.ll
+++ b/llvm/test/Transforms/FixIrreducible/nested.ll
@@ -50,6 +50,69 @@ exit:
   ret void
 }
 
+define void @nested_irr_top_level_callbr(i1 %Pred0, i1 %Pred1, i1 %Pred2, i1 %Pred3, i1 %Pred4, i1 %Pred5) {
+; CHECK-LABEL: @nested_irr_top_level_callbr(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[PRED0:%.*]])
+; CHECK-NEXT:            to label [[ENTRY_TARGET_A1:%.*]] [label %entry.target.A2]
+; CHECK:       A1:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[PRED1:%.*]])
+; CHECK-NEXT:            to label [[A1_TARGET_B1:%.*]] [label %A1.target.B2]
+; CHECK:       B1:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[PRED2:%.*]])
+; CHECK-NEXT:            to label [[B1_TARGET_B2:%.*]] [label %A3]
+; CHECK:       B2:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[PRED3:%.*]])
+; CHECK-NEXT:            to label [[B1:%.*]] [label %A3]
+; CHECK:       A3:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[PRED4:%.*]])
+; CHECK-NEXT:            to label [[A3_TARGET_A2:%.*]] [label %exit]
+; CHECK:       A2:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[PRED5:%.*]])
+; CHECK-NEXT:            to label [[A1:%.*]] [label %exit]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+; CHECK:       A3.target.A2:
+; CHECK-NEXT:    br label [[IRR_GUARD:%.*]]
+; CHECK:       entry.target.A1:
+; CHECK-NEXT:    br label [[IRR_GUARD]]
+; CHECK:       entry.target.A2:
+; CHECK-NEXT:    br label [[IRR_GUARD]]
+; CHECK:       irr.guard:
+; CHECK-NEXT:    [[GUARD_A2:%.*]] = phi i1 [ true, [[A3_TARGET_A2]] ], [ false, [[ENTRY_TARGET_A1]] ], [ true, [[ENTRY_TARGET_A2:%.*]] ]
+; CHECK-NEXT:    br i1 [[GUARD_A2]], label [[A2:%.*]], label [[A1]]
+; CHECK:       B1.target.B2:
+; CHECK-NEXT:    br label [[IRR_GUARD1:%.*]]
+; CHECK:       A1.target.B1:
+; CHECK-NEXT:    br label [[IRR_GUARD1]]
+; CHECK:       A1.target.B2:
+; CHECK-NEXT:    br label [[IRR_GUARD1]]
+; CHECK:       irr.guard1:
+; CHECK-NEXT:    [[GUARD_B2:%.*]] = phi i1 [ true, [[B1_TARGET_B2]] ], [ false, [[A1_TARGET_B1]] ], [ true, [[A1_TARGET_B2:%.*]] ]
+; CHECK-NEXT:    br i1 [[GUARD_B2]], label [[B2:%.*]], label [[B1]]
+;
+entry:
+  callbr void asm "", "r,!i"(i1 %Pred0) to label %A1 [label %A2]
+
+A1:
+  callbr void asm "", "r,!i"(i1 %Pred1) to label %B1 [label %B2]
+
+B1:
+  callbr void asm "", "r,!i"(i1 %Pred2) to label %B2 [label %A3]
+
+B2:
+  callbr void asm "", "r,!i"(i1 %Pred3) to label %B1 [label %A3]
+
+A3:
+  callbr void asm "", "r,!i"(i1 %Pred4) to label %A2 [label %exit]
+
+A2:
+  callbr void asm "", "r,!i"(i1 %Pred5) to label %A1 [label %exit]
+
+exit:
+  ret void
+}
+
 define void @nested_irr_in_loop(i1 %Pred0, i1 %Pred1, i1 %Pred2, i1 %Pred3, i1 %Pred4, i1 %Pred5, i1 %Pred6) {
 ; CHECK-LABEL: @nested_irr_in_loop(
 ; CHECK-NEXT:  entry:
@@ -107,6 +170,80 @@ exit:
   ret void
 }
 
+define void @nested_irr_in_loop_callbr(i1 %Pred0, i1 %Pred1, i1 %Pred2, i1 %Pred3, i1 %Pred4, i1 %Pred5, i1 %Pred6) {
+; CHECK-LABEL: @nested_irr_in_loop_callbr(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[H1:%.*]]
+; CHECK:       H1:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[PRED0:%.*]])
+; CHECK-NEXT:            to label [[H1_TARGET_A1:%.*]] [label %H1.target.A2]
+; CHECK:       A1:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[PRED1:%.*]])
+; CHECK-NEXT:            to label [[A1_TARGET_B1:%.*]] [label %A1.target.B2]
+; CHECK:       B1:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[PRED2:%.*]])
+; CHECK-NEXT:            to label [[B1_TARGET_B2:%.*]] [label %A3]
+; CHECK:       B2:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[PRED3:%.*]])
+; CHECK-NEXT:            to label [[B1:%.*]] [label %A3]
+; CHECK:       A3:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[PRED4:%.*]])
+; CHECK-NEXT:            to label [[A3_TARGET_A2:%.*]] [label %L1]
+; CHECK:       A2:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[PRED5:%.*]])
+; CHECK-NEXT:            to label [[A1:%.*]] [label %L1]
+; CHECK:       L1:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[PRED6:%.*]])
+; CHECK-NEXT:            to label [[EXIT:%.*]] [label %H1]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+; CHECK:       A3.target.A2:
+; CHECK-NEXT:    br label [[IRR_GUARD:%.*]]
+; CHECK:       H1.target.A1:
+; CHECK-NEXT:    br label [[IRR_GUARD]]
+; CHECK:       H1.target.A2:
+; CHECK-NEXT:    br label [[IRR_GUARD]]
+; CHECK:       irr.guard:
+; CHECK-NEXT:    [[GUARD_A2:%.*]] = phi i1 [ true, [[A3_TARGET_A2]] ], [ false, [[H1_TARGET_A1]] ], [ true, [[H1_TARGET_A2:%.*]] ]
+; CHECK-NEXT:    br i1 [[GUARD_A2]], label [[A2:%.*]], label [[A1]]
+; CHECK:       B1.target.B2:
+; CHECK-NEXT:    br label [[IRR_GUARD1:%.*]]
+; CHECK:       A1.target.B1:
+; CHECK-NEXT:    br label [[IRR_GUARD1]]
+; CHECK:       A1.target.B2:
+; CHECK-NEXT:    br label [[IRR_GUARD1]]
+; CHECK:       irr.guard1:
+; CHECK-NEXT:    [[GUARD_B2:%.*]] = phi i1 [ true, [[B1_TARGET_B2]] ], [ false, [[A1_TARGET_B1]] ], [ true, [[A1_TARGET_B2:%.*]] ]
+; CHECK-NEXT:    br i1 [[GUARD_B2]], label [[B2:%.*]], label [[B1]]
+;
+entry:
+  br label %H1
+
+H1:
+  callbr void asm "", "r,!i"(i1 %Pred0) to label %A1 [label %A2]
+
+A1:
+  callbr void asm "", "r,!i"(i1 %Pred1) to label %B1 [label %B2]
+
+B1:
+  callbr void asm "", "r,!i"(i1 %Pred2) to label %B2 [label %A3]
+
+B2:
+  callbr void asm "", "r,!i"(i1 %Pred3) to label %B1 [label %A3]
+
+A3:
+  callbr void asm "", "r,!i"(i1 %Pred4) to label %A2 [label %L1]
+
+A2:
+  callbr void asm "", "r,!i"(i1 %Pred5) to label %A1 [label %L1]
+
+L1:
+  callbr void asm "", "r,!i"(i1 %Pred6) to label %exit [label %H1]
+
+exit:
+  ret void
+}
+
 define void @loop_in_irr(i1 %Pred0, i1 %Pred1, i1 %Pred2) {
 ; CHECK-LABEL: @loop_in_irr(
 ; CHECK-NEXT:  entry:
@@ -150,6 +287,60 @@ exit:
   ret void
 }
 
+define void @loop_in_irr_callbr(i1 %Pred0, i1 %Pred1, i1 %Pred2) {
+; CHECK-LABEL: @loop_in_irr_callbr(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[PRED0:%.*]])
+; CHECK-NEXT:            to label [[ENTRY_TARGET_A1:%.*]] [label %entry.target.A2]
+; CHECK:       A1:
+; CHECK-NEXT:    callbr void asm "", ""()
+; CHECK-NEXT:            to label [[H1:%.*]] []
+; CHECK:       H1:
+; CHECK-NEXT:    callbr void asm "", ""()
+; CHECK-NEXT:            to label [[L1:%.*]] []
+; CHECK:       L1:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[PRED1:%.*]])
+; CHECK-NEXT:            to label [[H1]] [label %A3]
+; CHECK:       A3:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[PRED2:%.*]])
+; CHECK-NEXT:            to label [[A3_TARGET_A2:%.*]] [label %exit]
+; CHECK:       A2:
+; CHECK-NEXT:    callbr void asm "", ""()
+; CHECK-NEXT:            to label [[A1:%.*]] []
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+; CHECK:       A3.target.A2:
+; CHECK-NEXT:    br label [[IRR_GUARD:%.*]]
+; CHECK:       entry.target.A1:
+; CHECK-NEXT:    br label [[IRR_GUARD]]
+; CHECK:       entry.target.A2:
+; CHECK-NEXT:    br label [[IRR_GUARD]]
+; CHECK:       irr.guard:
+; CHECK-NEXT:    [[GUARD_A2:%.*]] = phi i1 [ true, [[A3_TARGET_A2]] ], [ false, [[ENTRY_TARGET_A1]] ], [ true, [[ENTRY_TARGET_A2:%.*]] ]
+; CHECK-NEXT:    br i1 [[GUARD_A2]], label [[A2:%.*]], label [[A1]]
+;
+entry:
+  callbr void asm "", "r,!i"(i1 %Pred0) to label %A1 [label %A2]
+
+A1:
+  callbr void asm "", ""() to label %H1 []
+
+H1:
+  callbr void asm "", ""() to label %L1 []
+
+L1:
+  callbr void asm "", "r,!i"(i1 %Pred1) to label %H1 [label %A3]
+
+A3:
+  callbr void asm "", "r,!i"(i1 %Pred2) to label %A2 [label %exit]
+
+A2:
+  callbr void asm "", ""() to label %A1 []
+
+exit:
+  ret void
+}
+
 define void @loop_in_irr_shared_entry(i1 %Pred0, i1 %Pred1, i1 %Pred2) {
 ; CHECK-LABEL: @loop_in_irr_shared_entry(
 ; CHECK-NEXT:  entry:
@@ -188,6 +379,54 @@ exit:
   ret void
 }
 
+define void @loop_in_irr_shared_entry_callbr(i1 %Pred0, i1 %Pred1, i1 %Pred2) {
+; CHECK-LABEL: @loop_in_irr_shared_entry_callbr(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[PRED0:%.*]])
+; CHECK-NEXT:            to label [[ENTRY_TARGET_H1:%.*]] [label %entry.target.A2]
+; CHECK:       H1:
+; CHECK-NEXT:    callbr void asm "", ""()
+; CHECK-NEXT:            to label [[L1:%.*]] []
+; CHECK:       L1:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[PRED1:%.*]])
+; CHECK-NEXT:            to label [[H1:%.*]] [label %A3]
+; CHECK:       A3:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[PRED2:%.*]])
+; CHECK-NEXT:            to label [[A3_TARGET_A2:%.*]] [label %exit]
+; CHECK:       A2:
+; CHECK-NEXT:    callbr void asm "", ""()
+; CHECK-NEXT:            to label [[H1]] []
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+; CHECK:       A3.target.A2:
+; CHECK-NEXT:    br label [[IRR_GUARD:%.*]]
+; CHECK:       entry.target.H1:
+; CHECK-NEXT:    br label [[IRR_GUARD]]
+; CHECK:       entry.target.A2:
+; CHECK-NEXT:    br label [[IRR_GUARD]]
+; CHECK:       irr.guard:
+; CHECK-NEXT:    [[GUARD_A2:%.*]] = phi i1 [ true, [[A3_TARGET_A2]] ], [ false, [[ENTRY_TARGET_H1]] ], [ true, [[ENTRY_TARGET_A2:%.*]] ]
+; CHECK-NEXT:    br i1 [[GUARD_A2]], label [[A2:%.*]], label [[H1]]
+;
+entry:
+  callbr void asm "", "r,!i"(i1 %Pred0) to label %H1 [label %A2]
+
+H1:
+  callbr void asm "", ""() to label %L1 []
+
+L1:
+  callbr void asm "", "r,!i"(i1 %Pred1) to label %H1 [label %A3]
+
+A3:
+  callbr void asm "", "r,!i"(i1 %Pred2) to label %A2 [label %exit]
+
+A2:
+  callbr void asm "", ""() to label %H1 []
+
+exit:
+  ret void
+}
+
 define void @loop_in_irr_shared_header(i1 %Pred0, i1 %Pred1, i1 %Pred2) {
 ; CHECK-LABEL: @loop_in_irr_shared_header(
 ; CHECK-NEXT:  entry:
@@ -226,6 +465,56 @@ exit:
   ret void
 }
 
+define void @loop_in_irr_shared_header_callbr(i1 %Pred0, i1 %Pred1, i1 %Pred2) {
+; CHECK-LABEL: @loop_in_irr_shared_header_callbr(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[PRED0:%.*]])
+; CHECK-NEXT:            to label [[ENTRY_TARGET_A2:%.*]] [label %entry.target.H1]
+; CHECK:       H1:
+; CHECK-NEXT:    callbr void asm "", ""()
+; CHECK-NEXT:            to label [[L1:%.*]] []
+; CHECK:       L1:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[PRED1:%.*]])
+; CHECK-NEXT:            to label [[L1_TARGET_H1:%.*]] [label %A3]
+; CHECK:       A3:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[PRED2:%.*]])
+; CHECK-NEXT:            to label [[A2:%.*]] [label %exit]
+; CHECK:       A2:
+; CHECK-NEXT:    callbr void asm "", ""()
+; CHECK-NEXT:            to label [[A2_TARGET_H1:%.*]] []
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+; CHECK:       A2.target.H1:
+; CHECK-NEXT:    br label [[IRR_GUARD:%.*]]
+; CHECK:       L1.target.H1:
+; CHECK-NEXT:    br label [[IRR_GUARD]]
+; CHECK:       entry.target.A2:
+; CHECK-NEXT:    br label [[IRR_GUARD]]
+; CHECK:       entry.target.H1:
+; CHECK-NEXT:    br label [[IRR_GUARD]]
+; CHECK:       irr.guard:
+; CHECK-NEXT:    [[GUARD_H1:%.*]] = phi i1 [ true, [[A2_TARGET_H1]] ], [ true, [[L1_TARGET_H1]] ], [ false, [[ENTRY_TARGET_A2]] ], [ true, [[ENTRY_TARGET_H1:%.*]] ]
+; CHECK-NEXT:    br i1 [[GUARD_H1]], label [[H1:%.*]], label [[A2]]
+;
+entry:
+  callbr void asm "", "r,!i"(i1 %Pred0) to label %A2 [label %H1]
+
+H1:
+  callbr void asm "", ""() to label %L1 []
+
+L1:
+  callbr void asm "", "r,!i"(i1 %Pred1) to label %H1 [label %A3]
+
+A3:
+  callbr void asm "", "r,!i"(i1 %Pred2) to label %A2 [label %exit]
+
+A2:
+  callbr void asm "", ""() to label %H1 []
+
+exit:
+  ret void
+}
+
 define void @loop_irr_loop_shared_header(i1 %Pred0, i1 %Pred1, i1 %Pred2, i1 %Pred3) {
 ; CHECK-LABEL: @loop_irr_loop_shared_header(
 ; CHECK-NEXT:  entry:
@@ -269,6 +558,62 @@ exit:
   ret void
 }
 
+define void @loop_irr_loop_shared_header_callbr(i1 %Pred0, i1 %Pred1, i1 %Pred2, i1 %Pred3) {
+; CHECK-LABEL: @loop_irr_loop_shared_header_callbr(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    callbr void asm "", ""()
+; CHECK-NEXT:            to label [[H2:%.*]] []
+; CHECK:       H2:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[PRED0:%.*]])
+; CHECK-NEXT:            to label [[H2_TARGET_A2:%.*]] [label %H2.target.H1]
+; CHECK:       H1:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[PRED1:%.*]])
+; CHECK-NEXT:            to label [[A3:%.*]] [label %H1.target.H1]
+; CHECK:       A3:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[PRED2:%.*]])
+; CHECK-NEXT:            to label [[A2:%.*]] [label %L2]
+; CHECK:       A2:
+; CHECK-NEXT:    callbr void asm "", ""()
+; CHECK-NEXT:            to label [[A2_TARGET_H1:%.*]] []
+; CHECK:       L2:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[PRED3:%.*]])
+; CHECK-NEXT:            to label [[H2]] [label %exit]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+; CHECK:       A2.target.H1:
+; CHECK-NEXT:    br label [[IRR_GUARD:%.*]]
+; CHECK:       H1.target.H1:
+; CHECK-NEXT:    br label [[IRR_GUARD]]
+; CHECK:       H2.target.A2:
+; CHECK-NEXT:    br label [[IRR_GUARD]]
+; CHECK:       H2.target.H1:
+; CHECK-NEXT:    br label [[IRR_GUARD]]
+; CHECK:       irr.guard:
+; CHECK-NEXT:    [[GUARD_H1:%.*]] = phi i1 [ true, [[A2_TARGET_H1]] ], [ true, [[H1_TARGET_H1:%.*]] ], [ false, [[H2_TARGET_A2]] ], [ true, [[H2_TARGET_H1:%.*]] ]
+; CHECK-NEXT:    br i1 [[GUARD_H1]], label [[H1:%.*]], label [[A2]]
+;
+entry:
+  callbr void asm "", ""() to label %H2 []
+
+H2:
+  callbr void asm "", "r,!i"(i1 %Pred0) to label %A2 [label %H1]
+
+H1:
+  callbr void asm "", "r,!i"(i1 %Pred1) to label %A3 [label %H1]
+
+A3:
+  callbr void asm "", "r,!i"(i1 %Pred2) to label %A2 [label %L2]
+
+A2:
+  callbr void asm "", ""() to label %H1 []
+
+L2:
+  callbr void asm "", "r,!i"(i1 %Pred3) to label %H2 [label %exit]
+
+exit:
+  ret void
+}
+
 define void @siblings_top_level(i1 %Pred0, i1 %Pred1, i1 %Pred2, i1 %Pred3, i1 %Pred4, i1 %Pred5, i1 %Pred6) {
 ; CHECK-LABEL: @siblings_top_level(
 ; CHECK-NEXT:  entry:
@@ -336,6 +681,93 @@ exit:
   ret void
 }
 
+define void @siblings_top_level_callbr(i1 %Pred0, i1 %Pred1, i1 %Pred2, i1 %Pred3, i1 %Pred4, i1 %Pred5, i1 %Pred6) {
+; CHECK-LABEL: @siblings_top_level_callbr(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[PRED0:%.*]])
+; CHECK-NEXT:            to label [[H1:%.*]] [label %fork1]
+; CHECK:       H1:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[PRED1:%.*]])
+; CHECK-NEXT:            to label [[H1_TARGET_A1:%.*]] [label %H1.target.A2]
+; CHECK:       A1:
+; CHECK-NEXT:    callbr void asm "", ""()
+; CHECK-NEXT:            to label [[A1_TARGET_A2:%.*]] []
+; CHECK:       A2:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[PRED2:%.*]])
+; CHECK-NEXT:            to label [[A1:%.*]] [label %L1]
+; CHECK:       L1:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[PRED3:%.*]])
+; CHECK-NEXT:            to label [[H1]] [label %exit]
+; CHECK:       fork1:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[PRED4:%.*]])
+; CHECK-NEXT:            to label [[FORK1_TARGET_B1:%.*]] [label %fork1.target.B2]
+; CHECK:       B1:
+; CHECK-NEXT:    callbr void asm "", ""()
+; CHECK-NEXT:            to label [[H2:%.*]] []
+; CHECK:       H2:
+; CHECK-NEXT:    callbr void asm "", ""()
+; CHECK-NEXT:            to label [[L2:%.*]] []
+; CHECK:       L2:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[PRED5:%.*]])
+; CHECK-NEXT:            to label [[H2]] [label %L2.target.B2]
+; CHECK:       B2:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[PRED6:%.*]])
+; CHECK-NEXT:            to label [[B1:%.*]] [label %exit]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+; CHECK:       A1.target.A2:
+; CHECK-NEXT:    br label [[IRR_GUARD:%.*]]
+; CHECK:       H1.target.A1:
+; CHECK-NEXT:    br label [[IRR_GUARD]]
+; CHECK:       H1.target.A2:
+; CHECK-NEXT:    br label [[IRR_GUARD]]
+; CHECK:       irr.guard:
+; CHECK-NEXT:    [[GUARD_A2:%.*]] = phi i1 [ true, [[A1_TARGET_A2]] ], [ false, [[H1_TARGET_A1]] ], [ true, [[H1_TARGET_A2:%.*]] ]
+; CHECK-NEXT:    br i1 [[GUARD_A2]], label [[A2:%.*]], label [[A1]]
+; CHECK:       L2.target.B2:
+; CHECK-NEXT:    br label [[IRR_GUARD1:%.*]]
+; CHECK:       fork1.target.B1:
+; CHECK-NEXT:    br label [[IRR_GUARD1]]
+; CHECK:       fork1.target.B2:
+; CHECK-NEXT:    br label [[IRR_GUARD1]]
+; CHECK:       irr.guard1:
+; CHECK-NEXT:    [[GUARD_B2:%.*]] = phi i1 [ true, [[L2_TARGET_B2:%.*]] ], [ false, [[FORK1_TARGET_B1]] ], [ true, [[FORK1_TARGET_B2:%.*]] ]
+; CHECK-NEXT:    br i1 [[GUARD_B2]], label [[B2:%.*]], label [[B1]]
+;
+entry:
+  callbr void asm "", "r,!i"(i1 %Pred0) to label %H1 [label %fork1]
+
+H1:
+  callbr void asm "", "r,!i"(i1 %Pred1) to label %A1 [label %A2]
+
+A1:
+  callbr void asm "", ""() to label %A2 []
+
+A2:
+  callbr void asm "", "r,!i"(i1 %Pred2) to label %A1 [label %L1]
+
+L1:
+  callbr void asm "", "r,!i"(i1 %Pred3) to label %H1 [label %exit]
+
+fork1:
+  callbr void asm "", "r,!i"(i1 %Pred4) to label %B1 [label %B2]
+
+B1:
+  callbr void asm "", ""() to label %H2 []
+
+H2:
+  callbr void asm "", ""() to label %L2 []
+
+L2:
+  callbr void asm "", "r,!i"(i1 %Pred5) to label %H2 [label %B2]
+
+B2:
+  callbr void asm "", "r,!i"(i1 %Pred6) to label %B1 [label %exit]
+
+exit:
+  ret void
+}
+
 define void @siblings_in_loop(i1 %Pred0, i1 %Pred1, i1 %Pred2, i1 %Pred3, i1 %Pred4, i1 %Pred5, i1 %Pred6, i1 %Pred7) {
 ; CHECK-LABEL: @siblings_in_loop(
 ; CHECK-NEXT:  entry:
@@ -413,6 +845,105 @@ exit:
   ret void
 }
 
+define void @siblings_in_loop_callbr(i1 %Pred0, i1 %Pred1, i1 %Pred2, i1 %Pred3, i1 %Pred4, i1 %Pred5, i1 %Pred6, i1 %Pred7) {
+; CHECK-LABEL: @siblings_in_loop_callbr(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    callbr void asm "", ""()
+; CHECK-NEXT:            to label [[H0:%.*]] []
+; CHECK:       H0:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[PRED0:%.*]])
+; CHECK-NEXT:            to label [[H1:%.*]] [label %fork1]
+; CHECK:       H1:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[PRED1:%.*]])
+; CHECK-NEXT:            to label [[H1_TARGET_A1:%.*]] [label %H1.target.A2]
+; CHECK:       A1:
+; CHECK-NEXT:    callbr void asm "", ""()
+; CHECK-NEXT:            to label [[A1_TARGET_A2:%.*]] []
+; CHECK:       A2:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[PRED2:%.*]])
+; CHECK-NEXT:            to label [[A1:%.*]] [label %L1]
+; CHECK:       L1:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[PRED3:%.*]])
+; CHECK-NEXT:            to label [[H1]] [label %L0]
+; CHECK:       fork1:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[PRED4:%.*]])
+; CHECK-NEXT:            to label [[FORK1_TARGET_B1:%.*]] [label %fork1.target.B2]
+; CHECK:       B1:
+; CHECK-NEXT:    callbr void asm "", ""()
+; CHECK-NEXT:            to label [[H2:%.*]] []
+; CHECK:       H2:
+; CHECK-NEXT:    callbr void asm "", ""()
+; CHECK-NEXT:            to label [[L2:%.*]] []
+; CHECK:       L2:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[PRED5:%.*]])
+; CHECK-NEXT:            to label [[H2]] [label %L2.target.B2]
+; CHECK:       B2:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[PRED6:%.*]])
+; CHECK-NEXT:            to label [[B1:%.*]] [label %L0]
+; CHECK:       L0:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[PRED7:%.*]])
+; CHECK-NEXT:            to label [[EXIT:%.*]] [label %H0]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+; CHECK:       A1.target.A2:
+; CHECK-NEXT:    br label [[IRR_GUARD:%.*]]
+; CHECK:       H1.target.A1:
+; CHECK-NEXT:    br label [[IRR_GUARD]]
+; CHECK:       H1.target.A2:
+; CHECK-NEXT:    br label [[IRR_GUARD]]
+; CHECK:       irr.guard:
+; CHECK-NEXT:    [[GUARD_A2:%.*]] = phi i1 [ true, [[A1_TARGET_A2]] ], [ false, [[H1_TARGET_A1]] ], [ true, [[H1_TARGET_A2:%.*]] ]
+; CHECK-NEXT:    br i1 [[GUARD_A2]], label [[A2:%.*]], label [[A1]]
+; CHECK:       L2.target.B2:
+; CHECK-NEXT:    br label [[IRR_GUARD1:%.*]]
+; CHECK:       fork1.target.B1:
+; CHECK-NEXT:    br label [[IRR_GUARD1]]
+; CHECK:       fork1.target.B2:
+; CHECK-NEXT:    br label [[IRR_GUARD1]]
+; CHECK:       irr.guard1:
+; CHECK-NEXT:    [[GUARD_B2:%.*]] = phi i1 [ true, [[L2_TARGET_B2:%.*]] ], [ false, [[FORK1_TARGET_B1]] ], [ true, [[FORK1_TARGET_B2:%.*]] ]
+; CHECK-NEXT:    br i1 [[GUARD_B2]], label [[B2:%.*]], label [[B1]]
+;
+entry:
+  callbr void asm "", ""() to label %H0 []
+
+H0:
+  callbr void asm "", "r,!i"(i1 %Pred0) to label %H1 [label %fork1]
+
+H1:
+  callbr void asm "", "r,!i"(i1 %Pred1) to label %A1 [label %A2]
+
+A1:
+  callbr void asm "", ""() to label %A2 []
+
+A2:
+  callbr void asm "", "r,!i"(i1 %Pred2) to label %A1 [label %L1]
+
+L1:
+  callbr void asm "", "r,!i"(i1 %Pred3) to label %H1 [label %L0]
+
+fork1:
+  callbr void asm "", "r,!i"(i1 %Pred4) to label %B1 [label %B2]
+
+B1:
+  callbr void asm "", ""() to label %H2 []
+
+H2:
+  callbr void asm "", ""() to label %L2 []
+
+L2:
+  callbr void asm "", "r,!i"(i1 %Pred5) to label %H2 [label %B2]
+
+B2:
+  callbr void asm "", "r,!i"(i1 %Pred6) to label %B1 [label %L0]
+
+L0:
+  callbr void asm "", "r,!i"(i1 %Pred7) to label %exit [label %H0]
+
+exit:
+  ret void
+}
+
 define void @irr_in_irr_shared_entry(i1 %Pred0, i1 %Pred1, i1 %Pred2, i1 %Pred3, i1 %Pred4, i1 %Pred5, i1 %Pred6, i1 %Pred7, i1 %Pred8, i1 %Pred9, i1 %Pred10, i1 %Pred11, i1 %Pred12, i1 %Pred13) {
 ; CHECK-LABEL: @irr_in_irr_shared_entry(
 ; CHECK-NEXT:  entry:
@@ -527,3 +1058,148 @@ if.end8.i:
 exit:
   ret void
 }
+
+define void @irr_in_irr_shared_entry_callbr(i1 %Pred0, i1 %Pred1, i1 %Pred2, i1 %Pred3, i1 %Pred4, i1 %Pred5, i1 %Pred6, i1 %Pred7, i1 %Pred8, i1 %Pred9, i1 %Pred10, i1 %Pred11, i1 %Pred12, i1 %Pred13) {
+; CHECK-LABEL: @irr_in_irr_shared_entry_callbr(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[PRED0:%.*]])
+; CHECK-NEXT:            to label [[IF_END:%.*]] [label %if.then]
+; CHECK:       if.end:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[PRED1:%.*]])
+; CHECK-NEXT:            to label [[IF_THEN7:%.*]] [label %if.else]
+; CHECK:       if.then7:
+; CHECK-NEXT:    callbr void asm "", ""()
+; CHECK-NEXT:            to label [[IF_END16:%.*]] []
+; CHECK:       if.else:
+; CHECK-NEXT:    callbr void asm "", ""()
+; CHECK-NEXT:            to label [[IF_END16]] []
+; CHECK:       if.end16:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[PRED2:%.*]])
+; CHECK-NEXT:            to label [[WHILE_COND_PREHEADER:%.*]] [label %if.then39]
+; CHECK:       while.cond.preheader:
+; CHECK-NEXT:    callbr void asm "", ""()
+; CHECK-NEXT:            to label [[WHILE_COND:%.*]] []
+; CHECK:       while.cond:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[PRED3:%.*]])
+; CHECK-NEXT:            to label [[WHILE_COND_TARGET_COND_TRUE49:%.*]] [label %lor.rhs]
+; CHECK:       cond.true49:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[PRED4:%.*]])
+; CHECK-NEXT:            to label [[IF_THEN69:%.*]] [label %cond.true49.target.while.body63]
+; CHECK:       while.body63:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[PRED5:%.*]])
+; CHECK-NEXT:            to label [[EXIT:%.*]] [label %while.cond47]
+; CHECK:       while.cond47:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[PRED6:%.*]])
+; CHECK-NEXT:            to label [[COND_TRUE49:%.*]] [label %while.cond47.target.cond.end61]
+; CHECK:       cond.end61:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[PRED7:%.*]])
+; CHECK-NEXT:            to label [[COND_END61_TARGET_WHILE_BODY63:%.*]] [label %while.cond]
+; CHECK:       if.then69:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[PRED8:%.*]])
+; CHECK-NEXT:            to label [[EXIT]] [label %while.cond]
+; CHECK:       lor.rhs:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[PRED9:%.*]])
+; CHECK-NEXT:            to label [[LOR_RHS_TARGET_COND_END61:%.*]] [label %while.end76]
+; CHECK:       while.end76:
+; CHECK-NEXT:    callbr void asm "", ""()
+; CHECK-NEXT:            to label [[EXIT]] []
+; CHECK:       if.then39:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[PRED10:%.*]])
+; CHECK-NEXT:            to label [[EXIT]] [label %if.end.i145]
+; CHECK:       if.end.i145:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[PRED11:%.*]])
+; CHECK-NEXT:            to label [[EXIT]] [label %if.end8.i149]
+; CHECK:       if.end8.i149:
+; CHECK-NEXT:    callbr void asm "", ""()
+; CHECK-NEXT:            to label [[EXIT]] []
+; CHECK:       if.then:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[PRED12:%.*]])
+; CHECK-NEXT:            to label [[EXIT]] [label %if.end.i]
+; CHECK:       if.end.i:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[PRED13:%.*]])
+; CHECK-NEXT:            to label [[EXIT]] [label %if.end8.i]
+; CHECK:       if.end8.i:
+; CHECK-NEXT:    callbr void asm "", ""()
+; CHECK-NEXT:            to label [[EXIT]] []
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+; CHECK:       while.cond47.target.cond.end61:
+; CHECK-NEXT:    br label [[IRR_GUARD:%.*]]
+; CHECK:       lor.rhs.target.cond.end61:
+; CHECK-NEXT:    br label [[IRR_GUARD]]
+; CHECK:       while.cond.target.cond.true49:
+; CHECK-NEXT:    br label [[IRR_GUARD]]
+; CHECK:       irr.guard:
+; CHECK-NEXT:    [[GUARD_COND_END61:%.*]] = phi i1 [ true, [[WHILE_COND47_TARGET_COND_END61:%.*]] ], [ true, [[LOR_RHS_TARGET_COND_END61]] ], [ false, [[WHILE_COND_TARGET_COND_TRUE49]] ]
+; CHECK-NEXT:    br i1 [[GUARD_COND_END61]], label [[COND_END61:%.*]], label [[IRR_GUARD1:%.*]]
+; CHECK:       cond.true49.target.while.body63:
+; CHECK-NEXT:    br label [[IRR_GUARD1]]
+; CHECK:       cond.end61.target.while.body63:
+; CHECK-NEXT:    br label [[IRR_GUARD1]]
+; CHECK:       irr.guard1:
+; CHECK-NEXT:    [[GUARD_WHILE_BODY63:%.*]] = phi i1 [ true, [[COND_TRUE49_TARGET_WHILE_BODY63:%.*]] ], [ true, [[COND_END61_TARGET_WHILE_BODY63]] ], [ false, [[IRR_GUARD]] ]
+; CHECK-NEXT:    br i1 [[GUARD_WHILE_BODY63]], label [[WHILE_BODY63:%.*]], label [[COND_TRUE49]]
+;
+entry:
+  callbr void asm "", "r,!i"(i1 %Pred0) to label %if.end [label %if.then]
+
+if.end:
+  callbr void asm "", "r,!i"(i1 %Pred1) to label %if.then7 [label %if.else]
+
+if.then7:
+  callbr void asm "", ""() to label %if.end16 []
+
+if.else:
+  callbr void asm "", ""() to label %if.end16 []
+
+if.end16:
+  callbr void asm "", "r,!i"(i1 %Pred2) to label %while.cond.preheader [label %if.then39]
+
+while.cond.preheader:
+  callbr void asm "", ""() to label %while.cond []
+
+while.cond:
+  callbr void asm "", "r,!i"(i1 %Pred3) to label %cond.true49 [label %lor.rhs]
+
+cond.true49:
+  callbr void asm "", "r,!i"(i1 %Pred4) to label %if.then69 [label %while.body63]
+
+while.body63:
+  callbr void asm "", "r,!i"(i1 %Pred5) to label %exit [label %while.cond47]
+
+while.cond47:
+  callbr void asm "", "r,!i"(i1 %Pred6) to label %cond.true49 [label %cond.end61]
+
+cond.end61:
+  callbr void asm "", "r,!i"(i1 %Pred7) to label %while.body63 [label %while.cond]
+
+if.then69:
+  callbr void asm "", "r,!i"(i1 %Pred8) to label %exit [label %while.cond]
+
+lor.rhs:
+  callbr void asm "", "r,!i"(i1 %Pred9) to label %cond.end61 [label %while.end76]
+
+while.end76:
+  callbr void asm "", ""() to label %exit []
+
+if.then39:
+  callbr void asm "", "r,!i"(i1 %Pred10) to label %exit [label %if.end.i145]
+
+if.end.i145:
+  callbr void asm "", "r,!i"(i1 %Pred11) to label %exit [label %if.end8.i149]
+
+if.end8.i149:
+  callbr void asm "", ""() to label %exit []
+
+if.then:
+  callbr void asm "", "r,!i"(i1 %Pred12) to label %exit [label %if.end.i]
+
+if.end.i:
+  callbr void asm "", "r,!i"(i1 %Pred13) to label %exit [label %if.end8.i]
+
+if.end8.i:
+  callbr void asm "", ""() to label %exit []
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/FixIrreducible/unreachable.ll b/llvm/test/Transforms/FixIrreducible/unreachable.ll
index defbefb3ba812..845cf507c7fc0 100644
--- a/llvm/test/Transforms/FixIrreducible/unreachable.ll
+++ b/llvm/test/Transforms/FixIrreducible/unreachable.ll
@@ -25,3 +25,26 @@ loop.latch:
 loop.exit:
   ret void
 }
+
+; CHECK-LABEL: @unreachable_callbr(
+; CHECK: entry:
+; CHECK-NOT: irr.guard:
+define void @unreachable_callbr(i32 %n, i1 %arg) {
+entry:
+  callbr void asm "", ""() to label %loop.body []
+
+loop.body:
+  callbr void asm "", ""() to label %inner.block []
+
+unreachable.block:
+  callbr void asm "", ""() to label %inner.block []
+
+inner.block:
+  callbr void asm "", "r,!i"(i1 %arg) to label %loop.exit [label %loop.latch]
+
+loop.latch:
+  callbr void asm "", ""() to label %loop.body []
+
+loop.exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/GVN/assume-equal.ll b/llvm/test/Transforms/GVN/assume-equal.ll
index bbbc5c58584a6..a38980169fc52 100644
--- a/llvm/test/Transforms/GVN/assume-equal.ll
+++ b/llvm/test/Transforms/GVN/assume-equal.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt < %s -passes=gvn -S | FileCheck %s
 
+target datalayout = "p1:64:64:64:32"
+
 %struct.A = type { ptr }
 @_ZTV1A = available_externally unnamed_addr constant [4 x ptr] [ptr null, ptr @_ZTI1A, ptr @_ZN1A3fooEv, ptr @_ZN1A3barEv], align 8
 @_ZTI1A = external constant ptr
@@ -372,6 +374,20 @@ define i1 @assume_ptr_eq_different_prov_does_not_matter_icmp(ptr %p, ptr %p2) {
   ret i1 %c
 }
 
+define i1 @assume_ptr_eq_different_prov_does_not_matter_icmp_addrsize(ptr addrspace(1) %p, ptr addrspace(1) %p2) {
+; CHECK-LABEL: define i1 @assume_ptr_eq_different_prov_does_not_matter_icmp_addrsize(
+; CHECK-SAME: ptr addrspace(1) [[P:%.*]], ptr addrspace(1) [[P2:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq ptr addrspace(1) [[P]], [[P2]]
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    [[C:%.*]] = icmp eq ptr addrspace(1) [[P]], null
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %cmp = icmp eq ptr addrspace(1) %p, %p2
+  call void @llvm.assume(i1 %cmp)
+  %c = icmp eq ptr addrspace(1) %p2, null
+  ret i1 %c
+}
+
 ; This is not correct, as it may change the provenance exposed by ptrtoint.
 ; We still allow it for now.
 define i64 @assume_ptr_eq_different_prov_does_not_matter_ptrtoint(ptr %p, ptr %p2) {
@@ -388,6 +404,20 @@ define i64 @assume_ptr_eq_different_prov_does_not_matter_ptrtoint(ptr %p, ptr %p
   ret i64 %int
 }
 
+define i64 @assume_ptr_eq_different_prov_does_not_matter_ptrtoint_addrsize(ptr addrspace(1) %p, ptr addrspace(1) %p2) {
+; CHECK-LABEL: define i64 @assume_ptr_eq_different_prov_does_not_matter_ptrtoint_addrsize(
+; CHECK-SAME: ptr addrspace(1) [[P:%.*]], ptr addrspace(1) [[P2:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq ptr addrspace(1) [[P]], [[P2]]
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    [[INT:%.*]] = ptrtoint ptr addrspace(1) [[P]] to i64
+; CHECK-NEXT:    ret i64 [[INT]]
+;
+  %cmp = icmp eq ptr addrspace(1) %p, %p2
+  call void @llvm.assume(i1 %cmp)
+  %int = ptrtoint ptr addrspace(1) %p2 to i64
+  ret i64 %int
+}
+
 define i64 @assume_ptr_eq_different_prov_does_not_matter_ptrtoaddr(ptr %p, ptr %p2) {
 ; CHECK-LABEL: define i64 @assume_ptr_eq_different_prov_does_not_matter_ptrtoaddr(
 ; CHECK-SAME: ptr [[P:%.*]], ptr [[P2:%.*]]) {
@@ -402,6 +432,20 @@ define i64 @assume_ptr_eq_different_prov_does_not_matter_ptrtoaddr(ptr %p, ptr %
   ret i64 %int
 }
 
+define i32 @assume_ptr_eq_different_prov_does_not_matter_ptrtoaddr_addrsize(ptr addrspace(1) %p, ptr addrspace(1) %p2) {
+; CHECK-LABEL: define i32 @assume_ptr_eq_different_prov_does_not_matter_ptrtoaddr_addrsize(
+; CHECK-SAME: ptr addrspace(1) [[P:%.*]], ptr addrspace(1) [[P2:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq ptr addrspace(1) [[P]], [[P2]]
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    [[INT:%.*]] = ptrtoaddr ptr addrspace(1) [[P]] to i32
+; CHECK-NEXT:    ret i32 [[INT]]
+;
+  %cmp = icmp eq ptr addrspace(1) %p, %p2
+  call void @llvm.assume(i1 %cmp)
+  %int = ptrtoaddr ptr addrspace(1) %p2 to i32
+  ret i32 %int
+}
+
 define i8 @assume_ptr_eq_same_prov(ptr %p, i64 %x) {
 ; CHECK-LABEL: define i8 @assume_ptr_eq_same_prov(
 ; CHECK-SAME: ptr [[P:%.*]], i64 [[X:%.*]]) {
diff --git a/llvm/test/Transforms/IndVarSimplify/AMDGPU/addrspace-7-doesnt-crash.ll b/llvm/test/Transforms/IndVarSimplify/AMDGPU/addrspace-7-doesnt-crash.ll
index 08dcf1d7a0091..8e932e0c00d4f 100644
--- a/llvm/test/Transforms/IndVarSimplify/AMDGPU/addrspace-7-doesnt-crash.ll
+++ b/llvm/test/Transforms/IndVarSimplify/AMDGPU/addrspace-7-doesnt-crash.ll
@@ -7,11 +7,11 @@ define void @f(ptr addrspace(7) %arg) {
 ; CHECK-LABEL: define void @f
 ; CHECK-SAME: (ptr addrspace(7) [[ARG:%.*]]) {
 ; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr addrspace(7) [[ARG]], i32 8
 ; CHECK-NEXT:    br label [[BB1:%.*]]
 ; CHECK:       bb1:
 ; CHECK-NEXT:    br i1 false, label [[BB2:%.*]], label [[BB1]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr addrspace(7) [[ARG]], i32 8
 ; CHECK-NEXT:    br label [[BB3:%.*]]
 ; CHECK:       bb3:
 ; CHECK-NEXT:    [[I4:%.*]] = load i32, ptr addrspace(7) [[SCEVGEP]], align 4
diff --git a/llvm/test/Transforms/IndVarSimplify/ARM/code-size.ll b/llvm/test/Transforms/IndVarSimplify/ARM/code-size.ll
index 2003b1a72206d..3c6535da486aa 100644
--- a/llvm/test/Transforms/IndVarSimplify/ARM/code-size.ll
+++ b/llvm/test/Transforms/IndVarSimplify/ARM/code-size.ll
@@ -4,33 +4,31 @@
 
 define i32 @remove_loop(i32 %size) #0 {
 ; CHECK-V8M-LABEL: @remove_loop(
-; CHECK-V8M-SAME: i32 [[SIZE:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-V8M-NEXT:  entry:
-; CHECK-V8M-NEXT:    br label %[[WHILE_COND:.*]]
-; CHECK-V8M:       while.cond:
-; CHECK-V8M-NEXT:    br i1 false, label %[[WHILE_COND]], label %[[WHILE_END:.*]]
-; CHECK-V8M:       while.end:
-; CHECK-V8M-NEXT:    [[TMP0:%.*]] = add i32 [[SIZE]], 31
+; CHECK-V8M-NEXT:    [[TMP0:%.*]] = add i32 [[SIZE:%.*]], 31
 ; CHECK-V8M-NEXT:    [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[SIZE]], i32 31)
 ; CHECK-V8M-NEXT:    [[TMP1:%.*]] = sub i32 [[TMP0]], [[UMIN]]
 ; CHECK-V8M-NEXT:    [[TMP2:%.*]] = lshr i32 [[TMP1]], 5
 ; CHECK-V8M-NEXT:    [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 5
 ; CHECK-V8M-NEXT:    [[TMP4:%.*]] = sub i32 [[SIZE]], [[TMP3]]
+; CHECK-V8M-NEXT:    br label [[WHILE_COND:%.*]]
+; CHECK-V8M:       while.cond:
+; CHECK-V8M-NEXT:    br i1 false, label [[WHILE_COND]], label [[WHILE_END:%.*]]
+; CHECK-V8M:       while.end:
 ; CHECK-V8M-NEXT:    ret i32 [[TMP4]]
 ;
 ; CHECK-V8A-LABEL: @remove_loop(
-; CHECK-V8A-SAME: i32 [[SIZE:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-V8A-NEXT:  entry:
-; CHECK-V8A-NEXT:    br label %[[WHILE_COND:.*]]
-; CHECK-V8A:       while.cond:
-; CHECK-V8A-NEXT:    br i1 false, label %[[WHILE_COND]], label %[[WHILE_END:.*]]
-; CHECK-V8A:       while.end:
-; CHECK-V8A-NEXT:    [[TMP0:%.*]] = add i32 [[SIZE]], 31
+; CHECK-V8A-NEXT:    [[TMP0:%.*]] = add i32 [[SIZE:%.*]], 31
 ; CHECK-V8A-NEXT:    [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[SIZE]], i32 31)
 ; CHECK-V8A-NEXT:    [[TMP1:%.*]] = sub i32 [[TMP0]], [[UMIN]]
 ; CHECK-V8A-NEXT:    [[TMP2:%.*]] = lshr i32 [[TMP1]], 5
 ; CHECK-V8A-NEXT:    [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 5
 ; CHECK-V8A-NEXT:    [[TMP4:%.*]] = sub i32 [[SIZE]], [[TMP3]]
+; CHECK-V8A-NEXT:    br label [[WHILE_COND:%.*]]
+; CHECK-V8A:       while.cond:
+; CHECK-V8A-NEXT:    br i1 false, label [[WHILE_COND]], label [[WHILE_END:%.*]]
+; CHECK-V8A:       while.end:
 ; CHECK-V8A-NEXT:    ret i32 [[TMP4]]
 ;
 entry:
diff --git a/llvm/test/Transforms/IndVarSimplify/ARM/indvar-unroll-imm-cost.ll b/llvm/test/Transforms/IndVarSimplify/ARM/indvar-unroll-imm-cost.ll
index 2261423766792..382f026e7de6a 100644
--- a/llvm/test/Transforms/IndVarSimplify/ARM/indvar-unroll-imm-cost.ll
+++ b/llvm/test/Transforms/IndVarSimplify/ARM/indvar-unroll-imm-cost.ll
@@ -77,6 +77,8 @@ define dso_local arm_aapcscc void @test(ptr nocapture %pDest, ptr nocapture read
 ; CHECK-NEXT:    [[CMP2780:%.*]] = icmp ugt i32 [[ADD25]], [[J_0_LCSSA]]
 ; CHECK-NEXT:    br i1 [[CMP2780]], label [[FOR_BODY29_PREHEADER:%.*]], label [[FOR_END40]]
 ; CHECK:       for.body29.preheader:
+; CHECK-NEXT:    [[TMP10:%.*]] = sub nsw i32 [[ADD25]], [[J_0_LCSSA]]
+; CHECK-NEXT:    [[SCEVGEP93:%.*]] = getelementptr i16, ptr [[PSRCB_ADDR_1_LCSSA]], i32 [[TMP10]]
 ; CHECK-NEXT:    br label [[FOR_BODY29:%.*]]
 ; CHECK:       for.body29:
 ; CHECK-NEXT:    [[J_184:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY29]] ], [ [[J_0_LCSSA]], [[FOR_BODY29_PREHEADER]] ]
@@ -100,8 +102,6 @@ define dso_local arm_aapcscc void @test(ptr nocapture %pDest, ptr nocapture read
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[ADD25]]
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END40_LOOPEXIT:%.*]], label [[FOR_BODY29]]
 ; CHECK:       for.end40.loopexit:
-; CHECK-NEXT:    [[TMP10:%.*]] = sub nsw i32 [[ADD25]], [[J_0_LCSSA]]
-; CHECK-NEXT:    [[SCEVGEP93:%.*]] = getelementptr i16, ptr [[PSRCB_ADDR_1_LCSSA]], i32 [[TMP10]]
 ; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i16, ptr [[PSRCA_ADDR_1_LCSSA]], i32 [[TMP10]]
 ; CHECK-NEXT:    [[SCEVGEP94:%.*]] = getelementptr i32, ptr [[PDEST_ADDR_1_LCSSA]], i32 [[TMP10]]
 ; CHECK-NEXT:    br label [[FOR_END40]]
diff --git a/llvm/test/Transforms/IndVarSimplify/X86/inner-loop-by-latch-cond.ll b/llvm/test/Transforms/IndVarSimplify/X86/inner-loop-by-latch-cond.ll
index 0fa6e34cf186e..0eb9debce8177 100644
--- a/llvm/test/Transforms/IndVarSimplify/X86/inner-loop-by-latch-cond.ll
+++ b/llvm/test/Transforms/IndVarSimplify/X86/inner-loop-by-latch-cond.ll
@@ -14,6 +14,7 @@ define void @test(i64 %a) {
 ; CHECK:       outer_header:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[OUTER_LATCH:%.*]] ], [ 21, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 20, [[ENTRY]] ], [ [[I_NEXT:%.*]], [[OUTER_LATCH]] ]
+; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
 ; CHECK-NEXT:    br label [[INNER_HEADER:%.*]]
 ; CHECK:       inner_header:
 ; CHECK-NEXT:    [[J:%.*]] = phi i64 [ 1, [[OUTER_HEADER]] ], [ [[J_NEXT:%.*]], [[INNER_HEADER]] ]
@@ -22,7 +23,6 @@ define void @test(i64 %a) {
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[J_NEXT]], [[INDVARS_IV]]
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[INNER_HEADER]], label [[OUTER_LATCH]]
 ; CHECK:       outer_latch:
-; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
 ; CHECK-NEXT:    [[COND2:%.*]] = icmp ne i64 [[I_NEXT]], 40
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    br i1 [[COND2]], label [[OUTER_HEADER]], label [[RETURN:%.*]]
diff --git a/llvm/test/Transforms/IndVarSimplify/exit-count-select.ll b/llvm/test/Transforms/IndVarSimplify/exit-count-select.ll
index 1592b84480e3f..829092f2f4bd4 100644
--- a/llvm/test/Transforms/IndVarSimplify/exit-count-select.ll
+++ b/llvm/test/Transforms/IndVarSimplify/exit-count-select.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=indvars -S | FileCheck %s
+; RUN: opt < %s -passes='require<scalar-evolution>,indvars,loop-mssa(licm)' -S | FileCheck %s
 
 define i32 @logical_and_2ops(i32 %n, i32 %m) {
 ; CHECK-LABEL: @logical_and_2ops(
@@ -56,10 +56,10 @@ define i32 @logical_and_3ops(i32 %n, i32 %m, i32 %k) {
 ; CHECK:       loop:
 ; CHECK-NEXT:    br i1 false, label [[LOOP]], label [[EXIT:%.*]]
 ; CHECK:       exit:
-; CHECK-NEXT:    [[TMP0:%.*]] = freeze i32 [[K:%.*]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = freeze i32 [[M:%.*]]
-; CHECK-NEXT:    [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP0]], i32 [[TMP1]])
-; CHECK-NEXT:    [[UMIN1:%.*]] = call i32 @llvm.umin.i32(i32 [[UMIN]], i32 [[N:%.*]])
+; CHECK-NEXT:    [[N:%.*]] = freeze i32 [[K:%.*]]
+; CHECK-NEXT:    [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP1]], i32 [[N]])
+; CHECK-NEXT:    [[UMIN1:%.*]] = call i32 @llvm.umin.i32(i32 [[UMIN]], i32 [[N1:%.*]])
 ; CHECK-NEXT:    ret i32 [[UMIN1]]
 ;
 entry:
@@ -84,10 +84,10 @@ define i32 @logical_or_3ops(i32 %n, i32 %m, i32 %k) {
 ; CHECK:       loop:
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[LOOP]]
 ; CHECK:       exit:
-; CHECK-NEXT:    [[TMP0:%.*]] = freeze i32 [[K:%.*]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = freeze i32 [[M:%.*]]
-; CHECK-NEXT:    [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP0]], i32 [[TMP1]])
-; CHECK-NEXT:    [[UMIN1:%.*]] = call i32 @llvm.umin.i32(i32 [[UMIN]], i32 [[N:%.*]])
+; CHECK-NEXT:    [[N:%.*]] = freeze i32 [[K:%.*]]
+; CHECK-NEXT:    [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP1]], i32 [[N]])
+; CHECK-NEXT:    [[UMIN1:%.*]] = call i32 @llvm.umin.i32(i32 [[UMIN]], i32 [[N1:%.*]])
 ; CHECK-NEXT:    ret i32 [[UMIN1]]
 ;
 entry:
diff --git a/llvm/test/Transforms/IndVarSimplify/finite-exit-comparisons.ll b/llvm/test/Transforms/IndVarSimplify/finite-exit-comparisons.ll
index e006d9f6696ca..f798eb281f51a 100644
--- a/llvm/test/Transforms/IndVarSimplify/finite-exit-comparisons.ll
+++ b/llvm/test/Transforms/IndVarSimplify/finite-exit-comparisons.ll
@@ -932,6 +932,9 @@ for.end:                                          ; preds = %for.body, %entry
 define i16 @ult_multiuse_profit(i16 %n.raw, i8 %start) mustprogress {
 ; CHECK-LABEL: @ult_multiuse_profit(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP2:%.*]] = add i8 [[START:%.*]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[TMP2]] to i16
+; CHECK-NEXT:    [[UMAX:%.*]] = call i16 @llvm.umax.i16(i16 [[TMP1]], i16 254)
 ; CHECK-NEXT:    [[TMP0:%.*]] = trunc i16 254 to i8
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
@@ -940,9 +943,6 @@ define i16 @ult_multiuse_profit(i16 %n.raw, i8 %start) mustprogress {
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[IV_NEXT]], [[TMP0]]
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    [[TMP1:%.*]] = add i8 [[START:%.*]], 1
-; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP1]] to i16
-; CHECK-NEXT:    [[UMAX:%.*]] = call i16 @llvm.umax.i16(i16 [[TMP2]], i16 254)
 ; CHECK-NEXT:    ret i16 [[UMAX]]
 ;
 entry:
diff --git a/llvm/test/Transforms/IndVarSimplify/loop-guard-order.ll b/llvm/test/Transforms/IndVarSimplify/loop-guard-order.ll
index 14ee00d77197c..2763860e79875 100644
--- a/llvm/test/Transforms/IndVarSimplify/loop-guard-order.ll
+++ b/llvm/test/Transforms/IndVarSimplify/loop-guard-order.ll
@@ -114,7 +114,7 @@ define i32 @urem_order1(i32 %n) {
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
 ; CHECK-NEXT:    call void @foo()
-; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 3
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw i32 [[IV]], 3
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT_LOOPEXIT:.*]], label %[[LOOP]]
 ; CHECK:       [[EXIT_LOOPEXIT]]:
@@ -205,13 +205,12 @@ define i64 @test_loop_with_div_order_1(i64 %n) {
 ; CHECK-NEXT:    [[PARITY_CHECK:%.*]] = icmp eq i64 [[IS_ODD]], 0
 ; CHECK-NEXT:    br i1 [[PARITY_CHECK]], label %[[LOOP_PREHEADER:.*]], label %[[EXIT]]
 ; CHECK:       [[LOOP_PREHEADER]]:
-; CHECK-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[UPPER_BOUND]], i64 1)
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
 ; CHECK-NEXT:    [[DUMMY:%.*]] = load volatile i64, ptr null, align 8
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], [[UMAX]]
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], [[UPPER_BOUND]]
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[LOOP]], label %[[EXIT_LOOPEXIT:.*]]
 ; CHECK:       [[EXIT_LOOPEXIT]]:
 ; CHECK-NEXT:    br label %[[EXIT]]
diff --git a/llvm/test/Transforms/IndVarSimplify/pr116483.ll b/llvm/test/Transforms/IndVarSimplify/pr116483.ll
index 093e25a3caa81..e9e0d22bf960a 100644
--- a/llvm/test/Transforms/IndVarSimplify/pr116483.ll
+++ b/llvm/test/Transforms/IndVarSimplify/pr116483.ll
@@ -4,16 +4,16 @@
 define i32 @test() {
 ; CHECK-LABEL: define i32 @test() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    br label %[[LOOP_BODY:.*]]
-; CHECK:       [[LOOP_BODY]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[LOOP_BODY]]
-; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    [[XOR:%.*]] = xor i32 0, 3
 ; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[XOR]], 329
 ; CHECK-NEXT:    [[CONV:%.*]] = trunc i32 [[MUL]] to i16
 ; CHECK-NEXT:    [[SEXT:%.*]] = shl i16 [[CONV]], 8
 ; CHECK-NEXT:    [[CONV1:%.*]] = ashr i16 [[SEXT]], 8
 ; CHECK-NEXT:    [[CONV3:%.*]] = zext i16 [[CONV1]] to i32
+; CHECK-NEXT:    br label %[[LOOP_BODY:.*]]
+; CHECK:       [[LOOP_BODY]]:
+; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[LOOP_BODY]]
+; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret i32 [[CONV3]]
 ;
 entry:
diff --git a/llvm/test/Transforms/IndVarSimplify/pr24783.ll b/llvm/test/Transforms/IndVarSimplify/pr24783.ll
index c521bcaf59d49..37ecf42ea0fd3 100644
--- a/llvm/test/Transforms/IndVarSimplify/pr24783.ll
+++ b/llvm/test/Transforms/IndVarSimplify/pr24783.ll
@@ -7,11 +7,11 @@ target triple = "powerpc64-unknown-linux-gnu"
 define void @f(ptr %end.s, ptr %loc, i32 %p) {
 ; CHECK-LABEL: @f(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[END:%.*]] = getelementptr inbounds i32, ptr [[END_S:%.*]], i32 [[P:%.*]]
 ; CHECK-NEXT:    br label [[WHILE_BODY_I:%.*]]
 ; CHECK:       while.body.i:
 ; CHECK-NEXT:    br i1 true, label [[LOOP_EXIT:%.*]], label [[WHILE_BODY_I]]
 ; CHECK:       loop.exit:
-; CHECK-NEXT:    [[END:%.*]] = getelementptr inbounds i32, ptr [[END_S:%.*]], i32 [[P:%.*]]
 ; CHECK-NEXT:    store ptr [[END]], ptr [[LOC:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/IndVarSimplify/pr39673.ll b/llvm/test/Transforms/IndVarSimplify/pr39673.ll
index 7b093b34b91ad..3cee1ab7be881 100644
--- a/llvm/test/Transforms/IndVarSimplify/pr39673.ll
+++ b/llvm/test/Transforms/IndVarSimplify/pr39673.ll
@@ -148,6 +148,7 @@ loop2.end:                                       ; preds = %loop2
 define i16 @neg_loop_carried(i16 %arg) {
 ; CHECK-LABEL: @neg_loop_carried(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i16 [[ARG:%.*]], 2
 ; CHECK-NEXT:    br label [[LOOP1:%.*]]
 ; CHECK:       loop1:
 ; CHECK-NEXT:    [[L1:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[L1_ADD:%.*]], [[LOOP1]] ]
@@ -155,7 +156,6 @@ define i16 @neg_loop_carried(i16 %arg) {
 ; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i16 [[L1_ADD]], 2
 ; CHECK-NEXT:    br i1 [[CMP1]], label [[LOOP1]], label [[LOOP2_PREHEADER:%.*]]
 ; CHECK:       loop2.preheader:
-; CHECK-NEXT:    [[TMP0:%.*]] = add i16 [[ARG:%.*]], 2
 ; CHECK-NEXT:    br label [[LOOP2:%.*]]
 ; CHECK:       loop2:
 ; CHECK-NEXT:    [[K2:%.*]] = phi i16 [ [[K2_ADD:%.*]], [[LOOP2]] ], [ [[TMP0]], [[LOOP2_PREHEADER]] ]
diff --git a/llvm/test/Transforms/IndVarSimplify/pr63763.ll b/llvm/test/Transforms/IndVarSimplify/pr63763.ll
index 427db1e67410a..a5fde67d6140a 100644
--- a/llvm/test/Transforms/IndVarSimplify/pr63763.ll
+++ b/llvm/test/Transforms/IndVarSimplify/pr63763.ll
@@ -16,13 +16,13 @@ define i32 @test(i1 %c) {
 ; CHECK-NEXT:    [[CONV2:%.*]] = ashr exact i32 [[SEXT]], 24
 ; CHECK-NEXT:    [[INVARIANT_OP:%.*]] = sub nsw i32 7, [[CONV2]]
 ; CHECK-NEXT:    call void @use(i32 [[INVARIANT_OP]])
+; CHECK-NEXT:    [[SEXT_US:%.*]] = shl i32 [[SEL]], 24
+; CHECK-NEXT:    [[CONV2_US:%.*]] = ashr exact i32 [[SEXT_US]], 24
+; CHECK-NEXT:    [[INVARIANT_OP_US:%.*]] = sub nsw i32 7, [[CONV2_US]]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[LOOP]]
 ; CHECK:       exit:
-; CHECK-NEXT:    [[SEXT_US:%.*]] = shl i32 [[SEL]], 24
-; CHECK-NEXT:    [[CONV2_US:%.*]] = ashr exact i32 [[SEXT_US]], 24
-; CHECK-NEXT:    [[INVARIANT_OP_US:%.*]] = sub nsw i32 7, [[CONV2_US]]
 ; CHECK-NEXT:    ret i32 [[INVARIANT_OP_US]]
 ;
 entry:
diff --git a/llvm/test/Transforms/IndVarSimplify/replace-loop-exit-folds.ll b/llvm/test/Transforms/IndVarSimplify/replace-loop-exit-folds.ll
index b3162de0f2245..7cdc98a6c4f7c 100644
--- a/llvm/test/Transforms/IndVarSimplify/replace-loop-exit-folds.ll
+++ b/llvm/test/Transforms/IndVarSimplify/replace-loop-exit-folds.ll
@@ -4,22 +4,21 @@
 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
 
 define i32 @remove_loop(i32 %size) {
-; CHECK-LABEL: define i32 @remove_loop(
-; CHECK-SAME: i32 [[SIZE:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    br label %[[WHILE_COND:.*]]
-; CHECK:       [[WHILE_COND]]:
-; CHECK-NEXT:    [[SIZE_ADDR_0:%.*]] = phi i32 [ [[SIZE]], %[[ENTRY]] ], [ [[SUB:%.*]], %[[WHILE_COND]] ]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[SIZE_ADDR_0]], 31
-; CHECK-NEXT:    [[SUB]] = add i32 [[SIZE_ADDR_0]], -32
-; CHECK-NEXT:    br i1 [[CMP]], label %[[WHILE_COND]], label %[[WHILE_END:.*]]
-; CHECK:       [[WHILE_END]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[SIZE]], 31
+; CHECK-LABEL: @remove_loop(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[SIZE:%.*]], 31
 ; CHECK-NEXT:    [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[SIZE]], i32 31)
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 [[TMP0]], [[UMIN]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = lshr i32 [[TMP1]], 5
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 5
 ; CHECK-NEXT:    [[TMP4:%.*]] = sub i32 [[SIZE]], [[TMP3]]
+; CHECK-NEXT:    br label [[WHILE_COND:%.*]]
+; CHECK:       while.cond:
+; CHECK-NEXT:    [[SIZE_ADDR_0:%.*]] = phi i32 [ [[SIZE]], [[ENTRY:%.*]] ], [ [[SUB:%.*]], [[WHILE_COND]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[SIZE_ADDR_0]], 31
+; CHECK-NEXT:    [[SUB]] = add i32 [[SIZE_ADDR_0]], -32
+; CHECK-NEXT:    br i1 [[CMP]], label [[WHILE_COND]], label [[WHILE_END:%.*]]
+; CHECK:       while.end:
 ; CHECK-NEXT:    ret i32 [[TMP4]]
 ;
 entry:
diff --git a/llvm/test/Transforms/IndVarSimplify/rewrite-loop-exit-values-phi.ll b/llvm/test/Transforms/IndVarSimplify/rewrite-loop-exit-values-phi.ll
index 84ae79d53e25e..41fce3681c3a3 100644
--- a/llvm/test/Transforms/IndVarSimplify/rewrite-loop-exit-values-phi.ll
+++ b/llvm/test/Transforms/IndVarSimplify/rewrite-loop-exit-values-phi.ll
@@ -76,6 +76,10 @@ define i64 @narow_canonical_iv_wide_multiplied_iv(i32 %x, i64 %y, ptr %0) {
 ; CHECK-LABEL: @narow_canonical_iv_wide_multiplied_iv(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[SMAX:%.*]] = tail call i32 @llvm.smax.i32(i32 [[X:%.*]], i32 1)
+; CHECK-NEXT:    [[TMP1:%.*]] = zext nneg i32 [[SMAX]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[Y:%.*]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = add nuw nsw i64 [[TMP3]], 1
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
@@ -84,10 +88,6 @@ define i64 @narow_canonical_iv_wide_multiplied_iv(i32 %x, i64 %y, ptr %0) {
 ; CHECK-NEXT:    [[EC:%.*]] = icmp ne i32 [[IV_NEXT]], [[SMAX]]
 ; CHECK-NEXT:    br i1 [[EC]], label [[LOOP]], label [[EXIT:%.*]]
 ; CHECK:       exit:
-; CHECK-NEXT:    [[TMP1:%.*]] = zext nneg i32 [[SMAX]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[Y:%.*]], [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 1
-; CHECK-NEXT:    [[TMP6:%.*]] = add nuw nsw i64 [[TMP3]], 1
 ; CHECK-NEXT:    ret i64 [[TMP6]]
 ;
 entry:
diff --git a/llvm/test/Transforms/IndVarSimplify/scev-expander-preserve-lcssa.ll b/llvm/test/Transforms/IndVarSimplify/scev-expander-preserve-lcssa.ll
index 14e06fe06b412..aca553e536119 100644
--- a/llvm/test/Transforms/IndVarSimplify/scev-expander-preserve-lcssa.ll
+++ b/llvm/test/Transforms/IndVarSimplify/scev-expander-preserve-lcssa.ll
@@ -23,8 +23,8 @@ define void @test1(i8 %x, ptr %ptr) {
 ; CHECK-NEXT:    br label [[WHILE_COND192:%.*]]
 ; CHECK:       while.cond192:
 ; CHECK-NEXT:    switch i8 [[X:%.*]], label [[WHILE_BODY205:%.*]] [
-; CHECK-NEXT:    i8 59, label [[WHILE_COND215_PREHEADER:%.*]]
-; CHECK-NEXT:    i8 10, label [[IF_END224_LOOPEXIT1:%.*]]
+; CHECK-NEXT:      i8 59, label [[WHILE_COND215_PREHEADER:%.*]]
+; CHECK-NEXT:      i8 10, label [[IF_END224_LOOPEXIT1:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       while.cond215.preheader:
 ; CHECK-NEXT:    br label [[WHILE_COND215:%.*]]
@@ -103,8 +103,8 @@ define void @test2(i16 %x)  {
 ; CHECK-NEXT:    br label [[FOR_COND:%.*]]
 ; CHECK:       for.cond:
 ; CHECK-NEXT:    switch i16 [[X:%.*]], label [[RETURN_LOOPEXIT1:%.*]] [
-; CHECK-NEXT:    i16 41, label [[FOR_END:%.*]]
-; CHECK-NEXT:    i16 43, label [[FOR_COND]]
+; CHECK-NEXT:      i16 41, label [[FOR_END:%.*]]
+; CHECK-NEXT:      i16 43, label [[FOR_COND]]
 ; CHECK-NEXT:    ]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    [[I_0_LCSSA2:%.*]] = phi i32 [ 0, [[FOR_COND]] ]
@@ -336,6 +336,7 @@ if.end1824:                                       ; preds = %for.end1326
 define void @test5(ptr %header, i32 %conv, i8 %n) {
 ; CHECK-LABEL: @test5(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SHL:%.*]] = shl nuw nsw i32 [[CONV:%.*]], 2
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    br label [[FOR_INNER:%.*]]
@@ -358,7 +359,6 @@ define void @test5(ptr %header, i32 %conv, i8 %n) {
 ; CHECK-NEXT:    br i1 false, label [[FOR_BODY]], label [[WHILE_COND_PREHEADER:%.*]]
 ; CHECK:       while.cond.preheader:
 ; CHECK-NEXT:    [[ADD85_LCSSA:%.*]] = phi i32 [ [[ADD85]], [[FOR_INC]] ]
-; CHECK-NEXT:    [[SHL:%.*]] = shl nuw nsw i32 [[CONV:%.*]], 2
 ; CHECK-NEXT:    br label [[WHILE_COND:%.*]]
 ; CHECK:       while.cond:
 ; CHECK-NEXT:    [[POS_8:%.*]] = phi i32 [ [[INC114:%.*]], [[WHILE_BODY:%.*]] ], [ [[ADD85_LCSSA]], [[WHILE_COND_PREHEADER]] ]
@@ -427,8 +427,8 @@ define void @test6(i8 %x) {
 ; CHECK-NEXT:    br label [[WHILE_COND192:%.*]]
 ; CHECK:       while.cond192:
 ; CHECK-NEXT:    switch i8 [[X:%.*]], label [[WHILE_BODY205:%.*]] [
-; CHECK-NEXT:    i8 59, label [[WHILE_COND215_PREHEADER:%.*]]
-; CHECK-NEXT:    i8 10, label [[IF_END224:%.*]]
+; CHECK-NEXT:      i8 59, label [[WHILE_COND215_PREHEADER:%.*]]
+; CHECK-NEXT:      i8 10, label [[IF_END224:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       while.cond215.preheader:
 ; CHECK-NEXT:    [[I_7_LCSSA:%.*]] = phi i32 [ 0, [[WHILE_COND192]] ]
diff --git a/llvm/test/Transforms/IndVarSimplify/scev-invalidation.ll b/llvm/test/Transforms/IndVarSimplify/scev-invalidation.ll
index a92d328df99ca..ad69812838569 100644
--- a/llvm/test/Transforms/IndVarSimplify/scev-invalidation.ll
+++ b/llvm/test/Transforms/IndVarSimplify/scev-invalidation.ll
@@ -46,12 +46,12 @@ for.end106:                                       ; preds = %for.cond
 define i32 @test_pr58439(i32 %a) {
 ; CHECK-LABEL: @test_pr58439(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[A:%.*]], 1
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    br i1 false, label [[LOOP]], label [[EXIT:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[C_EXT_LCSSA:%.*]] = phi i32 [ 0, [[LOOP]] ]
-; CHECK-NEXT:    [[OR:%.*]] = or i32 [[A:%.*]], 1
 ; CHECK-NEXT:    [[RES:%.*]] = add i32 [[C_EXT_LCSSA]], [[OR]]
 ; CHECK-NEXT:    ret i32 [[RES]]
 ;
@@ -76,6 +76,7 @@ define i8 @l(i32 %inc, i1 %tobool.not.i) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[OUTER_HEADER:%.*]]
 ; CHECK:       outer.header:
+; CHECK-NEXT:    [[AND:%.*]] = and i32 1, [[INC:%.*]]
 ; CHECK-NEXT:    br label [[INNER:%.*]]
 ; CHECK:       inner:
 ; CHECK-NEXT:    [[C_05_I:%.*]] = phi i32 [ [[INC_I:%.*]], [[INNER]] ], [ 0, [[OUTER_HEADER]] ]
@@ -86,7 +87,6 @@ define i8 @l(i32 %inc, i1 %tobool.not.i) {
 ; CHECK:       outer.latch:
 ; CHECK-NEXT:    [[C_05_I_LCSSA:%.*]] = phi i32 [ [[C_05_I]], [[INNER]] ]
 ; CHECK-NEXT:    [[LCSSA:%.*]] = phi i32 [ 0, [[INNER]] ]
-; CHECK-NEXT:    [[AND:%.*]] = and i32 1, [[INC:%.*]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[AND]] to i8
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[C_05_I_LCSSA]] to i8
 ; CHECK-NEXT:    [[TMP2:%.*]] = sub i8 [[TMP0]], [[TMP1]]
diff --git a/llvm/test/Transforms/IndVarSimplify/sentinel.ll b/llvm/test/Transforms/IndVarSimplify/sentinel.ll
index 523414167956b..4f12308f3b01a 100644
--- a/llvm/test/Transforms/IndVarSimplify/sentinel.ll
+++ b/llvm/test/Transforms/IndVarSimplify/sentinel.ll
@@ -9,19 +9,19 @@ define void @test(i1 %arg) personality ptr @snork {
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    br label [[BB4:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[INDVARS_IV_NEXT:%.*]] = add i32 [[INDVARS_IV:%.*]], 1
-; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[TMP6:%.*]], [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 [[TMP0]], [[SMAX:%.*]]
 ; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[BB2:%.*]], label [[BB4]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[TMP3:%.*]] = phi i32 [ [[TMP1]], [[BB1:%.*]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = phi i32 [ [[TMP1:%.*]], [[BB1:%.*]] ]
 ; CHECK-NEXT:    ret void
 ; CHECK:       bb4:
-; CHECK-NEXT:    [[INDVARS_IV]] = phi i32 [ [[INDVARS_IV_NEXT]], [[BB1]] ], [ undef, [[BB:%.*]] ]
-; CHECK-NEXT:    [[SMAX]] = call i32 @llvm.smax.i32(i32 [[INDVARS_IV]], i32 36)
-; CHECK-NEXT:    [[TMP6]] = invoke i32 @quux() [ "deopt"(i32 0, i32 0, i32 0, i32 180, i32 0, i32 25, i32 0, i32 7, ptr null, i32 7, ptr null, i32 7, ptr null, i32 3, i32 [[INDVARS_IV]], i32 3, i32 undef, i32 7, ptr null, i32 3, i32 undef, i32 3, i32 undef, i32 3, i32 undef, i32 3, i32 undef, i32 4, double undef, i32 7, ptr null, i32 4, i64 undef, i32 7, ptr null, i32 0, ptr addrspace(1) undef, i32 3, i32 undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 7, ptr null) ]
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], [[BB1]] ], [ undef, [[BB:%.*]] ]
+; CHECK-NEXT:    [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[INDVARS_IV]], i32 36)
+; CHECK-NEXT:    [[TMP6:%.*]] = invoke i32 @quux() [ "deopt"(i32 0, i32 0, i32 0, i32 180, i32 0, i32 25, i32 0, i32 7, ptr null, i32 7, ptr null, i32 7, ptr null, i32 3, i32 [[INDVARS_IV]], i32 3, i32 undef, i32 7, ptr null, i32 3, i32 undef, i32 3, i32 undef, i32 3, i32 undef, i32 3, i32 undef, i32 4, double undef, i32 7, ptr null, i32 4, i64 undef, i32 7, ptr null, i32 0, ptr addrspace(1) undef, i32 3, i32 undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 7, ptr null) ]
 ; CHECK-NEXT:            to label [[BB7:%.*]] unwind label [[BB15:%.*]]
 ; CHECK:       bb7:
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i32 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[TMP6]], [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP1]] = sub i32 [[TMP0]], [[SMAX]]
 ; CHECK-NEXT:    br label [[BB9:%.*]]
 ; CHECK:       bb9:
 ; CHECK-NEXT:    br i1 true, label [[BB1]], label [[BB9]]
diff --git a/llvm/test/Transforms/IndVarSimplify/sink-from-preheader.ll b/llvm/test/Transforms/IndVarSimplify/sink-from-preheader.ll
deleted file mode 100644
index 89583f9131518..0000000000000
--- a/llvm/test/Transforms/IndVarSimplify/sink-from-preheader.ll
+++ /dev/null
@@ -1,32 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=indvars -indvars-predicate-loops=0 -S | FileCheck %s
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
-target triple = "i386-apple-darwin10.0"
-
-; We make sinking here, Changed flag should be set properly.
-define i32 @test(i32 %a, i32 %b, i32 %N) {
-; CHECK-LABEL: @test(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[IV_NEXT]], [[N:%.*]]
-; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]]
-; CHECK:       exit:
-; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    ret i32 [[ADD]]
-;
-entry:
-  %add = add i32 %a, %b
-  br label %loop
-
-loop:
-  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
-  %iv.next = add i32 %iv, 1
-  %cmp = icmp slt i32 %iv.next, %N
-  br i1 %cmp, label %loop, label %exit
-
-exit:
-  ret i32 %add
-}
diff --git a/llvm/test/Transforms/IndVarSimplify/sink-trapping.ll b/llvm/test/Transforms/IndVarSimplify/sink-trapping.ll
deleted file mode 100644
index d2478be5a8fcc..0000000000000
--- a/llvm/test/Transforms/IndVarSimplify/sink-trapping.ll
+++ /dev/null
@@ -1,19 +0,0 @@
-; RUN: opt < %s -passes=indvars -S | FileCheck %s
-
-declare i1 @b()
-
-define i32 @a(i32 %x) nounwind {
-for.body.preheader:
-    %y = sdiv i32 10, %x
-	br label %for.body
-
-for.body:
-    %cmp = call i1 @b()
-	br i1 %cmp, label %for.body, label %for.end.loopexit
-
-for.end.loopexit:
-	ret i32 %y
-}
-; CHECK: for.end.loopexit:
-; CHECK: sdiv
-; CHECK: ret
diff --git a/llvm/test/Transforms/IndVarSimplify/zext-nuw.ll b/llvm/test/Transforms/IndVarSimplify/zext-nuw.ll
index 17921afc5ff06..abe7a3e618dd8 100644
--- a/llvm/test/Transforms/IndVarSimplify/zext-nuw.ll
+++ b/llvm/test/Transforms/IndVarSimplify/zext-nuw.ll
@@ -24,13 +24,13 @@ define void @_Z3fn1v() {
 ; CHECK-NEXT:    [[X8:%.*]] = icmp ult i32 0, 4
 ; CHECK-NEXT:    br i1 [[X8]], label [[DOTPREHEADER_LR_PH:%.*]], label [[X22]]
 ; CHECK:       .preheader.lr.ph:
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[K_09]], i64 [[TMP5]]
 ; CHECK-NEXT:    br label [[DOTPREHEADER:%.*]]
 ; CHECK:       .preheader:
 ; CHECK-NEXT:    br label [[X17:%.*]]
 ; CHECK:       x17:
 ; CHECK-NEXT:    br i1 false, label [[DOTPREHEADER]], label [[DOT_CRIT_EDGE_8:%.*]]
 ; CHECK:       ._crit_edge.8:
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[K_09]], i64 [[TMP5]]
 ; CHECK-NEXT:    br label [[X22]]
 ; CHECK:       x22:
 ; CHECK-NEXT:    [[K_1_LCSSA:%.*]] = phi ptr [ [[SCEVGEP]], [[DOT_CRIT_EDGE_8]] ], [ [[K_09]], [[DOTPREHEADER4]] ]
diff --git a/llvm/test/Transforms/InstCombine/assume.ll b/llvm/test/Transforms/InstCombine/assume.ll
index 7b0b871513513..cc87d6542fa12 100644
--- a/llvm/test/Transforms/InstCombine/assume.ll
+++ b/llvm/test/Transforms/InstCombine/assume.ll
@@ -10,8 +10,8 @@ declare void @llvm.assume(i1) #1
 
 ; Check that the assume has not been removed:
 
-define i32 @foo1(ptr %a) #0 {
-; DEFAULT-LABEL: @foo1(
+define i32 @align_to_bundle(ptr %a) #0 {
+; DEFAULT-LABEL: @align_to_bundle(
 ; DEFAULT-NEXT:    [[T0:%.*]] = load i32, ptr [[A:%.*]], align 4
 ; DEFAULT-NEXT:    [[PTRINT:%.*]] = ptrtoint ptr [[A]] to i64
 ; DEFAULT-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 31
@@ -19,7 +19,7 @@ define i32 @foo1(ptr %a) #0 {
 ; DEFAULT-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND]])
 ; DEFAULT-NEXT:    ret i32 [[T0]]
 ;
-; BUNDLES-LABEL: @foo1(
+; BUNDLES-LABEL: @align_to_bundle(
 ; BUNDLES-NEXT:    [[T0:%.*]] = load i32, ptr [[A:%.*]], align 4
 ; BUNDLES-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i64 32) ]
 ; BUNDLES-NEXT:    ret i32 [[T0]]
@@ -32,6 +32,28 @@ define i32 @foo1(ptr %a) #0 {
   ret i32 %t0
 }
 
+define i32 @align_to_bundle_ptrtoaddr(ptr %a) #0 {
+; DEFAULT-LABEL: @align_to_bundle_ptrtoaddr(
+; DEFAULT-NEXT:    [[T0:%.*]] = load i32, ptr [[A:%.*]], align 4
+; DEFAULT-NEXT:    [[PTRINT:%.*]] = ptrtoaddr ptr [[A]] to i64
+; DEFAULT-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 31
+; DEFAULT-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
+; DEFAULT-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND]])
+; DEFAULT-NEXT:    ret i32 [[T0]]
+;
+; BUNDLES-LABEL: @align_to_bundle_ptrtoaddr(
+; BUNDLES-NEXT:    [[T0:%.*]] = load i32, ptr [[A:%.*]], align 4
+; BUNDLES-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i64 32) ]
+; BUNDLES-NEXT:    ret i32 [[T0]]
+;
+  %t0 = load i32, ptr %a, align 4
+  %ptrint = ptrtoaddr ptr %a to i64
+  %maskedptr = and i64 %ptrint, 31
+  %maskcond = icmp eq i64 %maskedptr, 0
+  tail call void @llvm.assume(i1 %maskcond)
+  ret i32 %t0
+}
+
 define i32 @align_assume_trunc_cond(ptr %a) #0 {
 ; DEFAULT-LABEL: @align_assume_trunc_cond(
 ; DEFAULT-NEXT:    [[T0:%.*]] = load i32, ptr [[A:%.*]], align 4
diff --git a/llvm/test/Transforms/InstCombine/or.ll b/llvm/test/Transforms/InstCombine/or.ll
index 6b090e982af0a..f61a1970d3aa4 100644
--- a/llvm/test/Transforms/InstCombine/or.ll
+++ b/llvm/test/Transforms/InstCombine/or.ll
@@ -2113,3 +2113,98 @@ define <4 x i32> @or_zext_nneg_minus_constant_splat(<4 x i8> %a) {
   %or = or <4 x i32> %zext, splat (i32 -9)
   ret <4 x i32> %or
 }
+
+define i8 @or_positive_minus_non_positive_to_abs(i8 %a){
+; CHECK-LABEL: @or_positive_minus_non_positive_to_abs(
+; CHECK-NEXT:    [[TMP2:%.*]] = call i8 @llvm.abs.i8(i8 [[A:%.*]], i1 false)
+; CHECK-NEXT:    ret i8 [[TMP2]]
+;
+  %b = icmp sgt i8 %a, 0
+  %mask = sext i1 %b to i8
+  %neg = sub i8 0, %a
+  %mask_inv = xor i8 %mask, -1
+  %c = and i8 %neg, %mask_inv
+  %d = and i8 %a, %mask
+  %or = or i8 %c, %d
+  ret i8 %or
+}
+
+; TODO: Fold to smax https://alive2.llvm.org/ce/z/wDiDh2
+define i8 @or_select_smax_neg_to_abs(i8 %a){
+; CHECK-LABEL: @or_select_smax_neg_to_abs(
+; CHECK-NEXT:    [[SGT0:%.*]] = icmp sgt i8 [[A:%.*]], 0
+; CHECK-NEXT:    [[NEG:%.*]] = sub nsw i8 0, [[A]]
+; CHECK-NEXT:    [[OR:%.*]] = select i1 [[SGT0]], i8 0, i8 [[NEG]]
+; CHECK-NEXT:    ret i8 [[OR]]
+;
+  %sgt0 = icmp sgt i8 %a, 0
+  %neg = sub nsw i8 0, %a
+  %sel = select i1 %sgt0, i8 0, i8 %neg
+  ret i8 %sel
+}
+
+; TODO: Fold to abs https://alive2.llvm.org/ce/z/DybfHG
+define i8 @or_select_smax_smax_to_abs(i8 %a){
+; CHECK-LABEL: @or_select_smax_smax_to_abs(
+; CHECK-NEXT:    [[NEG:%.*]] = sub nsw i8 0, [[A:%.*]]
+; CHECK-NEXT:    [[SEL:%.*]] = call i8 @llvm.smax.i8(i8 [[NEG]], i8 0)
+; CHECK-NEXT:    [[MAX:%.*]] = call i8 @llvm.smax.i8(i8 [[A]], i8 0)
+; CHECK-NEXT:    [[OR:%.*]] = or i8 [[SEL]], [[MAX]]
+; CHECK-NEXT:    ret i8 [[OR]]
+;
+  %neg = sub nsw i8 0, %a
+  %sel = call i8 @llvm.smax.i8(i8 %neg, i8 0)
+  %max = call i8 @llvm.smax.i8(i8 %a, i8 0)
+  %or = or i8 %sel, %max
+  ret i8 %or
+}
+
+declare i8 @llvm.abs.i8(i8, i1)
+declare <2 x i8> @llvm.abs.v2i8(<2 x i8>, i1)
+
+define <2 x i8> @or_sgt_select_smax_to_abs(<2 x i8> %a){
+; CHECK-LABEL: @or_sgt_select_smax_to_abs(
+; CHECK-NEXT:    [[OR:%.*]] = call <2 x i8> @llvm.abs.v2i8(<2 x i8> [[A:%.*]], i1 false)
+; CHECK-NEXT:    ret <2 x i8> [[OR]]
+;
+  %sgt0 = icmp sgt <2 x i8> %a, zeroinitializer
+  %neg = sub <2 x i8> zeroinitializer, %a
+  %sel = select <2 x i1> %sgt0, <2 x i8> zeroinitializer, <2 x i8> %neg
+  %max = call <2 x i8> @llvm.smax.v2i8(<2 x i8> %a, <2 x i8> zeroinitializer)
+  %or = or <2 x i8> %sel, %max
+  ret <2 x i8> %or
+}
+
+define <2 x i8> @or_slt_select_smax_to_abs(<2 x i8> %a){
+; CHECK-LABEL: @or_slt_select_smax_to_abs(
+; CHECK-NEXT:    [[OR:%.*]] = call <2 x i8> @llvm.abs.v2i8(<2 x i8> [[A:%.*]], i1 false)
+; CHECK-NEXT:    ret <2 x i8> [[OR]]
+;
+  %slt0 = icmp slt <2 x i8> %a, zeroinitializer
+  %neg = sub <2 x i8> zeroinitializer, %a
+  %sel = select <2 x i1> %slt0, <2 x i8> %neg, <2 x i8> zeroinitializer
+  %max = call <2 x i8> @llvm.smax.v2i8(<2 x i8> %a, <2 x i8> zeroinitializer)
+  %or = or <2 x i8> %sel, %max
+  ret <2 x i8> %or
+}
+
+; negative test - %d has multiple uses. %or is not folded to abs.
+
+define <2 x i8> @or_select_smax_multi_uses(<2 x i8> %a){
+; CHECK-LABEL: @or_select_smax_multi_uses(
+; CHECK-NEXT:    [[B:%.*]] = icmp sgt <2 x i8> [[A:%.*]], zeroinitializer
+; CHECK-NEXT:    [[NEG:%.*]] = sub <2 x i8> zeroinitializer, [[A]]
+; CHECK-NEXT:    [[C:%.*]] = select <2 x i1> [[B]], <2 x i8> zeroinitializer, <2 x i8> [[NEG]]
+; CHECK-NEXT:    [[D:%.*]] = call <2 x i8> @llvm.smax.v2i8(<2 x i8> [[A]], <2 x i8> zeroinitializer)
+; CHECK-NEXT:    [[OR1:%.*]] = or <2 x i8> [[C]], [[D]]
+; CHECK-NEXT:    [[OR:%.*]] = add <2 x i8> [[OR1]], [[D]]
+; CHECK-NEXT:    ret <2 x i8> [[OR]]
+;
+  %sgt0 = icmp sgt <2 x i8> %a, zeroinitializer
+  %neg = sub <2 x i8> zeroinitializer, %a
+  %sel = select <2 x i1> %sgt0, <2 x i8> zeroinitializer, <2 x i8> %neg
+  %max = call <2 x i8> @llvm.smax.v2i8(<2 x i8> %a, <2 x i8> zeroinitializer)
+  %or = or <2 x i8> %sel, %max
+  %add = add <2 x i8> %or, %max
+  ret <2 x i8> %add
+}
diff --git a/llvm/test/Transforms/InstCombine/ptrtoaddr.ll b/llvm/test/Transforms/InstCombine/ptrtoaddr.ll
index a7434a28c4164..adf3aa12623b9 100644
--- a/llvm/test/Transforms/InstCombine/ptrtoaddr.ll
+++ b/llvm/test/Transforms/InstCombine/ptrtoaddr.ll
@@ -237,3 +237,75 @@ define ptr addrspace(1) @gep_sub_ptrtoaddr_different_obj_addrsize(ptr addrspace(
   call void @use.i32(i32 %addr)
   ret ptr addrspace(1) %gep
 }
+
+define i64 @ptrtoaddr_of_ptrmask(ptr %p, i64 %mask) {
+; CHECK-LABEL: define i64 @ptrtoaddr_of_ptrmask(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[MASK:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoaddr ptr [[P]] to i64
+; CHECK-NEXT:    [[ADDR:%.*]] = and i64 [[MASK]], [[TMP1]]
+; CHECK-NEXT:    ret i64 [[ADDR]]
+;
+  %masked = call ptr @llvm.ptrmask(ptr %p, i64 %mask)
+  %addr = ptrtoaddr ptr %masked to i64
+  ret i64 %addr
+}
+
+define i32 @ptrtoaddr_of_ptrmask_addrsize(ptr addrspace(1) %p, i32 %mask) {
+; CHECK-LABEL: define i32 @ptrtoaddr_of_ptrmask_addrsize(
+; CHECK-SAME: ptr addrspace(1) [[P:%.*]], i32 [[MASK:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoaddr ptr addrspace(1) [[P]] to i32
+; CHECK-NEXT:    [[ADDR:%.*]] = and i32 [[MASK]], [[TMP1]]
+; CHECK-NEXT:    ret i32 [[ADDR]]
+;
+  %masked = call ptr addrspace(1) @llvm.ptrmask(ptr addrspace(1) %p, i32 %mask)
+  %addr = ptrtoaddr ptr addrspace(1) %masked to i32
+  ret i32 %addr
+}
+
+define i64 @ptrtoaddr_of_gep_of_inttoptr(i64 %int, i64 %offset) {
+; CHECK-LABEL: define i64 @ptrtoaddr_of_gep_of_inttoptr(
+; CHECK-SAME: i64 [[INT:%.*]], i64 [[OFFSET:%.*]]) {
+; CHECK-NEXT:    [[ADDR:%.*]] = add i64 [[INT]], [[OFFSET]]
+; CHECK-NEXT:    ret i64 [[ADDR]]
+;
+  %ptr = inttoptr i64 %int to ptr
+  %gep = getelementptr i8, ptr %ptr, i64 %offset
+  %addr = ptrtoaddr ptr %gep to i64
+  ret i64 %addr
+}
+
+; FIXME: This could be supported by truncating %int before performing the
+; arithmetic.
+define i32 @ptrtoaddr_of_gep_of_inttoptr_addrsize(i64 %int, i32 %offset) {
+; CHECK-LABEL: define i32 @ptrtoaddr_of_gep_of_inttoptr_addrsize(
+; CHECK-SAME: i64 [[INT:%.*]], i32 [[OFFSET:%.*]]) {
+; CHECK-NEXT:    [[PTR:%.*]] = inttoptr i64 [[INT]] to ptr addrspace(1)
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr addrspace(1) [[PTR]], i32 [[OFFSET]]
+; CHECK-NEXT:    [[ADDR:%.*]] = ptrtoaddr ptr addrspace(1) [[GEP]] to i32
+; CHECK-NEXT:    ret i32 [[ADDR]]
+;
+  %ptr = inttoptr i64 %int to ptr addrspace(1)
+  %gep = getelementptr i8, ptr addrspace(1) %ptr, i32 %offset
+  %addr = ptrtoaddr ptr addrspace(1) %gep to i32
+  ret i32 %addr
+}
+
+define i64 @ptrtoaddr_of_gep_of_null(i64 %offset) {
+; CHECK-LABEL: define i64 @ptrtoaddr_of_gep_of_null(
+; CHECK-SAME: i64 [[OFFSET:%.*]]) {
+; CHECK-NEXT:    ret i64 [[OFFSET]]
+;
+  %gep = getelementptr i8, ptr null, i64 %offset
+  %addr = ptrtoaddr ptr %gep to i64
+  ret i64 %addr
+}
+
+define i32 @ptrtoaddr_of_gep_of_null_addrsize(i32 %offset) {
+; CHECK-LABEL: define i32 @ptrtoaddr_of_gep_of_null_addrsize(
+; CHECK-SAME: i32 [[OFFSET:%.*]]) {
+; CHECK-NEXT:    ret i32 [[OFFSET]]
+;
+  %gep = getelementptr i8, ptr addrspace(1) null, i32 %offset
+  %addr = ptrtoaddr ptr addrspace(1) %gep to i32
+  ret i32 %addr
+}
diff --git a/llvm/test/Transforms/InstCombine/vec_extract_var_elt-inseltpoison.ll b/llvm/test/Transforms/InstCombine/vec_extract_var_elt-inseltpoison.ll
deleted file mode 100644
index 9fcac802378f6..0000000000000
--- a/llvm/test/Transforms/InstCombine/vec_extract_var_elt-inseltpoison.ll
+++ /dev/null
@@ -1,26 +0,0 @@
-; RUN: opt < %s -passes=instcombine -S | FileCheck %s
-
-define void @test (float %b, ptr %p)  {
-; CHECK: extractelement
-; CHECK: fptosi
-  %1 = load <8 x float> , ptr %p
-  %2 = bitcast <8 x float> %1 to <8 x i32>
-  %3 = bitcast <8 x i32> %2 to <8 x float>
-  %a = fptosi <8 x float> %3 to <8 x i32>
-  %4 = fptosi float %b to i32
-  %5 = add i32 %4, -2
-  %6 = extractelement <8 x i32> %a, i32 %5
-  %7 = insertelement <8 x i32> poison, i32 %6, i32 7
-  %8 = sitofp <8 x i32> %7 to <8 x float>
-  store <8 x float> %8, ptr %p
-  ret void    
-}
-
-; PR18600
-define i32 @test2(i32 %i) {
-  %e = extractelement <4 x i32> bitcast (<2 x i64> <i64 1, i64 2> to <4 x i32>), i32 %i
-  ret i32 %e
-
-; CHECK-LABEL: @test2
-; CHECK: extractelement
-}
diff --git a/llvm/test/Transforms/InstCombine/vec_extract_var_elt.ll b/llvm/test/Transforms/InstCombine/vec_extract_var_elt.ll
index 32bf4da12c497..205b4b88c473a 100644
--- a/llvm/test/Transforms/InstCombine/vec_extract_var_elt.ll
+++ b/llvm/test/Transforms/InstCombine/vec_extract_var_elt.ll
@@ -1,26 +1,81 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt < %s -passes=instcombine -S | FileCheck %s
 
-define void @test (float %b, ptr %p)  {
-; CHECK: extractelement
-; CHECK: fptosi
-  %1 = load <8 x float> , ptr %p
+define void @test_poison(float %b, ptr %p) {
+; CHECK-LABEL: define void @test_poison(
+; CHECK-SAME: float [[B:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[P]], align 32
+; CHECK-NEXT:    [[TMP2:%.*]] = fptosi float [[B]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP2]], -2
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x float> [[TMP1]], i32 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fptosi float [[TMP4]] to i32
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> poison, i32 [[TMP5]], i64 7
+; CHECK-NEXT:    [[TMP7:%.*]] = sitofp <8 x i32> [[TMP6]] to <8 x float>
+; CHECK-NEXT:    store <8 x float> [[TMP7]], ptr [[P]], align 32
+; CHECK-NEXT:    ret void
+;
+  %1 = load <8 x float>, ptr %p
   %2 = bitcast <8 x float> %1 to <8 x i32>
   %3 = bitcast <8 x i32> %2 to <8 x float>
   %a = fptosi <8 x float> %3 to <8 x i32>
   %4 = fptosi float %b to i32
   %5 = add i32 %4, -2
   %6 = extractelement <8 x i32> %a, i32 %5
-  %7 = insertelement <8 x i32> undef, i32 %6, i32 7
+  %7 = insertelement <8 x i32> poison, i32 %6, i32 7
   %8 = sitofp <8 x i32> %7 to <8 x float>
   store <8 x float> %8, ptr %p
-  ret void    
+  ret void
 }
 
 ; PR18600
-define i32 @test2(i32 %i) {
+define i32 @test_bitcast(i32 %i) {
+; CHECK-LABEL: define i32 @test_bitcast(
+; CHECK-SAME: i32 [[I:%.*]]) {
+; CHECK-NEXT:    [[E:%.*]] = extractelement <4 x i32> <i32 1, i32 0, i32 2, i32 0>, i32 [[I]]
+; CHECK-NEXT:    ret i32 [[E]]
+;
   %e = extractelement <4 x i32> bitcast (<2 x i64> <i64 1, i64 2> to <4 x i32>), i32 %i
   ret i32 %e
+}
+
+declare void @use(i32)
 
-; CHECK-LABEL: @test2
-; CHECK: extractelement
+define void @test_loop(<4 x float> %in) {
+; CHECK-LABEL: define void @test_loop(
+; CHECK-SAME: <4 x float> [[IN:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[R:%.*]] = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> [[IN]], i32 9)
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[NEXT:%.*]], %[[LATCH:.*]] ]
+; CHECK-NEXT:    [[COND:%.*]] = icmp samesign ult i32 [[I]], 4
+; CHECK-NEXT:    br i1 [[COND]], label %[[BODY:.*]], label %[[DONE:.*]]
+; CHECK:       [[BODY]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <4 x float> [[R]], i32 [[I]]
+; CHECK-NEXT:    [[ELEM:%.*]] = fptosi float [[TMP0]] to i32
+; CHECK-NEXT:    call void @use(i32 [[ELEM]])
+; CHECK-NEXT:    br label %[[LATCH]]
+; CHECK:       [[LATCH]]:
+; CHECK-NEXT:    [[NEXT]] = add nuw nsw i32 [[I]], 1
+; CHECK-NEXT:    br label %[[LOOP]]
+; CHECK:       [[DONE]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %r = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %in, i32 9)
+  %vi = fptosi <4 x float> %r to <4 x i32>
+  br label %loop
+loop:
+  %i = phi i32 [ 0, %entry ], [ %next, %latch ]
+  %cond = icmp ult i32 %i, 4
+  br i1 %cond, label %body, label %done
+body:
+  %elem = extractelement <4 x i32> %vi, i32 %i
+  call void @use(i32 %elem)
+  br label %latch
+latch:
+  %next = add i32 %i, 1
+  br label %loop
+done:
+  ret void
 }
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/vecreduce.ll b/llvm/test/Transforms/InstSimplify/ConstProp/vecreduce.ll
index 9f9e3f9ffc070..479b3f8ea4128 100644
--- a/llvm/test/Transforms/InstSimplify/ConstProp/vecreduce.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/vecreduce.ll
@@ -1,26 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -passes=instsimplify -S | FileCheck %s
-; RUN: opt < %s -passes=instsimplify -use-constant-int-for-fixed-length-splat -S | FileCheck %s
-
-declare i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a)
-declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a)
-declare i32 @llvm.vector.reduce.mul.v1i32(<1 x i32> %a)
-declare i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> %a)
-declare i32 @llvm.vector.reduce.and.v1i32(<1 x i32> %a)
-declare i32 @llvm.vector.reduce.and.v8i32(<8 x i32> %a)
-declare i32 @llvm.vector.reduce.or.v1i32(<1 x i32> %a)
-declare i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %a)
-declare i32 @llvm.vector.reduce.xor.v1i32(<1 x i32> %a)
-declare i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> %a)
-declare i32 @llvm.vector.reduce.smin.v1i32(<1 x i32> %a)
-declare i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> %a)
-declare i32 @llvm.vector.reduce.smax.v1i32(<1 x i32> %a)
-declare i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> %a)
-declare i32 @llvm.vector.reduce.umin.v1i32(<1 x i32> %a)
-declare i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> %a)
-declare i32 @llvm.vector.reduce.umax.v1i32(<1 x i32> %a)
-declare i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> %a)
-
+; RUN: opt < %s -passes=instsimplify -use-constant-int-for-fixed-length-splat -use-constant-int-for-scalable-splat -S | FileCheck %s
 
 define i32 @add_0() {
 ; CHECK-LABEL: @add_0(
@@ -30,6 +10,14 @@ define i32 @add_0() {
   ret i32 %x
 }
 
+define i32 @add_0_scalable_vector() {
+; CHECK-LABEL: @add_0_scalable_vector(
+; CHECK-NEXT:    ret i32 0
+;
+  %x = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> zeroinitializer)
+  ret i32 %x
+}
+
 define i32 @add_1() {
 ; CHECK-LABEL: @add_1(
 ; CHECK-NEXT:    ret i32 8
@@ -38,6 +26,15 @@ define i32 @add_1() {
   ret i32 %x
 }
 
+define i32 @add_1_scalable_vector() {
+; CHECK-LABEL: @add_1_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> splat (i32 1))
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> splat (i32 1))
+  ret i32 %x
+}
+
 define i32 @add_inc() {
 ; CHECK-LABEL: @add_inc(
 ; CHECK-NEXT:    ret i32 18
@@ -63,8 +60,17 @@ define i32 @add_undef() {
   ret i32 %x
 }
 
-define i32 @add_undef1() {
-; CHECK-LABEL: @add_undef1(
+define i32 @add_undef_scalable_vector() {
+; CHECK-LABEL: @add_undef_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> undef)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> undef)
+  ret i32 %x
+}
+
+define i32 @add_undef_elt() {
+; CHECK-LABEL: @add_undef_elt(
 ; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
@@ -80,8 +86,16 @@ define i32 @add_poison() {
   ret i32 %x
 }
 
-define i32 @add_poison1() {
-; CHECK-LABEL: @add_poison1(
+define i32 @add_poison_scalable_vector() {
+; CHECK-LABEL: @add_poison_scalable_vector(
+; CHECK-NEXT:    ret i32 poison
+;
+  %x = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> poison)
+  ret i32 %x
+}
+
+define i32 @add_poison_elt() {
+; CHECK-LABEL: @add_poison_elt(
 ; CHECK-NEXT:    ret i32 poison
 ;
   %x = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> <i32 1, i32 1, i32 poison, i32 1, i32 1, i32 42, i32 1, i32 1>)
@@ -105,6 +119,14 @@ define i32 @mul_0() {
   ret i32 %x
 }
 
+define i32 @mul_0_scalable_vector() {
+; CHECK-LABEL: @mul_0_scalable_vector(
+; CHECK-NEXT:    ret i32 0
+;
+  %x = call i32 @llvm.vector.reduce.mul.nxv8i32(<vscale x 8 x i32> zeroinitializer)
+  ret i32 %x
+}
+
 define i32 @mul_1() {
 ; CHECK-LABEL: @mul_1(
 ; CHECK-NEXT:    ret i32 1
@@ -113,6 +135,31 @@ define i32 @mul_1() {
   ret i32 %x
 }
 
+define i32 @mul_1_scalable_vector() {
+; CHECK-LABEL: @mul_1_scalable_vector(
+; CHECK-NEXT:    ret i32 1
+;
+  %x = call i32 @llvm.vector.reduce.mul.nxv8i32(<vscale x 8 x i32> splat (i32 1))
+  ret i32 %x
+}
+
+define i32 @mul_2() {
+; CHECK-LABEL: @mul_2(
+; CHECK-NEXT:    ret i32 256
+;
+  %x = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>)
+  ret i32 %x
+}
+
+define i32 @mul_2_scalable_vector() {
+; CHECK-LABEL: @mul_2_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.mul.nxv8i32(<vscale x 8 x i32> splat (i32 2))
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.mul.nxv8i32(<vscale x 8 x i32> splat (i32 2))
+  ret i32 %x
+}
+
 define i32 @mul_inc() {
 ; CHECK-LABEL: @mul_inc(
 ; CHECK-NEXT:    ret i32 40320
@@ -138,8 +185,17 @@ define i32 @mul_undef() {
   ret i32 %x
 }
 
-define i32 @mul_undef1() {
-; CHECK-LABEL: @mul_undef1(
+define i32 @mul_undef_scalable_vector() {
+; CHECK-LABEL: @mul_undef_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.mul.nxv8i32(<vscale x 8 x i32> undef)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.mul.nxv8i32(<vscale x 8 x i32> undef)
+  ret i32 %x
+}
+
+define i32 @mul_undef_elt() {
+; CHECK-LABEL: @mul_undef_elt(
 ; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
@@ -155,8 +211,16 @@ define i32 @mul_poison() {
   ret i32 %x
 }
 
-define i32 @mul_poison1() {
-; CHECK-LABEL: @mul_poison1(
+define i32 @mul_poison_scalable_vector() {
+; CHECK-LABEL: @mul_poison_scalable_vector(
+; CHECK-NEXT:    ret i32 poison
+;
+  %x = call i32 @llvm.vector.reduce.mul.nxv8i32(<vscale x 8 x i32> poison)
+  ret i32 %x
+}
+
+define i32 @mul_poison_elt() {
+; CHECK-LABEL: @mul_poison_elt(
 ; CHECK-NEXT:    ret i32 poison
 ;
   %x = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> <i32 0, i32 1, i32 poison, i32 1, i32 1, i32 1, i32 1, i32 1>)
@@ -171,6 +235,14 @@ define i32 @and_0() {
   ret i32 %x
 }
 
+define i32 @and_0_scalable_vector() {
+; CHECK-LABEL: @and_0_scalable_vector(
+; CHECK-NEXT:    ret i32 0
+;
+  %x = call i32 @llvm.vector.reduce.and.nxv8i32(<vscale x 8 x i32> zeroinitializer)
+  ret i32 %x
+}
+
 define i32 @and_1() {
 ; CHECK-LABEL: @and_1(
 ; CHECK-NEXT:    ret i32 1
@@ -179,6 +251,14 @@ define i32 @and_1() {
   ret i32 %x
 }
 
+define i32 @and_1_scalable_vector() {
+; CHECK-LABEL: @and_1_scalable_vector(
+; CHECK-NEXT:    ret i32 1
+;
+  %x = call i32 @llvm.vector.reduce.and.nxv8i32(<vscale x 8 x i32> splat (i32 1))
+  ret i32 %x
+}
+
 define i32 @and_inc() {
 ; CHECK-LABEL: @and_inc(
 ; CHECK-NEXT:    ret i32 0
@@ -204,8 +284,17 @@ define i32 @and_undef() {
   ret i32 %x
 }
 
-define i32 @and_undef1() {
-; CHECK-LABEL: @and_undef1(
+define i32 @and_undef_scalable_vector() {
+; CHECK-LABEL: @and_undef_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.and.nxv8i32(<vscale x 8 x i32> undef)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.and.nxv8i32(<vscale x 8 x i32> undef)
+  ret i32 %x
+}
+
+define i32 @and_undef_elt() {
+; CHECK-LABEL: @and_undef_elt(
 ; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
@@ -221,8 +310,16 @@ define i32 @and_poison() {
   ret i32 %x
 }
 
-define i32 @and_poison1() {
-; CHECK-LABEL: @and_poison1(
+define i32 @and_poison_scalable_vector() {
+; CHECK-LABEL: @and_poison_scalable_vector(
+; CHECK-NEXT:    ret i32 poison
+;
+  %x = call i32 @llvm.vector.reduce.and.nxv8i32(<vscale x 8 x i32> poison)
+  ret i32 %x
+}
+
+define i32 @and_poison_elt() {
+; CHECK-LABEL: @and_poison_elt(
 ; CHECK-NEXT:    ret i32 poison
 ;
   %x = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> <i32 -1, i32 1, i32 poison, i32 1, i32 1, i32 1, i32 1, i32 1>)
@@ -237,6 +334,14 @@ define i32 @or_0() {
   ret i32 %x
 }
 
+define i32 @or_0_scalable_vector() {
+; CHECK-LABEL: @or_0_scalable_vector(
+; CHECK-NEXT:    ret i32 0
+;
+  %x = call i32 @llvm.vector.reduce.or.nxv8i32(<vscale x 8 x i32> zeroinitializer)
+  ret i32 %x
+}
+
 define i32 @or_1() {
 ; CHECK-LABEL: @or_1(
 ; CHECK-NEXT:    ret i32 1
@@ -245,6 +350,14 @@ define i32 @or_1() {
   ret i32 %x
 }
 
+define i32 @or_1_scalable_vector() {
+; CHECK-LABEL: @or_1_scalable_vector(
+; CHECK-NEXT:    ret i32 1
+;
+  %x = call i32 @llvm.vector.reduce.or.nxv8i32(<vscale x 8 x i32> splat (i32 1))
+  ret i32 %x
+}
+
 define i32 @or_inc() {
 ; CHECK-LABEL: @or_inc(
 ; CHECK-NEXT:    ret i32 -1
@@ -270,8 +383,17 @@ define i32 @or_undef() {
   ret i32 %x
 }
 
-define i32 @or_undef1() {
-; CHECK-LABEL: @or_undef1(
+define i32 @or_undef_scalable_vector() {
+; CHECK-LABEL: @or_undef_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.or.nxv8i32(<vscale x 8 x i32> undef)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.or.v8i32(<vscale x 8 x i32> undef)
+  ret i32 %x
+}
+
+define i32 @or_undef_elt() {
+; CHECK-LABEL: @or_undef_elt(
 ; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
@@ -287,8 +409,16 @@ define i32 @or_poison() {
   ret i32 %x
 }
 
-define i32 @or_poison1() {
-; CHECK-LABEL: @or_poison1(
+define i32 @or_poison_scalable_vector() {
+; CHECK-LABEL: @or_poison_scalable_vector(
+; CHECK-NEXT:    ret i32 poison
+;
+  %x = call i32 @llvm.vector.reduce.or.nxv8i32(<vscale x 8 x i32> poison)
+  ret i32 %x
+}
+
+define i32 @or_poison_elt() {
+; CHECK-LABEL: @or_poison_elt(
 ; CHECK-NEXT:    ret i32 poison
 ;
   %x = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> <i32 1, i32 0, i32 poison, i32 1, i32 1, i32 1, i32 1, i32 1>)
@@ -303,6 +433,14 @@ define i32 @xor_0() {
   ret i32 %x
 }
 
+define i32 @xor_0_scalable_vector() {
+; CHECK-LABEL: @xor_0_scalable_vector(
+; CHECK-NEXT:    ret i32 0
+;
+  %x = call i32 @llvm.vector.reduce.xor.nxv8i32(<vscale x 8 x i32> zeroinitializer)
+  ret i32 %x
+}
+
 define i32 @xor_1() {
 ; CHECK-LABEL: @xor_1(
 ; CHECK-NEXT:    ret i32 0
@@ -311,6 +449,23 @@ define i32 @xor_1() {
   ret i32 %x
 }
 
+define i32 @xor_1_scalable_vector() {
+; CHECK-LABEL: @xor_1_scalable_vector(
+; CHECK-NEXT:    ret i32 0
+;
+  %x = call i32 @llvm.vector.reduce.xor.nxv8i32(<vscale x 8 x i32> splat(i32 1))
+  ret i32 %x
+}
+
+define i32 @xor_1_scalable_vector_lane_count_not_known_even() {
+; CHECK-LABEL: @xor_1_scalable_vector_lane_count_not_known_even(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.xor.nxv1i32(<vscale x 1 x i32> splat (i32 1))
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.xor.nxv8i32(<vscale x 1 x i32> splat(i32 1))
+  ret i32 %x
+}
+
 define i32 @xor_inc() {
 ; CHECK-LABEL: @xor_inc(
 ; CHECK-NEXT:    ret i32 10
@@ -336,8 +491,17 @@ define i32 @xor_undef() {
   ret i32 %x
 }
 
-define i32 @xor_undef1() {
-; CHECK-LABEL: @xor_undef1(
+define i32 @xor_undef_scalable_vector() {
+; CHECK-LABEL: @xor_undef_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.xor.nxv8i32(<vscale x 8 x i32> undef)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.xor.nxv8i32(<vscale x 8 x i32> undef)
+  ret i32 %x
+}
+
+define i32 @xor_undef_elt() {
+; CHECK-LABEL: @xor_undef_elt(
 ; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
@@ -353,8 +517,16 @@ define i32 @xor_poison() {
   ret i32 %x
 }
 
-define i32 @xor_poison1() {
-; CHECK-LABEL: @xor_poison1(
+define i32 @xor_poison_scalable_vector() {
+; CHECK-LABEL: @xor_poison_scalable_vector(
+; CHECK-NEXT:    ret i32 poison
+;
+  %x = call i32 @llvm.vector.reduce.xor.nxv8i32(<vscale x 8 x i32> poison)
+  ret i32 %x
+}
+
+define i32 @xor_poison_elt() {
+; CHECK-LABEL: @xor_poison_elt(
 ; CHECK-NEXT:    ret i32 poison
 ;
   %x = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> <i32 poison, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
@@ -369,6 +541,14 @@ define i32 @smin_0() {
   ret i32 %x
 }
 
+define i32 @smin_0_scalable_vector() {
+; CHECK-LABEL: @smin_0_scalable_vector(
+; CHECK-NEXT:    ret i32 0
+;
+  %x = call i32 @llvm.vector.reduce.smin.nxv8i32(<vscale x 8 x i32> zeroinitializer)
+  ret i32 %x
+}
+
 define i32 @smin_1() {
 ; CHECK-LABEL: @smin_1(
 ; CHECK-NEXT:    ret i32 1
@@ -377,6 +557,14 @@ define i32 @smin_1() {
   ret i32 %x
 }
 
+define i32 @smin_1_scalable_vector() {
+; CHECK-LABEL: @smin_1_scalable_vector(
+; CHECK-NEXT:    ret i32 1
+;
+  %x = call i32 @llvm.vector.reduce.smin.nxv8i32(<vscale x 8 x i32> splat(i32 1))
+  ret i32 %x
+}
+
 define i32 @smin_inc() {
 ; CHECK-LABEL: @smin_inc(
 ; CHECK-NEXT:    ret i32 -6
@@ -402,8 +590,17 @@ define i32 @smin_undef() {
   ret i32 %x
 }
 
-define i32 @smin_undef1() {
-; CHECK-LABEL: @smin_undef1(
+define i32 @smin_undef_scalable_vector() {
+; CHECK-LABEL: @smin_undef_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.smin.nxv8i32(<vscale x 8 x i32> undef)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.smin.nxv8i32(<vscale x 8 x i32> undef)
+  ret i32 %x
+}
+
+define i32 @smin_undef_elt() {
+; CHECK-LABEL: @smin_undef_elt(
 ; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
@@ -419,8 +616,16 @@ define i32 @smin_poison() {
   ret i32 %x
 }
 
-define i32 @smin_poison1() {
-; CHECK-LABEL: @smin_poison1(
+define i32 @smin_poison_scalable_vector() {
+; CHECK-LABEL: @smin_poison_scalable_vector(
+; CHECK-NEXT:    ret i32 poison
+;
+  %x = call i32 @llvm.vector.reduce.smin.nxv8i32(<vscale x 8 x i32> poison)
+  ret i32 %x
+}
+
+define i32 @smin_poison_elt() {
+; CHECK-LABEL: @smin_poison_elt(
 ; CHECK-NEXT:    ret i32 poison
 ;
   %x = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 poison, i32 1, i32 1, i32 1>)
@@ -435,6 +640,14 @@ define i32 @smax_0() {
   ret i32 %x
 }
 
+define i32 @smax_0_scalable_vector() {
+; CHECK-LABEL: @smax_0_scalable_vector(
+; CHECK-NEXT:    ret i32 0
+;
+  %x = call i32 @llvm.vector.reduce.smax.nxv8i32(<vscale x 8 x i32> zeroinitializer)
+  ret i32 %x
+}
+
 define i32 @smax_1() {
 ; CHECK-LABEL: @smax_1(
 ; CHECK-NEXT:    ret i32 1
@@ -443,6 +656,14 @@ define i32 @smax_1() {
   ret i32 %x
 }
 
+define i32 @smax_1_scalable_vector() {
+; CHECK-LABEL: @smax_1_scalable_vector(
+; CHECK-NEXT:    ret i32 1
+;
+  %x = call i32 @llvm.vector.reduce.smax.nxv8i32(<vscale x 8 x i32> splat(i32 1))
+  ret i32 %x
+}
+
 define i32 @smax_inc() {
 ; CHECK-LABEL: @smax_inc(
 ; CHECK-NEXT:    ret i32 8
@@ -468,8 +689,17 @@ define i32 @smax_undef() {
   ret i32 %x
 }
 
-define i32 @smax_undef1() {
-; CHECK-LABEL: @smax_undef1(
+define i32 @smax_undef_scalable_vector() {
+; CHECK-LABEL: @smax_undef_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.smax.nxv8i32(<vscale x 8 x i32> undef)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.smax.nxv8i32(<vscale x 8 x i32> undef)
+  ret i32 %x
+}
+
+define i32 @smax_undef_elt() {
+; CHECK-LABEL: @smax_undef_elt(
 ; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
@@ -485,8 +715,16 @@ define i32 @smax_poison() {
   ret i32 %x
 }
 
-define i32 @smax_poison1() {
-; CHECK-LABEL: @smax_poison1(
+define i32 @smax_poison_scalable_vector() {
+; CHECK-LABEL: @smax_poison_scalable_vector(
+; CHECK-NEXT:    ret i32 poison
+;
+  %x = call i32 @llvm.vector.reduce.smax.nxv8i32(<vscale x 8 x i32> poison)
+  ret i32 %x
+}
+
+define i32 @smax_poison_elt() {
+; CHECK-LABEL: @smax_poison_elt(
 ; CHECK-NEXT:    ret i32 poison
 ;
   %x = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> <i32 1, i32 1, i32 0, i32 1, i32 1, i32 1, i32 1, i32 poison>)
@@ -501,6 +739,14 @@ define i32 @umin_0() {
   ret i32 %x
 }
 
+define i32 @umin_0_scalable_vector() {
+; CHECK-LABEL: @umin_0_scalable_vector(
+; CHECK-NEXT:    ret i32 0
+;
+  %x = call i32 @llvm.vector.reduce.umin.nxv8i32(<vscale x 8 x i32> zeroinitializer)
+  ret i32 %x
+}
+
 define i32 @umin_1() {
 ; CHECK-LABEL: @umin_1(
 ; CHECK-NEXT:    ret i32 1
@@ -509,6 +755,14 @@ define i32 @umin_1() {
   ret i32 %x
 }
 
+define i32 @umin_1_scalable_vector() {
+; CHECK-LABEL: @umin_1_scalable_vector(
+; CHECK-NEXT:    ret i32 1
+;
+  %x = call i32 @llvm.vector.reduce.umin.nxv8i32(<vscale x 8 x i32> splat (i32 1))
+  ret i32 %x
+}
+
 define i32 @umin_inc() {
 ; CHECK-LABEL: @umin_inc(
 ; CHECK-NEXT:    ret i32 1
@@ -534,8 +788,17 @@ define i32 @umin_undef() {
   ret i32 %x
 }
 
-define i32 @umin_undef1() {
-; CHECK-LABEL: @umin_undef1(
+define i32 @umin_undef_scalable_vector() {
+; CHECK-LABEL: @umin_undef_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.umin.nxv8i32(<vscale x 8 x i32> undef)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.umin.nxv8i32(<vscale x 8 x i32> undef)
+  ret i32 %x
+}
+
+define i32 @umin_undef_elt() {
+; CHECK-LABEL: @umin_undef_elt(
 ; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
@@ -551,8 +814,16 @@ define i32 @umin_poison() {
   ret i32 %x
 }
 
-define i32 @umin_poison1() {
-; CHECK-LABEL: @umin_poison1(
+define i32 @umin_poison_scalable_vector() {
+; CHECK-LABEL: @umin_poison_scalable_vector(
+; CHECK-NEXT:    ret i32 poison
+;
+  %x = call i32 @llvm.vector.reduce.umin.nxv8i32(<vscale x 8 x i32> poison)
+  ret i32 %x
+}
+
+define i32 @umin_poison_elt() {
+; CHECK-LABEL: @umin_poison_elt(
 ; CHECK-NEXT:    ret i32 poison
 ;
   %x = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> <i32 1, i32 1, i32 -1, i32 poison, i32 1, i32 1, i32 1, i32 1>)
@@ -567,6 +838,14 @@ define i32 @umax_0() {
   ret i32 %x
 }
 
+define i32 @umax_0_scalable_vector() {
+; CHECK-LABEL: @umax_0_scalable_vector(
+; CHECK-NEXT:    ret i32 0
+;
+  %x = call i32 @llvm.vector.reduce.umax.nxv8i32(<vscale x 8 x i32> zeroinitializer)
+  ret i32 %x
+}
+
 define i32 @umax_1() {
 ; CHECK-LABEL: @umax_1(
 ; CHECK-NEXT:    ret i32 1
@@ -575,6 +854,14 @@ define i32 @umax_1() {
   ret i32 %x
 }
 
+define i32 @umax_1_scalable_vector() {
+; CHECK-LABEL: @umax_1_scalable_vector(
+; CHECK-NEXT:    ret i32 1
+;
+  %x = call i32 @llvm.vector.reduce.umax.nxv8i32(<vscale x 8 x i32> splat(i32 1))
+  ret i32 %x
+}
+
 define i32 @umax_inc() {
 ; CHECK-LABEL: @umax_inc(
 ; CHECK-NEXT:    ret i32 -3
@@ -600,8 +887,17 @@ define i32 @umax_undef() {
   ret i32 %x
 }
 
-define i32 @umax_undef1() {
-; CHECK-LABEL: @umax_undef1(
+define i32 @umax_undef_scalable_vector() {
+; CHECK-LABEL: @umax_undef_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.umax.nxv8i32(<vscale x 8 x i32> undef)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.umax.nxv8i32(<vscale x 8 x i32> undef)
+  ret i32 %x
+}
+
+define i32 @umax_undef_elt() {
+; CHECK-LABEL: @umax_undef_elt(
 ; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
@@ -617,8 +913,16 @@ define i32 @umax_poison() {
   ret i32 %x
 }
 
-define i32 @umax_poison1() {
-; CHECK-LABEL: @umax_poison1(
+define i32 @umax_poison_scalable_vector() {
+; CHECK-LABEL: @umax_poison_scalable_vector(
+; CHECK-NEXT:    ret i32 poison
+;
+  %x = call i32 @llvm.vector.reduce.umax.nxv8i32(<vscale x 8 x i32> poison)
+  ret i32 %x
+}
+
+define i32 @umax_poison_elt() {
+; CHECK-LABEL: @umax_poison_elt(
 ; CHECK-NEXT:    ret i32 poison
 ;
   %x = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> <i32 1, i32 1, i32 poison, i32 1, i32 1, i32 poison, i32 1, i32 1>)
diff --git a/llvm/test/Transforms/LICM/scalar-promote.ll b/llvm/test/Transforms/LICM/scalar-promote.ll
index 3af65df55a099..e6cc457bd55b4 100644
--- a/llvm/test/Transforms/LICM/scalar-promote.ll
+++ b/llvm/test/Transforms/LICM/scalar-promote.ll
@@ -43,9 +43,9 @@ define void @test2(i32 %i) {
 ; CHECK-LABEL: define void @test2(
 ; CHECK-SAME: i32 [[I:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    [[X1:%.*]] = getelementptr i32, ptr @X, i64 1
 ; CHECK-NEXT:    [[X2:%.*]] = getelementptr i32, ptr @X, i64 1
-; CHECK-NEXT:    [[X1_PROMOTED:%.*]] = load i32, ptr [[X1]], align 4
+; CHECK-NEXT:    [[X3:%.*]] = getelementptr i32, ptr @X, i64 1
+; CHECK-NEXT:    [[X1_PROMOTED:%.*]] = load i32, ptr [[X2]], align 4
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[A1:%.*]] = phi i32 [ [[V:%.*]], %[[LOOP]] ], [ [[X1_PROMOTED]], %[[ENTRY]] ]
@@ -53,7 +53,7 @@ define void @test2(i32 %i) {
 ; CHECK-NEXT:    br i1 false, label %[[LOOP]], label %[[EXIT:.*]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    [[V_LCSSA:%.*]] = phi i32 [ [[V]], %[[LOOP]] ]
-; CHECK-NEXT:    store i32 [[V_LCSSA]], ptr [[X1]], align 4
+; CHECK-NEXT:    store i32 [[V_LCSSA]], ptr [[X2]], align 4
 ; CHECK-NEXT:    ret void
 ;
 Entry:
diff --git a/llvm/test/Transforms/IndVarSimplify/sink-alloca.ll b/llvm/test/Transforms/LICM/sink-alloca.ll
similarity index 89%
rename from llvm/test/Transforms/IndVarSimplify/sink-alloca.ll
rename to llvm/test/Transforms/LICM/sink-alloca.ll
index 0997bf6128869..2bf9350b71ea7 100644
--- a/llvm/test/Transforms/IndVarSimplify/sink-alloca.ll
+++ b/llvm/test/Transforms/LICM/sink-alloca.ll
@@ -1,9 +1,9 @@
-; RUN: opt < %s -passes=indvars -S | FileCheck %s
+; RUN: opt < %s -passes=licm -verify-memoryssa -S | FileCheck %s
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i386-apple-darwin10.0"
 
 ; PR4775
-; Indvars shouldn't sink the alloca out of the entry block, even though
+; LICM shouldn't sink the alloca out of the entry block, even though
 ; it's not used until after the loop.
 define i32 @main() nounwind {
 ; CHECK: entry:
@@ -25,7 +25,7 @@ while.end:                                        ; preds = %while.cond
 declare i32 @bar()
 
 ; <rdar://problem/10352360>
-; Indvars shouldn't sink the first alloca between the stacksave and stackrestore
+; LICM shouldn't sink the first alloca between the stacksave and stackrestore
 ; intrinsics.
 declare ptr @a(...)
 declare ptr @llvm.stacksave() nounwind
diff --git a/llvm/test/Transforms/LICM/sink-from-preheader.ll b/llvm/test/Transforms/LICM/sink-from-preheader.ll
new file mode 100644
index 0000000000000..bbe3d3b285c15
--- /dev/null
+++ b/llvm/test/Transforms/LICM/sink-from-preheader.ll
@@ -0,0 +1,185 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=licm -verify-memoryssa -S | FileCheck %s
+
+; We perform sinking here, Changed flag should be set properly.
+define i32 @test(i32 %a, i32 %b, i32 %N) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[IV_NEXT]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    ret i32 [[ADD]]
+;
+entry:
+  %add = add i32 %a, %b
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %iv.next = add i32 %iv, 1
+  %cmp = icmp slt i32 %iv.next, %N
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret i32 %add
+}
+
+define i32 @test_with_unused_load(i32 %a, ptr %b, i32 %N) {
+; CHECK-LABEL: @test_with_unused_load(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[IV_NEXT]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[B:%.*]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[A:%.*]], [[LOAD]]
+; CHECK-NEXT:    ret i32 [[ADD]]
+;
+entry:
+  %load = load i32, ptr %b
+  %add = add i32 %a, %load
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %iv.next = add i32 %iv, 1
+  %cmp = icmp slt i32 %iv.next, %N
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret i32 %add
+}
+
+define i32 @test_with_unused_load_modified_store(i32 %a, ptr %b, i32 %N) {
+; CHECK-LABEL: @test_with_unused_load_modified_store(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[B:%.*]], align 4
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], [[A:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[IV_NEXT]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[SMAX:%.*]] = phi i32 [ [[IV_NEXT]], [[LOOP]] ]
+; CHECK-NEXT:    store i32 [[SMAX]], ptr [[B]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[A]], [[LOAD]]
+; CHECK-NEXT:    ret i32 [[ADD]]
+;
+entry:
+  %load = load i32, ptr %b
+  %add = add i32 %a, %load
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %iv.next = add i32 %iv, %a
+  store i32 %iv.next, ptr %b
+  %cmp = icmp slt i32 %iv.next, %N
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret i32 %add
+}
+
+; Volatile loads must not be sunk.
+define i32 @test_with_volatile_load_no_sink(i32 %a, ptr %b, i32 %N) {
+; CHECK-LABEL: @test_with_volatile_load_no_sink(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[LD:%.*]] = load volatile i32, ptr [[B:%.*]], align 4
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[IV_NEXT]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[A:%.*]], [[LD]]
+; CHECK-NEXT:    ret i32 [[ADD]]
+;
+entry:
+  %ld = load volatile i32, ptr %b, align 4
+  %add = add i32 %a, %ld
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %iv.next = add i32 %iv, 1
+  %cmp = icmp slt i32 %iv.next, %N
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret i32 %add
+}
+
+; Ordered/atomic loads must not be sunk.
+define i32 @test_with_atomic_load_no_sink(i32 %a, ptr %b, i32 %N) {
+; CHECK-LABEL: @test_with_atomic_load_no_sink(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[LD:%.*]] = load atomic i32, ptr [[B:%.*]] acquire, align 4
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[IV_NEXT]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[A:%.*]], [[LD]]
+; CHECK-NEXT:    ret i32 [[ADD]]
+;
+entry:
+  %ld = load atomic i32, ptr %b acquire, align 4
+  %add = add i32 %a, %ld
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %iv.next = add i32 %iv, 1
+  %cmp = icmp slt i32 %iv.next, %N
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret i32 %add
+}
+
+declare void @clobber(ptr)
+
+; Calls that may write memory in the loop should prevent sinking the load.
+define i32 @test_with_unused_load_clobbered_by_call(i32 %a, ptr %b, i32 %N) {
+; CHECK-LABEL: @test_with_unused_load_clobbered_by_call(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[LD:%.*]] = load i32, ptr [[B:%.*]], align 4
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    call void @clobber(ptr [[B]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[IV_NEXT]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[A:%.*]], [[LD]]
+; CHECK-NEXT:    ret i32 [[ADD]]
+;
+entry:
+  %ld = load i32, ptr %b, align 4
+  %add = add i32 %a, %ld
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %iv.next = add i32 %iv, 1
+  call void @clobber(ptr %b)
+  %cmp = icmp slt i32 %iv.next, %N
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret i32 %add
+}
diff --git a/llvm/test/Transforms/LICM/sink-trapping.ll b/llvm/test/Transforms/LICM/sink-trapping.ll
new file mode 100644
index 0000000000000..f4d260d973987
--- /dev/null
+++ b/llvm/test/Transforms/LICM/sink-trapping.ll
@@ -0,0 +1,28 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -passes=licm -verify-memoryssa -S | FileCheck %s
+
+declare i1 @b()
+
+define i32 @a(i32 %x) nounwind {
+; CHECK-LABEL: define i32 @a(
+; CHECK-SAME: i32 [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[FOR_BODY_PREHEADER:.*:]]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[CMP:%.*]] = call i1 @b()
+; CHECK-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_END_LOOPEXIT:.*]]
+; CHECK:       [[FOR_END_LOOPEXIT]]:
+; CHECK-NEXT:    [[Y:%.*]] = sdiv i32 10, [[X]]
+; CHECK-NEXT:    ret i32 [[Y]]
+;
+for.body.preheader:
+  %y = sdiv i32 10, %x
+  br label %for.body
+
+for.body:
+  %cmp = call i1 @b()
+  br i1 %cmp, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:
+  ret i32 %y
+}
diff --git a/llvm/test/Transforms/LoopDeletion/invalidate-scev-after-hoisting.ll b/llvm/test/Transforms/LoopDeletion/invalidate-scev-after-hoisting.ll
index bdd51c2b6bc53..6c19aaad03ba8 100644
--- a/llvm/test/Transforms/LoopDeletion/invalidate-scev-after-hoisting.ll
+++ b/llvm/test/Transforms/LoopDeletion/invalidate-scev-after-hoisting.ll
@@ -84,13 +84,13 @@ define i32 @scev_invalidation_after_deleting(ptr %src) {
 ; CHECK:       inner.2.preheader:
 ; CHECK-NEXT:    br label [[INNER_3_PH:%.*]]
 ; CHECK:       inner.3.ph:
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i64 0 to i32
 ; CHECK-NEXT:    br label [[INNER_3:%.*]]
 ; CHECK:       inner.3:
 ; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[SRC:%.*]], align 4
 ; CHECK-NEXT:    br i1 false, label [[OUTER_LATCH]], label [[INNER_3]]
 ; CHECK:       outer.latch:
 ; CHECK-NEXT:    [[L_LCSSA:%.*]] = phi i32 [ [[L]], [[INNER_3]] ]
-; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i64 0 to i32
 ; CHECK-NEXT:    [[OUTER_IV_NEXT]] = add nsw i32 [[L_LCSSA]], [[TRUNC]]
 ; CHECK-NEXT:    br label [[OUTER_HEADER]]
 ;
diff --git a/llvm/test/Transforms/LoopDistribute/laa-invalidation.ll b/llvm/test/Transforms/LoopDistribute/laa-invalidation.ll
index 62c5627ac2d38..4a55c0e9e11d5 100644
--- a/llvm/test/Transforms/LoopDistribute/laa-invalidation.ll
+++ b/llvm/test/Transforms/LoopDistribute/laa-invalidation.ll
@@ -4,11 +4,11 @@
 define void @test_pr50940(ptr %A, ptr %B) {
 ; CHECK-LABEL: @test_pr50940(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 4
 ; CHECK-NEXT:    br label [[OUTER_HEADER:%.*]]
 ; CHECK:       outer.header:
 ; CHECK-NEXT:    br i1 false, label [[OUTER_LATCH:%.*]], label [[INNER_PH:%.*]]
 ; CHECK:       inner.ph:
-; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 4
 ; CHECK-NEXT:    [[GEP_A_3:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 3
 ; CHECK-NEXT:    br label [[INNER_LVER_CHECK:%.*]]
 ; CHECK:       inner.lver.check:
diff --git a/llvm/test/Transforms/LoopIdiom/cyclic-redundancy-check.ll b/llvm/test/Transforms/LoopIdiom/cyclic-redundancy-check.ll
index b2ec53ca405d4..90995a0257721 100644
--- a/llvm/test/Transforms/LoopIdiom/cyclic-redundancy-check.ll
+++ b/llvm/test/Transforms/LoopIdiom/cyclic-redundancy-check.ll
@@ -537,6 +537,52 @@ exit:                                              ; preds = %loop
   %ret = and i32 %unrelated.next, %crc.next
   ret i32 %ret
 }
+
+define i16 @not.crc.data.next.outside.user(i16 %crc.init, i16 %data.init) {
+; CHECK-LABEL: define i16 @not.crc.data.next.outside.user(
+; CHECK-SAME: i16 [[CRC_INIT:%.*]], i16 [[DATA_INIT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[TBL_LD:%.*]] = phi i16 [ [[CRC_INIT]], %[[ENTRY]] ], [ [[CRC_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[CRC_BE_SHIFT:%.*]] = phi i16 [ [[DATA_INIT]], %[[ENTRY]] ], [ [[DATA_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[CRC_NEXT3:%.*]] = xor i16 [[CRC_BE_SHIFT]], [[TBL_LD]]
+; CHECK-NEXT:    [[CRC_SHL:%.*]] = shl i16 [[TBL_LD]], 1
+; CHECK-NEXT:    [[CRC_XOR:%.*]] = xor i16 [[CRC_SHL]], 3
+; CHECK-NEXT:    [[CHECK_SB:%.*]] = icmp slt i16 [[CRC_NEXT3]], 0
+; CHECK-NEXT:    [[CRC_NEXT]] = select i1 [[CHECK_SB]], i16 [[CRC_XOR]], i16 [[CRC_SHL]]
+; CHECK-NEXT:    [[DATA_NEXT]] = shl i16 [[CRC_BE_SHIFT]], 1
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; CHECK-NEXT:    [[EXIT_COND:%.*]] = icmp samesign ult i32 [[IV]], 7
+; CHECK-NEXT:    br i1 [[EXIT_COND]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[CRC_NEXT_LCSSA:%.*]] = phi i16 [ [[CRC_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    [[DATA_NEXT_LCSSA:%.*]] = phi i16 [ [[DATA_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RET:%.*]] = xor i16 [[DATA_NEXT_LCSSA]], [[CRC_NEXT_LCSSA]]
+; CHECK-NEXT:    ret i16 [[RET]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %crc = phi i16 [ %crc.init, %entry ], [ %crc.next, %loop ]
+  %data = phi i16 [ %data.init, %entry ], [ %data.next, %loop ]
+  %xor.crc.data = xor i16 %data, %crc
+  %crc.shl = shl i16 %crc, 1
+  %crc.xor = xor i16 %crc.shl, 3
+  %check.sb = icmp slt i16 %xor.crc.data, 0
+  %crc.next = select i1 %check.sb, i16 %crc.xor, i16 %crc.shl
+  %data.next = shl i16 %data, 1
+  %iv.next = add nuw nsw i32 %iv, 1
+  %exit.cond = icmp samesign ult i32 %iv, 7
+  br i1 %exit.cond, label %loop, label %exit
+
+exit:
+  %ret = xor i16 %data.next, %crc.next
+  ret i16 %ret
+}
 ;.
 ; CHECK: attributes #[[ATTR0]] = { optsize }
 ;.
diff --git a/llvm/test/Transforms/LoopStrengthReduce/AArch64/prefer-all.ll b/llvm/test/Transforms/LoopStrengthReduce/AArch64/prefer-all.ll
index db30fd23b0c9d..1944a9c800355 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/AArch64/prefer-all.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/AArch64/prefer-all.ll
@@ -119,8 +119,6 @@ for.end:
 ; We can't use postindex addressing on the conditional load of qval and can't
 ; convert the loop condition to a compare with zero, so we should instead use
 ; offset addressing.
-; FIXME: Currently we don't notice the load of qval is conditional, and attempt
-; postindex addressing anyway.
 define i32 @conditional_load(ptr %p, ptr %q, ptr %n) {
 ; CHECK-LABEL: define i32 @conditional_load(
 ; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]], ptr [[N:%.*]]) {
@@ -128,7 +126,6 @@ define i32 @conditional_load(ptr %p, ptr %q, ptr %n) {
 ; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
 ; CHECK:       [[FOR_BODY]]:
 ; CHECK-NEXT:    [[LSR_IV1:%.*]] = phi ptr [ [[SCEVGEP2:%.*]], %[[FOR_INC:.*]] ], [ [[P]], %[[ENTRY]] ]
-; CHECK-NEXT:    [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP:%.*]], %[[FOR_INC]] ], [ [[Q]], %[[ENTRY]] ]
 ; CHECK-NEXT:    [[IDX:%.*]] = phi i64 [ [[IDX_NEXT:%.*]], %[[FOR_INC]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    [[RET:%.*]] = phi i32 [ [[RET_NEXT:%.*]], %[[FOR_INC]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    [[PVAL:%.*]] = load i32, ptr [[LSR_IV1]], align 4
@@ -136,6 +133,8 @@ define i32 @conditional_load(ptr %p, ptr %q, ptr %n) {
 ; CHECK-NEXT:    [[SCEVGEP2]] = getelementptr i8, ptr [[LSR_IV1]], i64 4
 ; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label %[[FOR_INC]], label %[[IF_THEN:.*]]
 ; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[IDX]], 2
+; CHECK-NEXT:    [[LSR_IV:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[QVAL:%.*]] = load i32, ptr [[LSR_IV]], align 4
 ; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[RET]], [[QVAL]]
 ; CHECK-NEXT:    br label %[[FOR_INC]]
@@ -143,7 +142,6 @@ define i32 @conditional_load(ptr %p, ptr %q, ptr %n) {
 ; CHECK-NEXT:    [[RET_NEXT]] = phi i32 [ [[ADD]], %[[IF_THEN]] ], [ [[RET]], %[[FOR_BODY]] ]
 ; CHECK-NEXT:    [[IDX_NEXT]] = add nuw nsw i64 [[IDX]], 1
 ; CHECK-NEXT:    [[NVAL:%.*]] = load volatile i64, ptr [[N]], align 8
-; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i8, ptr [[LSR_IV]], i64 4
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i64 [[IDX_NEXT]], [[NVAL]]
 ; CHECK-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[EXIT:.*]]
 ; CHECK:       [[EXIT]]:
@@ -176,3 +174,141 @@ for.inc:
 exit:
   ret i32 %ret.next
 }
+
+; We can use postindex addressing for both loads here, even though the second
+; may not be executed on every loop iteration.
+define i32 @early_exit_load(ptr %p, ptr %q, ptr %n) {
+; CHECK-LABEL: define i32 @early_exit_load(
+; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]], ptr [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[LSR_IV1:%.*]] = phi ptr [ [[SCEVGEP2:%.*]], %[[FOR_INC:.*]] ], [ [[P]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP:%.*]], %[[FOR_INC]] ], [ [[Q]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[RET_PHI:%.*]] = phi i32 [ [[ADD:%.*]], %[[FOR_INC]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[IDX:%.*]] = phi i64 [ [[IDX_NEXT:%.*]], %[[FOR_INC]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[PVAL:%.*]] = load i32, ptr [[LSR_IV1]], align 4
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[PVAL]], 0
+; CHECK-NEXT:    [[SCEVGEP2]] = getelementptr i8, ptr [[LSR_IV1]], i64 4
+; CHECK-NEXT:    br i1 [[CMP1]], label %[[FOR_INC]], label %[[EXIT:.*]]
+; CHECK:       [[FOR_INC]]:
+; CHECK-NEXT:    [[QVAL:%.*]] = load i32, ptr [[LSR_IV]], align 4
+; CHECK-NEXT:    [[ADD]] = add nsw i32 [[QVAL]], [[RET_PHI]]
+; CHECK-NEXT:    [[IDX_NEXT]] = add nuw nsw i64 [[IDX]], 1
+; CHECK-NEXT:    [[NVAL:%.*]] = load volatile i64, ptr [[N]], align 8
+; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i8, ptr [[LSR_IV]], i64 4
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i64 [[IDX_NEXT]], [[NVAL]]
+; CHECK-NEXT:    br i1 [[CMP2]], label %[[FOR_BODY]], label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RET:%.*]] = phi i32 [ [[RET_PHI]], %[[FOR_BODY]] ], [ [[ADD]], %[[FOR_INC]] ]
+; CHECK-NEXT:    ret i32 [[RET]]
+;
+entry:
+  br label %for.body
+
+for.body:
+  %ret.phi = phi i32 [ %add, %for.inc ], [ 0, %entry ]
+  %idx = phi i64 [ %idx.next, %for.inc ], [ 0, %entry ]
+  %paddr = getelementptr inbounds nuw i32, ptr %p, i64 %idx
+  %pval = load i32, ptr %paddr, align 4
+  %cmp1 = icmp eq i32 %pval, 0
+  br i1 %cmp1, label %for.inc, label %exit
+
+for.inc:
+  %qaddr = getelementptr inbounds nuw i32, ptr %q, i64 %idx
+  %qval = load i32, ptr %qaddr, align 4
+  %add = add nsw i32 %qval, %ret.phi
+  %idx.next = add nuw nsw i64 %idx, 1
+  %nval = load volatile i64, ptr %n, align 8
+  %cmp2 = icmp slt i64 %idx.next, %nval
+  br i1 %cmp2, label %for.body, label %exit
+
+exit:
+  %ret = phi i32 [ %ret.phi, %for.body ], [ %add, %for.inc ]
+  ret i32 %ret
+}
+
+; The control-flow before and after the load of qval shouldn't prevent postindex
+; addressing from happening.
+; FIXME: We choose postindex addressing, but the scevgep is placed in for.inc so
+; during codegen we will fail to actually generate a postindex load.
+define void @middle_block_load(ptr %p, ptr %q, i64 %n) {
+; CHECK-LABEL: define void @middle_block_load(
+; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[LSR_IV2:%.*]] = phi ptr [ [[SCEVGEP3:%.*]], %[[FOR_INC:.*]] ], [ [[P]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[LSR_IV1:%.*]] = phi ptr [ [[SCEVGEP:%.*]], %[[FOR_INC]] ], [ [[Q]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i64 [ [[LSR_IV_NEXT:%.*]], %[[FOR_INC]] ], [ [[N]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[PVAL:%.*]] = load i32, ptr [[LSR_IV2]], align 4
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[PVAL]], 0
+; CHECK-NEXT:    [[SCEVGEP3]] = getelementptr i8, ptr [[LSR_IV2]], i64 4
+; CHECK-NEXT:    br i1 [[CMP1]], label %[[IF_THEN1:.*]], label %[[IF_ELSE1:.*]]
+; CHECK:       [[IF_THEN1]]:
+; CHECK-NEXT:    tail call void @otherfn1()
+; CHECK-NEXT:    br label %[[IF_END:.*]]
+; CHECK:       [[IF_ELSE1]]:
+; CHECK-NEXT:    tail call void @otherfn2()
+; CHECK-NEXT:    br label %[[IF_END]]
+; CHECK:       [[IF_END]]:
+; CHECK-NEXT:    [[QVAL:%.*]] = load i32, ptr [[LSR_IV1]], align 4
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sgt i32 [[QVAL]], 0
+; CHECK-NEXT:    br i1 [[CMP2]], label %[[IF_THEN2:.*]], label %[[IF_ELSE2:.*]]
+; CHECK:       [[IF_THEN2]]:
+; CHECK-NEXT:    tail call void @otherfn1()
+; CHECK-NEXT:    br label %[[FOR_INC]]
+; CHECK:       [[IF_ELSE2]]:
+; CHECK-NEXT:    tail call void @otherfn2()
+; CHECK-NEXT:    br label %[[FOR_INC]]
+; CHECK:       [[FOR_INC]]:
+; CHECK-NEXT:    [[LSR_IV_NEXT]] = add i64 [[LSR_IV]], -1
+; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i8, ptr [[LSR_IV1]], i64 4
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i64 [[LSR_IV_NEXT]], 0
+; CHECK-NEXT:    br i1 [[CMP3]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %idx = phi i64 [ %idx.next, %for.inc ], [ 0, %entry ]
+  %paddr = getelementptr inbounds nuw i32, ptr %p, i64 %idx
+  %pval = load i32, ptr %paddr, align 4
+  %cmp1 = icmp sgt i32 %pval, 0
+  br i1 %cmp1, label %if.then1, label %if.else1
+
+if.then1:
+  tail call void @otherfn1()
+  br label %if.end
+
+if.else1:
+  tail call void @otherfn2()
+  br label %if.end
+
+if.end:
+  %qaddr = getelementptr inbounds nuw i32, ptr %q, i64 %idx
+  %qval = load i32, ptr %qaddr, align 4
+  %cmp2 = icmp sgt i32 %qval, 0
+  br i1 %cmp2, label %if.then2, label %if.else2
+
+if.then2:
+  tail call void @otherfn1()
+  br label %for.inc
+
+if.else2:
+  tail call void @otherfn2()
+  br label %for.inc
+
+for.inc:
+  %idx.next = add nuw nsw i64 %idx, 1
+  %cmp3 = icmp eq i64 %idx.next, %n
+  br i1 %cmp3, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+declare dso_local void @otherfn1()
+declare dso_local void @otherfn2()
diff --git a/llvm/test/Transforms/LoopUnroll/peel-branch-weights-freq.ll b/llvm/test/Transforms/LoopUnroll/branch-weights-freq/peel.ll
similarity index 100%
rename from llvm/test/Transforms/LoopUnroll/peel-branch-weights-freq.ll
rename to llvm/test/Transforms/LoopUnroll/branch-weights-freq/peel.ll
diff --git a/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-epilog.ll b/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-epilog.ll
new file mode 100644
index 0000000000000..96b31d801c2f9
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-epilog.ll
@@ -0,0 +1,160 @@
+; Test branch weight metadata, estimated trip count metadata, and block
+; frequencies after loop unrolling with an epilogue.
+
+; ------------------------------------------------------------------------------
+; Define substitutions.
+;
+; Check original loop body frequency.
+; DEFINE: %{bf-fc} = opt %s -S -passes='print<block-freq>' 2>&1 | \
+; DEFINE:   FileCheck %s -check-prefixes
+;
+; Unroll loops and then check block frequency.  The -implicit-check-not options
+; make sure that no additional labels or @f calls show up.
+; DEFINE: %{ur-bf} = opt %s -S -passes='loop-unroll,print<block-freq>' 2>&1
+; DEFINE: %{fc} = FileCheck %s \
+; DEFINE:     -implicit-check-not='{{^( *- )?[^ ;]*:}}' \
+; DEFINE:     -implicit-check-not='call void @f' -check-prefixes
+
+; ------------------------------------------------------------------------------
+; Check various interesting unroll count values relative to the original loop's
+; estimated trip count of 11 (e.g., minimum and boundary values).
+;
+; RUN: %{bf-fc} ALL,ORIG
+; RUN: %{ur-bf} -unroll-count=2 -unroll-runtime | %{fc} ALL,UR,UR2
+; RUN: %{ur-bf} -unroll-count=4 -unroll-runtime | %{fc} ALL,UR,UR4
+; RUN: %{ur-bf} -unroll-count=10 -unroll-runtime | %{fc} ALL,UR,UR10
+; RUN: %{ur-bf} -unroll-count=11 -unroll-runtime | %{fc} ALL,UR,UR11
+; RUN: %{ur-bf} -unroll-count=12 -unroll-runtime | %{fc} ALL,UR,UR12
+
+; ------------------------------------------------------------------------------
+; Check the iteration frequencies, which, when each is multiplied by the number
+; of original loop bodies that execute within it, should sum to almost exactly
+; the original loop body frequency.
+;
+; ALL-LABEL: block-frequency-info: test
+;
+;      ORIG: - [[ENTRY:.*]]:
+;      ORIG: - [[DO_BODY:.*]]: float = 11.0,
+;      ORIG: - [[DO_END:.*]]:
+;
+;        UR: - [[ENTRY:.*]]:
+;        UR: - [[ENTRY_NEW:.*]]:
+;       UR2: - [[DO_BODY:.*]]: float = 5.2381,
+;       UR4: - [[DO_BODY:.*]]: float = 2.3702,
+;      UR10: - [[DO_BODY:.*]]: float = 0.6902,
+;      UR11: - [[DO_BODY:.*]]: float = 0.59359,
+;      UR12: - [[DO_BODY:.*]]: float = 0.5144,
+;        UR: - [[DO_END_UNR_LCSSA:.*]]:
+;        UR: - [[DO_BODY_EPIL_PREHEADER:.*]]:
+;       UR2: - [[DO_BODY_EPIL:.*]]: float = 0.52381,
+;       UR4: - [[DO_BODY_EPIL:.*]]: float = 1.5193,
+;      UR10: - [[DO_BODY_EPIL:.*]]: float = 4.098,
+;      UR11: - [[DO_BODY_EPIL:.*]]: float = 4.4705,
+;      UR12: - [[DO_BODY_EPIL:.*]]: float = 4.8272,
+;       UR4: - [[DO_END_EPILOG_LCSSA:.*]]:
+;      UR10: - [[DO_END_EPILOG_LCSSA:.*]]:
+;      UR11: - [[DO_END_EPILOG_LCSSA:.*]]:
+;      UR12: - [[DO_END_EPILOG_LCSSA:.*]]:
+;        UR: - [[DO_END:.*]]:
+
+; ------------------------------------------------------------------------------
+; Check the CFGs, including the number of original loop bodies that appear
+; within each unrolled iteration.
+;
+;      UR-LABEL: define void @test(i32 %{{.*}}) {
+;            UR: [[ENTRY]]:
+;            UR:   br i1 %{{.*}}, label %[[DO_BODY_EPIL_PREHEADER]], label %[[ENTRY_NEW]], !prof ![[#PROF_UR_GUARD:]]{{$}}
+;            UR: [[ENTRY_NEW]]:
+;            UR:   br label %[[DO_BODY]]
+;            UR: [[DO_BODY]]:
+;   UR2-COUNT-2:   call void @f
+;   UR4-COUNT-4:   call void @f
+; UR10-COUNT-10:   call void @f
+; UR11-COUNT-11:   call void @f
+; UR12-COUNT-12:   call void @f
+;            UR:   br i1 %{{.*}}, label %[[DO_END_UNR_LCSSA]], label %[[DO_BODY]], !prof ![[#PROF_UR_LATCH:]], !llvm.loop ![[#LOOP_UR_LATCH:]]{{$}}
+;            UR: [[DO_END_UNR_LCSSA]]:
+;            UR:   br i1 %{{.*}}, label %[[DO_BODY_EPIL_PREHEADER]], label %[[DO_END:.*]], !prof ![[#PROF_RM_GUARD:]]{{$}}
+;            UR: [[DO_BODY_EPIL_PREHEADER]]:
+;            UR:   br label %[[DO_BODY_EPIL]]
+;            UR: [[DO_BODY_EPIL]]:
+;            UR:   call void @f
+;           UR4:   br i1 %{{.*}}, label %[[DO_BODY_EPIL]], label %[[DO_END_EPILOG_LCSSA]], !prof ![[#PROF_RM_LATCH:]], !llvm.loop ![[#LOOP_RM_LATCH:]]{{$}}
+;          UR10:   br i1 %{{.*}}, label %[[DO_BODY_EPIL]], label %[[DO_END_EPILOG_LCSSA]], !prof ![[#PROF_RM_LATCH:]], !llvm.loop ![[#LOOP_RM_LATCH:]]{{$}}
+;          UR11:   br i1 %{{.*}}, label %[[DO_BODY_EPIL]], label %[[DO_END_EPILOG_LCSSA]], !prof ![[#PROF_RM_LATCH:]], !llvm.loop ![[#LOOP_RM_LATCH:]]{{$}}
+;          UR12:   br i1 %{{.*}}, label %[[DO_BODY_EPIL]], label %[[DO_END_EPILOG_LCSSA]], !prof ![[#PROF_RM_LATCH:]], !llvm.loop ![[#LOOP_RM_LATCH:]]{{$}}
+;           UR4: [[DO_END_EPILOG_LCSSA]]:
+;          UR10: [[DO_END_EPILOG_LCSSA]]:
+;          UR11: [[DO_END_EPILOG_LCSSA]]:
+;          UR12: [[DO_END_EPILOG_LCSSA]]:
+;            UR:   br label %[[DO_END]]
+;            UR: [[DO_END]]:
+;            UR:   ret void
+
+declare void @f(i32)
+
+define void @test(i32 %n) {
+entry:
+  br label %do.body
+
+do.body:
+  %i = phi i32 [ 0, %entry ], [ %inc, %do.body ]
+  %inc = add i32 %i, 1
+  call void @f(i32 %i)
+  %c = icmp sge i32 %inc, %n
+  br i1 %c, label %do.end, label %do.body, !prof !0
+
+do.end:
+  ret void
+}
+
+!0 = !{!"branch_weights", i32 1, i32 10}
+
+; ------------------------------------------------------------------------------
+; Check branch weight metadata and estimated trip count metadata.
+;
+;  UR2: ![[#PROF_UR_GUARD]] = !{!"branch_weights", i32 195225786, i32 1952257862}
+;  UR4: ![[#PROF_UR_GUARD]] = !{!"branch_weights", i32 534047398, i32 1613436250}
+; UR10: ![[#PROF_UR_GUARD]] = !{!"branch_weights", i32 1236740947, i32 910742701}
+; UR11: ![[#PROF_UR_GUARD]] = !{!"branch_weights", i32 1319535738, i32 827947910}
+; UR12: ![[#PROF_UR_GUARD]] = !{!"branch_weights", i32 1394803730, i32 752679918}
+;
+;  UR2: ![[#PROF_UR_LATCH]] = !{!"branch_weights", i32 372703773, i32 1774779875}
+;  UR4: ![[#PROF_UR_LATCH]] = !{!"branch_weights", i32 680723421, i32 1466760227}
+; UR10: ![[#PROF_UR_LATCH]] = !{!"branch_weights", i32 1319535738, i32 827947910}
+; UR11: ![[#PROF_UR_LATCH]] = !{!"branch_weights", i32 1394803730, i32 752679918}
+; UR12: ![[#PROF_UR_LATCH]] = !{!"branch_weights", i32 1463229177, i32 684254471}
+;
+;  UR2: ![[#LOOP_UR_LATCH]] = distinct !{![[#LOOP_UR_LATCH]], ![[#LOOP_UR_TC:]], ![[#DISABLE:]]}
+;  UR4: ![[#LOOP_UR_LATCH]] = distinct !{![[#LOOP_UR_LATCH]], ![[#LOOP_UR_TC:]], ![[#DISABLE:]]}
+; UR10: ![[#LOOP_UR_LATCH]] = distinct !{![[#LOOP_UR_LATCH]], ![[#LOOP_UR_TC:]], ![[#DISABLE:]]}
+; UR11: ![[#LOOP_UR_LATCH]] = distinct !{![[#LOOP_UR_LATCH]], ![[#LOOP_UR_TC:]], ![[#DISABLE:]]}
+; UR12: ![[#LOOP_UR_LATCH]] = distinct !{![[#LOOP_UR_LATCH]], ![[#LOOP_UR_TC:]], ![[#DISABLE:]]}
+;
+;  UR2: ![[#LOOP_UR_TC]] = !{!"llvm.loop.estimated_trip_count", i32 5}
+;  UR4: ![[#LOOP_UR_TC]] = !{!"llvm.loop.estimated_trip_count", i32 2}
+; UR10: ![[#LOOP_UR_TC]] = !{!"llvm.loop.estimated_trip_count", i32 1}
+; UR11: ![[#LOOP_UR_TC]] = !{!"llvm.loop.estimated_trip_count", i32 1}
+; UR12: ![[#LOOP_UR_TC]] = !{!"llvm.loop.estimated_trip_count", i32 0}
+;   UR: ![[#DISABLE]] = !{!"llvm.loop.unroll.disable"}
+;
+;  UR2: ![[#PROF_RM_GUARD]] = !{!"branch_weights", i32 1022611260, i32 1124872388}
+;  UR4: ![[#PROF_RM_GUARD]] = !{!"branch_weights", i32 1531603292, i32 615880356}
+; UR10: ![[#PROF_RM_GUARD]] = !{!"branch_weights", i32 1829762672, i32 317720976}
+; UR11: ![[#PROF_RM_GUARD]] = !{!"branch_weights", i32 1846907894, i32 300575754}
+; UR12: ![[#PROF_RM_GUARD]] = !{!"branch_weights", i32 1860963812, i32 286519836}
+;
+;  UR4: ![[#PROF_RM_LATCH]] = !{!"branch_weights", i32 1038564635, i32 1108919013}
+; UR10: ![[#PROF_RM_LATCH]] = !{!"branch_weights", i32 1656332913, i32 491150735}
+; UR11: ![[#PROF_RM_LATCH]] = !{!"branch_weights", i32 1693034047, i32 454449601}
+; UR12: ![[#PROF_RM_LATCH]] = !{!"branch_weights", i32 1723419551, i32 424064097}
+
+;  UR4: ![[#LOOP_RM_LATCH]] = distinct !{![[#LOOP_RM_LATCH]], ![[#LOOP_RM_TC:]], ![[#DISABLE:]]}
+; UR10: ![[#LOOP_RM_LATCH]] = distinct !{![[#LOOP_RM_LATCH]], ![[#LOOP_UR_TC:]], ![[#DISABLE:]]}
+; UR11: ![[#LOOP_RM_LATCH]] = distinct !{![[#LOOP_RM_LATCH]], ![[#LOOP_RM_TC:]], ![[#DISABLE:]]}
+; UR12: ![[#LOOP_RM_LATCH]] = distinct !{![[#LOOP_RM_LATCH]], ![[#LOOP_RM_TC:]], ![[#DISABLE:]]}
+;
+;  UR4: ![[#LOOP_RM_TC]] = !{!"llvm.loop.estimated_trip_count", i32 3}
+; For UR10, llvm.loop.estimated_trip_count is the same for both loops.
+; UR11: ![[#LOOP_RM_TC]] = !{!"llvm.loop.estimated_trip_count", i32 0}
+; UR12: ![[#LOOP_RM_TC]] = !{!"llvm.loop.estimated_trip_count", i32 11}
diff --git a/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-partial.ll b/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-partial.ll
new file mode 100644
index 0000000000000..cde9d46ee8421
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-partial.ll
@@ -0,0 +1,68 @@
+; Test branch weight metadata, estimated trip count metadata, and block
+; frequencies after partial loop unrolling without -unroll-runtime.
+
+; RUN: opt < %s -S -passes='print<block-freq>' 2>&1 | \
+; RUN:   FileCheck -check-prefix=CHECK %s
+
+; The -implicit-check-not options make sure that no additional labels or calls
+; to @f show up.
+; RUN: opt < %s -S -passes='loop-unroll,print<block-freq>' \
+; RUN:     -unroll-count=4 2>&1 | \
+; RUN:   FileCheck %s -check-prefix=CHECK-UR \
+; RUN:       -implicit-check-not='{{^( *- )?[^ ;]*:}}' \
+; RUN:       -implicit-check-not='call void @f'
+
+; CHECK: block-frequency-info: test
+; CHECK: do.body: float = 10.0,
+
+; The sum should still be ~10.
+;
+; CHECK-UR: block-frequency-info: test
+; CHECK-UR: - [[ENTRY:.*]]:
+; CHECK-UR: - [[DO_BODY:.*]]: float = 2.9078,
+; CHECK-UR: - [[DO_BODY_1:.*]]: float = 2.617,
+; CHECK-UR: - [[DO_BODY_2:.*]]: float = 2.3553,
+; CHECK-UR: - [[DO_BODY_3:.*]]: float = 2.1198,
+; CHECK-UR: - [[DO_END:.*]]:
+
+declare void @f(i32)
+
+define void @test(i32 %n) {
+; CHECK-UR-LABEL: define void @test(i32 %{{.*}}) {
+;       CHECK-UR: [[ENTRY]]:
+;       CHECK-UR:   br label %[[DO_BODY]]
+;       CHECK-UR: [[DO_BODY]]:
+;       CHECK-UR:   call void @f
+;       CHECK-UR:   br i1 %{{.*}}, label %[[DO_END]], label %[[DO_BODY_1]], !prof ![[#PROF:]]
+;       CHECK-UR: [[DO_BODY_1]]:
+;       CHECK-UR:   call void @f
+;       CHECK-UR:   br i1 %{{.*}}, label %[[DO_END]], label %[[DO_BODY_2]], !prof ![[#PROF]]
+;       CHECK-UR: [[DO_BODY_2]]:
+;       CHECK-UR:   call void @f
+;       CHECK-UR:   br i1 %{{.*}}, label %[[DO_END]], label %[[DO_BODY_3]], !prof ![[#PROF]]
+;       CHECK-UR: [[DO_BODY_3]]:
+;       CHECK-UR:   call void @f
+;       CHECK-UR:   br i1 %{{.*}}, label %[[DO_END]], label %[[DO_BODY]], !prof ![[#PROF]], !llvm.loop ![[#LOOP_UR_LATCH:]]
+;       CHECK-UR: [[DO_END]]:
+;       CHECK-UR:   ret void
+
+entry:
+  br label %do.body
+
+do.body:
+  %i = phi i32 [ 0, %entry ], [ %inc, %do.body ]
+  %inc = add i32 %i, 1
+  call void @f(i32 %i)
+  %c = icmp sge i32 %inc, %n
+  br i1 %c, label %do.end, label %do.body, !prof !0
+
+do.end:
+  ret void
+}
+
+!0 = !{!"branch_weights", i32 1, i32 9}
+
+; CHECK-UR: ![[#PROF]] = !{!"branch_weights", i32 1, i32 9}
+; CHECK-UR: ![[#LOOP_UR_LATCH]] = distinct !{![[#LOOP_UR_LATCH]], ![[#LOOP_UR_TC:]], ![[#DISABLE:]]}
+; CHECK-UR: ![[#LOOP_UR_TC]] = !{!"llvm.loop.estimated_trip_count", i32 3}
+; CHECK-UR: ![[#DISABLE]] = !{!"llvm.loop.unroll.disable"}
diff --git a/llvm/test/Transforms/LoopUnroll/followup.ll b/llvm/test/Transforms/LoopUnroll/followup.ll
index 051e43d52b3be..9dda76e70efac 100644
--- a/llvm/test/Transforms/LoopUnroll/followup.ll
+++ b/llvm/test/Transforms/LoopUnroll/followup.ll
@@ -1,9 +1,20 @@
-; RUN: opt < %s -S -passes=loop-unroll -unroll-count=2 | FileCheck %s -check-prefixes=COUNT,COMMON
-; RUN: opt < %s -S -passes=loop-unroll -unroll-runtime=true -unroll-runtime-epilog=true  | FileCheck %s -check-prefixes=EPILOG,COMMON
-; RUN: opt < %s -S -passes=loop-unroll -unroll-runtime=true -unroll-runtime-epilog=false | FileCheck %s -check-prefixes=PROLOG,COMMON
-;
-; Check that followup-attributes are applied after LoopUnroll.
+; Check that followup attributes are applied after LoopUnroll.
 ;
+; We choose -unroll-count=3 because it produces partial unrolling of remainder
+; loops.  Complete unrolling would leave no remainder loop to which to copy
+; followup attributes.
+
+; DEFINE: %{unroll} = opt < %s -S -passes=loop-unroll -unroll-count=3
+; DEFINE: %{epilog} = %{unroll} -unroll-runtime -unroll-runtime-epilog=true
+; DEFINE: %{prolog} = %{unroll} -unroll-runtime -unroll-runtime-epilog=false
+; DEFINE: %{fc} = FileCheck %s -check-prefixes
+
+; RUN: %{unroll} | %{fc} COMMON,COUNT
+; RUN: %{epilog} | %{fc} COMMON,EPILOG,EPILOG-NO-UNROLL
+; RUN: %{prolog} | %{fc} COMMON,PROLOG,PROLOG-NO-UNROLL
+; RUN: %{epilog} -unroll-remainder | %{fc} COMMON,EPILOG,EPILOG-UNROLL
+; RUN: %{prolog} -unroll-remainder | %{fc} COMMON,PROLOG,PROLOG-UNROLL
+
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 define i32 @test(ptr nocapture %a, i32 %n) nounwind uwtable readonly {
@@ -36,15 +47,17 @@ for.end:                                          ; preds = %for.body, %entry
 ; COMMON-LABEL: @test(
 
 
-; COUNT: br i1 %exitcond.1, label %for.end.loopexit, label %for.body, !llvm.loop ![[LOOP:[0-9]+]]
+; COUNT: br i1 %exitcond.2, label %for.end.loopexit, label %for.body, !llvm.loop ![[LOOP:[0-9]+]]
 
 ; COUNT: ![[FOLLOWUP_ALL:[0-9]+]] = !{!"FollowupAll"}
 ; COUNT: ![[FOLLOWUP_UNROLLED:[0-9]+]] = !{!"FollowupUnrolled"}
 ; COUNT: ![[LOOP]] = distinct !{![[LOOP]], ![[FOLLOWUP_ALL]], ![[FOLLOWUP_UNROLLED]]}
 
 
-; EPILOG: br i1 %niter.ncmp.7, label %for.end.loopexit.unr-lcssa, label %for.body, !llvm.loop ![[LOOP_0:[0-9]+]]
-; EPILOG: br i1 %epil.iter.cmp, label %for.body.epil, label %for.end.loopexit.epilog-lcssa, !llvm.loop ![[LOOP_2:[0-9]+]]
+; EPILOG: br i1 %niter.ncmp.2, label %for.end.loopexit.unr-lcssa, label %for.body, !llvm.loop ![[LOOP_0:[0-9]+]]
+; EPILOG-NO-UNROLL: br i1 %epil.iter.cmp, label %for.body.epil, label %for.end.loopexit.epilog-lcssa, !llvm.loop ![[LOOP_2:[0-9]+]]
+; EPILOG-UNROLL: br i1 %epil.iter.cmp, label %for.body.epil.1, label %for.end.loopexit.epilog-lcssa
+; EPILOG-UNROLL: br i1 %epil.iter.cmp.1, label %for.body.epil, label %for.end.loopexit.epilog-lcssa, !llvm.loop ![[LOOP_2:[0-9]+]]
 
 ; EPILOG: ![[LOOP_0]] = distinct !{![[LOOP_0]], ![[FOLLOWUP_ALL:[0-9]+]], ![[FOLLOWUP_UNROLLED:[0-9]+]]}
 ; EPILOG: ![[FOLLOWUP_ALL]] = !{!"FollowupAll"}
@@ -53,8 +66,10 @@ for.end:                                          ; preds = %for.body, %entry
 ; EPILOG: ![[FOLLOWUP_REMAINDER]] = !{!"FollowupRemainder"}
 
 
-; PROLOG:  br i1 %prol.iter.cmp, label %for.body.prol, label %for.body.prol.loopexit.unr-lcssa, !llvm.loop ![[LOOP_0:[0-9]+]]
-; PROLOG:  br i1 %exitcond.7, label %for.end.loopexit.unr-lcssa, label %for.body, !llvm.loop ![[LOOP_2:[0-9]+]]
+; PROLOG-UNROLL:  br i1 %prol.iter.cmp, label %for.body.prol.1, label %for.body.prol.loopexit.unr-lcssa
+; PROLOG-UNROLL:  br i1 %prol.iter.cmp.1, label %for.body.prol, label %for.body.prol.loopexit.unr-lcssa, !llvm.loop ![[LOOP_0:[0-9]+]]
+; PROLOG-NO-UNROLL:  br i1 %prol.iter.cmp, label %for.body.prol, label %for.body.prol.loopexit.unr-lcssa, !llvm.loop ![[LOOP_0:[0-9]+]]
+; PROLOG:  br i1 %exitcond.2, label %for.end.loopexit.unr-lcssa, label %for.body, !llvm.loop ![[LOOP_2:[0-9]+]]
 
 ; PROLOG: ![[LOOP_0]] = distinct !{![[LOOP_0]], ![[FOLLOWUP_ALL:[0-9]+]], ![[FOLLOWUP_REMAINDER:[0-9]+]]}
 ; PROLOG: ![[FOLLOWUP_ALL]] = !{!"FollowupAll"}
diff --git a/llvm/test/Transforms/LoopUnroll/runtime-exit-phi-scev-invalidation.ll b/llvm/test/Transforms/LoopUnroll/runtime-exit-phi-scev-invalidation.ll
index 0c52b5a0edef8..047360178aa06 100644
--- a/llvm/test/Transforms/LoopUnroll/runtime-exit-phi-scev-invalidation.ll
+++ b/llvm/test/Transforms/LoopUnroll/runtime-exit-phi-scev-invalidation.ll
@@ -188,7 +188,7 @@ define void @pr56286(i64 %x, ptr %src, ptr %dst, ptr %ptr.src) !prof !0 {
 ; CHECK-NEXT:    [[L_1_LCSSA_UNR:%.*]] = phi i32 [ poison, [[OUTER_HEADER]] ], [ [[L_1_LCSSA_UNR_PH]], [[INNER_1_HEADER_PROL_LOOPEXIT_UNR_LCSSA]] ]
 ; CHECK-NEXT:    [[INNER_1_IV_UNR:%.*]] = phi i64 [ [[X]], [[OUTER_HEADER]] ], [ [[INNER_1_IV_UNR_PH]], [[INNER_1_HEADER_PROL_LOOPEXIT_UNR_LCSSA]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 7
-; CHECK-NEXT:    br i1 [[TMP4]], label [[OUTER_MIDDLE:%.*]], label [[OUTER_HEADER_NEW:%.*]], !prof [[PROF3]]
+; CHECK-NEXT:    br i1 [[TMP4]], label [[OUTER_MIDDLE:%.*]], label [[OUTER_HEADER_NEW:%.*]], !prof [[PROF6:![0-9]+]]
 ; CHECK:       outer.header.new:
 ; CHECK-NEXT:    br label [[INNER_1_HEADER:%.*]]
 ; CHECK:       inner.1.header:
@@ -232,7 +232,7 @@ define void @pr56286(i64 %x, ptr %src, ptr %dst, ptr %ptr.src) !prof !0 {
 ; CHECK-NEXT:    store i32 [[L_1_7]], ptr [[DST]], align 8
 ; CHECK-NEXT:    [[INNER_1_IV_NEXT_7]] = add i64 [[INNER_1_IV]], 8
 ; CHECK-NEXT:    [[CMP_2_7:%.*]] = icmp sgt i64 [[INNER_1_IV_NEXT_6]], 0
-; CHECK-NEXT:    br i1 [[CMP_2_7]], label [[OUTER_MIDDLE_UNR_LCSSA:%.*]], label [[INNER_1_HEADER]], !prof [[PROF6:![0-9]+]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP_2_7]], label [[OUTER_MIDDLE_UNR_LCSSA:%.*]], label [[INNER_1_HEADER]], !prof [[PROF7:![0-9]+]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       outer.middle.unr-lcssa:
 ; CHECK-NEXT:    [[L_1_LCSSA_PH:%.*]] = phi i32 [ [[L_1_7]], [[INNER_1_LATCH_7]] ]
 ; CHECK-NEXT:    br label [[OUTER_MIDDLE]]
diff --git a/llvm/test/Transforms/LoopUnroll/runtime-loop-branchweight.ll b/llvm/test/Transforms/LoopUnroll/runtime-loop-branchweight.ll
index 26171990a2592..2f8f98d40e86f 100644
--- a/llvm/test/Transforms/LoopUnroll/runtime-loop-branchweight.ll
+++ b/llvm/test/Transforms/LoopUnroll/runtime-loop-branchweight.ll
@@ -2,12 +2,24 @@
 
 ;; Check that the remainder loop is properly assigned a branch weight for its latch branch.
 ; CHECK-LABEL: @test(
-; CHECK-LABEL: for.body:
-; CHECK: br i1 [[COND1:%.*]], label %for.end.loopexit.unr-lcssa, label %for.body, !prof ![[#PROF:]], !llvm.loop ![[#LOOP:]]
-; CHECK-LABEL: for.body.epil:
-; CHECK: br i1 [[COND2:%.*]], label  %for.body.epil, label %for.end.loopexit.epilog-lcssa, !prof ![[#PROF2:]], !llvm.loop ![[#LOOP2:]]
-; CHECK: ![[#PROF]] = !{!"branch_weights", i32 1, i32 2499}
-; CHECK: ![[#PROF2]] = !{!"branch_weights", i32 1, i32 1}
+; CHECK-LABEL: entry:
+;       CHECK: [[FOR_BODY_PREHEADER:.*]]:
+;       CHECK:   br i1 %{{.*}}, label %[[FOR_BODY_EPIL_PREHEADER:.*]], label %[[FOR_BODY_PREHEADER_NEW:.*]], !prof ![[#PROF_UR_GUARD:]]
+;       CHECK: [[FOR_BODY_PREHEADER_NEW]]:
+;       CHECK:   br label %for.body
+;       CHECK: for.body:
+;       CHECK:   %add = add
+;       CHECK:   %add.1 = add
+;       CHECK:   %add.2 = add
+;       CHECK:   %add.3 = add
+;   CHECK-NOT:   %add.4 = add
+;       CHECK:   br i1 %{{.*}}, label %[[FOR_END_LOOPEXIT_UNR_LCSSA:.*]], label %for.body, !prof ![[#PROF_UR_LATCH:]], !llvm.loop ![[#LOOP_UR_LATCH:]]
+;       CHECK: [[FOR_END_LOOPEXIT_UNR_LCSSA]]:
+;       CHECK:   br i1 %{{.*}}, label %[[FOR_BODY_EPIL_PREHEADER]], label %[[FOR_END_LOOPEXIT:.*]], !prof ![[#PROF_RM_GUARD:]]
+;       CHECK: [[FOR_BODY_EPIL_PREHEADER]]:
+;       CHECK:   br label %[[FOR_BODY_EPIL:.*]]
+;       CHECK: [[FOR_BODY_EPIL]]:
+;       CHECK:   br i1 {{.*}}, label %[[FOR_BODY_EPIL]], label %[[FOR_END_LOOPEXIT_EPILOG_LCSSA:.*]], !prof ![[#PROF_RM_LATCH:]], !llvm.loop ![[#LOOP_RM_LATCH:]]
 
 define i3 @test(ptr %a, i3 %n) {
 entry:
@@ -31,3 +43,37 @@ for.end:
 }
 
 !0 = !{!"branch_weights", i32 1, i32 9999}
+
+; Original loop probability: p = 9999/(1+9999) = 0.9999
+; Original estimated trip count: (1+9999)/1 = 10000
+; Unroll count: 4
+
+; Probability of >=3 iterations after first: p^3 = 0.9970003 =~
+; 2146839468 / (644180 + 2146839468).
+; CHECK: ![[#PROF_UR_GUARD]] = !{!"branch_weights", i32 644180, i32 2146839468}
+
+; Probability of >=4 more iterations: p^4 = 0.99960006 =~
+; 2146624784 / (858864 + 2146624784).
+; CHECK: ![[#PROF_UR_LATCH]] = !{!"branch_weights", i32 858864, i32 2146624784}
+
+; 10000//4 = 2500
+; CHECK: ![[#LOOP_UR_LATCH]] = distinct !{![[#LOOP_UR_LATCH]], ![[#LOOP_UR_TC:]], ![[#DISABLE:]]}
+; CHECK: ![[#LOOP_UR_TC]] = !{!"llvm.loop.estimated_trip_count", i32 2500}
+;
+; CHECK: ![[#DISABLE]] = !{!"llvm.loop.unroll.disable"}
+
+; Probability of 1 to 3 more of 3 more remainder iterations:
+; (p-p^4)/(1-p^4) = 0.749962497 =~ 1610532724 / (1610532724 + 536950924).
+; CHECK: ![[#PROF_RM_GUARD]] = !{!"branch_weights", i32 1610532724, i32 536950924}
+
+; Frequency of first remainder iter:  r1 =                      1
+; Frequency of second remainder iter: r2 = r1*(p-p^3)/(1-p^3) = 0.666633331
+; Frequency of third remainder iter:  r3 = r2*(p-p^2)/(1-p^2) = 0.333299999
+; Solve for loop probability that produces that frequency: f = 1/(1-p') =>
+; p' = 1-1/f = 1-1/(r1+r2+r3) = 0.499983332 =~
+; 1073706403 / (1073706403 + 1073777245).
+; CHECK: ![[#PROF_RM_LATCH]] = !{!"branch_weights", i32 1073706403, i32 1073777245}
+
+; 10000%4 = 0
+; CHECK: ![[#LOOP_RM_LATCH]] = distinct !{![[#LOOP_RM_LATCH]], ![[#LOOP_RM_TC:]], ![[#DISABLE:]]}
+; CHECK: ![[#LOOP_RM_TC]] = !{!"llvm.loop.estimated_trip_count", i32 0}
diff --git a/llvm/test/Transforms/LoopUnroll/runtime-loop.ll b/llvm/test/Transforms/LoopUnroll/runtime-loop.ll
index 492de063573be..ec7aba432b484 100644
--- a/llvm/test/Transforms/LoopUnroll/runtime-loop.ll
+++ b/llvm/test/Transforms/LoopUnroll/runtime-loop.ll
@@ -295,11 +295,12 @@ exit2.loopexit:
 ; COMMON-LABEL: {{^}}!0 =
 
 ; EPILOG: [[EPILOG_PROF_0]] = !{!"branch_weights", i32 1, i32 11}
-; EPILOG: [[EPILOG_PROF_1]] = !{!"branch_weights", i32 1, i32 127}
-; EPILOG: [[EPILOG_PROF_2]] = !{!"branch_weights", i32 1, i32 7}
-; EPILOG: [[EPILOG_PROF_3]] = !{!"branch_weights", i32 3, i32 1}
+; EPILOG: [[EPILOG_PROF_1]] = !{!"branch_weights", i32 326124004, i32 1821359644}
+; EPILOG: [[EPILOG_PROF_2]] = !{!"branch_weights", i32 1856428066, i32 291055582}
+; EPILOG: [[EPILOG_PROF_3]] = !{!"branch_weights", i32 1597681585, i32 549802063}
 
-; EPILOG: [[EPILOG_LOOP]] = distinct !{[[EPILOG_LOOP]], [[EPILOG_LOOP_1:![0-9]+]]}
+; EPILOG: [[EPILOG_LOOP]] = distinct !{[[EPILOG_LOOP]], [[EPILOG_TC:![0-9]+]], [[EPILOG_LOOP_1:![0-9]+]]}
+; EPILOG: [[EPILOG_TC]] = !{!"llvm.loop.estimated_trip_count", i32 3}
 ; EPILOG: [[EPILOG_LOOP_1]] = !{!"llvm.loop.unroll.disable"}
 
 ; PROLOG: [[PROLOG_PROF_0]] = !{!"branch_weights", i32 1, i32 11}
diff --git a/llvm/test/Transforms/LoopUnroll/unroll-heuristics-pgo.ll b/llvm/test/Transforms/LoopUnroll/unroll-heuristics-pgo.ll
index 611ee5fb5807e..02f5bf932132e 100644
--- a/llvm/test/Transforms/LoopUnroll/unroll-heuristics-pgo.ll
+++ b/llvm/test/Transforms/LoopUnroll/unroll-heuristics-pgo.ll
@@ -3,14 +3,27 @@
 @known_constant = internal unnamed_addr constant [9 x i32] [i32 0, i32 -1, i32 0, i32 -1, i32 5, i32 -1, i32 0, i32 -1, i32 0], align 16
 
 ; CHECK-LABEL: @bar_prof
-; CHECK: loop:
-; CHECK:   %mul = mul
-; CHECK:   %mul.1 = mul
-; CHECK:   %mul.2 = mul
-; CHECK:   %mul.3 = mul
-; CHECK:   br i1 %niter.ncmp.7, label %loop.end.unr-lcssa, label %loop, !prof [[PROF0:![0-9]+]]
-; CHECK: loop.epil:
-; CHECK:   br i1 %epil.iter.cmp, label %loop.epil, label %loop.end.epilog-lcssa, !prof [[PROF1:![0-9]+]], !llvm.loop {{![0-9]+}}
+;       CHECK: entry:
+;       CHECK:   br i1 %{{.*}}, label %[[LOOP_EPIL_PREHEADER:.*]], label %[[ENTRY_NEW:.*]], !prof ![[#PROF_UR_GUARD:]]
+;       CHECK: [[ENTRY_NEW]]:
+;       CHECK:   br label %loop
+;       CHECK: loop:
+;       CHECK:   %mul = mul
+;       CHECK:   %mul.1 = mul
+;       CHECK:   %mul.2 = mul
+;       CHECK:   %mul.3 = mul
+;       CHECK:   %mul.4 = mul
+;       CHECK:   %mul.5 = mul
+;       CHECK:   %mul.6 = mul
+;       CHECK:   %mul.7 = mul
+;   CHECK-NOT:   %mul.8 = mul
+;       CHECK:   br i1 %{{.*}}, label %[[LOOP_END_UNR_LCSSA:.*]], label %loop, !prof ![[#PROF_UR_LATCH:]], !llvm.loop ![[#LOOP_UR_LATCH:]]
+;       CHECK: [[LOOP_END_UNR_LCSSA]]:
+;       CHECK:   br i1 %{{.*}}, label %[[LOOP_EPIL_PREHEADER]], label %loop.end, !prof ![[#PROF_RM_GUARD:]]
+;       CHECK: [[LOOP_EPIL_PREHEADER]]:
+;       CHECK:   br label %[[LOOP_EPIL:.*]]
+;       CHECK: [[LOOP_EPIL]]:
+;       CHECK:   br i1 %{{.*}}, label %[[LOOP_EPIL]], label %[[LOOP_END_EPILOG_LCSSA:.*]], !prof ![[#PROF_RM_LATCH:]], !llvm.loop ![[#LOOP_RM_LATCH:]]
 define i32 @bar_prof(ptr noalias nocapture readonly %src, i64 %c) !prof !1 {
 entry:
   br label %loop
@@ -60,5 +73,38 @@ loop.end:
 !1 = !{!"function_entry_count", i64 1}
 !2 = !{!"branch_weights", i32 1, i32 1000}
 
-; CHECK: [[PROF0]] = !{!"branch_weights", i32 1, i32 124}
-; CHECK: [[PROF1]] = !{!"branch_weights", i32 3, i32 1}
+; Original loop probability: p = 1000/(1+1000) = 0.999000999
+; Original estimated trip count: (1+1000)/1 = 1001
+; Unroll count: 8
+
+; Probability of >=7 iterations after first: p^7 = 0.993027916 =~
+; 2132511214 / (14972434 + 2132511214).
+; CHECK: ![[#PROF_UR_GUARD]] = !{!"branch_weights", i32 14972434, i32 2132511214}
+
+; Probability of >=8 more iterations: p^8 = 0.99203588 =~
+; 2130380833 / (17102815 + 2130380833).
+; CHECK: ![[#PROF_UR_LATCH]] = !{!"branch_weights", i32 17102815, i32 2130380833}
+
+; 1001//8 = 125
+; CHECK: ![[#LOOP_UR_LATCH]] = distinct !{![[#LOOP_UR_LATCH]], ![[#LOOP_UR_TC:]]}
+; CHECK: ![[#LOOP_UR_TC]] = !{!"llvm.loop.estimated_trip_count", i32 125}
+
+; Probability of 1 to 7 more of 7 more remainder iterations:
+; (p-p^8)/(1-p^8) = 0.874562282 =~ 1878108210 / (1878108210 + 269375438).
+; CHECK: ![[#PROF_RM_GUARD]] = !{!"branch_weights", i32 1878108210, i32 269375438}
+
+; Frequency of first remainder iter:   r1 =                      1
+; Frequency of second remainder iter:  r2 = r1*(p-p^7)/(1-p^7) = 0.856714143
+; Frequency of third remainder iter:   r3 = r2*(p-p^6)/(1-p^6) = 0.713571429
+; Frequency of fourth remainder iter:  r4 = r2*(p-p^5)/(1-p^5) = 0.570571715
+; Frequency of fifth remainder iter:   r5 = r2*(p-p^4)/(1-p^4) = 0.427714858
+; Frequency of sixth remainder iter:   r6 = r2*(p-p^3)/(1-p^3) = 0.285000715
+; Frequency of seventh remainder iter: r7 = r2*(p-p^2)/(1-p^2) = 0.142429143
+; Solve for loop probability that produces that frequency: f = 1/(1-p') =>
+; p' = 1-1/f = 1-1/(r1+r2+r3+r4+r5+r6+r7) = 0.749749875 =~
+; 1610075606 / (1610075606 + 537408042).
+; CHECK: ![[#PROF_RM_LATCH]] = !{!"branch_weights", i32 1610075606, i32 537408042}
+
+; Remainder estimated trip count: 1001%8 = 1
+; CHECK: ![[#LOOP_RM_LATCH]] = distinct !{![[#LOOP_RM_LATCH]], ![[#LOOP_RM_TC:]], ![[#DISABLE:]]}
+; CHECK: ![[#LOOP_RM_TC]] = !{!"llvm.loop.estimated_trip_count", i32 1}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll
index bfee39eac0ae2..068f82c7db670 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll
@@ -365,8 +365,8 @@ define void @invalid_legacy_cost(i64 %N, ptr %x) #0 {
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = alloca i8, i64 0, align 16
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x ptr> [[TMP7]], ptr [[TMP6]], i32 1
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP6]], i64 0
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x ptr> [[BROADCAST_SPLATINSERT]], <2 x ptr> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr ptr, ptr [[X]], i64 [[INDEX]]
 ; CHECK-NEXT:    store <2 x ptr> [[TMP8]], ptr [[TMP9]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/licm-calls.ll b/llvm/test/Transforms/LoopVectorize/AArch64/licm-calls.ll
index ea0148952f51b..0a9494e4c7ade 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/licm-calls.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/licm-calls.ll
@@ -10,8 +10,8 @@ define void @licm_replicate_call(double %x, ptr %dst) {
 ; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call double @llvm.pow.f64(double [[X]], double 3.000000e+00)
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[TMP1]], i32 1
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT]], <2 x double> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call-scalarize.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call-scalarize.ll
index 157b78704234a..35589573eed76 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call-scalarize.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call-scalarize.ll
@@ -64,9 +64,9 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 {
 ; TFCOMMON-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ]
 ; TFCOMMON-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[PRED_STORE_CONTINUE6]] ]
 ; TFCOMMON-NEXT:    [[LD:%.*]] = load double, ptr [[P2:%.*]], align 8
-; TFCOMMON-NEXT:    [[TMP5:%.*]] = tail call double @llvm.exp.f64(double [[LD]]) #[[ATTR3:[0-9]+]]
-; TFCOMMON-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> poison, double [[TMP5]], i32 0
-; TFCOMMON-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[TMP5]], i32 1
+; TFCOMMON-NEXT:    [[TMP5:%.*]] = tail call double @llvm.exp.f64(double [[LD]]) #[[ATTR2:[0-9]+]]
+; TFCOMMON-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[TMP5]], i64 0
+; TFCOMMON-NEXT:    [[TMP8:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT]], <2 x double> poison, <2 x i32> zeroinitializer
 ; TFCOMMON-NEXT:    [[TMP9:%.*]] = fcmp ogt <2 x double> [[TMP8]], zeroinitializer
 ; TFCOMMON-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP9]], <2 x double> zeroinitializer, <2 x double> splat (double 1.000000e+00)
 ; TFCOMMON-NEXT:    [[TMP16:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i32 0
@@ -79,7 +79,7 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 {
 ; TFCOMMON-NEXT:    [[TMP14:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i32 1
 ; TFCOMMON-NEXT:    br i1 [[TMP14]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE6]]
 ; TFCOMMON:       pred.store.if1:
-; TFCOMMON-NEXT:    [[TMP19:%.*]] = extractelement <2 x double> [[PREDPHI]], i32 1
+; TFCOMMON-NEXT:    [[TMP19:%.*]] = extractelement <2 x double> [[PREDPHI]], i32 0
 ; TFCOMMON-NEXT:    store double [[TMP19]], ptr [[P]], align 8
 ; TFCOMMON-NEXT:    br label [[PRED_STORE_CONTINUE6]]
 ; TFCOMMON:       pred.store.continue2:
@@ -105,9 +105,9 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 {
 ; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[PRED_STORE_CONTINUE9]] ]
 ; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK2:%.*]] = phi <2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY1]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT10:%.*]], [[PRED_STORE_CONTINUE9]] ]
 ; TFA_INTERLEAVE-NEXT:    [[TMP4:%.*]] = load double, ptr [[P2:%.*]], align 8
-; TFA_INTERLEAVE-NEXT:    [[TMP9:%.*]] = tail call double @llvm.exp.f64(double [[TMP4]]) #[[ATTR3:[0-9]+]]
-; TFA_INTERLEAVE-NEXT:    [[TMP11:%.*]] = insertelement <2 x double> poison, double [[TMP9]], i32 0
-; TFA_INTERLEAVE-NEXT:    [[TMP12:%.*]] = insertelement <2 x double> [[TMP11]], double [[TMP9]], i32 1
+; TFA_INTERLEAVE-NEXT:    [[TMP5:%.*]] = tail call double @llvm.exp.f64(double [[TMP4]]) #[[ATTR2:[0-9]+]]
+; TFA_INTERLEAVE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[TMP5]], i64 0
+; TFA_INTERLEAVE-NEXT:    [[TMP12:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT]], <2 x double> poison, <2 x i32> zeroinitializer
 ; TFA_INTERLEAVE-NEXT:    [[TMP14:%.*]] = fcmp ogt <2 x double> [[TMP12]], zeroinitializer
 ; TFA_INTERLEAVE-NEXT:    [[PREDPHI3:%.*]] = select <2 x i1> [[TMP14]], <2 x double> zeroinitializer, <2 x double> splat (double 1.000000e+00)
 ; TFA_INTERLEAVE-NEXT:    [[TMP19:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i32 0
@@ -120,7 +120,7 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 {
 ; TFA_INTERLEAVE-NEXT:    [[TMP29:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i32 1
 ; TFA_INTERLEAVE-NEXT:    br i1 [[TMP29]], label [[PRED_STORE_IF4:%.*]], label [[PRED_STORE_CONTINUE5:%.*]]
 ; TFA_INTERLEAVE:       pred.store.if3:
-; TFA_INTERLEAVE-NEXT:    [[TMP22:%.*]] = extractelement <2 x double> [[PREDPHI3]], i32 1
+; TFA_INTERLEAVE-NEXT:    [[TMP22:%.*]] = extractelement <2 x double> [[PREDPHI3]], i32 0
 ; TFA_INTERLEAVE-NEXT:    store double [[TMP22]], ptr [[P]], align 8
 ; TFA_INTERLEAVE-NEXT:    br label [[PRED_STORE_CONTINUE5]]
 ; TFA_INTERLEAVE:       pred.store.continue4:
@@ -134,7 +134,7 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 {
 ; TFA_INTERLEAVE-NEXT:    [[TMP25:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK2]], i32 1
 ; TFA_INTERLEAVE-NEXT:    br i1 [[TMP25]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9]]
 ; TFA_INTERLEAVE:       pred.store.if7:
-; TFA_INTERLEAVE-NEXT:    [[TMP34:%.*]] = extractelement <2 x double> [[PREDPHI3]], i32 1
+; TFA_INTERLEAVE-NEXT:    [[TMP34:%.*]] = extractelement <2 x double> [[PREDPHI3]], i32 0
 ; TFA_INTERLEAVE-NEXT:    store double [[TMP34]], ptr [[P]], align 8
 ; TFA_INTERLEAVE-NEXT:    br label [[PRED_STORE_CONTINUE9]]
 ; TFA_INTERLEAVE:       pred.store.continue8:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll
index 7f345133f51dd..68cfc659e1e94 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll
@@ -660,6 +660,114 @@ exit:
   ret i32 %red
 }
 
+
+define i32 @test_or_reduction_with_stride_2(i32 %scale, ptr %src) {
+; CHECK-LABEL: define i32 @test_or_reduction_with_stride_2(
+; CHECK-SAME: i32 [[SCALE:%.*]], ptr [[SRC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[SCALE]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP66:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 6
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 8
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 10
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 12
+; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 14
+; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 16
+; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], 18
+; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[OFFSET_IDX]], 20
+; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], 22
+; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[OFFSET_IDX]], 24
+; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 26
+; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], 28
+; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 30
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr [32 x i8], ptr [[SRC]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr [32 x i8], ptr [[SRC]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr [32 x i8], ptr [[SRC]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr [32 x i8], ptr [[SRC]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr [32 x i8], ptr [[SRC]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr [32 x i8], ptr [[SRC]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr [32 x i8], ptr [[SRC]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr [32 x i8], ptr [[SRC]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr [32 x i8], ptr [[SRC]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr [32 x i8], ptr [[SRC]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr [32 x i8], ptr [[SRC]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr [32 x i8], ptr [[SRC]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr [32 x i8], ptr [[SRC]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr [32 x i8], ptr [[SRC]], i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr [32 x i8], ptr [[SRC]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr [32 x i8], ptr [[SRC]], i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP32:%.*]] = load i8, ptr [[TMP16]], align 1
+; CHECK-NEXT:    [[TMP33:%.*]] = load i8, ptr [[TMP17]], align 1
+; CHECK-NEXT:    [[TMP34:%.*]] = load i8, ptr [[TMP18]], align 1
+; CHECK-NEXT:    [[TMP35:%.*]] = load i8, ptr [[TMP19]], align 1
+; CHECK-NEXT:    [[TMP36:%.*]] = load i8, ptr [[TMP20]], align 1
+; CHECK-NEXT:    [[TMP37:%.*]] = load i8, ptr [[TMP21]], align 1
+; CHECK-NEXT:    [[TMP38:%.*]] = load i8, ptr [[TMP22]], align 1
+; CHECK-NEXT:    [[TMP39:%.*]] = load i8, ptr [[TMP23]], align 1
+; CHECK-NEXT:    [[TMP40:%.*]] = load i8, ptr [[TMP24]], align 1
+; CHECK-NEXT:    [[TMP41:%.*]] = load i8, ptr [[TMP25]], align 1
+; CHECK-NEXT:    [[TMP42:%.*]] = load i8, ptr [[TMP26]], align 1
+; CHECK-NEXT:    [[TMP43:%.*]] = load i8, ptr [[TMP27]], align 1
+; CHECK-NEXT:    [[TMP44:%.*]] = load i8, ptr [[TMP28]], align 1
+; CHECK-NEXT:    [[TMP45:%.*]] = load i8, ptr [[TMP29]], align 1
+; CHECK-NEXT:    [[TMP46:%.*]] = load i8, ptr [[TMP30]], align 1
+; CHECK-NEXT:    [[TMP47:%.*]] = load i8, ptr [[TMP31]], align 1
+; CHECK-NEXT:    [[TMP48:%.*]] = insertelement <16 x i8> poison, i8 [[TMP32]], i32 0
+; CHECK-NEXT:    [[TMP49:%.*]] = insertelement <16 x i8> [[TMP48]], i8 [[TMP33]], i32 1
+; CHECK-NEXT:    [[TMP50:%.*]] = insertelement <16 x i8> [[TMP49]], i8 [[TMP34]], i32 2
+; CHECK-NEXT:    [[TMP51:%.*]] = insertelement <16 x i8> [[TMP50]], i8 [[TMP35]], i32 3
+; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <16 x i8> [[TMP51]], i8 [[TMP36]], i32 4
+; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <16 x i8> [[TMP52]], i8 [[TMP37]], i32 5
+; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <16 x i8> [[TMP53]], i8 [[TMP38]], i32 6
+; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <16 x i8> [[TMP54]], i8 [[TMP39]], i32 7
+; CHECK-NEXT:    [[TMP56:%.*]] = insertelement <16 x i8> [[TMP55]], i8 [[TMP40]], i32 8
+; CHECK-NEXT:    [[TMP57:%.*]] = insertelement <16 x i8> [[TMP56]], i8 [[TMP41]], i32 9
+; CHECK-NEXT:    [[TMP58:%.*]] = insertelement <16 x i8> [[TMP57]], i8 [[TMP42]], i32 10
+; CHECK-NEXT:    [[TMP59:%.*]] = insertelement <16 x i8> [[TMP58]], i8 [[TMP43]], i32 11
+; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <16 x i8> [[TMP59]], i8 [[TMP44]], i32 12
+; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <16 x i8> [[TMP60]], i8 [[TMP45]], i32 13
+; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <16 x i8> [[TMP61]], i8 [[TMP46]], i32 14
+; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <16 x i8> [[TMP62]], i8 [[TMP47]], i32 15
+; CHECK-NEXT:    [[TMP64:%.*]] = sext <16 x i8> [[TMP63]] to <16 x i32>
+; CHECK-NEXT:    [[TMP65:%.*]] = mul <16 x i32> [[BROADCAST_SPLAT]], [[TMP64]]
+; CHECK-NEXT:    [[TMP66]] = or <16 x i32> [[TMP65]], [[VEC_PHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEXT:    [[TMP67:%.*]] = icmp eq i64 [[INDEX_NEXT]], 48
+; CHECK-NEXT:    br i1 [[TMP67]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP68:%.*]] = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> [[TMP66]])
+; CHECK-NEXT:    br label %[[SCALAR_PH:.*]]
+; CHECK:       [[SCALAR_PH]]:
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
+  %reduction = phi i32 [ %reduction.next, %loop ], [ 0, %entry ]
+  %gep = getelementptr [32 x i8], ptr %src, i64 %iv
+  %load = load i8, ptr %gep, align 1
+  %sext = sext i8 %load to i32
+  %mul = mul i32 %scale, %sext
+  %reduction.next = or i32 %mul, %reduction
+  %iv.next = add i64 %iv, 2
+  %cmp = icmp eq i64 %iv.next, 100
+  br i1 %cmp, label %exit, label %loop
+
+exit:
+  ret i32 %reduction.next
+}
+
 attributes #0 = { "target-cpu"="neoverse-512tvb" }
 
 !0 = !{!1, !2, i64 0}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
index 49f663f5703b6..62e248bed85d9 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
@@ -1,12 +1,12 @@
 ; REQUIRES: asserts
-; RUN: opt -mattr=+neon,+dotprod -passes=loop-vectorize -debug-only=loop-vectorize -force-vector-interleave=1 -enable-epilogue-vectorization -epilogue-vectorization-force-VF=2 -disable-output %s 2>&1 | FileCheck %s
+; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize -disable-output %s 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64-none-unknown-elf"
 
 ; Tests for printing VPlans that are enabled under AArch64
 
-define i32 @print_partial_reduction(ptr %a, ptr %b) {
+define i32 @print_partial_reduction(ptr %a, ptr %b) "target-features"="+neon,+dotprod" {
 ; CHECK:      VPlan 'Initial VPlan for VF={8,16},UF>=1' {
 ; CHECK-NEXT: Live-in vp<[[VF:%.]]> = VF
 ; CHECK-NEXT: Live-in vp<[[VFxUF:%.]]> = VF * UF
@@ -69,60 +69,37 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) {
 ; CHECK-NEXT: No successors
 ; CHECK-NEXT: }
 ; CHECK: VPlan 'Final VPlan for VF={8,16},UF={1}' {
+; CHECK-NEXT: Live-in ir<1024> = vector-trip-count
 ; CHECK-NEXT: Live-in ir<1024> = original trip-count
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<entry>:
-; CHECK-NEXT: Successor(s): ir-bb<scalar.ph>, ir-bb<vector.main.loop.iter.check>
+; CHECK-NEXT: Successor(s): vector.ph
 ; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<vector.main.loop.iter.check>:
-; CHECK-NEXT: Successor(s): ir-bb<scalar.ph>, ir-bb<vector.ph>
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<vector.ph>:
-; CHECK-NEXT:  EMIT vp<[[RDX_START:%.+]]> = reduction-start-vector ir<0>, ir<0>, ir<4>
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT:   EMIT vp<%1> = reduction-start-vector ir<0>, ir<0>, ir<4>
 ; CHECK-NEXT: Successor(s): vector.body
 ; CHECK-EMPTY:
 ; CHECK-NEXT: vector.body:
-; CHECK-NEXT:   EMIT-SCALAR vp<[[EP_IV:%.+]]> = phi [ ir<0>, ir-bb<vector.ph> ], [ vp<%index.next>, vector.body ]
-; CHECK-NEXT:   WIDEN-REDUCTION-PHI ir<%accum> = phi vp<[[RDX_START]]>, ir<%add> (VF scaled by 1/4)
-; CHECK-NEXT:   CLONE ir<%gep.a> = getelementptr ir<%a>, vp<[[EP_IV]]>
+; CHECK-NEXT:   EMIT-SCALAR vp<%index> = phi [ ir<0>, vector.ph ], [ vp<%index.next>, vector.body ]
+; CHECK-NEXT:   WIDEN-REDUCTION-PHI ir<%accum> = phi vp<%1>, ir<%add> (VF scaled by 1/4)
+; CHECK-NEXT:   CLONE ir<%gep.a> = getelementptr ir<%a>, vp<%index>
 ; CHECK-NEXT:   WIDEN ir<%load.a> = load ir<%gep.a>
-; CHECK-NEXT:   CLONE ir<%gep.b> = getelementptr ir<%b>, vp<[[EP_IV]]>
+; CHECK-NEXT:   CLONE ir<%gep.b> = getelementptr ir<%b>, vp<%index>
 ; CHECK-NEXT:   WIDEN ir<%load.b> = load ir<%gep.b>
 ; CHECK-NEXT:   WIDEN-CAST ir<%ext.b> = zext ir<%load.b> to i32
 ; CHECK-NEXT:   WIDEN-CAST ir<%ext.a> = zext ir<%load.a> to i32
 ; CHECK-NEXT:   WIDEN ir<%mul> = mul ir<%ext.b>, ir<%ext.a>
 ; CHECK-NEXT:   PARTIAL-REDUCE ir<%add> = add ir<%accum>, ir<%mul>
-; CHECK-NEXT:   EMIT vp<[[EP_IV_NEXT:%.+]]> = add nuw vp<[[EP_IV]]>, ir<16>
-; CHECK-NEXT:   EMIT branch-on-count vp<[[EP_IV_NEXT]]>, ir<1024>
+; CHECK-NEXT:   EMIT vp<%index.next> = add nuw vp<%index>, ir<16>
+; CHECK-NEXT:   EMIT branch-on-count vp<%index.next>, ir<1024>
 ; CHECK-NEXT: Successor(s): middle.block, vector.body
 ; CHECK-EMPTY:
 ; CHECK-NEXT: middle.block:
-; CHECK-NEXT:   EMIT vp<[[RED_RESULT:%.+]]> = compute-reduction-result ir<%accum>, ir<%add>
-; CHECK-NEXT:   EMIT branch-on-cond ir<true>
-; CHECK-NEXT: Successor(s): ir-bb<exit>, ir-bb<scalar.ph>
+; CHECK-NEXT:   EMIT vp<%3> = compute-reduction-result ir<%accum>, ir<%add>
+; CHECK-NEXT: Successor(s): ir-bb<exit>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<exit>:
-; CHECK-NEXT:   IR   %add.lcssa = phi i32 [ %add, %for.body ] (extra operand: vp<[[RED_RESULT]]> from middle.block)
-; CHECK-NEXT: No successors
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<scalar.ph>:
-; CHECK-NEXT:   EMIT-SCALAR vp<[[EP_RESUME:%.+]]> = phi [ ir<1024>, middle.block ], [ ir<0>, ir-bb<entry> ]
-; CHECK-NEXT:   EMIT-SCALAR vp<[[EP_MERGE:%.+]]> = phi [ vp<[[RED_RESULT]]>, middle.block ], [ ir<0>, ir-bb<entry> ]
-; CHECK-NEXT:   EMIT-SCALAR vp<%6> = resume-for-epilogue vp<%vec.epilog.resume.val>
-; CHECK-NEXT: Successor(s): ir-bb<for.body>
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<for.body>:
-; CHECK-NEXT:   IR   %accum = phi i32 [ 0, %scalar.ph ], [ %add, %for.body ] (extra operand: vp<[[EP_MERGE]]> from ir-bb<scalar.ph>)
-; CHECK-NEXT:   IR   %gep.a = getelementptr i8, ptr %a, i64 %iv
-; CHECK-NEXT:   IR   %load.a = load i8, ptr %gep.a, align 1
-; CHECK-NEXT:   IR   %ext.a = zext i8 %load.a to i32
-; CHECK-NEXT:   IR   %gep.b = getelementptr i8, ptr %b, i64 %iv
-; CHECK-NEXT:   IR   %load.b = load i8, ptr %gep.b, align 1
-; CHECK-NEXT:   IR   %ext.b = zext i8 %load.b to i32
-; CHECK-NEXT:   IR   %mul = mul i32 %ext.b, %ext.a
-; CHECK-NEXT:   IR   %add = add i32 %mul, %accum
-; CHECK-NEXT:   IR   %iv.next = add i64 %iv, 1
-; CHECK-NEXT:   IR   %exitcond.not = icmp eq i64 %iv.next, 1024
+; CHECK-NEXT:   IR   %add.lcssa = phi i32 [ %add, %for.body ] (extra operand: vp<%3> from middle.block)
 ; CHECK-NEXT: No successors
 ; CHECK-NEXT: }
 entry:
@@ -141,8 +118,12 @@ for.body:                                         ; preds = %for.body, %entry
   %add = add i32 %mul, %accum
   %iv.next = add i64 %iv, 1
   %exitcond.not = icmp eq i64 %iv.next, 1024
-  br i1 %exitcond.not, label %exit, label %for.body
+  br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !0
 
 exit:
   ret i32 %add
 }
+
+!0 = distinct !{!0, !2, !3}
+!2 = !{!"llvm.loop.interleave.count", i32 1}
+!3 = !{!"llvm.loop.vectorize.predicate.enable", i1 false}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/replicate-recipe-with-only-first-lane-used.ll b/llvm/test/Transforms/LoopVectorize/X86/replicate-recipe-with-only-first-lane-used.ll
index 03087bb883464..4590dfc5326b5 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/replicate-recipe-with-only-first-lane-used.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/replicate-recipe-with-only-first-lane-used.ll
@@ -199,10 +199,8 @@ define float @uniform_load_replicating_select(ptr %A, ptr %B, i64 %1) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 7
 ; CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr [[A]], align 4
 ; CHECK-NEXT:    [[TMP10:%.*]] = fcmp ogt float [[TMP6]], 0.000000e+00
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i1> poison, i1 [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x i1> [[TMP8]], i1 [[TMP10]], i32 1
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x i1> [[TMP9]], i1 [[TMP10]], i32 2
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <4 x i1> [[TMP13]], i1 [[TMP10]], i32 3
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP10]], i64 0
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP4]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/scev-checks-unprofitable.ll b/llvm/test/Transforms/LoopVectorize/X86/scev-checks-unprofitable.ll
index 905e67b8723f9..7e6b5e932b6c6 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/scev-checks-unprofitable.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/scev-checks-unprofitable.ll
@@ -17,13 +17,12 @@ define void @value_defined_in_loop1_used_for_trip_counts(i32 %start, i1 %c, ptr
 ; CHECK-NEXT:    [[IV_1:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[ZEXT]], %[[LOOP_1]] ]
 ; CHECK-NEXT:    br i1 false, label %[[LOOP_1_EXIT:.*]], label %[[LOOP_1]]
 ; CHECK:       [[LOOP_1_EXIT]]:
-; CHECK-NEXT:    [[IV_1_LCSSA2:%.*]] = phi i64 [ [[IV_1]], %[[LOOP_1]] ]
 ; CHECK-NEXT:    [[IV_1_LCSSA:%.*]] = phi i64 [ [[IV_1]], %[[LOOP_1]] ]
 ; CHECK-NEXT:    br i1 [[C]], label %[[LOOP_2_PREHEADER:.*]], label %[[LOOP_3_PREHEADER:.*]]
 ; CHECK:       [[LOOP_3_PREHEADER]]:
 ; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[IV_1_LCSSA2]], 1
+; CHECK-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[IV_1]], 1
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> poison, <16 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/uniform_load.ll b/llvm/test/Transforms/LoopVectorize/X86/uniform_load.ll
index d4004daf8833c..8081c0e17f865 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/uniform_load.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/uniform_load.ll
@@ -64,39 +64,24 @@ exit:
 define void @uniform_load_can_fold_users(ptr noalias %src, ptr %dst, i64 %start, double %d) {
 ; CHECK-LABEL: define void @uniform_load_can_fold_users(
 ; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr [[DST:%.*]], i64 [[START:%.*]], double [[D:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[START]], 1
-; CHECK-NEXT:    [[SMIN:%.*]] = call i64 @llvm.smin.i64(i64 [[START]], i64 0)
-; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[SMIN]]
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 2
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], [[SCALAR_PH:label %.*]], label %[[VECTOR_PH:.*]]
-; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 2
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[START]], [[N_VEC]]
-; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
-; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[TMP4:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_1_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_2:%.*]] = phi i64 [ [[START]], %[[ENTRY]] ], [ [[IV_2_NEXT:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = load double, ptr [[SRC]], align 8
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[TMP5]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = fmul <2 x double> [[BROADCAST_SPLAT]], splat (double 9.000000e+00)
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul double [[TMP5]], 9.000000e+00
 ; CHECK-NEXT:    [[TMP8:%.*]] = fdiv double [[TMP7]], [[D]]
-; CHECK-NEXT:    [[TMP9:%.*]] = sub i64 [[TMP3]], 1
 ; CHECK-NEXT:    [[TMP10:%.*]] = sub i64 [[TMP4]], 1
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr double, ptr [[TMP11]], i64 [[TMP9]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr double, ptr [[TMP12]], i64 [[TMP10]]
-; CHECK-NEXT:    store double [[TMP8]], ptr [[TMP13]], align 8
 ; CHECK-NEXT:    store double [[TMP8]], ptr [[TMP14]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
-; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[IV_1_NEXT]] = add i64 [[TMP4]], 1
+; CHECK-NEXT:    [[IV_2_NEXT]] = add i64 [[IV_2]], -1
+; CHECK-NEXT:    [[EC:%.*]] = icmp sgt i64 [[IV_2]], 0
+; CHECK-NEXT:    br i1 [[EC]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/create-induction-resume.ll b/llvm/test/Transforms/LoopVectorize/create-induction-resume.ll
index 62399c5d4b4ee..f9b512700f608 100644
--- a/llvm/test/Transforms/LoopVectorize/create-induction-resume.ll
+++ b/llvm/test/Transforms/LoopVectorize/create-induction-resume.ll
@@ -29,7 +29,6 @@ define void @test(i32 %arg, i32 %L1.limit, i32 %L2.switch, i1 %c, ptr %dst) {
 ; CHECK:       L1.early.exit:
 ; CHECK-NEXT:    ret void
 ; CHECK:       L1.exit:
-; CHECK-NEXT:    [[INDUCTION_IV_LCSSA1:%.*]] = phi i32 [ [[INDUCTION_IV]], [[L1_BACKEDGE]] ]
 ; CHECK-NEXT:    [[L1_EXIT_VAL:%.*]] = phi i32 [ [[L1_SUM_NEXT]], [[L1_BACKEDGE]] ]
 ; CHECK-NEXT:    br label [[L2_HEADER:%.*]]
 ; CHECK:       L2.header.loopexit:
@@ -46,11 +45,11 @@ define void @test(i32 %arg, i32 %L1.limit, i32 %L2.switch, i1 %c, ptr %dst) {
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[L1_EXIT_VAL]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[DOTSPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[INDUCTION_IV_LCSSA1]], i64 0
+; CHECK-NEXT:    [[DOTSPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[INDUCTION_IV]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT1:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul <4 x i32> <i32 0, i32 1, i32 2, i32 3>, [[DOTSPLAT1]]
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> splat (i32 1), [[TMP4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 [[INDUCTION_IV_LCSSA1]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 [[INDUCTION_IV]], 4
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP5]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/hoist-and-sink-mem-ops-with-invariant-pointers.ll b/llvm/test/Transforms/LoopVectorize/hoist-and-sink-mem-ops-with-invariant-pointers.ll
new file mode 100644
index 0000000000000..8615401af34f8
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/hoist-and-sink-mem-ops-with-invariant-pointers.ll
@@ -0,0 +1,247 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6
+; RUN: opt -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S %s | FileCheck %s
+
+define void @hoist_invariant_load_noalias_due_to_memchecks(ptr %dst, ptr %invariant_ptr, i32 %n) {
+; CHECK-LABEL: define void @hoist_invariant_load_noalias_due_to_memchecks(
+; CHECK-SAME: ptr [[DST:%.*]], ptr [[INVARIANT_PTR:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK:       [[VECTOR_MEMCHECK]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 4
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]]
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[INVARIANT_PTR]], i64 4
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[INVARIANT_PTR]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[INVARIANT_PTR]], align 4, !alias.scope [[META0:![0-9]+]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[INDEX]]
+; CHECK-NEXT:    store <4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP5]], align 4, !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[INV_VAL:%.*]] = load i32, ptr [[INVARIANT_PTR]], align 4
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[IV]]
+; CHECK-NEXT:    store i32 [[INV_VAL]], ptr [[GEP]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %inv_val = load i32, ptr %invariant_ptr, align 4
+  %gep = getelementptr inbounds i32, ptr %dst, i32 %iv
+  store i32 %inv_val, ptr %gep, align 4
+  %iv.next = add nuw nsw i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; Test that loads with non-invariant addresses are not hoisted.
+define void @dont_hoist_variant_address(ptr %dst, ptr %src, i32 %n) {
+; CHECK-LABEL: define void @dont_hoist_variant_address(
+; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[SRC2:%.*]] = ptrtoint ptr [[SRC]] to i64
+; CHECK-NEXT:    [[A1:%.*]] = ptrtoint ptr [[DST]] to i64
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK:       [[VECTOR_MEMCHECK]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 [[A1]], [[SRC2]]
+; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16
+; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[INDEX]]
+; CHECK-NEXT:    store <4 x i32> [[WIDE_LOAD]], ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[IV]]
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[IV]]
+; CHECK-NEXT:    store i32 [[VAL]], ptr [[GEP_DST]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep.src = getelementptr inbounds i32, ptr %src, i32 %iv
+  %val = load i32, ptr %gep.src, align 4
+  %gep.dst = getelementptr inbounds i32, ptr %dst, i32 %iv
+  store i32 %val, ptr %gep.dst, align 4
+  %iv.next = add nuw nsw i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; Test that predicated loads are not hoisted.
+define void @dont_hoist_predicated_load(ptr %dst, ptr %invariant_ptr, ptr %cond_ptr, i32 %n) {
+; CHECK-LABEL: define void @dont_hoist_predicated_load(
+; CHECK-SAME: ptr [[DST:%.*]], ptr [[INVARIANT_PTR:%.*]], ptr [[COND_PTR:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK:       [[VECTOR_MEMCHECK]]:
+; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[TMP20:%.*]] = zext i32 [[TMP5]] to i64
+; CHECK-NEXT:    [[TMP22:%.*]] = shl nuw nsw i64 [[TMP20]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[TMP22]], 4
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]]
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[COND_PTR]], i64 [[TMP3]]
+; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[INVARIANT_PTR]], i64 4
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[COND_PTR]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    [[BOUND03:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]]
+; CHECK-NEXT:    [[BOUND14:%.*]] = icmp ult ptr [[INVARIANT_PTR]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]]
+; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]]
+; CHECK-NEXT:    br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE11:.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[COND_PTR]], i32 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4, !alias.scope [[META11:![0-9]+]]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0
+; CHECK-NEXT:    br i1 [[TMP2]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; CHECK:       [[PRED_STORE_IF]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[INVARIANT_PTR]], align 4, !alias.scope [[META14:![0-9]+]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]]
+; CHECK-NEXT:    store i32 [[TMP7]], ptr [[TMP9]], align 4, !alias.scope [[META16:![0-9]+]], !noalias [[META18:![0-9]+]]
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; CHECK:       [[PRED_STORE_CONTINUE]]:
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[PRED_STORE_IF6:.*]], label %[[PRED_STORE_CONTINUE7:.*]]
+; CHECK:       [[PRED_STORE_IF6]]:
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[INVARIANT_PTR]], align 4, !alias.scope [[META14]]
+; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP8]]
+; CHECK-NEXT:    store i32 [[TMP11]], ptr [[TMP13]], align 4, !alias.scope [[META16]], !noalias [[META18]]
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE7]]
+; CHECK:       [[PRED_STORE_CONTINUE7]]:
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2
+; CHECK-NEXT:    br i1 [[TMP10]], label %[[PRED_STORE_IF8:.*]], label %[[PRED_STORE_CONTINUE9:.*]]
+; CHECK:       [[PRED_STORE_IF8]]:
+; CHECK-NEXT:    [[TMP15:%.*]] = load i32, ptr [[INVARIANT_PTR]], align 4, !alias.scope [[META14]]
+; CHECK-NEXT:    [[TMP12:%.*]] = add i32 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP12]]
+; CHECK-NEXT:    store i32 [[TMP15]], ptr [[TMP17]], align 4, !alias.scope [[META16]], !noalias [[META18]]
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE9]]
+; CHECK:       [[PRED_STORE_CONTINUE9]]:
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i1> [[TMP1]], i32 3
+; CHECK-NEXT:    br i1 [[TMP14]], label %[[PRED_STORE_IF10:.*]], label %[[PRED_STORE_CONTINUE11]]
+; CHECK:       [[PRED_STORE_IF10]]:
+; CHECK-NEXT:    [[TMP19:%.*]] = load i32, ptr [[INVARIANT_PTR]], align 4, !alias.scope [[META14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = add i32 [[INDEX]], 3
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP16]]
+; CHECK-NEXT:    store i32 [[TMP19]], ptr [[TMP21]], align 4, !alias.scope [[META16]], !noalias [[META18]]
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE11]]
+; CHECK:       [[PRED_STORE_CONTINUE11]]:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT:    [[GEP_COND:%.*]] = getelementptr inbounds i32, ptr [[COND_PTR]], i32 [[IV]]
+; CHECK-NEXT:    [[COND:%.*]] = load i32, ptr [[GEP_COND]], align 4
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[COND]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[LOOP_LATCH]]
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    [[INV_VAL:%.*]] = load i32, ptr [[INVARIANT_PTR]], align 4
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[IV]]
+; CHECK-NEXT:    store i32 [[INV_VAL]], ptr [[GEP]], align 4
+; CHECK-NEXT:    br label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %gep.cond = getelementptr inbounds i32, ptr %cond_ptr, i32 %iv
+  %cond = load i32, ptr %gep.cond, align 4
+  %cmp = icmp sgt i32 %cond, 0
+  br i1 %cmp, label %if.then, label %loop.latch
+
+if.then:
+  %inv_val = load i32, ptr %invariant_ptr, align 4
+  %gep = getelementptr inbounds i32, ptr %dst, i32 %iv
+  store i32 %inv_val, ptr %gep, align 4
+  br label %loop.latch
+
+loop.latch:
+  %iv.next = add nuw nsw i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/invalidate-scev-at-scope-after-vectorization.ll b/llvm/test/Transforms/LoopVectorize/invalidate-scev-at-scope-after-vectorization.ll
index 1f32f89001ee0..32de44ce8aac1 100644
--- a/llvm/test/Transforms/LoopVectorize/invalidate-scev-at-scope-after-vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/invalidate-scev-at-scope-after-vectorization.ll
@@ -31,12 +31,11 @@ define void @test_invalidate_scevs_at_scope(ptr %p) {
 ; CHECK-NEXT:    [[C_1:%.*]] = icmp eq i32 [[IV_1]], 100
 ; CHECK-NEXT:    br i1 [[C_1]], label %[[EXIT_1:.*]], label %[[LOOP_1]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[EXIT_1]]:
-; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi i32 [ [[TMP4]], %[[LOOP_1]] ]
 ; CHECK-NEXT:    [[ADD_LCSSA1:%.*]] = phi i32 [ [[ADD_1]], %[[LOOP_1]] ]
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = add i32 [[DOTLCSSA]], 100
+; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = add i32 [[TMP4]], 100
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[ADD_LCSSA]], i32 100)
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[SMAX]], -100
-; CHECK-NEXT:    [[TMP5:%.*]] = sub i32 [[TMP3]], [[DOTLCSSA]]
+; CHECK-NEXT:    [[TMP5:%.*]] = sub i32 [[TMP3]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = zext i32 [[TMP5]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = add nuw nsw i64 [[TMP6]], 1
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP7]], 4
diff --git a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll
index eea22374ade30..abed18a57b90e 100644
--- a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll
@@ -380,7 +380,6 @@ define void @multiple_uniform_stores(ptr nocapture %var1, ptr nocapture readonly
 ; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP8]], 8589934588
-; CHECK-NEXT:    [[IND_END:%.*]] = add nuw nsw i64 [[N_VEC]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <4 x i32> <i32 poison, i32 0, i32 0, i32 0>, i32 [[ARRAYIDX5_PROMOTED]], i64 0
 ; CHECK-NEXT:    [[INVARIANT_GEP:%.*]] = getelementptr i32, ptr [[VAR2]], i64 [[TMP4]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -396,6 +395,7 @@ define void @multiple_uniform_stores(ptr nocapture %var1, ptr nocapture readonly
 ; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi <4 x i32> [ [[TMP17]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[IND_END:%.*]] = add nuw nsw i64 [[N_VEC]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[DOTLCSSA]])
 ; CHECK-NEXT:    store i32 [[TMP19]], ptr [[ARRAYIDX5]], align 4, !alias.scope [[META27:![0-9]+]], !noalias [[META23]]
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP8]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/nested-loops-scev-expansion.ll b/llvm/test/Transforms/LoopVectorize/nested-loops-scev-expansion.ll
index 8525b3aa5d349..3bf5c0d1d13a9 100644
--- a/llvm/test/Transforms/LoopVectorize/nested-loops-scev-expansion.ll
+++ b/llvm/test/Transforms/LoopVectorize/nested-loops-scev-expansion.ll
@@ -222,10 +222,9 @@ define void @pr52024(ptr %dst, i16 %N) {
 ; CHECK-NEXT:    [[EXITCOND_2:%.*]] = icmp eq i16 [[IV_1_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_2]], label %[[LOOP_2_PH:.*]], label %[[LOOP_1]]
 ; CHECK:       [[LOOP_2_PH]]:
-; CHECK-NEXT:    [[IV_1_LCSSA2:%.*]] = phi i16 [ [[IV_1]], %[[LOOP_1_LATCH]] ]
 ; CHECK-NEXT:    [[IV_1_NEXT_LCSSA:%.*]] = phi i16 [ [[IV_1_NEXT]], %[[LOOP_1_LATCH]] ]
 ; CHECK-NEXT:    [[IV_1_NEXT_EXT:%.*]] = sext i16 [[IV_1_NEXT_LCSSA]] to i64
-; CHECK-NEXT:    [[TMP0:%.*]] = mul i16 [[IV_1_LCSSA2]], 3
+; CHECK-NEXT:    [[TMP0:%.*]] = mul i16 [[IV_1]], 3
 ; CHECK-NEXT:    br label %[[LOOP_2_HEADER:.*]]
 ; CHECK:       [[LOOP_2_HEADER]]:
 ; CHECK-NEXT:    [[IV_1_REM:%.*]] = urem i64 100, [[IV_1_NEXT_EXT]]
diff --git a/llvm/test/Transforms/LoopVectorize/pr45259.ll b/llvm/test/Transforms/LoopVectorize/pr45259.ll
index f33437fd8ebde..7a048a9a607ba 100644
--- a/llvm/test/Transforms/LoopVectorize/pr45259.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr45259.ll
@@ -14,12 +14,10 @@ define i8 @widget(ptr %arr, i8 %t9) {
 ; CHECK-NEXT:    [[C:%.*]] = call i1 @cond()
 ; CHECK-NEXT:    br i1 [[C]], label [[FOR_PREHEADER:%.*]], label [[BB6]]
 ; CHECK:       for.preheader:
-; CHECK-NEXT:    [[T1_0_LCSSA4:%.*]] = phi ptr [ [[T1_0]], [[BB6]] ]
 ; CHECK-NEXT:    [[T1_0_LCSSA1:%.*]] = phi ptr [ [[T1_0]], [[BB6]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[ARR1]] to i32
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 0, [[TMP0]]
-; CHECK-NEXT:    [[T1_0_LCSSA3:%.*]] = ptrtoint ptr [[T1_0_LCSSA4]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[T1_0_LCSSA3]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[T1_0_LCSSA2]] to i32
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP3]], 4
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/pr58811-scev-expansion.ll b/llvm/test/Transforms/LoopVectorize/pr58811-scev-expansion.ll
index 269c3bf73c869..879c7ae5c3c43 100644
--- a/llvm/test/Transforms/LoopVectorize/pr58811-scev-expansion.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr58811-scev-expansion.ll
@@ -19,11 +19,10 @@ define void @test1_pr58811() {
 ; CHECK-NEXT:    [[INDUCTION_IV_NEXT]] = add i32 [[INDUCTION_IV]], [[TMP1]]
 ; CHECK-NEXT:    br i1 false, label [[LOOP_1]], label [[LOOP_2_PREHEADER:%.*]]
 ; CHECK:       loop.2.preheader:
-; CHECK-NEXT:    [[INDUCTION_IV_LCSSA:%.*]] = phi i32 [ [[INDUCTION_IV]], [[LOOP_1]] ]
 ; CHECK-NEXT:    [[IV_1_LCSSA:%.*]] = phi i32 [ [[IV_1]], [[LOOP_1]] ]
 ; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[IND_END:%.*]] = mul i32 196, [[INDUCTION_IV_LCSSA]]
+; CHECK-NEXT:    [[IND_END:%.*]] = mul i32 196, [[INDUCTION_IV]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -111,8 +110,8 @@ define void @test2_pr58811() {
 ; CHECK-NEXT:    [[INDUCTION_IV_NEXT]] = add i32 [[INDUCTION_IV]], [[TMP1]]
 ; CHECK-NEXT:    br i1 false, label [[LOOP_2]], label [[LOOP_3_PREHEADER:%.*]]
 ; CHECK:       loop.3.preheader:
-; CHECK-NEXT:    [[IV_2_LCSSA:%.*]] = phi i32 [ [[IV_2]], [[LOOP_2]] ]
 ; CHECK-NEXT:    [[INDUCTION_IV_LCSSA:%.*]] = phi i32 [ [[INDUCTION_IV]], [[LOOP_2]] ]
+; CHECK-NEXT:    [[IV_2_LCSSA:%.*]] = phi i32 [ [[IV_2]], [[LOOP_2]] ]
 ; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[IND_END:%.*]] = mul i32 196, [[INDUCTION_IV_LCSSA]]
@@ -182,12 +181,11 @@ define void @test3_pr58811() {
 ; CHECK-NEXT:    [[ADD101:%.*]] = add i32 [[REM85]], [[P_2]]
 ; CHECK-NEXT:    br i1 false, label [[LOOP_2]], label [[LOOP_3_PREHEADER:%.*]]
 ; CHECK:       loop.3.preheader:
-; CHECK-NEXT:    [[P_2_LCSSA:%.*]] = phi i32 [ [[P_2]], [[LOOP_2]] ]
 ; CHECK-NEXT:    [[ADD101_LCSSA:%.*]] = phi i32 [ [[ADD101]], [[LOOP_2]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = udiv i32 1, [[P_1]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i32 [[P_1]], [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], -1
-; CHECK-NEXT:    [[TMP3:%.*]] = sub i32 [[TMP2]], [[P_2_LCSSA]]
+; CHECK-NEXT:    [[TMP3:%.*]] = sub i32 [[TMP2]], [[P_2]]
 ; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[IND_END:%.*]] = mul i32 196, [[TMP3]]
diff --git a/llvm/test/Transforms/LoopVectorize/pr66616.ll b/llvm/test/Transforms/LoopVectorize/pr66616.ll
index 1ef614ab32472..1e093407620d5 100644
--- a/llvm/test/Transforms/LoopVectorize/pr66616.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr66616.ll
@@ -18,10 +18,9 @@ define void @pr66616(ptr %ptr) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
 ; CHECK-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi i32 [ [[TMP0]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    br label [[LOOP_1:%.*]]
 ; CHECK:       preheader:
-; CHECK-NEXT:    [[TMP4:%.*]] = sub i32 -1, [[DOTLCSSA]]
+; CHECK-NEXT:    [[TMP4:%.*]] = sub i32 -1, [[TMP0]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = add nuw nsw i64 [[TMP5]], 1
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP6]], 4
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
index 964a257ef352f..fafa82c211dc6 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
@@ -2800,6 +2800,88 @@ exit:
   ret i64 %r.0.lcssa
 }
 
+define i32 @reduction_expression_ext_mulacc_livein(ptr %a, i16 %c) {
+; CHECK-LABEL: define i32 @reduction_expression_ext_mulacc_livein(
+; CHECK-SAME: ptr [[A:%.*]], i16 [[C:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[C]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP0]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = zext <4 x i8> [[WIDE_LOAD]] to <4 x i16>
+; CHECK-NEXT:    [[TMP2:%.*]] = mul <4 x i16> [[BROADCAST_SPLAT]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3]])
+; CHECK-NEXT:    [[TMP5]] = add i32 [[VEC_PHI]], [[TMP4]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[FOR_EXIT:.*]]
+; CHECK:       [[FOR_EXIT]]:
+; CHECK-NEXT:    ret i32 [[TMP5]]
+;
+; CHECK-INTERLEAVED-LABEL: define i32 @reduction_expression_ext_mulacc_livein(
+; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], i16 [[C:%.*]]) {
+; CHECK-INTERLEAVED-NEXT:  [[ENTRY:.*:]]
+; CHECK-INTERLEAVED-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK-INTERLEAVED:       [[VECTOR_PH]]:
+; CHECK-INTERLEAVED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[C]], i64 0
+; CHECK-INTERLEAVED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer
+; CHECK-INTERLEAVED-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-INTERLEAVED:       [[VECTOR_BODY]]:
+; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 4
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP0]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = zext <4 x i8> [[WIDE_LOAD]] to <4 x i16>
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[WIDE_LOAD2]] to <4 x i16>
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = mul <4 x i16> [[BROADCAST_SPLAT]], [[TMP2]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = mul <4 x i16> [[BROADCAST_SPLAT]], [[TMP3]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = zext <4 x i16> [[TMP4]] to <4 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP6]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP8]] = add i32 [[VEC_PHI]], [[TMP7]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = zext <4 x i16> [[TMP5]] to <4 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP9]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP11]] = add i32 [[VEC_PHI1]], [[TMP10]]
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]]
+; CHECK-INTERLEAVED:       [[MIDDLE_BLOCK]]:
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add i32 [[TMP11]], [[TMP8]]
+; CHECK-INTERLEAVED-NEXT:    br label %[[FOR_EXIT:.*]]
+; CHECK-INTERLEAVED:       [[FOR_EXIT]]:
+; CHECK-INTERLEAVED-NEXT:    ret i32 [[BIN_RDX]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %gep.a = getelementptr i8, ptr %a, i64 %iv
+  %load.a = load i8, ptr %gep.a, align 1
+  %ext.a = zext i8 %load.a to i16
+  %mul = mul i16 %c, %ext.a
+  %mul.ext = zext i16 %mul to i32
+  %add = add i32 %mul.ext, %accum
+  %iv.next = add i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %for.exit, label %for.body
+
+for.exit:                        ; preds = %for.body
+  ret i32 %add
+}
+
 declare float @llvm.fmuladd.f32(float, float, float)
 
 !6 = distinct !{!6, !7, !8}
diff --git a/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll b/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll
index c270a23344f54..faca86a41b023 100644
--- a/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll
+++ b/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll
@@ -205,12 +205,11 @@ define void @expand_diff_scev_unknown(ptr %dst, i1 %invar.c, i32 %step) mustprog
 ; CHECK-NEXT:    [[INDVAR_NEXT]] = add i32 [[INDVAR]], 1
 ; CHECK-NEXT:    br i1 [[INVAR_C]], label %[[LOOP_2_PREHEADER:.*]], label %[[LOOP_1]]
 ; CHECK:       [[LOOP_2_PREHEADER]]:
-; CHECK-NEXT:    [[INDVAR_LCSSA1:%.*]] = phi i32 [ [[INDVAR]], %[[LOOP_1]] ]
 ; CHECK-NEXT:    [[IV_1_LCSSA:%.*]] = phi i32 [ [[IV_1]], %[[LOOP_1]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[IV_1_LCSSA]], [[STEP]]
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP1]], i32 0)
 ; CHECK-NEXT:    [[TMP2:%.*]] = mul i32 [[STEP]], -2
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i32 [[INDVAR_LCSSA1]], -1
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i32 [[INDVAR]], -1
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[SMAX]], [[TMP4]]
 ; CHECK-NEXT:    [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP5]], i32 1)
@@ -219,7 +218,8 @@ define void @expand_diff_scev_unknown(ptr %dst, i1 %invar.c, i32 %step) mustprog
 ; CHECK-NEXT:    [[UMAX:%.*]] = call i32 @llvm.umax.i32(i32 [[STEP]], i32 1)
 ; CHECK-NEXT:    [[TMP8:%.*]] = udiv i32 [[TMP7]], [[UMAX]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[TMP6]], [[TMP8]]
-; CHECK-NEXT:    [[TMP12:%.*]] = add i32 [[INDVAR_LCSSA1]], 2
+; CHECK-NEXT:    [[TMP16:%.*]] = sub i32 2, [[STEP]]
+; CHECK-NEXT:    [[TMP12:%.*]] = add i32 [[IV_1_LCSSA]], [[TMP16]]
 ; CHECK-NEXT:    [[SMAX1:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP12]], i32 0)
 ; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[TMP3]], -1
 ; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[SMAX1]], [[TMP14]]
diff --git a/llvm/test/Transforms/LoopVectorize/scev-exit-phi-invalidation.ll b/llvm/test/Transforms/LoopVectorize/scev-exit-phi-invalidation.ll
index c7b27040d6484..479d859a9287c 100644
--- a/llvm/test/Transforms/LoopVectorize/scev-exit-phi-invalidation.ll
+++ b/llvm/test/Transforms/LoopVectorize/scev-exit-phi-invalidation.ll
@@ -19,15 +19,14 @@ define void @test_pr63368(i1 %c, ptr %A) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP1]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi i32 [ [[TMP0]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    br label [[EXIT_1:%.*]]
 ; CHECK:       exit.1:
-; CHECK-NEXT:    [[SMAX1:%.*]] = call i32 @llvm.smax.i32(i32 [[DOTLCSSA]], i32 -1)
+; CHECK-NEXT:    [[SMAX1:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP0]], i32 -1)
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[SMAX1]], 2
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP2]], 4
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
 ; CHECK:       vector.scevcheck:
-; CHECK-NEXT:    [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 poison, i32 -1)
+; CHECK-NEXT:    [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP0]], i32 -1)
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[SMAX]], 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8
 ; CHECK-NEXT:    [[TMP5:%.*]] = add i8 1, [[TMP4]]
@@ -61,7 +60,7 @@ define void @test_pr63368(i1 %c, ptr %A) {
 ; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i8 [[IV_2_NEXT]]
 ; CHECK-NEXT:    store i8 0, ptr [[GEP_A]], align 1
 ; CHECK-NEXT:    [[IV_2_SEXT:%.*]] = sext i8 [[IV_2]] to i32
-; CHECK-NEXT:    [[EC_2:%.*]] = icmp sge i32 [[DOTLCSSA]], [[IV_2_SEXT]]
+; CHECK-NEXT:    [[EC_2:%.*]] = icmp sge i32 [[TMP0]], [[IV_2_SEXT]]
 ; CHECK-NEXT:    br i1 [[EC_2]], label [[LOOP_2]], label [[EXIT_2]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       exit.2:
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/LoopVectorize/single-scalar-cast-minbw.ll b/llvm/test/Transforms/LoopVectorize/single-scalar-cast-minbw.ll
index 9a699826696ec..70adac2103feb 100644
--- a/llvm/test/Transforms/LoopVectorize/single-scalar-cast-minbw.ll
+++ b/llvm/test/Transforms/LoopVectorize/single-scalar-cast-minbw.ll
@@ -84,12 +84,8 @@ define void @single_scalar_cast_stored(ptr %src, ptr %dst, i32 %n) {
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[SRC]], align 2, !alias.scope [[META4:![0-9]+]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq <4 x i16> [[BROADCAST_SPLAT]], zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = and <4 x i16> [[BROADCAST_SPLAT]], splat (i16 15)
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i16> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i16 [[TMP0]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = and i16 [[TMP0]], 15
 ; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], i16 0, i16 [[TMP4]]
 ; CHECK-NEXT:    store i16 [[TMP5]], ptr [[DST]], align 2, !alias.scope [[META7:![0-9]+]], !noalias [[META4]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-metadata.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-metadata.ll
new file mode 100644
index 0000000000000..857b9131a0b8c
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-metadata.ll
@@ -0,0 +1,100 @@
+; REQUIRES: asserts
+
+; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -disable-output %s 2>&1 | FileCheck %s
+
+define void @test_widen_metadata(ptr noalias %A, ptr noalias %B, i32 %n) {
+; CHECK-LABEL: Checking a loop in 'test_widen_metadata'
+; CHECK:      VPlan 'Initial VPlan for VF={4},UF>=1' {
+; CHECK:      <x1> vector loop: {
+; CHECK:        vector.body:
+; CHECK:          WIDEN ir<%lv> = load vp<{{.*}}>
+; CHECK:          WIDEN-CAST ir<%conv> = sitofp ir<%lv> to float
+; CHECK:          WIDEN ir<%mul> = fmul ir<%conv>, ir<2.000000e+00>
+; CHECK:          WIDEN-CAST ir<%conv.back> = fptosi ir<%mul> to i32
+; CHECK:          WIDEN store vp<{{.*}}>, ir<%conv.back>
+;
+entry:
+  br label %loop
+
+loop:
+  %i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
+  %gep.A = getelementptr inbounds i32, ptr %A, i32 %i
+  %lv = load i32, ptr %gep.A, align 4, !tbaa !0, !range !6
+  %conv = sitofp i32 %lv to float, !fpmath !5
+  %mul = fmul float %conv, 2.0, !fpmath !5
+  %conv.back = fptosi float %mul to i32
+  %gep.B = getelementptr inbounds i32, ptr %B, i32 %i
+  store i32 %conv.back, ptr %gep.B, align 4, !tbaa !0
+  %i.next = add i32 %i, 1
+  %cond = icmp eq i32 %i.next, %n
+  br i1 %cond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+declare float @llvm.sqrt.f32(float)
+
+define void @test_intrinsic_with_metadata(ptr noalias %A, ptr noalias %B, i32 %n) {
+; CHECK-LABEL: Checking a loop in 'test_intrinsic_with_metadata'
+; CHECK:      VPlan 'Initial VPlan for VF={4},UF>=1' {
+; CHECK:      <x1> vector loop: {
+; CHECK:        vector.body:
+; CHECK:          WIDEN ir<%lv> = load vp<{{.*}}>
+; CHECK:          WIDEN-INTRINSIC ir<%sqrt> = call llvm.sqrt(ir<%lv>)
+; CHECK:          WIDEN store vp<{{.*}}>, ir<%sqrt>
+;
+entry:
+  br label %loop
+
+loop:
+  %i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
+  %gep.A = getelementptr inbounds float, ptr %A, i32 %i
+  %lv = load float, ptr %gep.A, align 4, !tbaa !0
+  %sqrt = call float @llvm.sqrt.f32(float %lv), !fpmath !5
+  %gep.B = getelementptr inbounds float, ptr %B, i32 %i
+  store float %sqrt, ptr %gep.B, align 4, !tbaa !0
+  %i.next = add i32 %i, 1
+  %cond = icmp eq i32 %i.next, %n
+  br i1 %cond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_widen_with_multiple_metadata(ptr noalias %A, ptr noalias %B, i32 %n) {
+; CHECK-LABEL: Checking a loop in 'test_widen_with_multiple_metadata'
+; CHECK:      VPlan 'Initial VPlan for VF={4},UF>=1' {
+; CHECK:      <x1> vector loop: {
+; CHECK:        vector.body:
+; CHECK:          WIDEN ir<%lv> = load vp<{{.*}}>
+; CHECK:          WIDEN-CAST ir<%conv> = sitofp ir<%lv> to float
+; CHECK:          WIDEN ir<%mul> = fmul ir<%conv>, ir<2.000000e+00>
+; CHECK:          WIDEN-CAST ir<%conv.back> = fptosi ir<%mul> to i32
+; CHECK:          WIDEN store vp<{{.*}}>, ir<%conv.back>
+;
+entry:
+  br label %loop
+
+loop:
+  %i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
+  %gep.A = getelementptr inbounds i32, ptr %A, i32 %i
+  %lv = load i32, ptr %gep.A, align 4, !tbaa !0, !range !6
+  %conv = sitofp i32 %lv to float
+  %mul = fmul float %conv, 2.0
+  %conv.back = fptosi float %mul to i32
+  %gep.B = getelementptr inbounds i32, ptr %B, i32 %i
+  store i32 %conv.back, ptr %gep.B, align 4, !tbaa !0
+  %i.next = add i32 %i, 1
+  %cond = icmp eq i32 %i.next, %n
+  br i1 %cond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+!0 = !{!1, !1, i64 0}
+!1 = !{!"float", !2}
+!2 = !{!"root"}
+!5 = !{float 2.500000e+00}
+!6 = !{i32 0, i32 100}
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll
index 06b044872c217..ef678ff759943 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll
@@ -800,3 +800,283 @@ exit:
   %r.0.lcssa = phi i64 [ %rdx.next, %loop ]
   ret i64 %r.0.lcssa
 }
+
+define i32 @print_mulacc_extended_const(ptr %start, ptr %end) {
+; CHECK-LABEL: 'print_mulacc_extended_const'
+; CHECK:       VPlan 'Initial VPlan for VF={4},UF>=1' {
+; CHECK-NEXT:  Live-in vp<[[VF:%.+]]> = VF
+; CHECK-NEXT:  Live-in vp<[[VFxUF:%.+]]> = VF * UF
+; CHECK-NEXT:  Live-in vp<[[VTC:%.+]]> = vector-trip-count
+; CHECK-NEXT:  vp<%3> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<entry>:
+; CHECK-NEXT:    EMIT vp<%3> = EXPAND SCEV (1 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64))
+; CHECK-NEXT:  Successor(s): scalar.ph, vector.ph
+; CHECK-EMPTY:
+; CHECK-NEXT:  vector.ph:
+; CHECK-NEXT:    vp<[[DER_IV:%.+]]> = DERIVED-IV ir<%start> + vp<[[VTC]]> * ir<1>
+; CHECK-NEXT:    EMIT vp<[[RDX_START:%.+]]> = reduction-start-vector ir<0>, ir<0>, ir<1>
+; CHECK-NEXT:  Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT:  <x1> vector loop: {
+; CHECK-NEXT:    vector.body:
+; CHECK-NEXT:      EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[IV_NEXT:%.+]]>
+; CHECK-NEXT:      WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi vp<[[RDX_START]]>, vp<[[RDX_NEXT:%.+]]>
+; CHECK-NEXT:      vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>, vp<[[VF]]>
+; CHECK-NEXT:      EMIT vp<%next.gep> = ptradd ir<%start>, vp<[[STEPS]]>
+; CHECK-NEXT:      vp<[[VEC_PTR:%.+]]> = vector-pointer vp<%next.gep>
+; CHECK-NEXT:      WIDEN ir<%l> = load vp<[[VEC_PTR]]>
+; CHECK-NEXT:      EXPRESSION vp<[[RDX_NEXT]]> = ir<[[RDX]]> + reduce.add (mul (ir<%l> zext to i32), (ir<63> zext to i32))
+; CHECK-NEXT:      EMIT vp<[[IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
+; CHECK-NEXT:      EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]>
+; CHECK-NEXT:    No successors
+; CHECK-NEXT:  }
+; CHECK-NEXT:  Successor(s): middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT:  middle.block:
+; CHECK-NEXT:    EMIT vp<%11> = compute-reduction-result ir<[[RDX]]>, vp<[[RDX_NEXT]]>
+; CHECK-NEXT:    EMIT vp<%cmp.n> = icmp eq vp<%3>, vp<[[VTC]]>
+; CHECK-NEXT:    EMIT branch-on-cond vp<%cmp.n>
+entry:
+  br label %loop
+
+loop:
+  %ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ]
+  %red = phi i32 [ 0, %entry ], [ %red.next, %loop ]
+  %l = load i8, ptr %ptr.iv, align 1
+  %l.ext = zext i8 %l to i32
+  %mul = mul i32 %l.ext, 63
+  %red.next = add i32 %red, %mul
+  %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1
+  %ec = icmp eq ptr %ptr.iv, %end
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret i32 %red.next
+}
+
+define i32 @print_mulacc_extended_const_lhs(ptr %start, ptr %end) {
+; CHECK-LABEL: 'print_mulacc_extended_const_lhs'
+; CHECK:       VPlan 'Initial VPlan for VF={4},UF>=1' {
+; CHECK-NEXT:  Live-in vp<[[VF:%.+]]> = VF
+; CHECK-NEXT:  Live-in vp<[[VFxUF:%.+]]> = VF * UF
+; CHECK-NEXT:  Live-in vp<[[VTC:%.+]]> = vector-trip-count
+; CHECK-NEXT:  vp<%3> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<entry>:
+; CHECK-NEXT:    EMIT vp<%3> = EXPAND SCEV (1 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64))
+; CHECK-NEXT:  Successor(s): scalar.ph, vector.ph
+; CHECK-EMPTY:
+; CHECK-NEXT:  vector.ph:
+; CHECK-NEXT:    vp<[[DER_IV:%.+]]> = DERIVED-IV ir<%start> + vp<[[VTC]]> * ir<1>
+; CHECK-NEXT:    EMIT vp<[[RDX_START:%.+]]> = reduction-start-vector ir<0>, ir<0>, ir<1>
+; CHECK-NEXT:  Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT:  <x1> vector loop: {
+; CHECK-NEXT:    vector.body:
+; CHECK-NEXT:      EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[IV_NEXT:%.+]]>
+; CHECK-NEXT:      WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi vp<[[RDX_START]]>, vp<[[RDX_NEXT:%.+]]>
+; CHECK-NEXT:      vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>, vp<[[VF]]>
+; CHECK-NEXT:      EMIT vp<%next.gep> = ptradd ir<%start>, vp<[[STEPS]]>
+; CHECK-NEXT:      vp<[[VEC_PTR:%.+]]> = vector-pointer vp<%next.gep>
+; CHECK-NEXT:      WIDEN ir<%l> = load vp<[[VEC_PTR]]>
+; CHECK-NEXT:      WIDEN-CAST ir<%l.ext> = zext ir<%l> to i32
+; CHECK-NEXT:      EXPRESSION vp<[[RDX_NEXT]]> = ir<[[RDX]]> + reduce.add (mul ir<63>, ir<%l.ext>)
+; CHECK-NEXT:      EMIT vp<[[IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
+; CHECK-NEXT:      EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]>
+; CHECK-NEXT:    No successors
+; CHECK-NEXT:  }
+; CHECK-NEXT:  Successor(s): middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT:  middle.block:
+; CHECK-NEXT:    EMIT vp<%11> = compute-reduction-result ir<[[RDX]]>, vp<[[RDX_NEXT]]>
+; CHECK-NEXT:    EMIT vp<%cmp.n> = icmp eq vp<%3>, vp<[[VTC]]>
+; CHECK-NEXT:    EMIT branch-on-cond vp<%cmp.n>
+entry:
+  br label %loop
+
+loop:
+  %ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ]
+  %red = phi i32 [ 0, %entry ], [ %red.next, %loop ]
+  %l = load i8, ptr %ptr.iv, align 1
+  %l.ext = zext i8 %l to i32
+  %mul = mul i32 63, %l.ext
+  %red.next = add i32 %red, %mul
+  %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1
+  %ec = icmp eq ptr %ptr.iv, %end
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret i32 %red.next
+}
+
+; Constants >= 128 cannot be treated as sign-extended, so the expression shouldn't extend 128
+define i32 @print_mulacc_not_extended_const(ptr %start, ptr %end) {
+; CHECK-LABEL: 'print_mulacc_not_extended_const'
+; CHECK:       VPlan 'Initial VPlan for VF={4},UF>=1' {
+; CHECK-NEXT:  Live-in vp<[[VF:%.+]]> = VF
+; CHECK-NEXT:  Live-in vp<[[VFxUF:%.+]]> = VF * UF
+; CHECK-NEXT:  Live-in vp<[[VTC:%.+]]> = vector-trip-count
+; CHECK-NEXT:  vp<%3> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<entry>:
+; CHECK-NEXT:    EMIT vp<%3> = EXPAND SCEV (1 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64))
+; CHECK-NEXT:  Successor(s): scalar.ph, vector.ph
+; CHECK-EMPTY:
+; CHECK-NEXT:  vector.ph:
+; CHECK-NEXT:    vp<[[DER_IV:%.+]]> = DERIVED-IV ir<%start> + vp<[[VTC]]> * ir<1>
+; CHECK-NEXT:    EMIT vp<[[RDX_START:%.+]]> = reduction-start-vector ir<0>, ir<0>, ir<1>
+; CHECK-NEXT:  Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT:  <x1> vector loop: {
+; CHECK-NEXT:    vector.body:
+; CHECK-NEXT:      EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[IV_NEXT:%.+]]>
+; CHECK-NEXT:      WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi vp<[[RDX_START]]>, vp<[[RDX_NEXT:%.+]]>
+; CHECK-NEXT:      vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>, vp<[[VF]]>
+; CHECK-NEXT:      EMIT vp<%next.gep> = ptradd ir<%start>, vp<[[STEPS]]>
+; CHECK-NEXT:      vp<[[VEC_PTR:%.+]]> = vector-pointer vp<%next.gep>
+; CHECK-NEXT:      WIDEN ir<%l> = load vp<[[VEC_PTR]]>
+; CHECK-NEXT:      WIDEN-CAST ir<%l.ext> = sext ir<%l> to i32
+; CHECK-NEXT:      EXPRESSION vp<[[RDX_NEXT]]> = ir<[[RDX]]> + reduce.add (mul ir<%l.ext>, ir<128>)
+; CHECK-NEXT:      EMIT vp<[[IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
+; CHECK-NEXT:      EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]>
+; CHECK-NEXT:    No successors
+; CHECK-NEXT:  }
+; CHECK-NEXT:  Successor(s): middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT:  middle.block:
+; CHECK-NEXT:    EMIT vp<%11> = compute-reduction-result ir<[[RDX:%.+]]>, vp<[[RDX_NEXT]]>
+; CHECK-NEXT:    EMIT vp<%cmp.n> = icmp eq vp<%3>, vp<[[VTC]]>
+; CHECK-NEXT:    EMIT branch-on-cond vp<%cmp.n>
+entry:
+  br label %loop
+
+loop:
+  %ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ]
+  %red = phi i32 [ 0, %entry ], [ %red.next, %loop ]
+  %l = load i8, ptr %ptr.iv, align 1
+  %l.ext = sext i8 %l to i32
+  %mul = mul i32 %l.ext, 128
+  %red.next = add i32 %red, %mul
+  %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1
+  %ec = icmp eq ptr %ptr.iv, %end
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  %red.next.lcssa = phi i32 [ %red.next, %loop ]
+  ret i32 %red.next.lcssa
+}
+
+define i64 @print_ext_mulacc_extended_const(ptr %start, ptr %end) {
+; CHECK-LABEL: 'print_ext_mulacc_extended_const'
+; CHECK:       VPlan 'Initial VPlan for VF={4},UF>=1' {
+; CHECK-NEXT:  Live-in vp<[[VF:%.+]]> = VF
+; CHECK-NEXT:  Live-in vp<[[VFxUF:%.+]]> = VF * UF
+; CHECK-NEXT:  Live-in vp<[[VTC:%.+]]> = vector-trip-count
+; CHECK-NEXT:  vp<%3> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<entry>:
+; CHECK-NEXT:    EMIT vp<%3> = EXPAND SCEV (1 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64))
+; CHECK-NEXT:  Successor(s): scalar.ph, vector.ph
+; CHECK-EMPTY:
+; CHECK-NEXT:  vector.ph:
+; CHECK-NEXT:    vp<[[DER_IV:%.+]]> = DERIVED-IV ir<%start> + vp<[[VTC]]> * ir<1>
+; CHECK-NEXT:    EMIT vp<[[RDX_START:%.+]]> = reduction-start-vector ir<0>, ir<0>, ir<1>
+; CHECK-NEXT:  Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT:  <x1> vector loop: {
+; CHECK-NEXT:    vector.body:
+; CHECK-NEXT:      EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[IV_NEXT:%.+]]>
+; CHECK-NEXT:      WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi vp<[[RDX_START]]>, vp<[[RDX_NEXT:%.+]]>
+; CHECK-NEXT:      vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>, vp<[[VF]]>
+; CHECK-NEXT:      EMIT vp<%next.gep> = ptradd ir<%start>, vp<[[STEPS]]>
+; CHECK-NEXT:      vp<[[VEC_PTR:%.+]]> = vector-pointer vp<%next.gep>
+; CHECK-NEXT:      WIDEN ir<%l> = load vp<[[VEC_PTR]]>
+; CHECK-NEXT:      EXPRESSION vp<[[RDX_NEXT]]> = ir<[[RDX]]> + reduce.add (mul (ir<%l> zext to i64), (ir<63> zext to i64))
+; CHECK-NEXT:      EMIT vp<[[IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
+; CHECK-NEXT:      EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]>
+; CHECK-NEXT:    No successors
+; CHECK-NEXT:  }
+; CHECK-NEXT:  Successor(s): middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT:  middle.block:
+; CHECK-NEXT:    EMIT vp<%11> = compute-reduction-result ir<[[RDX]]>, vp<[[RDX_NEXT]]>
+; CHECK-NEXT:    EMIT vp<%cmp.n> = icmp eq vp<%3>, vp<[[VTC]]>
+; CHECK-NEXT:    EMIT branch-on-cond vp<%cmp.n>
+entry:
+  br label %loop
+
+loop:
+  %ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ]
+  %red = phi i64 [ 0, %entry ], [ %red.next, %loop ]
+  %l = load i8, ptr %ptr.iv, align 1
+  %l.ext = zext i8 %l to i32
+  %mul = mul i32 %l.ext, 63
+  %mul.ext = zext i32 %mul to i64
+  %red.next = add i64 %red, %mul.ext
+  %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1
+  %ec = icmp eq ptr %ptr.iv, %end
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret i64 %red.next
+}
+
+; Constants >= 128 cannot be treated as sign-extended, so the expression shouldn't extend 128
+define i64 @print_ext_mulacc_not_extended_const(ptr %start, ptr %end) {
+; CHECK-LABEL: 'print_ext_mulacc_not_extended_const'
+; CHECK:       VPlan 'Initial VPlan for VF={4},UF>=1' {
+; CHECK-NEXT:  Live-in vp<[[VF:%.+]]> = VF
+; CHECK-NEXT:  Live-in vp<[[VFxUF:%.+]]> = VF * UF
+; CHECK-NEXT:  Live-in vp<[[VTC:%.+]]> = vector-trip-count
+; CHECK-NEXT:  vp<%3> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<entry>:
+; CHECK-NEXT:    EMIT vp<%3> = EXPAND SCEV (1 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64))
+; CHECK-NEXT:  Successor(s): scalar.ph, vector.ph
+; CHECK-EMPTY:
+; CHECK-NEXT:  vector.ph:
+; CHECK-NEXT:    vp<[[DER_IV:%.+]]> = DERIVED-IV ir<%start> + vp<[[VTC]]> * ir<1>
+; CHECK-NEXT:    EMIT vp<[[RDX_START:%.+]]> = reduction-start-vector ir<0>, ir<0>, ir<1>
+; CHECK-NEXT:  Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT:  <x1> vector loop: {
+; CHECK-NEXT:    vector.body:
+; CHECK-NEXT:      EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[IV_NEXT:%.+]]>
+; CHECK-NEXT:      WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi vp<[[RDX_START]]>, vp<[[RDX_NEXT:%.+]]>
+; CHECK-NEXT:      vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>, vp<[[VF]]>
+; CHECK-NEXT:      EMIT vp<%next.gep> = ptradd ir<%start>, vp<[[STEPS]]>
+; CHECK-NEXT:      vp<[[VEC_PTR:%.+]]> = vector-pointer vp<%next.gep>
+; CHECK-NEXT:      WIDEN ir<%l> = load vp<[[VEC_PTR]]>
+; CHECK-NEXT:      WIDEN-CAST ir<%l.ext> = sext ir<%l> to i32
+; CHECK-NEXT:      WIDEN ir<%mul> = mul ir<%l.ext>, ir<128>
+; CHECK-NEXT:      EXPRESSION vp<[[RDX_NEXT]]> = ir<[[RDX]]> + reduce.add (ir<%mul> sext to i64)
+; CHECK-NEXT:      EMIT vp<[[IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
+; CHECK-NEXT:      EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]>
+; CHECK-NEXT:    No successors
+; CHECK-NEXT:  }
+; CHECK-NEXT:  Successor(s): middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT:  middle.block:
+; CHECK-NEXT:    EMIT vp<%11> = compute-reduction-result ir<[[RDX]]>, vp<[[RDX_NEXT]]>
+; CHECK-NEXT:    EMIT vp<%cmp.n> = icmp eq vp<%3>, vp<[[VTC]]>
+; CHECK-NEXT:    EMIT branch-on-cond vp<%cmp.n>
+entry:
+  br label %loop
+
+loop:
+  %ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ]
+  %red = phi i64 [ 0, %entry ], [ %red.next, %loop ]
+  %l = load i8, ptr %ptr.iv, align 1
+  %l.ext = sext i8 %l to i32
+  %mul = mul i32 %l.ext, 128
+  %mul.ext = sext i32 %mul to i64
+  %red.next = add i64 %red, %mul.ext
+  %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1
+  %ec = icmp eq ptr %ptr.iv, %end
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  %red.next.lcssa = phi i64 [ %red.next, %loop ]
+  ret i64 %red.next.lcssa
+}
diff --git a/llvm/test/Transforms/MemCpyOpt/stack-move.ll b/llvm/test/Transforms/MemCpyOpt/stack-move.ll
index 940e30ec46881..0c2e05fa8fed6 100644
--- a/llvm/test/Transforms/MemCpyOpt/stack-move.ll
+++ b/llvm/test/Transforms/MemCpyOpt/stack-move.ll
@@ -1729,3 +1729,61 @@ define i32 @test_ret_only_capture() {
   %v = load i32, ptr %a
   ret i32 %v
 }
+
+declare ptr @captures_address_only(ptr captures(address))
+
+; Can transform: Only one address captured.
+define void @test_captures_address_captures_none() {
+; CHECK-LABEL: define void @test_captures_address_captures_none() {
+; CHECK-NEXT:    [[SRC:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4
+; CHECK-NEXT:    store [[STRUCT_FOO]] { i32 10, i32 20, i32 30 }, ptr [[SRC]], align 4
+; CHECK-NEXT:    call void @captures_address_only(ptr [[SRC]])
+; CHECK-NEXT:    call void @use_nocapture(ptr [[SRC]])
+; CHECK-NEXT:    ret void
+;
+  %src = alloca %struct.Foo, align 4
+  %dst = alloca %struct.Foo, align 4
+  store %struct.Foo { i32 10, i32 20, i32 30 }, ptr %src
+  call void @captures_address_only(ptr %src)
+  call void @llvm.memcpy.p0.p0.i64(ptr align 4 %dst, ptr align 4 %src, i64 12, i1 false)
+  call void @use_nocapture(ptr %dst)
+  ret void
+}
+
+; Can transform: Only one address captured.
+define void @test_captures_none_and_captures_address() {
+; CHECK-LABEL: define void @test_captures_none_and_captures_address() {
+; CHECK-NEXT:    [[SRC:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4
+; CHECK-NEXT:    store [[STRUCT_FOO]] { i32 10, i32 20, i32 30 }, ptr [[SRC]], align 4
+; CHECK-NEXT:    call void @use_nocapture(ptr [[SRC]])
+; CHECK-NEXT:    call void @captures_address_only(ptr [[SRC]])
+; CHECK-NEXT:    ret void
+;
+  %src = alloca %struct.Foo, align 4
+  %dst = alloca %struct.Foo, align 4
+  store %struct.Foo { i32 10, i32 20, i32 30 }, ptr %src
+  call void @use_nocapture(ptr %src)
+  call void @llvm.memcpy.p0.p0.i64(ptr align 4 %dst, ptr align 4 %src, i64 12, i1 false)
+  call void @captures_address_only(ptr %dst)
+  ret void
+}
+
+; Cannot transform: Both addresses captured.
+define void @test_captures_address_and_captures_address() {
+; CHECK-LABEL: define void @test_captures_address_and_captures_address() {
+; CHECK-NEXT:    [[SRC:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4
+; CHECK-NEXT:    [[DST:%.*]] = alloca [[STRUCT_FOO]], align 4
+; CHECK-NEXT:    store [[STRUCT_FOO]] { i32 10, i32 20, i32 30 }, ptr [[SRC]], align 4
+; CHECK-NEXT:    call void @captures_address_only(ptr [[SRC]])
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[DST]], ptr align 4 [[SRC]], i64 12, i1 false)
+; CHECK-NEXT:    call void @captures_address_only(ptr [[DST]])
+; CHECK-NEXT:    ret void
+;
+  %src = alloca %struct.Foo, align 4
+  %dst = alloca %struct.Foo, align 4
+  store %struct.Foo { i32 10, i32 20, i32 30 }, ptr %src
+  call void @captures_address_only(ptr %src)
+  call void @llvm.memcpy.p0.p0.i64(ptr align 4 %dst, ptr align 4 %src, i64 12, i1 false)
+  call void @captures_address_only(ptr %dst)
+  ret void
+}
diff --git a/llvm/test/Transforms/PGOProfile/memprof_diff_inline.ll b/llvm/test/Transforms/PGOProfile/memprof_diff_inline.ll
new file mode 100644
index 0000000000000..5213a07d13d39
--- /dev/null
+++ b/llvm/test/Transforms/PGOProfile/memprof_diff_inline.ll
@@ -0,0 +1,118 @@
+;; Tests that the compiler ignores smaller contexts that differ only in the
+;; IsInlineFrame bool. These map to the same full context id internally, as we
+;; ignore the inline frame status which may differ in feedback compiles.
+;; Presumably this happens when profiles collected from different binaries are
+;; merged. If we didn't pick the largest we would default them all to noncold.
+
+;; Avoid failures on big-endian systems that can't read the profile properly
+; REQUIRES: x86_64-linux
+
+;; Generate the profile and the IR.
+; RUN: split-file %s %t
+
+;; Generate indexed profile
+; RUN: llvm-profdata merge %t/memprof_diff_inline.yaml -o %t.memprofdata
+
+; RUN: opt < %t/memprof_diff_inline.ll -passes='memprof-use<profile-filename=%t.memprofdata>' -S -memprof-report-hinted-sizes -memprof-print-match-info 2>&1 | FileCheck %s --check-prefixes=MEMPROF
+
+; MEMPROF: MemProf notcold context with id 10194276560488437434 has total profiled size 200 is matched with 1 frames
+; MEMPROF: MemProf cold context with id 16342802530253093571 has total profiled size 10000 is matched with 1 frames
+
+;--- memprof_diff_inline.yaml
+---
+HeapProfileRecords:
+  - GUID:            _Z3foov
+    AllocSites:
+      # Small non-cold, full context id 16342802530253093571, should ignore
+      - Callstack:
+          - { Function: _Z3foov, LineOffset: 1, Column: 10, IsInlineFrame: false }
+          - { Function: _Z4foo2v, LineOffset: 1, Column: 10, IsInlineFrame: false }
+          - { Function: _Z3barv, LineOffset: 1, Column: 10, IsInlineFrame: false }
+          - { Function: main, LineOffset: 8, Column: 13, IsInlineFrame: false }
+        MemInfoBlock:
+          AllocCount:      1
+          TotalSize:       10
+          TotalLifetime:   0
+          TotalLifetimeAccessDensity: 20000
+      # Large cold, full context id 16342802530253093571, should keep
+      - Callstack:
+          - { Function: _Z3foov, LineOffset: 1, Column: 10, IsInlineFrame: false }
+          - { Function: _Z4foo2v, LineOffset: 1, Column: 10, IsInlineFrame: true }
+          - { Function: _Z3barv, LineOffset: 1, Column: 10, IsInlineFrame: false }
+          - { Function: main, LineOffset: 8, Column: 13, IsInlineFrame: false }
+        MemInfoBlock:
+          AllocCount:      1
+          TotalSize:       10000
+          TotalLifetime:   200000
+          TotalLifetimeAccessDensity: 0
+      # Small non-cold, full context id 16342802530253093571, should ignore
+      - Callstack:
+          - { Function: _Z3foov, LineOffset: 1, Column: 10, IsInlineFrame: false }
+          - { Function: _Z4foo2v, LineOffset: 1, Column: 10, IsInlineFrame: false }
+          - { Function: _Z3barv, LineOffset: 1, Column: 10, IsInlineFrame: true }
+          - { Function: main, LineOffset: 8, Column: 13, IsInlineFrame: false }
+        MemInfoBlock:
+          AllocCount:      1
+          TotalSize:       100
+          TotalLifetime:   0
+          TotalLifetimeAccessDensity: 20000
+      # Small non-cold, full context id 10194276560488437434
+      - Callstack:
+          - { Function: _Z3foov, LineOffset: 1, Column: 10, IsInlineFrame: false }
+          - { Function: _Z4foo2v, LineOffset: 1, Column: 10, IsInlineFrame: false }
+          - { Function: _Z3barv, LineOffset: 1, Column: 10, IsInlineFrame: false }
+          - { Function: main, LineOffset: 9, Column: 13, IsInlineFrame: false }
+        MemInfoBlock:
+          AllocCount:      1
+          TotalSize:       200
+          TotalLifetime:   0
+          TotalLifetimeAccessDensity: 20000
+    CallSites:       []
+...
+;--- memprof_diff_inline.ll
+; ModuleID = 'memprof_diff_inline.cc'
+source_filename = "memprof_diff_inline.cc"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%"struct.std::nothrow_t" = type { i8 }
+
+@_ZSt7nothrow = external global %"struct.std::nothrow_t", align 1
+
+define dso_local noundef ptr @_Z3foov() !dbg !10 {
+entry:
+  ; MEMPROF: call {{.*}} @_Znwm{{.*}} !memprof ![[M1:[0-9]+]], !callsite ![[C1:[0-9]+]]
+  %call = call noalias noundef align 32 ptr @_Znwm(i64 noundef 32) #6, !dbg !13
+  ret ptr %call
+}
+
+declare noundef ptr @_Znwm(i64 noundef)
+
+attributes #6 = { builtin allocsize(0) }
+
+; MEMPROF: ![[M1]] = !{![[MIB1:[0-9]+]], ![[MIB2:[0-9]+]]}
+
+; MEMPROF: ![[MIB1]] = !{![[STACK1:[0-9]+]], !"notcold", ![[CONTEXTSIZE1:[0-9]+]]}
+; MEMPROF: ![[STACK1]] = !{i64 2732490490862098848, i64 8467819354083268568, i64 9086428284934609951, i64 2061451396820446691}
+;; Full context id 10194276560488437434 == -8252467513221114182
+; MEMPROF: ![[CONTEXTSIZE1]] = !{i64 -8252467513221114182, i64 200}
+
+; MEMPROF: ![[MIB2]] = !{![[STACK2:[0-9]+]], !"cold", ![[CONTEXTSIZE2:[0-9]+]]}
+; MEMPROF: ![[STACK2]] = !{i64 2732490490862098848, i64 8467819354083268568, i64 9086428284934609951, i64 -5747251260480066785}
+;; Full context id 16342802530253093571 == -2103941543456458045
+;; We should have kept the large (cold) one.
+; MEMPROF: ![[CONTEXTSIZE2]] = !{i64 -2103941543456458045, i64 10000}
+
+; MEMPROF: ![[C1]] = !{i64 2732490490862098848}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 15.0.0 (https://github.com/llvm/llvm-project.git 6cbe6284d1f0a088b5c6482ae27b738f03d82fe7)", isOptimized: false, runtimeVersion: 0, emissionKind: LineTablesOnly, splitDebugInlining: false, debugInfoForProfiling: true, nameTableKind: None)
+!1 = !DIFile(filename: "memprof.cc", directory: "/usr/local/google/home/tejohnson/llvm/tmp", checksumkind: CSK_MD5, checksum: "e8c40ebe4b21776b4d60e9632cbc13c2")
+!2 = !{i32 7, !"Dwarf Version", i32 5}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!10 = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov", scope: !1, file: !1, line: 4, type: !11, scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !12)
+!11 = !DISubroutineType(types: !12)
+!12 = !{}
+!13 = !DILocation(line: 5, column: 10, scope: !10)
diff --git a/llvm/test/Transforms/PGOProfile/memprof_loop_unroll.ll b/llvm/test/Transforms/PGOProfile/memprof_loop_unroll.ll
index 2461ca32e9821..ba53c5797208c 100644
--- a/llvm/test/Transforms/PGOProfile/memprof_loop_unroll.ll
+++ b/llvm/test/Transforms/PGOProfile/memprof_loop_unroll.ll
@@ -4,24 +4,50 @@
 ;; Avoid failures on big-endian systems that can't read the profile properly
 ; REQUIRES: x86_64-linux
 
-;; TODO: Use text profile inputs once that is available for memprof.
-;; # To update the Inputs below, run Inputs/update_memprof_inputs.sh.
-;; # To generate below LLVM IR for use in matching.
-;; $ clang++ -gmlt -fdebug-info-for-profiling -S %S/Inputs/memprof_loop_unroll_b.cc -emit-llvm
+; Generate the profile and the IR.
+; RUN: split-file %s %t
+
+;; Generate indexed profile
+; RUN: llvm-profdata merge %t/memprof_loop_unroll.yaml -o %t.memprofdata
 
-; RUN: llvm-profdata merge %S/Inputs/memprof_loop_unroll.memprofraw --profiled-binary %S/Inputs/memprof_loop_unroll.exe -o %t.memprofdata
 ;; Set the minimum lifetime threshold to 0 to ensure that one context is
 ;; considered cold (the other will be notcold).
-; RUN: opt < %s -passes='memprof-use<profile-filename=%t.memprofdata>' -S -memprof-report-hinted-sizes -memprof-ave-lifetime-cold-threshold=0 2>&1 | FileCheck %s
+; RUN: opt < %t/memprof_loop_unroll.ll -passes='memprof-use<profile-filename=%t.memprofdata>' -S -memprof-report-hinted-sizes -memprof-ave-lifetime-cold-threshold=0 2>&1 | FileCheck %s
 
-;; Conservatively annotate as not cold. We get two messages as there are two
-;; unrolled copies of the allocation.
-; CHECK: MemProf hinting: Total size for full allocation context hash {{.*}} and indistinguishable alloc type notcold: 4
-; CHECK: MemProf hinting: Total size for full allocation context hash {{.*}} and indistinguishable alloc type notcold: 4
+;; Conservatively annotate as not cold.
+; CHECK: MemProf hinting: Total size for full allocation context hash {{.*}} and single alloc type notcold: 4
 ; CHECK: call {{.*}} @_Znam{{.*}} #[[ATTR:[0-9]+]]
 ; CHECK: attributes #[[ATTR]] = { builtin allocsize(0) "memprof"="notcold" }
 ; CHECK-NOT: stackIds: ()
 
+;--- memprof_loop_unroll.yaml
+---
+HeapProfileRecords:
+  - GUID:            0x7f8d88fcc70a347b
+    AllocSites:
+      - Callstack:
+          - { Function: 0x7f8d88fcc70a347b, LineOffset: 2, Column: 16, IsInlineFrame: false }
+          - { Function: 0xdb956436e78dd5fa, LineOffset: 1, Column: 5, IsInlineFrame: false }
+        MemInfoBlock:
+          AllocCount:      1
+          TotalSize:       4
+          TotalLifetime:   2
+          TotalLifetimeAccessDensity: 12500000000
+      - Callstack:
+          - { Function: 0x7f8d88fcc70a347b, LineOffset: 2, Column: 16, IsInlineFrame: false }
+          - { Function: 0xdb956436e78dd5fa, LineOffset: 1, Column: 5, IsInlineFrame: false }
+        MemInfoBlock:
+          AllocCount:      1
+          TotalSize:       4
+          TotalLifetime:   2
+          TotalLifetimeAccessDensity: 0
+  - GUID:            0xdb956436e78dd5fa
+    CallSites:
+      - Frames:
+          - { Function: 0xdb956436e78dd5fa, LineOffset: 1, Column: 5, IsInlineFrame: false }
+...
+
+;--- memprof_loop_unroll.ll
 ; ModuleID = 'memprof_loop_unroll_b.cc'
 source_filename = "memprof_loop_unroll_b.cc"
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/hoist-load-from-vector-loop.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/hoist-load-from-vector-loop.ll
new file mode 100644
index 0000000000000..a35bcf1c5a88d
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/hoist-load-from-vector-loop.ll
@@ -0,0 +1,46 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6
+; RUN: opt -passes='default<O3>' -S %s | FileCheck %s
+
+target triple = "arm64-apple-macosx"
+
+%"class.dealii::VectorizedArray" = type { [4 x double] }
+
+define void @hoist_invariant_load(ptr %invariant_ptr, i64 %num_elements, ptr %array) {
+; CHECK-LABEL: define void @hoist_invariant_load(
+; CHECK-SAME: ptr readonly captures(none) [[INVARIANT_PTR:%.*]], i64 [[NUM_ELEMENTS:%.*]], ptr captures(none) [[ARRAY:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CMP1_NOT:%.*]] = icmp eq i64 [[NUM_ELEMENTS]], 0
+; CHECK-NEXT:    br i1 [[CMP1_NOT]], label %[[EXIT:.*]], label %[[LOOP_LATCH:.*]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[I2:%.*]] = phi i64 [ [[I_NEXT:%.*]], %[[LOOP_LATCH]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr nusw %"class.dealii::VectorizedArray", ptr [[ARRAY]], i64 [[I2]]
+; CHECK-NEXT:    [[INVARIANT_VAL:%.*]] = load double, ptr [[INVARIANT_PTR]], align 8
+; CHECK-NEXT:    [[ARRAY_VAL:%.*]] = load double, ptr [[GEP]], align 8
+; CHECK-NEXT:    [[SUM:%.*]] = fadd double [[INVARIANT_VAL]], [[ARRAY_VAL]]
+; CHECK-NEXT:    store double [[SUM]], ptr [[GEP]], align 8
+; CHECK-NEXT:    [[I_NEXT]] = add nuw i64 [[I2]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[I_NEXT]], [[NUM_ELEMENTS]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP_LATCH]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop.header
+
+loop.header:                                      ; preds = %loop.latch, %entry
+  %i = phi i64 [ 0, %entry ], [ %i.next, %loop.latch ]
+  %cmp = icmp ult i64 %i, %num_elements
+  br i1 %cmp, label %loop.latch, label %exit
+
+loop.latch:                                       ; preds = %loop.header
+  %gep = getelementptr nusw %"class.dealii::VectorizedArray", ptr %array, i64 %i
+  %invariant_val = load double, ptr %invariant_ptr, align 8
+  %array_val = load double, ptr %gep, align 8
+  %sum = fadd double %array_val, %invariant_val
+  store double %sum, ptr %gep, align 8
+  %i.next = add i64 %i, 1
+  br label %loop.header
+
+exit:                                             ; preds = %loop.header
+  ret void
+}
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll
index 8d20a3ba8ed08..d311f547f2e51 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll
@@ -43,7 +43,6 @@ define void @s172(i32 noundef %xa, i32 noundef %xb, ptr noundef %a, ptr noundef
 ; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[FOR_BODY_PREHEADER13]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP8]], -8
-; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[N_VEC]], [[TMP0]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -64,6 +63,7 @@ define void @s172(i32 noundef %xa, i32 noundef %xb, ptr noundef %a, ptr noundef
 ; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[N_VEC]], [[TMP0]]
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP8]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER13]]
 ; CHECK:       for.body.preheader14:
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/interleave_vec.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/interleave_vec.ll
index 2dceb27165c4d..f2ae327778f4a 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/interleave_vec.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/interleave_vec.ll
@@ -1040,7 +1040,6 @@ define void @saxpy_5(i64 %n, float %a, ptr readonly %x, ptr noalias %y) {
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[LOOP_PREHEADER11:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP3]], 9223372036854775806
-; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[N_VEC]], 5
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[A]], i64 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT]], <2 x float> poison, <10 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
@@ -1058,10 +1057,11 @@ define void @saxpy_5(i64 %n, float %a, ptr readonly %x, ptr noalias %y) {
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[N_VEC]], 5
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT]], label %[[LOOP_PREHEADER11]]
 ; CHECK:       [[LOOP_PREHEADER11]]:
-; CHECK-NEXT:    [[I1_PH:%.*]] = phi i64 [ 0, %[[LOOP_PREHEADER]] ], [ [[TMP4]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[I1_PH:%.*]] = phi i64 [ 0, %[[LOOP_PREHEADER]] ], [ [[TMP16]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0
 ; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x float> [[TMP10]], <4 x float> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/std-find.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/std-find.ll
index a3b8736a06ec7..338d9259b635c 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/std-find.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/std-find.ll
@@ -9,7 +9,6 @@ define i64 @std_find_i16_constant_offset_with_assumptions(ptr %first.coerce, i16
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[FIRST_COERCE]], i64 2) ]
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[FIRST_COERCE]], i64 256) ]
-; CHECK-NEXT:    [[COERCE_VAL_IP:%.*]] = getelementptr i8, ptr [[FIRST_COERCE]], i64 256
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[S]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
@@ -27,6 +26,7 @@ define i64 @std_find_i16_constant_offset_with_assumptions(ptr %first.coerce, i16
 ; CHECK-NEXT:    [[TMP4:%.*]] = or i1 [[TMP2]], [[TMP3]]
 ; CHECK-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_SPLIT]]:
+; CHECK-NEXT:    [[COERCE_VAL_IP:%.*]] = getelementptr i8, ptr [[FIRST_COERCE]], i64 256
 ; CHECK-NEXT:    br i1 [[TMP2]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[RETURN:.*]]
 ; CHECK:       [[VECTOR_EARLY_EXIT]]:
 ; CHECK-NEXT:    [[TMP5:%.*]] = tail call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> [[TMP0]], i1 true)
diff --git a/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll b/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll
index 5127b7d37f0b4..7c349fb77be20 100644
--- a/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll
+++ b/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll
@@ -18,22 +18,15 @@ define void @arm_mult_q15(ptr %pSrcA, ptr %pSrcB, ptr noalias %pDst, i32 %blockS
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[WHILE_BODY_PREHEADER15:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i32 [[BLOCKSIZE]], -8
-; CHECK-NEXT:    [[IND_END:%.*]] = and i32 [[BLOCKSIZE]], 7
-; CHECK-NEXT:    [[TMP0:%.*]] = shl i32 [[N_VEC]], 1
-; CHECK-NEXT:    [[IND_END7:%.*]] = getelementptr i8, ptr [[PSRCA:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP1:%.*]] = shl i32 [[N_VEC]], 1
-; CHECK-NEXT:    [[IND_END9:%.*]] = getelementptr i8, ptr [[PDST:%.*]], i32 [[TMP1]]
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i32 [[N_VEC]], 1
-; CHECK-NEXT:    [[IND_END11:%.*]] = getelementptr i8, ptr [[PSRCB:%.*]], i32 [[TMP2]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i32 [[INDEX]], 1
-; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PSRCA]], i32 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PSRCA:%.*]], i32 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[OFFSET_IDX13:%.*]] = shl i32 [[INDEX]], 1
-; CHECK-NEXT:    [[NEXT_GEP14:%.*]] = getelementptr i8, ptr [[PDST]], i32 [[OFFSET_IDX13]]
+; CHECK-NEXT:    [[NEXT_GEP14:%.*]] = getelementptr i8, ptr [[PDST:%.*]], i32 [[OFFSET_IDX13]]
 ; CHECK-NEXT:    [[OFFSET_IDX15:%.*]] = shl i32 [[INDEX]], 1
-; CHECK-NEXT:    [[NEXT_GEP16:%.*]] = getelementptr i8, ptr [[PSRCB]], i32 [[OFFSET_IDX15]]
+; CHECK-NEXT:    [[NEXT_GEP16:%.*]] = getelementptr i8, ptr [[PSRCB:%.*]], i32 [[OFFSET_IDX15]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[NEXT_GEP]], align 2
 ; CHECK-NEXT:    [[TMP3:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i32>
 ; CHECK-NEXT:    [[WIDE_LOAD17:%.*]] = load <8 x i16>, ptr [[NEXT_GEP16]], align 2
@@ -47,6 +40,13 @@ define void @arm_mult_q15(ptr %pSrcA, ptr %pSrcB, ptr noalias %pDst, i32 %blockS
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
+; CHECK-NEXT:    [[IND_END:%.*]] = and i32 [[BLOCKSIZE]], 7
+; CHECK-NEXT:    [[TMP13:%.*]] = shl i32 [[N_VEC]], 1
+; CHECK-NEXT:    [[IND_END7:%.*]] = getelementptr i8, ptr [[PSRCA]], i32 [[TMP13]]
+; CHECK-NEXT:    [[TMP14:%.*]] = shl i32 [[N_VEC]], 1
+; CHECK-NEXT:    [[IND_END9:%.*]] = getelementptr i8, ptr [[PDST]], i32 [[TMP14]]
+; CHECK-NEXT:    [[TMP12:%.*]] = shl i32 [[N_VEC]], 1
+; CHECK-NEXT:    [[IND_END11:%.*]] = getelementptr i8, ptr [[PSRCB]], i32 [[TMP12]]
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[BLOCKSIZE]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[WHILE_END]], label [[WHILE_BODY_PREHEADER15]]
 ; CHECK:       while.body.preheader15:
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/pr48844-br-to-switch-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/X86/pr48844-br-to-switch-vectorization.ll
index dcfebe32302be..6e95b63270e6c 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/pr48844-br-to-switch-vectorization.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/pr48844-br-to-switch-vectorization.ll
@@ -46,7 +46,6 @@ define dso_local void @test(ptr %start, ptr %end) #0 {
 ; AVX2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 124
 ; AVX2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[BB12_PREHEADER11:%.*]], label [[VECTOR_PH:%.*]]
 ; AVX2:       vector.ph:
-; AVX2-NEXT:    [[N_VEC_REMAINING:%.*]] = and i64 [[TMP3]], 24
 ; AVX2-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP3]], 9223372036854775776
 ; AVX2-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; AVX2:       vector.body:
@@ -80,6 +79,7 @@ define dso_local void @test(ptr %start, ptr %end) #0 {
 ; AVX2-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; AVX2-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; AVX2:       middle.block:
+; AVX2-NEXT:    [[N_VEC_REMAINING:%.*]] = and i64 [[TMP3]], 24
 ; AVX2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
 ; AVX2-NEXT:    br i1 [[CMP_N]], label [[EXIT]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; AVX2:       vec.epilog.iter.check:
@@ -90,8 +90,6 @@ define dso_local void @test(ptr %start, ptr %end) #0 {
 ; AVX2:       vec.epilog.ph:
 ; AVX2-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; AVX2-NEXT:    [[N_VEC10:%.*]] = and i64 [[TMP3]], 9223372036854775800
-; AVX2-NEXT:    [[TMP21:%.*]] = shl i64 [[N_VEC10]], 2
-; AVX2-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP21]]
 ; AVX2-NEXT:    br label [[BB12:%.*]]
 ; AVX2:       vec.epilog.vector.body:
 ; AVX2-NEXT:    [[INDEX12:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[BB12_PREHEADER11]] ], [ [[INDEX_NEXT16:%.*]], [[BB12]] ]
@@ -106,6 +104,8 @@ define dso_local void @test(ptr %start, ptr %end) #0 {
 ; AVX2-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT16]], [[N_VEC10]]
 ; AVX2-NEXT:    br i1 [[TMP25]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[BB12]], !llvm.loop [[LOOP4:![0-9]+]]
 ; AVX2:       vec.epilog.middle.block:
+; AVX2-NEXT:    [[TMP27:%.*]] = shl i64 [[N_VEC10]], 2
+; AVX2-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP27]]
 ; AVX2-NEXT:    [[CMP_N17:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC10]]
 ; AVX2-NEXT:    br i1 [[CMP_N17]], label [[EXIT]], label [[BB12_PREHEADER1]]
 ; AVX2:       bb12.preheader:
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll b/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll
index bfb8554e6243c..4562072b7b450 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll
@@ -16,8 +16,8 @@ define void @vdiv(ptr %x, ptr %y, double %a, i32 %N) #0 {
 ; CHECK-SAME: ptr writeonly captures(none) [[X:%.*]], ptr readonly captures(none) [[Y:%.*]], double [[A:%.*]], i32 [[N:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[N]], 0
-; CHECK-NEXT:    br i1 [[CMP1]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_END:.*]]
-; CHECK:       [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT:    br i1 [[CMP1]], label %[[ITER_CHECK:.*]], label %[[FOR_END:.*]]
+; CHECK:       [[ITER_CHECK]]:
 ; CHECK-NEXT:    [[X4:%.*]] = ptrtoint ptr [[X]] to i64
 ; CHECK-NEXT:    [[Y5:%.*]] = ptrtoint ptr [[Y]] to i64
 ; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
@@ -25,12 +25,11 @@ define void @vdiv(ptr %x, ptr %y, double %a, i32 %N) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 [[X4]], [[Y5]]
 ; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 128
 ; CHECK-NEXT:    [[OR_COND:%.*]] = select i1 [[MIN_ITERS_CHECK]], i1 true, i1 [[DIFF_CHECK]]
-; CHECK-NEXT:    br i1 [[OR_COND]], label %[[FOR_BODY_PREHEADER9:.*]], label %[[VECTOR_PH:.*]]
-; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br i1 [[OR_COND]], label %[[FOR_BODY_PREHEADER:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
+; CHECK:       [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK6:%.*]] = icmp ult i32 [[N]], 16
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK6]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH1:.*]]
-; CHECK:       [[VECTOR_PH1]]:
-; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 12
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK6]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 2147483632
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x double> poison, double [[A]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT]], <4 x double> poison, <4 x i32> zeroinitializer
@@ -40,7 +39,7 @@ define void @vdiv(ptr %x, ptr %y, double %a, i32 %N) #0 {
 ; CHECK-NEXT:    [[TMP4:%.*]] = fdiv fast <4 x double> splat (double 1.000000e+00), [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw double, ptr [[Y]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP5]], i64 32
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP5]], i64 64
@@ -65,13 +64,14 @@ define void @vdiv(ptr %x, ptr %y, double %a, i32 %N) #0 {
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 12
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[FOR_END]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
 ; CHECK:       [[VEC_EPILOG_ITER_CHECK]]:
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0
-; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[FOR_BODY_PREHEADER9]], label %[[VEC_EPILOG_PH]]
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[FOR_BODY_PREHEADER]], label %[[VEC_EPILOG_PH]], !prof [[PROF10:![0-9]+]]
 ; CHECK:       [[VEC_EPILOG_PH]]:
-; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_PH]] ]
+; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[N_VEC11:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 2147483644
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT14:%.*]] = insertelement <4 x double> poison, double [[A]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT15:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT14]], <4 x double> poison, <4 x i32> zeroinitializer
@@ -86,12 +86,12 @@ define void @vdiv(ptr %x, ptr %y, double %a, i32 %N) #0 {
 ; CHECK-NEXT:    store <4 x double> [[TMP40]], ptr [[TMP41]], align 8, !tbaa [[DOUBLE_TBAA3]]
 ; CHECK-NEXT:    [[INDEX_NEXT16]] = add nuw i64 [[INDEX12]], 4
 ; CHECK-NEXT:    [[TMP42:%.*]] = icmp eq i64 [[INDEX_NEXT16]], [[N_VEC11]]
-; CHECK-NEXT:    br i1 [[TMP42]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP42]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       [[VEC_EPILOG_MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[CMP_N17:%.*]] = icmp eq i64 [[N_VEC11]], [[WIDE_TRIP_COUNT]]
-; CHECK-NEXT:    br i1 [[CMP_N17]], label %[[FOR_END]], label %[[FOR_BODY_PREHEADER9]]
-; CHECK:       [[FOR_BODY_PREHEADER9]]:
-; CHECK-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[N_VEC11]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br i1 [[CMP_N17]], label %[[FOR_END]], label %[[FOR_BODY_PREHEADER]]
+; CHECK:       [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, %[[ITER_CHECK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[N_VEC11]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    [[TMP43:%.*]] = sub nsw i64 [[WIDE_TRIP_COUNT]], [[INDVARS_IV_PH]]
 ; CHECK-NEXT:    [[XTRAITER:%.*]] = and i64 [[TMP43]], 7
 ; CHECK-NEXT:    [[LCMP_MOD_NOT:%.*]] = icmp eq i64 [[XTRAITER]], 0
@@ -110,13 +110,13 @@ define void @vdiv(ptr %x, ptr %y, double %a, i32 %N) #0 {
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_PROL]] = add nuw nsw i64 [[INDVARS_IV_PROL]], 1
 ; CHECK-NEXT:    [[PROL_ITER_NEXT]] = add i64 [[PROL_ITER]], 1
 ; CHECK-NEXT:    [[PROL_ITER_CMP_NOT:%.*]] = icmp eq i64 [[PROL_ITER_NEXT]], [[XTRAITER]]
-; CHECK-NEXT:    br i1 [[PROL_ITER_CMP_NOT]], label %[[FOR_BODY_PROL_LOOPEXIT]], label %[[FOR_BODY_PROL]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT:    br i1 [[PROL_ITER_CMP_NOT]], label %[[FOR_BODY_PROL_LOOPEXIT]], label %[[FOR_BODY_PROL]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       [[FOR_BODY_PROL_LOOPEXIT]]:
-; CHECK-NEXT:    [[INDVARS_IV_UNR:%.*]] = phi i64 [ [[INDVARS_IV_PH]], %[[FOR_BODY_PREHEADER9]] ], [ [[INDVARS_IV_NEXT_PROL]], %[[FOR_BODY_PROL]] ]
+; CHECK-NEXT:    [[INDVARS_IV_UNR:%.*]] = phi i64 [ [[INDVARS_IV_PH]], %[[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT_PROL]], %[[FOR_BODY_PROL]] ]
 ; CHECK-NEXT:    [[TMP20:%.*]] = sub nsw i64 [[INDVARS_IV_PH]], [[WIDE_TRIP_COUNT]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = icmp ugt i64 [[TMP20]], -8
-; CHECK-NEXT:    br i1 [[TMP21]], label %[[FOR_END]], label %[[FOR_BODY_PREHEADER9_NEW:.*]]
-; CHECK:       [[FOR_BODY_PREHEADER9_NEW]]:
+; CHECK-NEXT:    br i1 [[TMP21]], label %[[FOR_END]], label %[[FOR_BODY_PREHEADER_NEW:.*]]
+; CHECK:       [[FOR_BODY_PREHEADER_NEW]]:
 ; CHECK-NEXT:    [[TMP22:%.*]] = fdiv fast double 1.000000e+00, [[A]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = fdiv fast double 1.000000e+00, [[A]]
 ; CHECK-NEXT:    [[TMP24:%.*]] = fdiv fast double 1.000000e+00, [[A]]
@@ -127,7 +127,7 @@ define void @vdiv(ptr %x, ptr %y, double %a, i32 %N) #0 {
 ; CHECK-NEXT:    [[TMP29:%.*]] = fdiv fast double 1.000000e+00, [[A]]
 ; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
 ; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_UNR]], %[[FOR_BODY_PREHEADER9_NEW]] ], [ [[INDVARS_IV_NEXT_7:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_UNR]], %[[FOR_BODY_PREHEADER_NEW]] ], [ [[INDVARS_IV_NEXT_7:%.*]], %[[FOR_BODY]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw double, ptr [[Y]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    [[T0:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[DOUBLE_TBAA3]]
 ; CHECK-NEXT:    [[TMP30:%.*]] = fmul fast double [[T0]], [[TMP22]]
@@ -177,7 +177,7 @@ define void @vdiv(ptr %x, ptr %y, double %a, i32 %N) #0 {
 ; CHECK-NEXT:    store double [[TMP37]], ptr [[ARRAYIDX2_7]], align 8, !tbaa [[DOUBLE_TBAA3]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_7]] = add nuw nsw i64 [[INDVARS_IV]], 8
 ; CHECK-NEXT:    [[EXITCOND_NOT_7:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_7]], [[WIDE_TRIP_COUNT]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT_7]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT_7]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       [[FOR_END]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -232,8 +232,9 @@ attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="
 ; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META8:![0-9]+]], [[META9:![0-9]+]]}
 ; CHECK: [[META8]] = !{!"llvm.loop.isvectorized", i32 1}
 ; CHECK: [[META9]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META8]], [[META9]]}
-; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META12:![0-9]+]]}
-; CHECK: [[META12]] = !{!"llvm.loop.unroll.disable"}
-; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META8]]}
+; CHECK: [[PROF10]] = !{!"branch_weights", i32 4, i32 12}
+; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META8]], [[META9]]}
+; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META13:![0-9]+]]}
+; CHECK: [[META13]] = !{!"llvm.loop.unroll.disable"}
+; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META8]]}
 ;.
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/div-like-mixed-with-undefs.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/div-like-mixed-with-undefs.ll
index d16843c81144d..6629b1219cbe8 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/div-like-mixed-with-undefs.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/div-like-mixed-with-undefs.ll
@@ -1,21 +1,21 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -passes=slp-vectorizer -S -slp-threshold=-100 -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s
 
-define ptr @test(ptr %d) {
+define ptr @test(ptr %d, i64 %v) {
 ; CHECK-LABEL: define ptr @test(
-; CHECK-SAME: ptr [[D:%.*]]) {
+; CHECK-SAME: ptr [[D:%.*]], i64 [[V:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr null, align 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[D]], align 1
 ; CHECK-NEXT:    [[CMP4_2:%.*]] = icmp eq i8 [[TMP0]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[CMP4_2]], i64 0, i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 0, 0
-; CHECK-NEXT:    [[TMP3:%.*]] = udiv i64 [[TMP2]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = udiv i64 1, 0
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[CMP4_2]], i64 0, i64 4
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 0, [[V]]
+; CHECK-NEXT:    [[TMP3:%.*]] = udiv i64 [[TMP2]], 3
+; CHECK-NEXT:    [[TMP4:%.*]] = udiv i64 1, [[V]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <6 x i64> poison, i64 [[TMP1]], i32 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <6 x i64> [[TMP5]], i64 [[TMP3]], i32 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <6 x i64> [[TMP6]], i64 [[TMP4]], i32 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <6 x i64> [[TMP7]], <6 x i64> poison, <6 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP9:%.*]] = mul <6 x i64> [[TMP8]], <i64 2, i64 6, i64 1, i64 1, i64 1, i64 0>
+; CHECK-NEXT:    [[TMP9:%.*]] = mul <6 x i64> [[TMP8]], <i64 2, i64 6, i64 4, i64 3, i64 5, i64 4>
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <6 x i64> [[TMP9]], i32 0
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[D]], i64 [[TMP10]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <6 x i64> [[TMP9]], i32 1
@@ -31,23 +31,23 @@ define ptr @test(ptr %d) {
 ; CHECK-NEXT:    ret ptr [[TMP20]]
 ;
 entry:
-  %0 = load i8, ptr null, align 1
+  %0 = load i8, ptr %d, align 1
   %cmp4.2 = icmp eq i8 %0, 0
-  %1 = select i1 %cmp4.2, i64 0, i64 0
+  %1 = select i1 %cmp4.2, i64 0, i64 4
   %2 = shl i64 %1, 1
   %3 = getelementptr i8, ptr %d, i64 %2
-  %4 = xor i64 0, 0
-  %5 = udiv i64 %4, 0
+  %4 = xor i64 0, %v
+  %5 = udiv i64 %4, 3
   %6 = mul i64 %5, 6
   %7 = getelementptr i8, ptr %d, i64 %6
-  %8 = shl i64 %1, 0
+  %8 = shl i64 %1, 2
   %scevgep42 = getelementptr i8, ptr %d, i64 %8
-  %9 = mul i64 %5, 1
+  %9 = mul i64 %5, 3
   %10 = getelementptr i8, ptr %d, i64 %9
-  %11 = udiv i64 1, 0
-  %12 = mul i64 %11, 1
+  %11 = udiv i64 1, %v
+  %12 = mul i64 %11, 5
   %13 = getelementptr i8, ptr %d, i64 %12
-  %14 = mul i64 %11, 0
+  %14 = mul i64 %11, 4
   %15 = getelementptr i8, ptr %d, i64 %14
   ret ptr %15
 }
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-opcode-strict-bitwidth-than-main.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-opcode-strict-bitwidth-than-main.ll
new file mode 100644
index 0000000000000..959b2350d9d78
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-opcode-strict-bitwidth-than-main.ll
@@ -0,0 +1,36 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -passes=slp-vectorizer -S -slp-threshold=-99999 -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+define float @test(i8 %0) {
+; CHECK-LABEL: define float @test(
+; CHECK-SAME: i8 [[TMP0:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i8> <i8 poison, i8 0>, i8 [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = sext <2 x i8> [[TMP1]] to <2 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = mul <2 x i32> [[TMP2]], <i32 2, i32 27>
+; CHECK-NEXT:    [[TMP4:%.*]] = lshr <2 x i32> [[TMP2]], <i32 2, i32 27>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i32> [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = or i32 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    switch i32 [[TMP8]], label %[[EXIT:.*]] [
+; CHECK-NEXT:      i32 0, label %[[EXIT]]
+; CHECK-NEXT:      i32 1, label %[[EXIT]]
+; CHECK-NEXT:    ]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+entry:
+  %1 = sext i8 0 to i32
+  %2 = lshr i32 %1, 27
+  %3 = sext i8 %0 to i32
+  %reass.add.epil = mul i32 %3, 2
+  %4 = or i32 %reass.add.epil, %2
+  switch i32 %4, label %exit [
+  i32 0, label %exit
+  i32 1, label %exit
+  ]
+
+exit:
+  ret float 0.000000e+00
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gathered-node-with-in-order-parent.ll b/llvm/test/Transforms/SLPVectorizer/X86/gathered-node-with-in-order-parent.ll
new file mode 100644
index 0000000000000..260de1cc2b76a
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/gathered-node-with-in-order-parent.ll
@@ -0,0 +1,49 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -passes=slp-vectorizer -S -slp-threshold=-99999 -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+define double @test() {
+; CHECK-LABEL: define double @test() {
+; CHECK-NEXT:  [[BB:.*]]:
+; CHECK-NEXT:    br label %[[BB1:.*]]
+; CHECK:       [[BB1]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi <4 x i32> [ zeroinitializer, %[[BB]] ], [ [[TMP3:%.*]], %[[BB4:.*]] ]
+; CHECK-NEXT:    br label %[[BB4]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 0, 1
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> <i32 poison, i32 0, i32 0, i32 0>, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> <i32 poison, i32 0, i32 0, i32 0>, i32 [[MUL]], i32 0
+; CHECK-NEXT:    [[TMP3]] = or <4 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> <i32 poison, i32 0, i32 0, i32 1>, i32 [[MUL]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i32> [[TMP0]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP5]], i32 2
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[TMP6]], 0
+; CHECK-NEXT:    br i1 false, label %[[BB7:.*]], label %[[BB1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = phi <4 x i32> [ [[TMP5]], %[[BB4]] ]
+; CHECK-NEXT:    ret double 0.000000e+00
+;
+bb:
+  br label %bb1
+
+bb1:
+  %phi = phi i32 [ 0, %bb ], [ 0, %bb4 ]
+  %phi2 = phi i32 [ 0, %bb ], [ 0, %bb4 ]
+  %phi3 = phi i32 [ 0, %bb ], [ %or5, %bb4 ]
+  br label %bb4
+
+bb4:
+  %or = or i32 %phi2, 0
+  %mul = mul i32 0, 1
+  %or5 = or i32 %phi3, %mul
+  %and = and i32 %or, 0
+  %or6 = or i32 %phi2, 1
+  br i1 false, label %bb7, label %bb1
+
+bb7:
+  %phi8 = phi i32 [ %phi, %bb4 ]
+  %phi9 = phi i32 [ %or, %bb4 ]
+  %phi10 = phi i32 [ %or5, %bb4 ]
+  %phi11 = phi i32 [ %or6, %bb4 ]
+  ret double 0.000000e+00
+}
+
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/parent-node-schedulable-with-multi-copyables.ll b/llvm/test/Transforms/SLPVectorizer/X86/parent-node-schedulable-with-multi-copyables.ll
new file mode 100644
index 0000000000000..9e96e93a3205b
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/parent-node-schedulable-with-multi-copyables.ll
@@ -0,0 +1,170 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -passes=slp-vectorizer -S -slp-threshold=-100 -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+define i64 @test(ptr %arg1, i64 %alloca.promoted344, i8 %load.311.i, i1 %load1.i) {
+; CHECK-LABEL: define i64 @test(
+; CHECK-SAME: ptr [[ARG1:%.*]], i64 [[ALLOCA_PROMOTED344:%.*]], i8 [[LOAD_311_I:%.*]], i1 [[LOAD1_I:%.*]]) {
+; CHECK-NEXT:  [[BB:.*]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i8> <i8 0, i8 0, i8 0, i8 poison>, i8 [[LOAD_311_I]], i32 3
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i8> <i8 poison, i8 poison, i8 0, i8 0>, i8 [[LOAD_311_I]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[ALLOCA_PROMOTED344]], i32 0
+; CHECK-NEXT:    br label %[[BB2:.*]]
+; CHECK:       [[BB2]]:
+; CHECK-NEXT:    [[TMP3:%.*]] = phi <2 x i64> [ zeroinitializer, %[[BB]] ], [ [[TMP28:%.*]], %[[BB12_8_I:.*]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = phi <8 x i8> [ zeroinitializer, %[[BB]] ], [ [[TMP29:%.*]], %[[BB12_8_I]] ]
+; CHECK-NEXT:    br i1 [[LOAD1_I]], label %[[SPAM_EXIT:.*]], label %[[BB4_LR_PH_I:.*]]
+; CHECK:       [[BB4_LR_PH_I]]:
+; CHECK-NEXT:    br i1 true, label %[[BB3_I_I_PEEL:.*]], label %[[EGGS_EXIT_I_PEEL:.*]]
+; CHECK:       [[BB3_I_I_PEEL]]:
+; CHECK-NEXT:    [[TMP5:%.*]] = and <2 x i64> [[TMP3]], splat (i64 1)
+; CHECK-NEXT:    [[LOAD4_I_I_PEEL:%.*]] = load i64, ptr [[ARG1]], align 8
+; CHECK-NEXT:    [[SHL_I_I_PEEL:%.*]] = shl i64 [[LOAD4_I_I_PEEL]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> poison, <2 x i32> <i32 poison, i32 0>
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i64> [[TMP6]], i64 [[SHL_I_I_PEEL]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = or <2 x i64> [[TMP5]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <2 x i64> [[TMP5]], [[TMP7]]
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> [[TMP9]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    br label %[[EGGS_EXIT_I_PEEL]]
+; CHECK:       [[EGGS_EXIT_I_PEEL]]:
+; CHECK-NEXT:    [[TMP11:%.*]] = phi <2 x i64> [ [[TMP10]], %[[BB3_I_I_PEEL]] ], [ zeroinitializer, %[[BB4_LR_PH_I]] ]
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x i64> [[TMP11]], <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 0>
+; CHECK-NEXT:    [[TMP13:%.*]] = trunc <4 x i64> [[TMP12]] to <4 x i8>
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i64> [[TMP12]], i32 1
+; CHECK-NEXT:    br label %[[SPAM_EXIT]]
+; CHECK:       [[SPAM_EXIT]]:
+; CHECK-NEXT:    [[GETELEMENTPTR_I_I_PROMOTED346:%.*]] = phi i64 [ [[TMP14]], %[[EGGS_EXIT_I_PEEL]] ], [ 0, %[[BB2]] ]
+; CHECK-NEXT:    [[LOAD_8_I:%.*]] = phi i8 [ 0, %[[EGGS_EXIT_I_PEEL]] ], [ 1, %[[BB2]] ]
+; CHECK-NEXT:    [[TMP15:%.*]] = phi <4 x i8> [ [[TMP13]], %[[EGGS_EXIT_I_PEEL]] ], [ zeroinitializer, %[[BB2]] ]
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP15]], <4 x i8> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT:    br i1 [[LOAD1_I]], label %[[BB12_8_I]], label %[[BB12_1_THREAD_I:.*]]
+; CHECK:       [[BB12_1_THREAD_I]]:
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <8 x i8> [[TMP4]], i32 0
+; CHECK-NEXT:    [[ICMP5_3_I:%.*]] = icmp eq i8 [[TMP17]], 0
+; CHECK-NEXT:    br i1 [[ICMP5_3_I]], label %[[BB12_3_I:.*]], label %[[BB8_3_I:.*]]
+; CHECK:       [[BB8_3_I]]:
+; CHECK-NEXT:    br label %[[BB12_3_I]]
+; CHECK:       [[BB12_3_I]]:
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <8 x i8> [[TMP4]], i32 1
+; CHECK-NEXT:    [[ICMP5_4_I:%.*]] = icmp eq i8 [[TMP18]], 0
+; CHECK-NEXT:    br i1 [[ICMP5_4_I]], label %[[BB12_4_I:.*]], label %[[BB8_4_I:.*]]
+; CHECK:       [[BB8_4_I]]:
+; CHECK-NEXT:    br label %[[BB12_4_I]]
+; CHECK:       [[BB12_4_I]]:
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <8 x i8> [[TMP4]], i32 2
+; CHECK-NEXT:    [[ICMP5_5_I:%.*]] = icmp eq i8 [[TMP19]], 0
+; CHECK-NEXT:    br i1 [[ICMP5_5_I]], label %[[BB12_5_I:.*]], label %[[BB8_5_I:.*]]
+; CHECK:       [[BB8_5_I]]:
+; CHECK-NEXT:    br label %[[BB12_5_I]]
+; CHECK:       [[BB12_5_I]]:
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <8 x i8> [[TMP4]], i32 3
+; CHECK-NEXT:    [[ICMP5_7_I:%.*]] = icmp eq i8 [[TMP20]], 0
+; CHECK-NEXT:    br i1 [[ICMP5_7_I]], label %[[BB12_7_I:.*]], label %[[BB8_7_I:.*]]
+; CHECK:       [[BB8_7_I]]:
+; CHECK-NEXT:    br label %[[BB12_7_I]]
+; CHECK:       [[BB12_7_I]]:
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <8 x i8> [[TMP4]], i32 4
+; CHECK-NEXT:    [[ICMP5_8_I:%.*]] = icmp eq i8 [[TMP21]], 0
+; CHECK-NEXT:    br i1 [[ICMP5_8_I]], label %[[BB12_8_I]], label %[[BB8_8_I:.*]]
+; CHECK:       [[BB8_8_I]]:
+; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x i8> [[TMP1]], i8 [[LOAD_8_I]], i32 1
+; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <4 x i8> poison, i8 [[LOAD_8_I]], i32 0
+; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <8 x i8> [[TMP4]], <8 x i8> poison, <4 x i32> <i32 poison, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <4 x i8> [[TMP23]], <4 x i8> [[TMP24]], <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    br label %[[BB12_8_I]]
+; CHECK:       [[BB12_8_I]]:
+; CHECK-NEXT:    [[TMP26:%.*]] = phi <4 x i8> [ [[TMP0]], %[[BB12_7_I]] ], [ [[TMP22]], %[[BB8_8_I]] ], [ [[TMP15]], %[[SPAM_EXIT]] ]
+; CHECK-NEXT:    [[TMP27:%.*]] = phi <4 x i8> [ zeroinitializer, %[[BB12_7_I]] ], [ [[TMP25]], %[[BB8_8_I]] ], [ [[TMP16]], %[[SPAM_EXIT]] ]
+; CHECK-NEXT:    [[TMP28]] = insertelement <2 x i64> [[TMP2]], i64 [[GETELEMENTPTR_I_I_PROMOTED346]], i32 1
+; CHECK-NEXT:    [[TMP29]] = shufflevector <4 x i8> [[TMP26]], <4 x i8> [[TMP27]], <8 x i32> <i32 2, i32 7, i32 5, i32 0, i32 1, i32 3, i32 4, i32 6>
+; CHECK-NEXT:    br label %[[BB2]]
+;
+bb:
+  br label %bb2
+
+bb2:
+  %getelementptr.i.i.promoted = phi i64 [ 0, %bb ], [ %getelementptr.i.i.promoted346, %bb12.8.i ]
+  %alloca.promoted = phi i64 [ 0, %bb ], [ %alloca.promoted344, %bb12.8.i ]
+  %load.8.i231 = phi i8 [ 0, %bb ], [ %load.8.i239, %bb12.8.i ]
+  %load.7.i217 = phi i8 [ 0, %bb ], [ %load.7.i225, %bb12.8.i ]
+  %load.626.i200 = phi i8 [ 0, %bb ], [ %load.626.i208, %bb12.8.i ]
+  %load.6.i183 = phi i8 [ 0, %bb ], [ %load.6.i191, %bb12.8.i ]
+  %load.5.i167 = phi i8 [ 0, %bb ], [ %load.5.i175, %bb12.8.i ]
+  %load.418.i148 = phi i8 [ 0, %bb ], [ %load.418.i156, %bb12.8.i ]
+  %load.4.i129 = phi i8 [ 0, %bb ], [ %load.4.i137, %bb12.8.i ]
+  %load.3.i111 = phi i8 [ 0, %bb ], [ %load.3.i119, %bb12.8.i ]
+  br i1 %load1.i, label %spam.exit, label %bb4.lr.ph.i
+
+bb4.lr.ph.i:
+  br i1 true, label %bb3.i.i.peel, label %eggs.exit.i.peel
+
+bb3.i.i.peel:
+  %and.i.i.peel = and i64 %alloca.promoted, 1
+  %load4.i.i.peel = load i64, ptr %arg1, align 8
+  %shl.i.i.peel = shl i64 %load4.i.i.peel, 1
+  %or.i.i.peel = or i64 %shl.i.i.peel, %and.i.i.peel
+  %and6.i.i.peel = and i64 %getelementptr.i.i.promoted, 1
+  %xor.i.i.peel = xor i64 %and6.i.i.peel, %alloca.promoted
+  br label %eggs.exit.i.peel
+
+eggs.exit.i.peel:
+  %load5.i.i93.peel = phi i64 [ %xor.i.i.peel, %bb3.i.i.peel ], [ 0, %bb4.lr.ph.i ]
+  %or.i.i91.peel = phi i64 [ %or.i.i.peel, %bb3.i.i.peel ], [ 0, %bb4.lr.ph.i ]
+  %0 = trunc i64 %or.i.i91.peel to i8
+  %1 = trunc nuw i64 %or.i.i91.peel to i8
+  %2 = trunc i64 %load5.i.i93.peel to i8
+  br label %spam.exit
+
+spam.exit:
+  %getelementptr.i.i.promoted346 = phi i64 [ %load5.i.i93.peel, %eggs.exit.i.peel ], [ 0, %bb2 ]
+  %load.834.i = phi i8 [ %2, %eggs.exit.i.peel ], [ 0, %bb2 ]
+  %load.7.i25 = phi i8 [ %1, %eggs.exit.i.peel ], [ 0, %bb2 ]
+  %load.8.i = phi i8 [ 0, %eggs.exit.i.peel ], [ 1, %bb2 ]
+  %load.6.i18 = phi i8 [ %0, %eggs.exit.i.peel ], [ 0, %bb2 ]
+  br i1 %load1.i, label %bb12.8.i, label %bb12.1.thread.i
+
+bb12.1.thread.i:
+  %icmp5.3.i = icmp eq i8 %load.3.i111, 0
+  br i1 %icmp5.3.i, label %bb12.3.i, label %bb8.3.i
+
+bb8.3.i:
+  br label %bb12.3.i
+
+bb12.3.i:
+  %icmp5.4.i = icmp eq i8 %load.4.i129, 0
+  br i1 %icmp5.4.i, label %bb12.4.i, label %bb8.4.i
+
+bb8.4.i:
+  br label %bb12.4.i
+
+bb12.4.i:
+  %icmp5.5.i = icmp eq i8 %load.5.i167, 0
+  br i1 %icmp5.5.i, label %bb12.5.i, label %bb8.5.i
+
+bb8.5.i:
+  br label %bb12.5.i
+
+bb12.5.i:
+  %icmp5.7.i = icmp eq i8 %load.7.i217, 0
+  br i1 %icmp5.7.i, label %bb12.7.i, label %bb8.7.i
+
+bb8.7.i:
+  br label %bb12.7.i
+
+bb12.7.i:
+  %icmp5.8.i = icmp eq i8 %load.8.i231, 0
+  br i1 %icmp5.8.i, label %bb12.8.i, label %bb8.8.i
+
+bb8.8.i:
+  br label %bb12.8.i
+
+bb12.8.i:
+  %load.8.i239 = phi i8 [ 0, %bb12.7.i ], [ %load.8.i, %bb8.8.i ], [ %load.834.i, %spam.exit ]
+  %load.7.i225 = phi i8 [ 0, %bb12.7.i ], [ %load.311.i, %bb8.8.i ], [ %load.7.i25, %spam.exit ]
+  %load.626.i208 = phi i8 [ 0, %bb12.7.i ], [ %load.8.i, %bb8.8.i ], [ %load.6.i18, %spam.exit ]
+  %load.6.i191 = phi i8 [ %load.311.i, %bb12.7.i ], [ 0, %bb8.8.i ], [ %load.6.i18, %spam.exit ]
+  %load.5.i175 = phi i8 [ 0, %bb12.7.i ], [ %load.6.i183, %bb8.8.i ], [ %load.6.i18, %spam.exit ]
+  %load.418.i156 = phi i8 [ 0, %bb12.7.i ], [ %load.626.i200, %bb8.8.i ], [ %load.6.i18, %spam.exit ]
+  %load.4.i137 = phi i8 [ 0, %bb12.7.i ], [ %load.418.i148, %bb8.8.i ], [ %load.6.i18, %spam.exit ]
+  %load.3.i119 = phi i8 [ 0, %bb12.7.i ], [ 0, %bb8.8.i ], [ %load.6.i18, %spam.exit ]
+  br label %bb2
+}
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch-profile.ll b/llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch-profile.ll
new file mode 100644
index 0000000000000..9cc417f6b874e
--- /dev/null
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch-profile.ll
@@ -0,0 +1,89 @@
+; RUN: split-file %s %t
+; RUN: cat %t/main.ll %t/probable-or.prof > %t/probable-or.ll
+; RUN: cat %t/main.ll %t/probable-and.prof > %t/probable-and.ll
+; RUN: opt -passes='loop(simple-loop-unswitch<nontrivial>)' -S %t/probable-or.ll -o -| FileCheck %t/probable-or.prof
+; RUN: opt -passes='loop(simple-loop-unswitch<nontrivial>)' -S %t/probable-and.ll -o -| FileCheck %t/probable-and.prof
+
+;--- main.ll
+declare i32 @a()
+declare i32 @b()
+
+define i32 @or(ptr %ptr, i1 %cond) !prof !0 {
+entry:
+  br label %loop_begin
+
+loop_begin:
+  %v1 = load i1, ptr %ptr
+  %cond_or = or i1 %v1, %cond
+  br i1 %cond_or, label %loop_a, label %loop_b, !prof !1
+
+loop_a:
+  call i32 @a()
+  br label %latch
+
+loop_b:
+  call i32 @b()
+  br label %latch
+
+latch:
+  %v2 = load i1, ptr %ptr
+  br i1 %v2, label %loop_begin, label %loop_exit, !prof !2
+
+loop_exit:
+  ret i32 0
+}
+
+define i32 @and(ptr %ptr, i1 %cond) !prof !0 {
+entry:
+  br label %loop_begin
+
+loop_begin:
+  %v1 = load i1, ptr %ptr
+  %cond_and = and i1 %v1, %cond
+  br i1 %cond_and, label %loop_a, label %loop_b, !prof !1
+
+loop_a:
+  call i32 @a()
+  br label %latch
+
+loop_b:
+  call i32 @b()
+  br label %latch
+
+latch:
+  %v2 = load i1, ptr %ptr
+  br i1 %v2, label %loop_begin, label %loop_exit, !prof !2
+
+loop_exit:
+  ret i32 0
+}
+
+;--- probable-or.prof
+!0 = !{!"function_entry_count", i32 10}
+!1 = !{!"branch_weights", i32 1, i32 1000}
+!2 = !{!"branch_weights", i32 5, i32 7}
+; CHECK-LABEL: @or
+; CHECK-LABEL: entry:
+; CHECK-NEXT:   %cond.fr = freeze i1 %cond
+; CHECK-NEXT:   br i1 %cond.fr, label %entry.split.us, label %entry.split, !prof !1
+; CHECK-LABEL: @and
+; CHECK-LABEL: entry:
+; CHECK-NEXT:   %cond.fr = freeze i1 %cond
+; CHECK-NEXT:   br i1 %cond.fr, label %entry.split, label %entry.split.us, !prof !3
+; CHECK: !1 = !{!"branch_weights", i32 1, i32 1000}
+; CHECK: !3 = !{!"unknown", !"simple-loop-unswitch"}
+
+;--- probable-and.prof
+!0 = !{!"function_entry_count", i32 10}
+!1 = !{!"branch_weights", i32 1000, i32 1}
+!2 = !{!"branch_weights", i32 5, i32 7}
+; CHECK-LABEL: @or
+; CHECK-LABEL: entry:
+; CHECK-NEXT:   %cond.fr = freeze i1 %cond
+; CHECK-NEXT:   br i1 %cond.fr, label %entry.split.us, label %entry.split, !prof !1
+; CHECK-LABEL: @and
+; CHECK-LABEL: entry:
+; CHECK-NEXT:   %cond.fr = freeze i1 %cond
+; CHECK-NEXT:   br i1 %cond.fr, label %entry.split, label %entry.split.us, !prof !3
+; CHECK: !1 = !{!"unknown", !"simple-loop-unswitch"}
+; CHECK: !3 = !{!"branch_weights", i32 1000, i32 1}
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/pr60736.ll b/llvm/test/Transforms/SimpleLoopUnswitch/pr60736.ll
index 0964c55d1dec6..3760be4b26f23 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/pr60736.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/pr60736.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
 ; RUN: opt < %s -simple-loop-unswitch-inject-invariant-conditions=true -passes='loop(simple-loop-unswitch<nontrivial>,loop-instsimplify)' -S | FileCheck %s
 
 define void @test() {
@@ -7,7 +7,7 @@ define void @test() {
 ; CHECK-NEXT:    [[TMP:%.*]] = call i1 @llvm.experimental.widenable.condition()
 ; CHECK-NEXT:    [[TMP1:%.*]] = load atomic i32, ptr addrspace(1) poison unordered, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load atomic i32, ptr addrspace(1) poison unordered, align 8
-; CHECK-NEXT:    br i1 [[TMP]], label [[BB_SPLIT:%.*]], label [[BB3_SPLIT_US:%.*]]
+; CHECK-NEXT:    br i1 [[TMP]], label [[BB_SPLIT:%.*]], label [[BB3_SPLIT_US:%.*]], !prof [[PROF0:![0-9]+]]
 ; CHECK:       bb.split:
 ; CHECK-NEXT:    br label [[BB3:%.*]]
 ; CHECK:       bb3:
@@ -19,7 +19,7 @@ define void @test() {
 ; CHECK-NEXT:    [[TMP6_US:%.*]] = phi i32 [ poison, [[BB3_SPLIT_US]] ]
 ; CHECK-NEXT:    [[TMP7_US:%.*]] = add nuw nsw i32 [[TMP6_US]], 2
 ; CHECK-NEXT:    [[TMP8_US:%.*]] = icmp ult i32 [[TMP7_US]], [[TMP2]]
-; CHECK-NEXT:    br i1 [[TMP8_US]], label [[BB9_US:%.*]], label [[BB16_SPLIT_US:%.*]], !prof [[PROF0:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP8_US]], label [[BB9_US:%.*]], label [[BB16_SPLIT_US:%.*]], !prof [[PROF0]]
 ; CHECK:       bb9.us:
 ; CHECK-NEXT:    br label [[BB17_SPLIT_US:%.*]]
 ; CHECK:       bb16.split.us:
@@ -96,3 +96,8 @@ declare i1 @llvm.experimental.widenable.condition()
 
 !0 = !{!"branch_weights", i32 1048576, i32 1}
 
+;.
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(inaccessiblemem: readwrite) }
+;.
+; CHECK: [[PROF0]] = !{!"branch_weights", i32 1048576, i32 1}
+;.
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/simple-unswitch-profile.ll b/llvm/test/Transforms/SimpleLoopUnswitch/simple-unswitch-profile.ll
new file mode 100644
index 0000000000000..ec6baa5b3772f
--- /dev/null
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/simple-unswitch-profile.ll
@@ -0,0 +1,157 @@
+; RUN: split-file %s %t
+; RUN: cat %t/main.ll %t/probable-or.prof > %t/probable-or.ll
+; RUN: cat %t/main.ll %t/probable-and.prof > %t/probable-and.ll
+; RUN: opt -passes='loop-mssa(simple-loop-unswitch)' -S %t/probable-or.ll -o - | FileCheck %t/probable-or.prof
+; RUN: opt -passes='loop-mssa(simple-loop-unswitch)' -S %t/probable-and.ll -o - | FileCheck %t/probable-and.prof
+;
+; RUN: opt -passes='module(print<block-freq>),function(loop-mssa(simple-loop-unswitch)),module(print<block-freq>)' \
+; RUN:   %t/probable-or.ll -disable-output -simple-loop-unswitch-estimate-profile=0 2>&1 | FileCheck %t/probable-or.prof --check-prefixes=PROFILE-COM,PROFILE-REF
+
+; RUN: opt -passes='module(print<block-freq>),function(loop-mssa(simple-loop-unswitch)),module(print<block-freq>)' \
+; RUN:   %t/probable-or.ll -disable-output -simple-loop-unswitch-estimate-profile=1 2>&1 | FileCheck %t/probable-or.prof --check-prefixes=PROFILE-COM,PROFILE-CHK
+
+; RUN: opt -passes='module(print<block-freq>),function(loop-mssa(simple-loop-unswitch)),module(print<block-freq>)' \
+; RUN:   %t/probable-and.ll -disable-output -simple-loop-unswitch-estimate-profile=0 2>&1 | FileCheck %t/probable-and.prof --check-prefixes=PROFILE-COM,PROFILE-REF
+
+; RUN: opt -passes='module(print<block-freq>),function(loop-mssa(simple-loop-unswitch)),module(print<block-freq>)' \
+; RUN:   %t/probable-and.ll -disable-output -simple-loop-unswitch-estimate-profile=1 2>&1 | FileCheck %t/probable-and.prof --check-prefixes=PROFILE-COM,PROFILE-CHK
+
+;--- main.ll
+declare void @some_func() noreturn
+
+define i32 @or(i1 %cond1, i32 %var1) !prof !0 {
+entry:
+  br label %loop_begin
+
+loop_begin:
+  %var3 = phi i32 [%var1, %entry], [%var2, %do_something]
+  %cond2 = icmp eq i32 %var3, 10
+  %cond.or = or i1 %cond1, %cond2
+  br i1 %cond.or, label %loop_exit, label %do_something, !prof !1
+
+do_something:
+  %var2 = add i32 %var3, 1
+  call void @some_func() noreturn nounwind
+  br label %loop_begin
+
+loop_exit:
+  ret i32 0
+}
+
+define i32 @and(i1 %cond1, i32 %var1) !prof !0 {
+entry:
+  br label %loop_begin
+
+loop_begin:
+  %var3 = phi i32 [%var1, %entry], [%var2, %do_something]
+  %cond2 = icmp eq i32 %var3, 10
+  %cond.and = and i1 %cond1, %cond2
+  br i1 %cond.and, label %do_something, label %loop_exit, !prof !1
+
+do_something:
+  %var2 = add i32 %var3, 1
+  call void @some_func() noreturn nounwind
+  br label %loop_begin
+
+loop_exit:
+  ret i32 0
+}
+
+;--- probable-or.prof
+!0 = !{!"function_entry_count", i32 10}
+!1 = !{!"branch_weights", i32 1, i32 1000}
+; CHECK-LABEL: @or
+; CHECK-LABEL: entry:
+; CHECK-NEXT:   %cond1.fr = freeze i1 %cond1
+; CHECK-NEXT:   br i1 %cond1.fr, label %loop_exit.split, label %entry.split, !prof !1
+; CHECK-LABEL: @and
+; CHECK-LABEL: entry:
+; CHECK-NEXT:   %cond1.fr = freeze i1 %cond1
+; CHECK-NEXT:   br i1 %cond1.fr, label %entry.split, label %loop_exit.split, !prof !2
+; CHECK: !1 = !{!"branch_weights", i32 1, i32 1000}
+; CHECK: !2 = !{!"unknown", !"simple-loop-unswitch"}
+
+; PROFILE-COM: Printing analysis results of BFI for function 'or':
+; PROFILE-COM: block-frequency-info: or
+ ; PROFILE-COM: - entry: {{.*}} count = 10
+ ; PROFILE-COM: - loop_begin: {{.*}} count = 10010
+ ; PROFILE-COM: - do_something: {{.*}} count = 10000
+ ; PROFILE-COM: - loop_exit: {{.*}} count = 10
+
+; PROFILE-COM: Printing analysis results of BFI for function 'and':
+; PROFILE-COM: block-frequency-info: and
+ ; PROFILE-COM: - entry: {{.*}} count = 10
+ ; PROFILE-COM: - loop_begin: {{.*}} count = 10
+ ; PROFILE-COM: - do_something: {{.*}} count = 0
+ ; PROFILE-COM: - loop_exit: {{.*}} count = 10
+
+; PROFILE-COM: Printing analysis results of BFI for function 'or':
+; PROFILE-COM: block-frequency-info: or
+ ; PROFILE-COM: - entry: {{.*}} count = 10
+ ; PROFILE-REF: - entry.split: {{.*}} count = 5
+ ; PROFILE-CHK: - entry.split: {{.*}} count = 10
+ ; PROFILE-REF: - loop_begin: {{.*}} count = 5005
+ ; PROFILE-CHK: - loop_begin: {{.*}} count = 10000
+ ; PROFILE-REF: - do_something: {{.*}} count = 5000
+ ; PROFILE-CHK: - do_something: {{.*}} count = 9990
+ ; PROFILE-REF: - loop_exit: {{.*}} count = 5
+ ; PROFILE-CHK: - loop_exit: {{.*}} count = 10
+ ; PROFILE-COM: - loop_exit.split: {{.*}} count = 10
+
+; PROFILE-COM: Printing analysis results of BFI for function 'and':
+; PROFILE-COM: block-frequency-info: and
+ ; PROFILE-COM: - entry: {{.*}} count = 10
+ ; PROFILE-COM: - entry.split: {{.*}} count = 5
+ ; PROFILE-COM: - loop_begin: {{.*}} count = 5
+ ; PROFILE-COM: - do_something: {{.*}} count = 0
+ ; PROFILE-COM: - loop_exit: {{.*}} count = 5
+ ; PROFILE-COM: - loop_exit.split: {{.*}} count = 10
+
+;--- probable-and.prof
+!0 = !{!"function_entry_count", i32 10}
+!1 = !{!"branch_weights", i32 1000, i32 1}
+; CHECK-LABEL: @or
+; CHECK-LABEL: entry:
+; CHECK-NEXT:   %cond1.fr = freeze i1 %cond1
+; CHECK-NEXT:   br i1 %cond1.fr, label %loop_exit.split, label %entry.split, !prof !1
+; CHECK-LABEL: @and
+; CHECK-LABEL: entry:
+; CHECK-NEXT:   %cond1.fr = freeze i1 %cond1
+; CHECK-NEXT:   br i1 %cond1.fr, label %entry.split, label %loop_exit.split, !prof !2
+; CHECK: !1 = !{!"unknown", !"simple-loop-unswitch"}
+; CHECK: !2 = !{!"branch_weights", i32 1000, i32 1}
+; PROFILE-COM: Printing analysis results of BFI for function 'or':
+; PROFILE-COM: block-frequency-info: or
+ ; PROFILE-COM: - entry: {{.*}}, count = 10
+ ; PROFILE-COM: - loop_begin: {{.*}}, count = 10
+ ; PROFILE-COM: - do_something: {{.*}}, count = 0
+ ; PROFILE-COM: - loop_exit: {{.*}}, count = 10
+
+; PROFILE-COM: Printing analysis results of BFI for function 'and':
+; PROFILE-COM: block-frequency-info: and
+ ; PROFILE-COM: - entry: {{.*}} count = 10
+ ; PROFILE-COM: - loop_begin: {{.*}} count = 10010
+ ; PROFILE-COM: - do_something: {{.*}} count = 10000
+ ; PROFILE-COM: - loop_exit: {{.*}} count = 10
+
+; PROFILE-COM: Printing analysis results of BFI for function 'or':
+; PROFILE-COM: block-frequency-info: or
+ ; PROFILE-COM: - entry: {{.*}} count = 10
+ ; PROFILE-COM: - entry.split: {{.*}} count = 5
+ ; PROFILE-COM: - loop_begin: {{.*}} count = 5
+ ; PROFILE-COM: - do_something: {{.*}} count = 0
+ ; PROFILE-COM: - loop_exit: {{.*}} count = 5
+ ; PROFILE-COM: - loop_exit.split: {{.*}} count = 10
+
+; PROFILE-COM: Printing analysis results of BFI for function 'and':
+; PROFILE-COM: block-frequency-info: and
+ ; PROFILE-COM: - entry: {{.*}} count = 10
+ ; PROFILE-REF: - entry.split: {{.*}} count = 5
+ ; PROFILE-CHK: - entry.split: {{.*}} count = 10
+ ; PROFILE-REF: - loop_begin: {{.*}} count = 5005
+ ; PROFILE-CHK: - loop_begin: {{.*}} count = 10000
+ ; PROFILE-REF: - do_something: {{.*}} count = 5000
+ ; PROFILE-CHK: - do_something: {{.*}} count = 9990
+ ; PROFILE-REF: - loop_exit: {{.*}} count = 5
+ ; PROFILE-CHK: - loop_exit: {{.*}} count = 10
+ ; PROFILE-COM: - loop_exit.split: {{.*}} count = 10
diff --git a/llvm/test/Transforms/SimplifyCFG/X86/debugloc-switch-powers-of-two.ll b/llvm/test/Transforms/SimplifyCFG/X86/debugloc-switch-powers-of-two.ll
new file mode 100644
index 0000000000000..a276067530669
--- /dev/null
+++ b/llvm/test/Transforms/SimplifyCFG/X86/debugloc-switch-powers-of-two.ll
@@ -0,0 +1,81 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool ./build/bin/opt --version 6
+; RUN: opt -passes='simplifycfg<switch-to-lookup>' -simplifycfg-require-and-preserve-domtree=1 -S < %s | FileCheck %s
+;; As we replace the switch statement with a set of instructions that may more
+;; efficiently perform the conditional check, the DILocation of the switch
+;; should be propagated to all of its replacing instructions.
+
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @switch_of_powers_two_default_reachable(i32 %arg) !dbg !5 {
+; CHECK-LABEL: define i32 @switch_of_powers_two_default_reachable(
+; CHECK-SAME: i32 [[ARG:%.*]]) !dbg [[DBG5:![0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.ctpop.i32(i32 [[ARG]]), !dbg [[DBG8:![0-9]+]]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 1, !dbg [[DBG8]]
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[ENTRY_SPLIT:.*]], label %[[RETURN:.*]], !dbg [[DBG8]]
+; CHECK:       [[ENTRY_SPLIT]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.cttz.i32(i32 [[ARG]], i1 true), !dbg [[DBG8]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i32 [[TMP2]], 7, !dbg [[DBG8]]
+; CHECK-NEXT:    br i1 [[TMP3]], label %[[SWITCH_LOOKUP:.*]], label %[[RETURN]], !dbg [[DBG8]]
+; CHECK:       [[SWITCH_LOOKUP]]:
+; CHECK-NEXT:    [[TMP4:%.*]] = zext nneg i32 [[TMP2]] to i64, !dbg [[DBG8]]
+; CHECK-NEXT:    [[SWITCH_GEP:%.*]] = getelementptr inbounds [7 x i32], ptr @switch.table.switch_of_powers_two_default_reachable, i64 0, i64 [[TMP4]], !dbg [[DBG8]]
+; CHECK-NEXT:    [[SWITCH_LOAD:%.*]] = load i32, ptr [[SWITCH_GEP]], align 4, !dbg [[DBG8]]
+; CHECK-NEXT:    br label %[[RETURN]], !dbg [[DBG8]]
+; CHECK:       [[RETURN]]:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ 5, %[[ENTRY]] ], [ 5, %[[ENTRY_SPLIT]] ], [ [[SWITCH_LOAD]], %[[SWITCH_LOOKUP]] ]
+; CHECK-NEXT:    ret i32 [[PHI]]
+;
+entry:
+  switch i32 %arg, label %default_case [
+  i32 1, label %bb1
+  i32 8, label %bb2
+  i32 16, label %bb3
+  i32 32, label %bb4
+  i32 64, label %bb5
+  ], !dbg !8
+
+default_case:                                     ; preds = %entry
+  br label %return
+
+bb1:                                              ; preds = %entry
+  br label %return
+
+bb2:                                              ; preds = %entry
+  br label %return
+
+bb3:                                              ; preds = %entry
+  br label %return
+
+bb4:                                              ; preds = %entry
+  br label %return
+
+bb5:                                              ; preds = %entry
+  br label %return
+
+return:                                           ; preds = %bb5, %bb4, %bb3, %bb2, %bb1, %default_case
+  %phi = phi i32 [ 3, %bb1 ], [ 2, %bb2 ], [ 1, %bb3 ], [ 0, %bb4 ], [ 42, %bb5 ], [ 5, %default_case ]
+  ret i32 %phi
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.debugify = !{!2, !3}
+!llvm.module.flags = !{!4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!1 = !DIFile(filename: "debugloc-switch-powers-of-two.ll", directory: "/")
+!2 = !{i32 9}
+!3 = !{i32 1}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = distinct !DISubprogram(name: "switch_of_powers_two_default_reachable", linkageName: "switch_of_powers_two_default_reachable", scope: null, file: !1, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !7)
+!6 = !DISubroutineType(types: !7)
+!7 = !{}
+!8 = !DILocation(line: 1, column: 1, scope: !5)
+;.
+; CHECK: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C, file: [[META1:![0-9]+]], producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+; CHECK: [[META1]] = !DIFile(filename: "{{.*}}debugloc-switch-powers-of-two.ll", directory: {{.*}})
+; CHECK: [[DBG5]] = distinct !DISubprogram(name: "switch_of_powers_two_default_reachable", linkageName: "switch_of_powers_two_default_reachable", scope: null, file: [[META1]], line: 1, type: [[META6:![0-9]+]], scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META7:![0-9]+]])
+; CHECK: [[META6]] = !DISubroutineType(types: [[META7]])
+; CHECK: [[META7]] = !{}
+; CHECK: [[DBG8]] = !DILocation(line: 1, column: 1, scope: [[DBG5]])
+;.
diff --git a/llvm/test/Transforms/SimplifyCFG/X86/switch-of-powers-of-two.ll b/llvm/test/Transforms/SimplifyCFG/X86/switch-of-powers-of-two.ll
index aa95b3fd235e5..d818335f075e5 100644
--- a/llvm/test/Transforms/SimplifyCFG/X86/switch-of-powers-of-two.ll
+++ b/llvm/test/Transforms/SimplifyCFG/X86/switch-of-powers-of-two.ll
@@ -1,8 +1,13 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5
 ; RUN: opt -passes='simplifycfg<switch-to-lookup>' -simplifycfg-require-and-preserve-domtree=1 -S < %s | FileCheck %s
 
 target triple = "x86_64-unknown-linux-gnu"
 
+;.
+; CHECK: @switch.table.switch_of_powers_two = private unnamed_addr constant [7 x i32] [i32 3, i32 poison, i32 poison, i32 2, i32 1, i32 0, i32 42], align 4
+; CHECK: @switch.table.switch_of_powers_two_default_reachable = private unnamed_addr constant [7 x i32] [i32 3, i32 5, i32 5, i32 2, i32 1, i32 0, i32 42], align 4
+; CHECK: @switch.table.switch_of_powers_two_default_reachable_multipreds = private unnamed_addr constant [7 x i32] [i32 3, i32 poison, i32 poison, i32 2, i32 1, i32 0, i32 42], align 4
+;.
 define i32 @switch_of_powers_two(i32 %arg) {
 ; CHECK-LABEL: define i32 @switch_of_powers_two(
 ; CHECK-SAME: i32 [[ARG:%.*]]) {
@@ -35,17 +40,17 @@ return:
   ret i32 %phi
 }
 
-define i32 @switch_of_powers_two_default_reachable(i32 %arg) {
+define i32 @switch_of_powers_two_default_reachable(i32 %arg) !prof !0 {
 ; CHECK-LABEL: define i32 @switch_of_powers_two_default_reachable(
-; CHECK-SAME: i32 [[ARG:%.*]]) {
+; CHECK-SAME: i32 [[ARG:%.*]]) !prof [[PROF0:![0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.ctpop.i32(i32 [[ARG]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 1
-; CHECK-NEXT:    br i1 [[TMP1]], label %[[ENTRY_SPLIT:.*]], label %[[RETURN:.*]]
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[ENTRY_SPLIT:.*]], label %[[RETURN:.*]], !prof [[PROF1:![0-9]+]]
 ; CHECK:       [[ENTRY_SPLIT]]:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.cttz.i32(i32 [[ARG]], i1 true)
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i32 [[TMP2]], 7
-; CHECK-NEXT:    br i1 [[TMP3]], label %[[SWITCH_LOOKUP:.*]], label %[[RETURN]]
+; CHECK-NEXT:    br i1 [[TMP3]], label %[[SWITCH_LOOKUP:.*]], label %[[RETURN]], !prof [[PROF2:![0-9]+]]
 ; CHECK:       [[SWITCH_LOOKUP]]:
 ; CHECK-NEXT:    [[TMP4:%.*]] = zext nneg i32 [[TMP2]] to i64
 ; CHECK-NEXT:    [[SWITCH_GEP:%.*]] = getelementptr inbounds [7 x i32], ptr @switch.table.switch_of_powers_two_default_reachable, i64 0, i64 [[TMP4]]
@@ -62,7 +67,7 @@ entry:
   i32 16, label %bb3
   i32 32, label %bb4
   i32 64, label %bb5
-  ]
+  ], !prof !1
 
 default_case: br label %return
 bb1: br label %return
@@ -128,3 +133,13 @@ return:
   %phi = phi i32 [ 3, %bb1 ], [ 2, %bb2 ], [ 1, %bb3 ], [ 0, %bb4 ], [ 42, %bb5 ], [ %pn, %default_case ]
   ret i32 %phi
 }
+
+!0 = !{!"function_entry_count", i32 10}
+!1 = !{!"branch_weights", i32 10, i32 5, i32 7, i32 11, i32 13, i32 17}
+;.
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+;.
+; CHECK: [[PROF0]] = !{!"function_entry_count", i32 10}
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 58, i32 5}
+; CHECK: [[PROF2]] = !{!"branch_weights", i32 56, i32 5}
+;.
diff --git a/llvm/test/Transforms/SimplifyCFG/hoist-common-code.ll b/llvm/test/Transforms/SimplifyCFG/hoist-common-code.ll
index 8ce94d1cf5b4e..98c0599ab209c 100644
--- a/llvm/test/Transforms/SimplifyCFG/hoist-common-code.ll
+++ b/llvm/test/Transforms/SimplifyCFG/hoist-common-code.ll
@@ -486,3 +486,119 @@ else:
   call void @bar()
   ret float %op2
 }
+
+define void @test_switch_with_unreachable_block_as_default(i1 %c, i32 %x, ptr %ptr) {
+; CHECK-LABEL: @test_switch_with_unreachable_block_as_default(
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[SW1:%.*]], label [[SW2:%.*]]
+; CHECK:       sw1:
+; CHECK-NEXT:    switch i32 [[X:%.*]], label [[UNREACHABLE:%.*]] [
+; CHECK-NEXT:      i32 1, label [[COMMON_RET:%.*]]
+; CHECK-NEXT:      i32 2, label [[BAR:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       sw2:
+; CHECK-NEXT:    store i64 42, ptr [[PTR:%.*]], align 4
+; CHECK-NEXT:    br label [[COMMON_RET]]
+; CHECK:       common.ret:
+; CHECK-NEXT:    ret void
+; CHECK:       unreachable:
+; CHECK-NEXT:    unreachable
+; CHECK:       bar:
+; CHECK-NEXT:    call void @bar()
+; CHECK-NEXT:    br label [[COMMON_RET]]
+;
+  br i1 %c, label %sw1, label %sw2
+
+sw1:
+  ; This switch only exists to have an %unreachable block with multiple predecessors.
+  switch i32 %x, label %unreachable [
+  i32 1, label %foo
+  i32 2, label %bar
+  ]
+
+sw2:
+  switch i32 %x, label %unreachable [
+  i32 1, label %bb1
+  i32 2, label %bb2
+  i32 3, label %bb3
+  ]
+
+bb1:
+  store i64 42, ptr %ptr
+  ret void
+
+bb2:
+  store i64 42, ptr %ptr
+  ret void
+
+bb3:
+  store i64 42, ptr %ptr
+  ret void
+
+unreachable:
+  unreachable
+
+foo:
+  ret void
+
+bar:
+  call void @bar()
+  ret void
+}
+
+define void @test_switch_with_unreachable_block_as_case(i1 %c, i32 %x, ptr %ptr) {
+; CHECK-LABEL: @test_switch_with_unreachable_block_as_case(
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[SW1:%.*]], label [[SW2:%.*]]
+; CHECK:       sw1:
+; CHECK-NEXT:    switch i32 [[X:%.*]], label [[UNREACHABLE:%.*]] [
+; CHECK-NEXT:      i32 1, label [[COMMON_RET:%.*]]
+; CHECK-NEXT:      i32 2, label [[BAR:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       sw2:
+; CHECK-NEXT:    store i64 42, ptr [[PTR:%.*]], align 4
+; CHECK-NEXT:    br label [[COMMON_RET]]
+; CHECK:       common.ret:
+; CHECK-NEXT:    ret void
+; CHECK:       unreachable:
+; CHECK-NEXT:    unreachable
+; CHECK:       bar:
+; CHECK-NEXT:    call void @bar()
+; CHECK-NEXT:    br label [[COMMON_RET]]
+;
+  br i1 %c, label %sw1, label %sw2
+
+sw1:
+  ; This switch only exists to have an %unreachable block with multiple predecessors.
+  switch i32 %x, label %unreachable [
+  i32 1, label %foo
+  i32 2, label %bar
+  ]
+
+sw2:
+  switch i32 %x, label %bb3 [
+  i32 1, label %bb1
+  i32 2, label %bb2
+  i32 3, label %unreachable
+  ]
+
+bb1:
+  store i64 42, ptr %ptr
+  ret void
+
+bb2:
+  store i64 42, ptr %ptr
+  ret void
+
+bb3:
+  store i64 42, ptr %ptr
+  ret void
+
+unreachable:
+  unreachable
+
+foo:
+  ret void
+
+bar:
+  call void @bar()
+  ret void
+}
diff --git a/llvm/test/Transforms/SimplifyCFG/pr165088.ll b/llvm/test/Transforms/SimplifyCFG/pr165088.ll
new file mode 100644
index 0000000000000..4514a1927b586
--- /dev/null
+++ b/llvm/test/Transforms/SimplifyCFG/pr165088.ll
@@ -0,0 +1,186 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -passes="simplifycfg<switch-range-to-icmp>" < %s | FileCheck %s
+
+; Avoid getting stuck in the cycle pr165088_cycle_[1-4].
+
+define void @pr165088_cycle_1(i8 %x) {
+; CHECK-LABEL: define void @pr165088_cycle_1(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ult i8 [[X]], 2
+; CHECK-NEXT:    br i1 [[TMP0]], label %[[BLOCK2:.*]], label %[[BLOCK3:.*]]
+; CHECK:       [[BLOCK1:.*]]:
+; CHECK-NEXT:    [[COND2:%.*]] = icmp ugt i8 [[X]], 1
+; CHECK-NEXT:    br i1 [[COND2]], label %[[BLOCK3]], label %[[BLOCK2]]
+; CHECK:       [[BLOCK2]]:
+; CHECK-NEXT:    br label %[[BLOCK3]]
+; CHECK:       [[BLOCK3]]:
+; CHECK-NEXT:    [[COND3:%.*]] = icmp eq i8 [[X]], 0
+; CHECK-NEXT:    br i1 [[COND3]], label %[[EXIT:.*]], label %[[BLOCK1]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %switch = icmp uge i8 %x, 2
+  %cond1 = icmp ugt i8 %x, 1
+  %or.cond = and i1 %switch, %cond1
+  br i1 %or.cond, label %block3, label %block2
+
+block1:
+  %cond2 = icmp ugt i8 %x, 1
+  br i1 %cond2, label %block3, label %block2
+
+block2:
+  br label %block3
+
+block3:
+  %cond3 = icmp eq i8 %x, 0
+  br i1 %cond3, label %exit, label %block1
+
+exit:
+  ret void
+}
+
+define void @pr165088_cycle_2(i8 %x) {
+; CHECK-LABEL: define void @pr165088_cycle_2(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SWITCH:%.*]] = icmp ult i8 [[X]], 2
+; CHECK-NEXT:    br i1 [[SWITCH]], label %[[BLOCK2:.*]], label %[[BLOCK3:.*]]
+; CHECK:       [[BLOCK1:.*]]:
+; CHECK-NEXT:    [[COND2:%.*]] = icmp ugt i8 [[X]], 1
+; CHECK-NEXT:    br i1 [[COND2]], label %[[BLOCK3]], label %[[BLOCK2]]
+; CHECK:       [[BLOCK2]]:
+; CHECK-NEXT:    br label %[[BLOCK3]]
+; CHECK:       [[BLOCK3]]:
+; CHECK-NEXT:    [[COND3:%.*]] = icmp eq i8 [[X]], 0
+; CHECK-NEXT:    br i1 [[COND3]], label %[[EXIT:.*]], label %[[BLOCK1]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  switch i8 %x, label %block3 [
+  i8 1, label %block2
+  i8 0, label %block2
+  ]
+
+block1:                                              ; preds = %block3
+  %cond2 = icmp ugt i8 %x, 1
+  br i1 %cond2, label %block3, label %block2
+
+block2:                                              ; preds = %entry, %entry, %block1
+  br label %block3
+
+block3:                                              ; preds = %entry, %block2, %block1
+  %cond3 = icmp eq i8 %x, 0
+  br i1 %cond3, label %exit, label %block1
+
+exit:                                             ; preds = %block3
+  ret void
+}
+
+define void @pr165088_cycle_3(i8 %x) {
+; CHECK-LABEL: define void @pr165088_cycle_3(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[BLOCK3:.*]]
+; CHECK:       [[BLOCK3]]:
+; CHECK-NEXT:    [[COND3:%.*]] = icmp eq i8 [[X]], 0
+; CHECK-NEXT:    br i1 [[COND3]], label %[[EXIT:.*]], label %[[BLOCK3]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  switch i8 %x, label %block1 [
+  i8 1, label %block2
+  i8 0, label %block2
+  ]
+
+block1:                                              ; preds = %entry, %block3
+  %cond2 = icmp ugt i8 %x, 1
+  br i1 %cond2, label %block3, label %block2
+
+block2:                                              ; preds = %entry, %entry, %block1
+  br label %block3
+
+block3:                                              ; preds = %block2, %block1
+  %cond3 = icmp eq i8 %x, 0
+  br i1 %cond3, label %exit, label %block1
+
+exit:                                             ; preds = %block3
+  ret void
+}
+
+define void @pr165088_cycle_4(i8 %x) {
+; CHECK-LABEL: define void @pr165088_cycle_4(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ult i8 [[X]], 2
+; CHECK-NEXT:    br i1 [[TMP0]], label %[[BLOCK2:.*]], label %[[BLOCK3:.*]]
+; CHECK:       [[BLOCK1:.*]]:
+; CHECK-NEXT:    [[COND2_OLD:%.*]] = icmp ugt i8 [[X]], 1
+; CHECK-NEXT:    br i1 [[COND2_OLD]], label %[[BLOCK3]], label %[[BLOCK2]]
+; CHECK:       [[BLOCK2]]:
+; CHECK-NEXT:    br label %[[BLOCK3]]
+; CHECK:       [[BLOCK3]]:
+; CHECK-NEXT:    [[COND3:%.*]] = icmp eq i8 [[X]], 0
+; CHECK-NEXT:    br i1 [[COND3]], label %[[EXIT:.*]], label %[[BLOCK1]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %switch = icmp ult i8 %x, 2
+  br i1 %switch, label %block2, label %block1
+
+block1:                                              ; preds = %entry, %block3
+  %cond2 = icmp ugt i8 %x, 1
+  br i1 %cond2, label %block3, label %block2
+
+block2:                                              ; preds = %entry, %block1
+  br label %block3
+
+block3:                                              ; preds = %block2, %block1
+  %cond3 = icmp eq i8 %x, 0
+  br i1 %cond3, label %exit, label %block1
+
+exit:                                             ; preds = %block3
+  ret void
+}
+
+define void @pr165088_original(i8 %x) {
+; CHECK-LABEL: define void @pr165088_original(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ult i8 [[X]], 2
+; CHECK-NEXT:    br i1 [[TMP0]], label %[[BLOCK2:.*]], label %[[BLOCK3:.*]]
+; CHECK:       [[BLOCK1:.*]]:
+; CHECK-NEXT:    [[COND3_OLD_OLD:%.*]] = icmp ugt i8 [[X]], 1
+; CHECK-NEXT:    br i1 [[COND3_OLD_OLD]], label %[[BLOCK3]], label %[[BLOCK2]]
+; CHECK:       [[BLOCK2]]:
+; CHECK-NEXT:    br label %[[BLOCK3]]
+; CHECK:       [[BLOCK3]]:
+; CHECK-NEXT:    [[COND4:%.*]] = icmp eq i8 [[X]], 0
+; CHECK-NEXT:    br i1 [[COND4]], label %[[EXIT:.*]], label %[[BLOCK1]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cond = icmp ne i8 %x, 0
+  %cond3 = icmp ne i8 %x, 0
+  %or.cond = and i1 %cond, %cond3
+  br i1 %or.cond, label %block3, label %block2
+
+block1:                                              ; preds = %block3
+  %cond3.old = icmp ugt i8 %x, 1
+  br i1 %cond3.old, label %block3, label %block2
+
+block2:                                              ; preds = %block1, %entry
+  br label %block3
+
+block3:                                              ; preds = %block2, %block1, %entry
+  %cond4 = icmp eq i8 %x, 0
+  br i1 %cond4, label %exit, label %block1
+
+exit:                                             ; preds = %block3
+  ret void
+}
diff --git a/llvm/test/Transforms/SimplifyCFG/pr165301.ll b/llvm/test/Transforms/SimplifyCFG/pr165301.ll
new file mode 100644
index 0000000000000..1df655250f57e
--- /dev/null
+++ b/llvm/test/Transforms/SimplifyCFG/pr165301.ll
@@ -0,0 +1,31 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 6
+; RUN: opt -S -passes="simplifycfg<switch-range-to-icmp>" < %s | FileCheck %s
+
+; Make sure there's no use after free when removing incoming values from PHI nodes
+
+define i32 @pr165301(i1 %cond) !prof !0 {
+; CHECK-LABEL: define i32 @pr165301(
+; CHECK-SAME: i1 [[COND:%.*]]) !prof [[PROF0:![0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[SWITCHBB:.*]]
+; CHECK:       [[SWITCHBB]]:
+; CHECK-NEXT:    br label %[[SWITCHBB]]
+;
+entry:
+  br label %switchbb
+
+switchbb:
+  switch i1 %cond, label %default [
+  i1 false, label %switchbb
+  i1 true, label %switchbb
+  ], !prof !1
+
+default:
+  %phi.lcssa = phi i32 [ 0, %switchbb ]
+  ret i32 %phi.lcssa
+}
+!0 = !{!"function_entry_count", i32 10}
+!1 = !{!"branch_weights", i32 2, i32 3, i32 5}
+;.
+; CHECK: [[PROF0]] = !{!"function_entry_count", i32 10}
+;.
diff --git a/llvm/test/Transforms/UnifyLoopExits/basic.ll b/llvm/test/Transforms/UnifyLoopExits/basic.ll
index ccd15d4e6b943..d04d142f196d3 100644
--- a/llvm/test/Transforms/UnifyLoopExits/basic.ll
+++ b/llvm/test/Transforms/UnifyLoopExits/basic.ll
@@ -18,12 +18,12 @@ define void @loop_1(i1 %PredEntry, i1 %PredB, i1 %PredC, i1 %PredD) {
 ; CHECK:       F:
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       G:
-; CHECK-NEXT:    br label [[F:%.*]]
+; CHECK-NEXT:    br label [[Y:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ; CHECK:       loop.exit.guard:
-; CHECK-NEXT:    [[GUARD_E:%.*]] = phi i1 [ true, [[B]] ], [ false, [[C]] ], [ false, [[D]] ]
-; CHECK-NEXT:    br i1 [[GUARD_E]], label [[E:%.*]], label [[F]]
+; CHECK-NEXT:    [[GUARD_X:%.*]] = phi i1 [ true, [[B]] ], [ false, [[C]] ], [ false, [[D]] ]
+; CHECK-NEXT:    br i1 [[GUARD_X]], label [[X:%.*]], label [[Y]]
 ;
 entry:
   br i1 %PredEntry, label %A, label %G
@@ -53,6 +53,67 @@ exit:
   ret void
 }
 
+define void @loop_1_callbr(i1 %PredEntry, i1 %PredB, i1 %PredC, i1 %PredD) {
+; CHECK-LABEL: @loop_1_callbr(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[PREDENTRY:%.*]], label [[A:%.*]], label [[G:%.*]]
+; CHECK:       A:
+; CHECK-NEXT:    br label [[B:%.*]]
+; CHECK:       B:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[PREDB:%.*]])
+; CHECK-NEXT:            to label [[C:%.*]] [label %B.target.E]
+; CHECK:       C:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[PREDC:%.*]])
+; CHECK-NEXT:            to label [[D:%.*]] [label %C.target.F]
+; CHECK:       D:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[PREDD:%.*]])
+; CHECK-NEXT:            to label [[A]] [label %D.target.F]
+; CHECK:       E:
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       F:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       G:
+; CHECK-NEXT:    br label [[Y:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+; CHECK:       B.target.E:
+; CHECK-NEXT:    br label [[LOOP_EXIT_GUARD:%.*]]
+; CHECK:       C.target.F:
+; CHECK-NEXT:    br label [[LOOP_EXIT_GUARD]]
+; CHECK:       D.target.F:
+; CHECK-NEXT:    br label [[LOOP_EXIT_GUARD]]
+; CHECK:       loop.exit.guard:
+; CHECK-NEXT:    [[GUARD_X:%.*]] = phi i1 [ true, [[B_TARGET_E:%.*]] ], [ false, [[C_TARGET_F:%.*]] ], [ false, [[D_TARGET_F:%.*]] ]
+; CHECK-NEXT:    br i1 [[GUARD_X]], label [[X:%.*]], label [[Y]]
+;
+entry:
+  br i1 %PredEntry, label %A, label %G
+
+A:
+  br label %B
+
+B:
+  callbr void asm "", "r,!i"(i1 %PredB) to label %C [label %E]
+
+C:
+  callbr void asm "", "r,!i"(i1 %PredC) to label %D [label %F]
+
+D:
+  callbr void asm "", "r,!i"(i1 %PredD) to label %A [label %F]
+
+E:
+  br label %exit
+
+F:
+  br label %exit
+
+G:
+  br label %F
+
+exit:
+  ret void
+}
+
 define void @loop_2(i1 %PredA, i1 %PredB, i1 %PredC) {
 ; CHECK-LABEL: @loop_2(
 ; CHECK-NEXT:  entry:
@@ -107,3 +168,67 @@ Z:
 exit:
   ret void
 }
+
+define void @loop_2_callbr(i1 %PredA, i1 %PredB, i1 %PredC) {
+; CHECK-LABEL: @loop_2_callbr(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[A:%.*]]
+; CHECK:       A:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[PREDA:%.*]])
+; CHECK-NEXT:            to label [[B:%.*]] [label %A.target.X]
+; CHECK:       B:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[PREDB:%.*]])
+; CHECK-NEXT:            to label [[C:%.*]] [label %B.target.Y]
+; CHECK:       C:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[PREDC:%.*]])
+; CHECK-NEXT:            to label [[D:%.*]] [label %C.target.Z]
+; CHECK:       D:
+; CHECK-NEXT:    br label [[A]]
+; CHECK:       X:
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       Y:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       Z:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+; CHECK:       A.target.X:
+; CHECK-NEXT:    br label [[LOOP_EXIT_GUARD:%.*]]
+; CHECK:       B.target.Y:
+; CHECK-NEXT:    br label [[LOOP_EXIT_GUARD]]
+; CHECK:       C.target.Z:
+; CHECK-NEXT:    br label [[LOOP_EXIT_GUARD]]
+; CHECK:       loop.exit.guard:
+; CHECK-NEXT:    [[GUARD_X:%.*]] = phi i1 [ true, [[A_TARGET_X:%.*]] ], [ false, [[B_TARGET_Y:%.*]] ], [ false, [[C_TARGET_Z:%.*]] ]
+; CHECK-NEXT:    [[GUARD_Y:%.*]] = phi i1 [ false, [[A_TARGET_X]] ], [ true, [[B_TARGET_Y]] ], [ false, [[C_TARGET_Z]] ]
+; CHECK-NEXT:    br i1 [[GUARD_X]], label [[X:%.*]], label [[LOOP_EXIT_GUARD1:%.*]]
+; CHECK:       loop.exit.guard1:
+; CHECK-NEXT:    br i1 [[GUARD_Y]], label [[Y:%.*]], label [[Z:%.*]]
+;
+entry:
+  br label %A
+
+A:
+  callbr void asm "", "r,!i"(i1 %PredA) to label %B [label %X]
+
+B:
+  callbr void asm "", "r,!i"(i1 %PredB) to label %C [label %Y]
+
+C:
+  callbr void asm "", "r,!i"(i1 %PredC) to label %D [label %Z]
+
+D:
+  br label %A
+
+X:
+  br label %exit
+
+Y:
+  br label %exit
+
+Z:
+  br label %exit
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/UnifyLoopExits/integer_guards.ll b/llvm/test/Transforms/UnifyLoopExits/integer_guards.ll
index f55639ff2db37..be982d5d043f9 100644
--- a/llvm/test/Transforms/UnifyLoopExits/integer_guards.ll
+++ b/llvm/test/Transforms/UnifyLoopExits/integer_guards.ll
@@ -71,6 +71,85 @@ E:
   ret void
 }
 
+define void @loop_two_exits_callbr(i1 %PredEntry, i1 %PredA) {
+; CHECK-LABEL: @loop_two_exits_callbr(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[PREDENTRY:%.*]], label [[A:%.*]], label [[E:%.*]]
+; CHECK:       A:
+; CHECK-NEXT:    [[INC1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC2:%.*]], [[C:%.*]] ]
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[PREDA:%.*]])
+; CHECK-NEXT:            to label [[A_TARGET_B:%.*]] [label %C]
+; CHECK:       B:
+; CHECK-NEXT:    tail call fastcc void @check(i32 1) #[[ATTR0]]
+; CHECK-NEXT:    br label [[D:%.*]]
+; CHECK:       C:
+; CHECK-NEXT:    [[INC2]] = add i32 [[INC1]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[INC2]], 10
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[CMP]])
+; CHECK-NEXT:            to label [[A]] [label %C.target.E]
+; CHECK:       D:
+; CHECK-NEXT:    unreachable
+; CHECK:       E:
+; CHECK-NEXT:    ret void
+; CHECK:       A.target.B:
+; CHECK-NEXT:    br label [[LOOP_EXIT_GUARD:%.*]]
+; CHECK:       C.target.E:
+; CHECK-NEXT:    br label [[LOOP_EXIT_GUARD]]
+; CHECK:       loop.exit.guard:
+; CHECK-NEXT:    [[MERGED_BB_IDX:%.*]] = phi i32 [ 0, [[A_TARGET_B]] ], [ 1, [[C_TARGET_E:%.*]] ]
+; CHECK-NEXT:    [[B_PREDICATE:%.*]] = icmp eq i32 [[MERGED_BB_IDX]], 0
+; CHECK-NEXT:    br i1 [[B_PREDICATE]], label [[B:%.*]], label [[E]]
+;
+; BOOLEAN-LABEL: @loop_two_exits_callbr(
+; BOOLEAN-NEXT:  entry:
+; BOOLEAN-NEXT:    br i1 [[PREDENTRY:%.*]], label [[A:%.*]], label [[E:%.*]]
+; BOOLEAN:       A:
+; BOOLEAN-NEXT:    [[INC1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC2:%.*]], [[C:%.*]] ]
+; BOOLEAN-NEXT:    callbr void asm "", "r,!i"(i1 [[PREDA:%.*]])
+; BOOLEAN-NEXT:            to label [[A_TARGET_B:%.*]] [label %C]
+; BOOLEAN:       B:
+; BOOLEAN-NEXT:    tail call fastcc void @check(i32 1) #[[ATTR0]]
+; BOOLEAN-NEXT:    br label [[D:%.*]]
+; BOOLEAN:       C:
+; BOOLEAN-NEXT:    [[INC2]] = add i32 [[INC1]], 1
+; BOOLEAN-NEXT:    [[CMP:%.*]] = icmp ult i32 [[INC2]], 10
+; BOOLEAN-NEXT:    callbr void asm "", "r,!i"(i1 [[CMP]])
+; BOOLEAN-NEXT:            to label [[A]] [label %C.target.E]
+; BOOLEAN:       D:
+; BOOLEAN-NEXT:    unreachable
+; BOOLEAN:       E:
+; BOOLEAN-NEXT:    ret void
+; BOOLEAN:       A.target.B:
+; BOOLEAN-NEXT:    br label [[LOOP_EXIT_GUARD:%.*]]
+; BOOLEAN:       C.target.E:
+; BOOLEAN-NEXT:    br label [[LOOP_EXIT_GUARD]]
+; BOOLEAN:       loop.exit.guard:
+; BOOLEAN-NEXT:    [[GUARD_B:%.*]] = phi i1 [ true, [[A_TARGET_B]] ], [ false, [[C_TARGET_E:%.*]] ]
+; BOOLEAN-NEXT:    br i1 [[GUARD_B]], label [[B:%.*]], label [[E]]
+;
+entry:
+  br i1 %PredEntry, label %A, label %E
+
+A:
+  %inc1 = phi i32 [ 0, %entry ], [ %inc2, %C ]
+  callbr void asm "", "r,!i"(i1 %PredA) to label %B [label %C]
+
+B:
+  tail call fastcc void @check(i32 1) #0
+  br label %D
+
+C:
+  %inc2 = add i32 %inc1, 1
+  %cmp = icmp ult i32 %inc2, 10
+  callbr void asm "","r,!i"(i1 %cmp) to label %A [label %E]
+
+D:
+  unreachable
+
+E:
+  ret void
+}
+
 ; The loop exit blocks appear in an inner loop.
 
 define void @inner_loop(i1 %PredEntry, i1 %PredA, i1 %PredB) {
@@ -196,6 +275,164 @@ I:
   ret void
 }
 
+define void @inner_loop_callbr(i1 %PredEntry, i1 %PredA, i1 %PredB) {
+; CHECK-LABEL: @inner_loop_callbr(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[PREDENTRY:%.*]], label [[A:%.*]], label [[I:%.*]]
+; CHECK:       A:
+; CHECK-NEXT:    [[OUTER1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OUTER2:%.*]], [[G:%.*]] ]
+; CHECK-NEXT:    br label [[B:%.*]]
+; CHECK:       B:
+; CHECK-NEXT:    [[INNER1:%.*]] = phi i32 [ 0, [[A]] ], [ [[INNER2:%.*]], [[F:%.*]] ]
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[PREDA:%.*]])
+; CHECK-NEXT:            to label [[D:%.*]] [label %B.target.B.target.C]
+; CHECK:       C:
+; CHECK-NEXT:    tail call fastcc void @check(i32 1) #[[ATTR0]]
+; CHECK-NEXT:    br label [[H:%.*]]
+; CHECK:       D:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[PREDB:%.*]])
+; CHECK-NEXT:            to label [[D_TARGET_D_TARGET_E:%.*]] [label %F]
+; CHECK:       E:
+; CHECK-NEXT:    tail call fastcc void @check(i32 2) #[[ATTR0]]
+; CHECK-NEXT:    br label [[H]]
+; CHECK:       F:
+; CHECK-NEXT:    [[INNER2]] = add i32 [[INNER1]], 1
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i32 [[INNER2]], 20
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[CMP1]])
+; CHECK-NEXT:            to label [[B]] [label %F.target.G]
+; CHECK:       G:
+; CHECK-NEXT:    [[OUTER2]] = add i32 [[OUTER1]], 1
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i32 [[OUTER2]], 10
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[CMP2]])
+; CHECK-NEXT:            to label [[A]] [label %G.target.I]
+; CHECK:       H:
+; CHECK-NEXT:    unreachable
+; CHECK:       I:
+; CHECK-NEXT:    ret void
+; CHECK:       B.target.C:
+; CHECK-NEXT:    br label [[LOOP_EXIT_GUARD:%.*]]
+; CHECK:       D.target.E:
+; CHECK-NEXT:    br label [[LOOP_EXIT_GUARD]]
+; CHECK:       G.target.I:
+; CHECK-NEXT:    br label [[LOOP_EXIT_GUARD]]
+; CHECK:       loop.exit.guard:
+; CHECK-NEXT:    [[MERGED_BB_IDX:%.*]] = phi i32 [ 0, [[B_TARGET_C:%.*]] ], [ 1, [[D_TARGET_E:%.*]] ], [ 2, [[G_TARGET_I:%.*]] ]
+; CHECK-NEXT:    [[C_PREDICATE:%.*]] = icmp eq i32 [[MERGED_BB_IDX]], 0
+; CHECK-NEXT:    br i1 [[C_PREDICATE]], label [[C:%.*]], label [[LOOP_EXIT_GUARD1:%.*]]
+; CHECK:       loop.exit.guard1:
+; CHECK-NEXT:    [[E_PREDICATE:%.*]] = icmp eq i32 [[MERGED_BB_IDX]], 1
+; CHECK-NEXT:    br i1 [[E_PREDICATE]], label [[E:%.*]], label [[I]]
+; CHECK:       B.target.B.target.C:
+; CHECK-NEXT:    br label [[LOOP_EXIT_GUARD2:%.*]]
+; CHECK:       D.target.D.target.E:
+; CHECK-NEXT:    br label [[LOOP_EXIT_GUARD2]]
+; CHECK:       F.target.G:
+; CHECK-NEXT:    br label [[LOOP_EXIT_GUARD2]]
+; CHECK:       loop.exit.guard2:
+; CHECK-NEXT:    [[MERGED_BB_IDX4:%.*]] = phi i32 [ 0, [[B_TARGET_B_TARGET_C:%.*]] ], [ 1, [[D_TARGET_D_TARGET_E]] ], [ 2, [[F_TARGET_G:%.*]] ]
+; CHECK-NEXT:    [[B_TARGET_C_PREDICATE:%.*]] = icmp eq i32 [[MERGED_BB_IDX4]], 0
+; CHECK-NEXT:    br i1 [[B_TARGET_C_PREDICATE]], label [[B_TARGET_C]], label [[LOOP_EXIT_GUARD3:%.*]]
+; CHECK:       loop.exit.guard3:
+; CHECK-NEXT:    [[D_TARGET_E_PREDICATE:%.*]] = icmp eq i32 [[MERGED_BB_IDX4]], 1
+; CHECK-NEXT:    br i1 [[D_TARGET_E_PREDICATE]], label [[D_TARGET_E]], label [[G]]
+;
+; BOOLEAN-LABEL: @inner_loop_callbr(
+; BOOLEAN-NEXT:  entry:
+; BOOLEAN-NEXT:    br i1 [[PREDENTRY:%.*]], label [[A:%.*]], label [[I:%.*]]
+; BOOLEAN:       A:
+; BOOLEAN-NEXT:    [[OUTER1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OUTER2:%.*]], [[G:%.*]] ]
+; BOOLEAN-NEXT:    br label [[B:%.*]]
+; BOOLEAN:       B:
+; BOOLEAN-NEXT:    [[INNER1:%.*]] = phi i32 [ 0, [[A]] ], [ [[INNER2:%.*]], [[F:%.*]] ]
+; BOOLEAN-NEXT:    callbr void asm "", "r,!i"(i1 [[PREDA:%.*]])
+; BOOLEAN-NEXT:            to label [[D:%.*]] [label %B.target.B.target.C]
+; BOOLEAN:       C:
+; BOOLEAN-NEXT:    tail call fastcc void @check(i32 1) #[[ATTR0]]
+; BOOLEAN-NEXT:    br label [[H:%.*]]
+; BOOLEAN:       D:
+; BOOLEAN-NEXT:    callbr void asm "", "r,!i"(i1 [[PREDB:%.*]])
+; BOOLEAN-NEXT:            to label [[D_TARGET_D_TARGET_E:%.*]] [label %F]
+; BOOLEAN:       E:
+; BOOLEAN-NEXT:    tail call fastcc void @check(i32 2) #[[ATTR0]]
+; BOOLEAN-NEXT:    br label [[H]]
+; BOOLEAN:       F:
+; BOOLEAN-NEXT:    [[INNER2]] = add i32 [[INNER1]], 1
+; BOOLEAN-NEXT:    [[CMP1:%.*]] = icmp ult i32 [[INNER2]], 20
+; BOOLEAN-NEXT:    callbr void asm "", "r,!i"(i1 [[CMP1]])
+; BOOLEAN-NEXT:            to label [[B]] [label %F.target.G]
+; BOOLEAN:       G:
+; BOOLEAN-NEXT:    [[OUTER2]] = add i32 [[OUTER1]], 1
+; BOOLEAN-NEXT:    [[CMP2:%.*]] = icmp ult i32 [[OUTER2]], 10
+; BOOLEAN-NEXT:    callbr void asm "", "r,!i"(i1 [[CMP2]])
+; BOOLEAN-NEXT:            to label [[A]] [label %G.target.I]
+; BOOLEAN:       H:
+; BOOLEAN-NEXT:    unreachable
+; BOOLEAN:       I:
+; BOOLEAN-NEXT:    ret void
+; BOOLEAN:       B.target.C:
+; BOOLEAN-NEXT:    br label [[LOOP_EXIT_GUARD:%.*]]
+; BOOLEAN:       D.target.E:
+; BOOLEAN-NEXT:    br label [[LOOP_EXIT_GUARD]]
+; BOOLEAN:       G.target.I:
+; BOOLEAN-NEXT:    br label [[LOOP_EXIT_GUARD]]
+; BOOLEAN:       loop.exit.guard:
+; BOOLEAN-NEXT:    [[GUARD_C:%.*]] = phi i1 [ true, [[B_TARGET_C:%.*]] ], [ false, [[D_TARGET_E:%.*]] ], [ false, [[G_TARGET_I:%.*]] ]
+; BOOLEAN-NEXT:    [[GUARD_E:%.*]] = phi i1 [ false, [[B_TARGET_C]] ], [ true, [[D_TARGET_E]] ], [ false, [[G_TARGET_I]] ]
+; BOOLEAN-NEXT:    br i1 [[GUARD_C]], label [[C:%.*]], label [[LOOP_EXIT_GUARD1:%.*]]
+; BOOLEAN:       loop.exit.guard1:
+; BOOLEAN-NEXT:    br i1 [[GUARD_E]], label [[E:%.*]], label [[I]]
+; BOOLEAN:       B.target.B.target.C:
+; BOOLEAN-NEXT:    br label [[LOOP_EXIT_GUARD2:%.*]]
+; BOOLEAN:       D.target.D.target.E:
+; BOOLEAN-NEXT:    br label [[LOOP_EXIT_GUARD2]]
+; BOOLEAN:       F.target.G:
+; BOOLEAN-NEXT:    br label [[LOOP_EXIT_GUARD2]]
+; BOOLEAN:       loop.exit.guard2:
+; BOOLEAN-NEXT:    [[GUARD_B_TARGET_C:%.*]] = phi i1 [ true, [[B_TARGET_B_TARGET_C:%.*]] ], [ false, [[D_TARGET_D_TARGET_E]] ], [ false, [[F_TARGET_G:%.*]] ]
+; BOOLEAN-NEXT:    [[GUARD_D_TARGET_E:%.*]] = phi i1 [ false, [[B_TARGET_B_TARGET_C]] ], [ true, [[D_TARGET_D_TARGET_E]] ], [ false, [[F_TARGET_G]] ]
+; BOOLEAN-NEXT:    br i1 [[GUARD_B_TARGET_C]], label [[B_TARGET_C]], label [[LOOP_EXIT_GUARD3:%.*]]
+; BOOLEAN:       loop.exit.guard3:
+; BOOLEAN-NEXT:    br i1 [[GUARD_D_TARGET_E]], label [[D_TARGET_E]], label [[G]]
+;
+entry:
+  br i1 %PredEntry, label %A, label %I
+
+A:
+  %outer1 = phi i32 [ 0, %entry ], [ %outer2, %G ]
+  br label %B
+
+B:
+  %inner1 = phi i32 [ 0, %A ], [ %inner2, %F ]
+  callbr void asm "", "r,!i"(i1 %PredA) to label %D [label %C]
+
+C:
+  tail call fastcc void @check(i32 1) #0
+  br label %H
+
+D:
+  callbr void asm "", "r,!i"(i1 %PredB) to label %E [label %F]
+
+E:
+  tail call fastcc void @check(i32 2) #0
+  br label %H
+
+F:
+  %inner2 = add i32 %inner1, 1
+  %cmp1 = icmp ult i32 %inner2, 20
+  callbr void asm "", "r,!i"(i1 %cmp1) to label %B [label %G]
+
+G:
+  %outer2 = add i32 %outer1, 1
+  %cmp2 = icmp ult i32 %outer2, 10
+  callbr void asm "", "r,!i"(i1 %cmp2) to label %A [label %I]
+
+H:
+  unreachable
+
+I:
+  ret void
+}
+
 ; A loop with more exit blocks.
 
 define void @loop_five_exits(i1 %PredEntry, i1 %PredA, i1 %PredB, i1 %PredC, i1 %PredD) {
@@ -341,6 +578,179 @@ L:
   ret void
 }
 
+define void @loop_five_exits_callbr(i1 %PredEntry, i1 %PredA, i1 %PredB, i1 %PredC, i1 %PredD) {
+; CHECK-LABEL: @loop_five_exits_callbr(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[PREDENTRY:%.*]], label [[A:%.*]], label [[L:%.*]]
+; CHECK:       A:
+; CHECK-NEXT:    [[INC1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC2:%.*]], [[I:%.*]] ]
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[PREDA:%.*]])
+; CHECK-NEXT:            to label [[A_TARGET_B:%.*]] [label %C]
+; CHECK:       B:
+; CHECK-NEXT:    tail call fastcc void @check(i32 1) #[[ATTR0]]
+; CHECK-NEXT:    br label [[J:%.*]]
+; CHECK:       C:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[PREDB:%.*]])
+; CHECK-NEXT:            to label [[C_TARGET_D:%.*]] [label %E]
+; CHECK:       D:
+; CHECK-NEXT:    tail call fastcc void @check(i32 2) #[[ATTR0]]
+; CHECK-NEXT:    br label [[J]]
+; CHECK:       E:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[PREDC:%.*]])
+; CHECK-NEXT:            to label [[E_TARGET_F:%.*]] [label %G]
+; CHECK:       F:
+; CHECK-NEXT:    tail call fastcc void @check(i32 3) #[[ATTR0]]
+; CHECK-NEXT:    br label [[K:%.*]]
+; CHECK:       G:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[PREDD:%.*]])
+; CHECK-NEXT:            to label [[G_TARGET_H:%.*]] [label %I]
+; CHECK:       H:
+; CHECK-NEXT:    tail call fastcc void @check(i32 4) #[[ATTR0]]
+; CHECK-NEXT:    br label [[K]]
+; CHECK:       I:
+; CHECK-NEXT:    [[INC2]] = add i32 [[INC1]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[INC2]], 10
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[CMP]])
+; CHECK-NEXT:            to label [[A]] [label %I.target.L]
+; CHECK:       J:
+; CHECK-NEXT:    br label [[L]]
+; CHECK:       K:
+; CHECK-NEXT:    br label [[L]]
+; CHECK:       L:
+; CHECK-NEXT:    ret void
+; CHECK:       A.target.B:
+; CHECK-NEXT:    br label [[LOOP_EXIT_GUARD:%.*]]
+; CHECK:       C.target.D:
+; CHECK-NEXT:    br label [[LOOP_EXIT_GUARD]]
+; CHECK:       E.target.F:
+; CHECK-NEXT:    br label [[LOOP_EXIT_GUARD]]
+; CHECK:       G.target.H:
+; CHECK-NEXT:    br label [[LOOP_EXIT_GUARD]]
+; CHECK:       I.target.L:
+; CHECK-NEXT:    br label [[LOOP_EXIT_GUARD]]
+; CHECK:       loop.exit.guard:
+; CHECK-NEXT:    [[MERGED_BB_IDX:%.*]] = phi i32 [ 0, [[A_TARGET_B]] ], [ 1, [[C_TARGET_D]] ], [ 2, [[E_TARGET_F]] ], [ 3, [[G_TARGET_H]] ], [ 4, [[I_TARGET_L:%.*]] ]
+; CHECK-NEXT:    [[B_PREDICATE:%.*]] = icmp eq i32 [[MERGED_BB_IDX]], 0
+; CHECK-NEXT:    br i1 [[B_PREDICATE]], label [[B:%.*]], label [[LOOP_EXIT_GUARD1:%.*]]
+; CHECK:       loop.exit.guard1:
+; CHECK-NEXT:    [[D_PREDICATE:%.*]] = icmp eq i32 [[MERGED_BB_IDX]], 1
+; CHECK-NEXT:    br i1 [[D_PREDICATE]], label [[D:%.*]], label [[LOOP_EXIT_GUARD2:%.*]]
+; CHECK:       loop.exit.guard2:
+; CHECK-NEXT:    [[F_PREDICATE:%.*]] = icmp eq i32 [[MERGED_BB_IDX]], 2
+; CHECK-NEXT:    br i1 [[F_PREDICATE]], label [[F:%.*]], label [[LOOP_EXIT_GUARD3:%.*]]
+; CHECK:       loop.exit.guard3:
+; CHECK-NEXT:    [[H_PREDICATE:%.*]] = icmp eq i32 [[MERGED_BB_IDX]], 3
+; CHECK-NEXT:    br i1 [[H_PREDICATE]], label [[H:%.*]], label [[L]]
+;
+; BOOLEAN-LABEL: @loop_five_exits_callbr(
+; BOOLEAN-NEXT:  entry:
+; BOOLEAN-NEXT:    br i1 [[PREDENTRY:%.*]], label [[A:%.*]], label [[L:%.*]]
+; BOOLEAN:       A:
+; BOOLEAN-NEXT:    [[INC1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC2:%.*]], [[I:%.*]] ]
+; BOOLEAN-NEXT:    callbr void asm "", "r,!i"(i1 [[PREDA:%.*]])
+; BOOLEAN-NEXT:            to label [[A_TARGET_B:%.*]] [label %C]
+; BOOLEAN:       B:
+; BOOLEAN-NEXT:    tail call fastcc void @check(i32 1) #[[ATTR0]]
+; BOOLEAN-NEXT:    br label [[J:%.*]]
+; BOOLEAN:       C:
+; BOOLEAN-NEXT:    callbr void asm "", "r,!i"(i1 [[PREDB:%.*]])
+; BOOLEAN-NEXT:            to label [[C_TARGET_D:%.*]] [label %E]
+; BOOLEAN:       D:
+; BOOLEAN-NEXT:    tail call fastcc void @check(i32 2) #[[ATTR0]]
+; BOOLEAN-NEXT:    br label [[J]]
+; BOOLEAN:       E:
+; BOOLEAN-NEXT:    callbr void asm "", "r,!i"(i1 [[PREDC:%.*]])
+; BOOLEAN-NEXT:            to label [[E_TARGET_F:%.*]] [label %G]
+; BOOLEAN:       F:
+; BOOLEAN-NEXT:    tail call fastcc void @check(i32 3) #[[ATTR0]]
+; BOOLEAN-NEXT:    br label [[K:%.*]]
+; BOOLEAN:       G:
+; BOOLEAN-NEXT:    callbr void asm "", "r,!i"(i1 [[PREDD:%.*]])
+; BOOLEAN-NEXT:            to label [[G_TARGET_H:%.*]] [label %I]
+; BOOLEAN:       H:
+; BOOLEAN-NEXT:    tail call fastcc void @check(i32 4) #[[ATTR0]]
+; BOOLEAN-NEXT:    br label [[K]]
+; BOOLEAN:       I:
+; BOOLEAN-NEXT:    [[INC2]] = add i32 [[INC1]], 1
+; BOOLEAN-NEXT:    [[CMP:%.*]] = icmp ult i32 [[INC2]], 10
+; BOOLEAN-NEXT:    callbr void asm "", "r,!i"(i1 [[CMP]])
+; BOOLEAN-NEXT:            to label [[A]] [label %I.target.L]
+; BOOLEAN:       J:
+; BOOLEAN-NEXT:    br label [[L]]
+; BOOLEAN:       K:
+; BOOLEAN-NEXT:    br label [[L]]
+; BOOLEAN:       L:
+; BOOLEAN-NEXT:    ret void
+; BOOLEAN:       A.target.B:
+; BOOLEAN-NEXT:    br label [[LOOP_EXIT_GUARD:%.*]]
+; BOOLEAN:       C.target.D:
+; BOOLEAN-NEXT:    br label [[LOOP_EXIT_GUARD]]
+; BOOLEAN:       E.target.F:
+; BOOLEAN-NEXT:    br label [[LOOP_EXIT_GUARD]]
+; BOOLEAN:       G.target.H:
+; BOOLEAN-NEXT:    br label [[LOOP_EXIT_GUARD]]
+; BOOLEAN:       I.target.L:
+; BOOLEAN-NEXT:    br label [[LOOP_EXIT_GUARD]]
+; BOOLEAN:       loop.exit.guard:
+; BOOLEAN-NEXT:    [[GUARD_B:%.*]] = phi i1 [ true, [[A_TARGET_B]] ], [ false, [[C_TARGET_D]] ], [ false, [[E_TARGET_F]] ], [ false, [[G_TARGET_H]] ], [ false, [[I_TARGET_L:%.*]] ]
+; BOOLEAN-NEXT:    [[GUARD_D:%.*]] = phi i1 [ false, [[A_TARGET_B]] ], [ true, [[C_TARGET_D]] ], [ false, [[E_TARGET_F]] ], [ false, [[G_TARGET_H]] ], [ false, [[I_TARGET_L]] ]
+; BOOLEAN-NEXT:    [[GUARD_F:%.*]] = phi i1 [ false, [[A_TARGET_B]] ], [ false, [[C_TARGET_D]] ], [ true, [[E_TARGET_F]] ], [ false, [[G_TARGET_H]] ], [ false, [[I_TARGET_L]] ]
+; BOOLEAN-NEXT:    [[GUARD_H:%.*]] = phi i1 [ false, [[A_TARGET_B]] ], [ false, [[C_TARGET_D]] ], [ false, [[E_TARGET_F]] ], [ true, [[G_TARGET_H]] ], [ false, [[I_TARGET_L]] ]
+; BOOLEAN-NEXT:    br i1 [[GUARD_B]], label [[B:%.*]], label [[LOOP_EXIT_GUARD1:%.*]]
+; BOOLEAN:       loop.exit.guard1:
+; BOOLEAN-NEXT:    br i1 [[GUARD_D]], label [[D:%.*]], label [[LOOP_EXIT_GUARD2:%.*]]
+; BOOLEAN:       loop.exit.guard2:
+; BOOLEAN-NEXT:    br i1 [[GUARD_F]], label [[F:%.*]], label [[LOOP_EXIT_GUARD3:%.*]]
+; BOOLEAN:       loop.exit.guard3:
+; BOOLEAN-NEXT:    br i1 [[GUARD_H]], label [[H:%.*]], label [[L]]
+;
+entry:
+  br i1 %PredEntry, label %A, label %L
+
+A:
+  %inc1 = phi i32 [ 0, %entry ], [ %inc2, %I ]
+  callbr void asm "", "r,!i"(i1 %PredA) to label %B [label %C]
+
+B:
+  tail call fastcc void @check(i32 1) #0
+  br label %J
+
+C:
+  callbr void asm "", "r,!i"(i1 %PredB) to label %D [label %E]
+
+D:
+  tail call fastcc void @check(i32 2) #0
+  br label %J
+
+E:
+  callbr void asm "", "r,!i"(i1 %PredC) to label %F [label %G]
+
+F:
+  tail call fastcc void @check(i32 3) #0
+  br label %K
+
+G:
+  callbr void asm "", "r,!i"(i1 %PredD) to label %H [label %I]
+
+H:
+  tail call fastcc void @check(i32 4) #0
+  br label %K
+
+I:
+  %inc2 = add i32 %inc1, 1
+  %cmp = icmp ult i32 %inc2, 10
+  callbr void asm "", "r,!i"(i1 %cmp) to label %A [label %L]
+
+J:
+  br label %L
+
+K:
+  br label %L
+
+L:
+  ret void
+}
+
 
 declare void @check(i32 noundef %i) #0
 
diff --git a/llvm/test/Transforms/UnifyLoopExits/nested.ll b/llvm/test/Transforms/UnifyLoopExits/nested.ll
index 8fae2c4349a7b..2ec576a2efa89 100644
--- a/llvm/test/Transforms/UnifyLoopExits/nested.ll
+++ b/llvm/test/Transforms/UnifyLoopExits/nested.ll
@@ -78,3 +78,145 @@ exit:
   %exit.phi = phi i32 [%A4.phi, %A5], [%Z, %C]
   ret void
 }
+
+define void @nested_callbr(i1 %PredB3, i1 %PredB4, i1 %PredA4, i1 %PredA3, i32 %X, i32 %Y, i32 %Z) {
+; CHECK-LABEL: @nested_callbr(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[A1:%.*]]
+; CHECK:       A1:
+; CHECK-NEXT:    br label [[B1:%.*]]
+; CHECK:       B1:
+; CHECK-NEXT:    br label [[B2:%.*]]
+; CHECK:       B2:
+; CHECK-NEXT:    [[X_INC:%.*]] = add i32 [[X:%.*]], 1
+; CHECK-NEXT:    br label [[B3:%.*]]
+; CHECK:       B3:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[PREDB3:%.*]])
+; CHECK-NEXT:            to label [[B4:%.*]] [label %B3.target.A3]
+; CHECK:       B4:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[PREDB4:%.*]])
+; CHECK-NEXT:            to label [[B1]] [label %B4.target.A2]
+; CHECK:       A2:
+; CHECK-NEXT:    br label [[A4:%.*]]
+; CHECK:       A3:
+; CHECK-NEXT:    br label [[A4]]
+; CHECK:       A4:
+; CHECK-NEXT:    [[A4_PHI:%.*]] = phi i32 [ [[Y:%.*]], [[A3:%.*]] ], [ [[X_INC_MOVED:%.*]], [[A2:%.*]] ]
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[PREDA4:%.*]])
+; CHECK-NEXT:            to label [[A4_TARGET_C:%.*]] [label %A5]
+; CHECK:       A5:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[PREDA3:%.*]])
+; CHECK-NEXT:            to label [[A5_TARGET_EXIT:%.*]] [label %A1]
+; CHECK:       C:
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[EXIT_PHI:%.*]] = phi i32 [ [[Z:%.*]], [[C:%.*]] ], [ [[EXIT_PHI_MOVED:%.*]], [[LOOP_EXIT_GUARD:%.*]] ]
+; CHECK-NEXT:    ret void
+; CHECK:       A4.target.C:
+; CHECK-NEXT:    br label [[LOOP_EXIT_GUARD]]
+; CHECK:       A5.target.exit:
+; CHECK-NEXT:    br label [[LOOP_EXIT_GUARD]]
+; CHECK:       loop.exit.guard:
+; CHECK-NEXT:    [[EXIT_PHI_MOVED]] = phi i32 [ poison, [[A4_TARGET_C]] ], [ [[A4_PHI]], [[A5_TARGET_EXIT]] ]
+; CHECK-NEXT:    [[GUARD_C:%.*]] = phi i1 [ true, [[A4_TARGET_C]] ], [ false, [[A5_TARGET_EXIT]] ]
+; CHECK-NEXT:    br i1 [[GUARD_C]], label [[C]], label [[EXIT]]
+; CHECK:       B3.target.A3:
+; CHECK-NEXT:    br label [[LOOP_EXIT_GUARD1:%.*]]
+; CHECK:       B4.target.A2:
+; CHECK-NEXT:    br label [[LOOP_EXIT_GUARD1]]
+; CHECK:       loop.exit.guard1:
+; CHECK-NEXT:    [[X_INC_MOVED]] = phi i32 [ [[X_INC]], [[B3_TARGET_A3:%.*]] ], [ [[X_INC]], [[B4_TARGET_A2:%.*]] ]
+; CHECK-NEXT:    [[GUARD_A3:%.*]] = phi i1 [ true, [[B3_TARGET_A3]] ], [ false, [[B4_TARGET_A2]] ]
+; CHECK-NEXT:    br i1 [[GUARD_A3]], label [[A3]], label [[A2]]
+;
+entry:
+  br label %A1
+
+A1:
+  br label %B1
+
+B1:
+  br label %B2
+
+B2:
+  %X.inc = add i32 %X, 1
+  br label %B3
+
+B3:
+  callbr void asm "", "r,!i"(i1 %PredB3) to label %B4 [label %A3]
+
+B4:
+  callbr void asm "", "r,!i"(i1 %PredB4) to label %B1 [label %A2]
+
+A2:
+  br label %A4
+
+A3:
+  br label %A4
+
+A4:
+  %A4.phi = phi i32 [%Y, %A3], [%X.inc, %A2]
+  callbr void asm "", "r,!i"(i1 %PredA4) to label %C [label %A5]
+
+A5:
+  callbr void asm "", "r,!i"(i1 %PredA3) to label %exit [label %A1]
+
+C:
+  br label %exit
+
+exit:
+  %exit.phi = phi i32 [%A4.phi, %A5], [%Z, %C]
+  ret void
+}
+
+; Here, the newly created target loop that connects b to r1 needs to be part of
+; the parent loop (the outer loop b participates in). Otherwise, it will be
+; regarded as an additional loop entry point to this outer loop.
+define void @nested_callbr_multiple_exits() {
+; CHECK-LABEL: @nested_callbr_multiple_exits(
+; CHECK-NEXT:    br label [[A:%.*]]
+; CHECK:       a:
+; CHECK-NEXT:    callbr void asm "", ""()
+; CHECK-NEXT:            to label [[B:%.*]] []
+; CHECK:       b:
+; CHECK-NEXT:    callbr void asm "", "!i"()
+; CHECK-NEXT:            to label [[C:%.*]] [label %b.target.b.target.r1]
+; CHECK:       c:
+; CHECK-NEXT:    callbr void asm "", "!i"()
+; CHECK-NEXT:            to label [[C_TARGET_E:%.*]] [label %b]
+; CHECK:       e:
+; CHECK-NEXT:    callbr void asm "", "!i"()
+; CHECK-NEXT:            to label [[A]] [label %e.target.r2]
+; CHECK:       r1:
+; CHECK-NEXT:    ret void
+; CHECK:       r2:
+; CHECK-NEXT:    ret void
+; CHECK:       b.target.r1:
+; CHECK-NEXT:    br label [[LOOP_EXIT_GUARD:%.*]]
+; CHECK:       e.target.r2:
+; CHECK-NEXT:    br label [[LOOP_EXIT_GUARD]]
+; CHECK:       loop.exit.guard:
+; CHECK-NEXT:    [[GUARD_R1:%.*]] = phi i1 [ true, [[B_TARGET_R1:%.*]] ], [ false, [[E_TARGET_R2:%.*]] ]
+; CHECK-NEXT:    br i1 [[GUARD_R1]], label [[R1:%.*]], label [[R2:%.*]]
+; CHECK:       b.target.b.target.r1:
+; CHECK-NEXT:    br label [[LOOP_EXIT_GUARD1:%.*]]
+; CHECK:       c.target.e:
+; CHECK-NEXT:    br label [[LOOP_EXIT_GUARD1]]
+; CHECK:       loop.exit.guard1:
+; CHECK-NEXT:    [[GUARD_B_TARGET_R1:%.*]] = phi i1 [ true, [[B_TARGET_B_TARGET_R1:%.*]] ], [ false, [[C_TARGET_E]] ]
+; CHECK-NEXT:    br i1 [[GUARD_B_TARGET_R1]], label [[B_TARGET_R1]], label [[E:%.*]]
+;
+  br label %a
+a:
+  callbr void asm "", ""() to label %b []
+b:
+  callbr void asm "", "!i"() to label %c [label %r1]
+c:
+  callbr void asm "", "!i"() to label %e [label %b]
+e:
+  callbr void asm "", "!i"() to label %a [label %r2]
+r1:
+  ret void
+r2:
+  ret void
+}
diff --git a/llvm/test/Transforms/UnifyLoopExits/restore-ssa.ll b/llvm/test/Transforms/UnifyLoopExits/restore-ssa.ll
index 3e68df3e79260..ffe8026a535c0 100644
--- a/llvm/test/Transforms/UnifyLoopExits/restore-ssa.ll
+++ b/llvm/test/Transforms/UnifyLoopExits/restore-ssa.ll
@@ -57,6 +57,60 @@ return:
   ret i32 %phi
 }
 
+define i32 @exiting-used-in-exit_callbr(ptr %arg1, ptr %arg2) local_unnamed_addr align 2 {
+; CHECK-LABEL: @exiting-used-in-exit_callbr(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    callbr void asm "", ""()
+; CHECK-NEXT:            to label [[A:%.*]] []
+; CHECK:       A:
+; CHECK-NEXT:    [[MYTMP42:%.*]] = load i32, ptr [[ARG1:%.*]], align 4
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[MYTMP42]], 0
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[CMP1]])
+; CHECK-NEXT:            to label [[B:%.*]] [label %A.target.return]
+; CHECK:       B:
+; CHECK-NEXT:    [[MYTMP41:%.*]] = load i32, ptr [[ARG2:%.*]], align 4
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[MYTMP41]], 0
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[CMP]])
+; CHECK-NEXT:            to label [[A]] [label %B.target.C]
+; CHECK:       C:
+; CHECK-NEXT:    [[INC:%.*]] = add i32 [[MYTMP41_MOVED:%.*]], 1
+; CHECK-NEXT:    callbr void asm "", ""()
+; CHECK-NEXT:            to label [[RETURN:%.*]] []
+; CHECK:       return:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ [[INC]], [[C:%.*]] ], [ [[PHI_MOVED:%.*]], [[LOOP_EXIT_GUARD:%.*]] ]
+; CHECK-NEXT:    ret i32 [[PHI]]
+; CHECK:       A.target.return:
+; CHECK-NEXT:    br label [[LOOP_EXIT_GUARD]]
+; CHECK:       B.target.C:
+; CHECK-NEXT:    br label [[LOOP_EXIT_GUARD]]
+; CHECK:       loop.exit.guard:
+; CHECK-NEXT:    [[MYTMP41_MOVED]] = phi i32 [ poison, [[A_TARGET_RETURN:%.*]] ], [ [[MYTMP41]], [[B_TARGET_C:%.*]] ]
+; CHECK-NEXT:    [[PHI_MOVED]] = phi i32 [ [[MYTMP42]], [[A_TARGET_RETURN]] ], [ poison, [[B_TARGET_C]] ]
+; CHECK-NEXT:    [[GUARD_RETURN:%.*]] = phi i1 [ true, [[A_TARGET_RETURN]] ], [ false, [[B_TARGET_C]] ]
+; CHECK-NEXT:    br i1 [[GUARD_RETURN]], label [[RETURN]], label [[C]]
+;
+entry:
+  callbr void asm "", ""() to label %A []
+
+A:
+  %mytmp42 = load i32, ptr %arg1, align 4
+  %cmp1 = icmp slt i32 %mytmp42, 0
+  callbr void asm "", "r,!i"(i1 %cmp1) to label %B [label %return]
+
+B:
+  %mytmp41 = load i32, ptr %arg2, align 4
+  %cmp = icmp slt i32 %mytmp41, 0
+  callbr void asm "", "r,!i"(i1 %cmp) to label %A [label %C]
+
+C:
+  %inc = add i32 %mytmp41, 1
+  callbr void asm "", ""() to label %return []
+
+return:
+  %phi = phi i32 [ %inc, %C ], [ %mytmp42, %A ]
+  ret i32 %phi
+}
+
 ; Loop consists of A, B and C:
 ; - A is the header
 ; - A and C are exiting blocks
@@ -112,6 +166,63 @@ return:
   ret i32 0
 }
 
+define i32 @internal-used-in-exit_callbr(ptr %arg1, ptr %arg2) local_unnamed_addr align 2 {
+; CHECK-LABEL: @internal-used-in-exit_callbr(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MYTMP42:%.*]] = load i32, ptr [[ARG1:%.*]], align 4
+; CHECK-NEXT:    callbr void asm "", ""()
+; CHECK-NEXT:            to label [[A:%.*]] []
+; CHECK:       A:
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[MYTMP42]], 0
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[CMP1]])
+; CHECK-NEXT:            to label [[B:%.*]] [label %A.target.return]
+; CHECK:       B:
+; CHECK-NEXT:    [[MYTMP41:%.*]] = load i32, ptr [[ARG2:%.*]], align 4
+; CHECK-NEXT:    callbr void asm "", ""()
+; CHECK-NEXT:            to label [[C:%.*]] []
+; CHECK:       C:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[MYTMP42]], 0
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[CMP]])
+; CHECK-NEXT:            to label [[A]] [label %C.target.D]
+; CHECK:       D:
+; CHECK-NEXT:    [[INC:%.*]] = add i32 [[MYTMP41_MOVED:%.*]], 1
+; CHECK-NEXT:    callbr void asm "", ""()
+; CHECK-NEXT:            to label [[RETURN:%.*]] []
+; CHECK:       return:
+; CHECK-NEXT:    ret i32 0
+; CHECK:       A.target.return:
+; CHECK-NEXT:    br label [[LOOP_EXIT_GUARD:%.*]]
+; CHECK:       C.target.D:
+; CHECK-NEXT:    br label [[LOOP_EXIT_GUARD]]
+; CHECK:       loop.exit.guard:
+; CHECK-NEXT:    [[MYTMP41_MOVED]] = phi i32 [ poison, [[A_TARGET_RETURN:%.*]] ], [ [[MYTMP41]], [[C_TARGET_D:%.*]] ]
+; CHECK-NEXT:    [[GUARD_RETURN:%.*]] = phi i1 [ true, [[A_TARGET_RETURN]] ], [ false, [[C_TARGET_D]] ]
+; CHECK-NEXT:    br i1 [[GUARD_RETURN]], label [[RETURN]], label [[D:%.*]]
+;
+entry:
+  %mytmp42 = load i32, ptr %arg1, align 4
+  callbr void asm "", ""() to label %A []
+
+A:
+  %cmp1 = icmp slt i32 %mytmp42, 0
+  callbr void asm "", "r,!i"(i1 %cmp1) to label %B [label %return]
+
+B:
+  %mytmp41 = load i32, ptr %arg2, align 4
+  callbr void asm "", ""() to label %C []
+
+C:
+  %cmp = icmp slt i32 %mytmp42, 0
+  callbr void asm "", "r,!i"(i1 %cmp) to label %A [label %D]
+
+D:
+  %inc = add i32 %mytmp41, 1
+  callbr void asm "", ""() to label %return []
+
+return:
+  ret i32 0
+}
+
 ; Loop consists of A, B and C:
 ; - A is the header
 ; - A and C are exiting blocks
@@ -172,6 +283,68 @@ return:
   ret i32 %phi
 }
 
+define i32 @mixed-use-in-exit_callbr(ptr %arg1, ptr %arg2) local_unnamed_addr align 2 {
+; CHECK-LABEL: @mixed-use-in-exit_callbr(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MYTMP42:%.*]] = load i32, ptr [[ARG1:%.*]], align 4
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[MYTMP42]], 0
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[CMP2]])
+; CHECK-NEXT:            to label [[A:%.*]] [label %return]
+; CHECK:       A:
+; CHECK-NEXT:    [[MYTMP43:%.*]] = add i32 [[MYTMP42]], 1
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[MYTMP42]], 0
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[CMP1]])
+; CHECK-NEXT:            to label [[B:%.*]] [label %A.target.return]
+; CHECK:       B:
+; CHECK-NEXT:    [[MYTMP41:%.*]] = load i32, ptr [[ARG2:%.*]], align 4
+; CHECK-NEXT:    callbr void asm "", ""()
+; CHECK-NEXT:            to label [[C:%.*]] []
+; CHECK:       C:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[MYTMP42]], 0
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[CMP]])
+; CHECK-NEXT:            to label [[A]] [label %C.target.D]
+; CHECK:       D:
+; CHECK-NEXT:    callbr void asm "", ""()
+; CHECK-NEXT:            to label [[RETURN:%.*]] []
+; CHECK:       return:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ [[MYTMP41_MOVED:%.*]], [[D:%.*]] ], [ [[MYTMP42]], [[ENTRY:%.*]] ], [ [[PHI_MOVED:%.*]], [[LOOP_EXIT_GUARD:%.*]] ]
+; CHECK-NEXT:    ret i32 [[PHI]]
+; CHECK:       A.target.return:
+; CHECK-NEXT:    br label [[LOOP_EXIT_GUARD]]
+; CHECK:       C.target.D:
+; CHECK-NEXT:    br label [[LOOP_EXIT_GUARD]]
+; CHECK:       loop.exit.guard:
+; CHECK-NEXT:    [[MYTMP41_MOVED]] = phi i32 [ poison, [[A_TARGET_RETURN:%.*]] ], [ [[MYTMP41]], [[C_TARGET_D:%.*]] ]
+; CHECK-NEXT:    [[PHI_MOVED]] = phi i32 [ [[MYTMP43]], [[A_TARGET_RETURN]] ], [ poison, [[C_TARGET_D]] ]
+; CHECK-NEXT:    [[GUARD_RETURN:%.*]] = phi i1 [ true, [[A_TARGET_RETURN]] ], [ false, [[C_TARGET_D]] ]
+; CHECK-NEXT:    br i1 [[GUARD_RETURN]], label [[RETURN]], label [[D]]
+;
+entry:
+  %mytmp42 = load i32, ptr %arg1, align 4
+  %cmp2 = icmp slt i32 %mytmp42, 0
+  callbr void asm "", "r,!i"(i1 %cmp2) to label %A [label %return]
+
+A:
+  %mytmp43 = add i32 %mytmp42, 1
+  %cmp1 = icmp slt i32 %mytmp42, 0
+  callbr void asm "", "r,!i"(i1 %cmp1) to label %B [label %return]
+
+B:
+  %mytmp41 = load i32, ptr %arg2, align 4
+  callbr void asm "", ""() to label %C []
+
+C:
+  %cmp = icmp slt i32 %mytmp42, 0
+  callbr void asm "", "r,!i"(i1 %cmp) to label %A [label %D]
+
+D:
+  callbr void asm "", ""() to label %return []
+
+return:
+  %phi = phi i32 [ %mytmp41, %D ], [ %mytmp43, %A ], [%mytmp42, %entry]
+  ret i32 %phi
+}
+
 ; Loop consists of A, B and C:
 ; - A is the header
 ; - A and C are exiting blocks
@@ -236,3 +409,66 @@ return:
   %phi = phi i32 [ %mytmp41, %D ], [ %mytmp42, %E ]
   ret i32 %phi
 }
+
+define i32 @phi-via-external-block_callbr(ptr %arg1, ptr %arg2) local_unnamed_addr align 2 {
+; CHECK-LABEL: @phi-via-external-block_callbr(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MYTMP42:%.*]] = load i32, ptr [[ARG1:%.*]], align 4
+; CHECK-NEXT:    callbr void asm "", ""()
+; CHECK-NEXT:            to label [[A:%.*]] []
+; CHECK:       A:
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[MYTMP42]], 0
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[CMP1]])
+; CHECK-NEXT:            to label [[B:%.*]] [label %A.target.E]
+; CHECK:       B:
+; CHECK-NEXT:    [[MYTMP41:%.*]] = load i32, ptr [[ARG2:%.*]], align 4
+; CHECK-NEXT:    callbr void asm "", ""()
+; CHECK-NEXT:            to label [[C:%.*]] []
+; CHECK:       C:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[MYTMP42]], 0
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[CMP]])
+; CHECK-NEXT:            to label [[A]] [label %C.target.D]
+; CHECK:       D:
+; CHECK-NEXT:    callbr void asm "", ""()
+; CHECK-NEXT:            to label [[RETURN:%.*]] []
+; CHECK:       E:
+; CHECK-NEXT:    callbr void asm "", ""()
+; CHECK-NEXT:            to label [[RETURN]] []
+; CHECK:       return:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ [[MYTMP41_MOVED:%.*]], [[D:%.*]] ], [ [[MYTMP42]], [[E:%.*]] ]
+; CHECK-NEXT:    ret i32 [[PHI]]
+; CHECK:       A.target.E:
+; CHECK-NEXT:    br label [[LOOP_EXIT_GUARD:%.*]]
+; CHECK:       C.target.D:
+; CHECK-NEXT:    br label [[LOOP_EXIT_GUARD]]
+; CHECK:       loop.exit.guard:
+; CHECK-NEXT:    [[MYTMP41_MOVED]] = phi i32 [ poison, [[A_TARGET_E:%.*]] ], [ [[MYTMP41]], [[C_TARGET_D:%.*]] ]
+; CHECK-NEXT:    [[GUARD_E:%.*]] = phi i1 [ true, [[A_TARGET_E]] ], [ false, [[C_TARGET_D]] ]
+; CHECK-NEXT:    br i1 [[GUARD_E]], label [[E]], label [[D]]
+;
+entry:
+  %mytmp42 = load i32, ptr %arg1, align 4
+  callbr void asm "", ""() to label %A []
+
+A:
+  %cmp1 = icmp slt i32 %mytmp42, 0
+  callbr void asm "", "r,!i"(i1 %cmp1) to label %B [label %E]
+
+B:
+  %mytmp41 = load i32, ptr %arg2, align 4
+  callbr void asm "", ""() to label %C []
+
+C:
+  %cmp = icmp slt i32 %mytmp42, 0
+  callbr void asm "", "r,!i"(i1 %cmp) to label %A [label %D]
+
+D:
+  callbr void asm "", ""() to label %return []
+
+E:
+  callbr void asm "", ""() to label %return []
+
+return:
+  %phi = phi i32 [ %mytmp41, %D ], [ %mytmp42, %E ]
+  ret i32 %phi
+}
diff --git a/llvm/test/Transforms/UnifyLoopExits/undef-phis.ll b/llvm/test/Transforms/UnifyLoopExits/undef-phis.ll
index 05f50fcc37d6e..e65e2549a21c8 100644
--- a/llvm/test/Transforms/UnifyLoopExits/undef-phis.ll
+++ b/llvm/test/Transforms/UnifyLoopExits/undef-phis.ll
@@ -56,3 +56,71 @@ mbb5291:                                           ; preds = %mbb4321
   store volatile [2 x i32] %i5293, ptr addrspace(5) null, align 4
   ret void
 }
+
+define fastcc void @undef_phi_callbr(i64 %i5247, i1 %i4530, i1 %i4936.not) {
+; CHECK-LABEL: define fastcc void @undef_phi_callbr(
+; CHECK-SAME: i64 [[I5247:%.*]], i1 [[I4530:%.*]], i1 [[I4936_NOT:%.*]]) {
+; CHECK-NEXT:  [[MBB:.*:]]
+; CHECK-NEXT:    callbr void asm "", ""()
+; CHECK-NEXT:            to label %[[MBB3932:.*]] []
+; CHECK:       [[MBB3932]]:
+; CHECK-NEXT:    callbr void asm "", ""()
+; CHECK-NEXT:            to label %[[MBB4454:.*]] []
+; CHECK:       [[MBB4321:.*]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[I5247]] to i32
+; CHECK-NEXT:    [[I5290:%.*]] = icmp eq i32 [[TMP0]], 0
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[I5290]])
+; CHECK-NEXT:            to label %[[MBB3932]] [label %mbb4321.target.mbb5291]
+; CHECK:       [[MBB4454]]:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[I4530]])
+; CHECK-NEXT:            to label %[[MBB4535:.*]] [label %mbb4454.target.mbb4454.target.mbb4531]
+; CHECK:       [[MBB4531:.*]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[MBB4535]]:
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i1 [[I4936_NOT]])
+; CHECK-NEXT:            to label %[[MBB4535_TARGET_MBB4321:.*]] [label %mbb4454]
+; CHECK:       [[MBB5291:.*]]:
+; CHECK-NEXT:    [[I5293:%.*]] = insertvalue [2 x i32] zeroinitializer, i32 [[DOTMOVED:%.*]], 1
+; CHECK-NEXT:    store volatile [2 x i32] [[I5293]], ptr addrspace(5) null, align 4
+; CHECK-NEXT:    ret void
+; CHECK:       [[MBB4454_TARGET_MBB4531:.*]]:
+; CHECK-NEXT:    br label %[[LOOP_EXIT_GUARD:.*]]
+; CHECK:       [[MBB4321_TARGET_MBB5291:.*]]:
+; CHECK-NEXT:    br label %[[LOOP_EXIT_GUARD]]
+; CHECK:       [[LOOP_EXIT_GUARD]]:
+; CHECK-NEXT:    [[DOTMOVED]] = phi i32 [ poison, %[[MBB4454_TARGET_MBB4531]] ], [ [[TMP0]], %[[MBB4321_TARGET_MBB5291]] ]
+; CHECK-NEXT:    [[GUARD_MBB4531:%.*]] = phi i1 [ true, %[[MBB4454_TARGET_MBB4531]] ], [ false, %[[MBB4321_TARGET_MBB5291]] ]
+; CHECK-NEXT:    br i1 [[GUARD_MBB4531]], label %[[MBB4531]], label %[[MBB5291]]
+; CHECK:       [[MBB4454_TARGET_MBB4454_TARGET_MBB4531:.*]]:
+; CHECK-NEXT:    br label %[[LOOP_EXIT_GUARD1:.*]]
+; CHECK:       [[MBB4535_TARGET_MBB4321]]:
+; CHECK-NEXT:    br label %[[LOOP_EXIT_GUARD1]]
+; CHECK:       [[LOOP_EXIT_GUARD1]]:
+; CHECK-NEXT:    [[GUARD_MBB4454_TARGET_MBB4531:%.*]] = phi i1 [ true, %[[MBB4454_TARGET_MBB4454_TARGET_MBB4531]] ], [ false, %[[MBB4535_TARGET_MBB4321]] ]
+; CHECK-NEXT:    br i1 [[GUARD_MBB4454_TARGET_MBB4531]], label %[[MBB4454_TARGET_MBB4531]], label %[[MBB4321]]
+;
+mbb:
+  callbr void asm "", ""() to label %mbb3932 []
+
+mbb3932:                                           ; preds = %mbb4321, %mbb
+  callbr void asm "", ""() to label %mbb4454 []
+
+mbb4321:                                           ; preds = %mbb4535
+  %0 = trunc i64 %i5247 to i32
+  %i5290 = icmp eq i32 %0, 0
+  callbr void asm "", "r,!i"(i1 %i5290) to label %mbb3932 [label %mbb5291]
+
+mbb4454:                                           ; preds = %mbb4535, %mbb3932
+  callbr void asm "", "r,!i"(i1 %i4530) to label %mbb4535 [label %mbb4531]
+
+mbb4531:                                           ; preds = %mbb4454
+  ret void
+
+mbb4535:                                           ; preds = %mbb4454
+  callbr void asm "", "r,!i"(i1 %i4936.not) to label %mbb4321 [label %mbb4454]
+
+mbb5291:                                           ; preds = %mbb4321
+  %i5293 = insertvalue [2 x i32] zeroinitializer, i32 %0, 1
+  store volatile [2 x i32] %i5293, ptr addrspace(5) null, align 4
+  ret void
+}
diff --git a/llvm/test/Transforms/Util/PredicateInfo/branch-on-same-cond.ll b/llvm/test/Transforms/Util/PredicateInfo/branch-on-same-cond.ll
index 0be13ee76bece..f024106b7299a 100644
--- a/llvm/test/Transforms/Util/PredicateInfo/branch-on-same-cond.ll
+++ b/llvm/test/Transforms/Util/PredicateInfo/branch-on-same-cond.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-inst-comments
 ; RUN: opt -S -passes=print-predicateinfo < %s 2>&1 >/dev/null | FileCheck %s
 
 ; FIXME:  RenamedOp should be %cmp or %x in all cases here,
@@ -9,25 +9,25 @@ define i32 @test(i32 %x) {
 ; CHECK-NEXT:    br label [[BB1:%.*]]
 ; CHECK:       bb1:
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
-; CHECK: RenamedOp: [[CMP]]
-; CHECK:         [[CMP_0:%.*]] = bitcast i1 [[CMP]] to i1
-; CHECK: RenamedOp: [[X]]
-; CHECK:         [[X_0:%.*]] = bitcast i32 [[X]] to i32
-; CHECK-NEXT:    br i1 [[CMP]], label [[BB2:%.*]], label [[EXIT1:%.*]]
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = icmp eq i32 [[X]], 0 Edge: [label [[BB1]],label [[BB2:%.*]]], RenamedOp: [[CMP]] }
+; CHECK-NEXT:    [[CMP_0:%.*]] = bitcast i1 [[CMP]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = icmp eq i32 [[X]], 0 Edge: [label [[BB1]],label [[BB2]]], RenamedOp: [[X]] }
+; CHECK-NEXT:    [[X_0:%.*]] = bitcast i32 [[X]] to i32
+; CHECK-NEXT:    br i1 [[CMP]], label [[BB2]], label [[EXIT1:%.*]]
 ; CHECK:       bb2:
-; CHECK: RenamedOp: [[CMP_0]]
-; CHECK:         [[CMP_0_1:%.*]] = bitcast i1 [[CMP_0]] to i1
-; CHECK: RenamedOp: [[X]]
-; CHECK:         [[X_0_1:%.*]] = bitcast i32 [[X_0]] to i32
-; CHECK: RenamedOp: [[X_0]]
-; CHECK:         [[X_0_4:%.*]] = bitcast i32 [[X_0]] to i32
-; CHECK-NEXT:    br i1 [[CMP_0]], label [[BB3:%.*]], label [[EXIT2:%.*]]
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = icmp eq i32 [[X]], 0 Edge: [label [[BB2]],label [[BB3:%.*]]], RenamedOp: [[CMP_0]] }
+; CHECK-NEXT:    [[CMP_0_1:%.*]] = bitcast i1 [[CMP_0]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = icmp eq i32 [[X]], 0 Edge: [label [[BB2]],label [[BB3]]], RenamedOp: [[X]] }
+; CHECK-NEXT:    [[X_0_1:%.*]] = bitcast i32 [[X_0]] to i32
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[CMP]] = icmp eq i32 [[X]], 0 Edge: [label [[BB2]],label [[EXIT2:%.*]]], RenamedOp: [[X_0]] }
+; CHECK-NEXT:    [[X_0_4:%.*]] = bitcast i32 [[X_0]] to i32
+; CHECK-NEXT:    br i1 [[CMP_0]], label [[BB3]], label [[EXIT2]]
 ; CHECK:       bb3:
-; CHECK: RenamedOp: [[X]]
-; CHECK:         [[X_0_1_2:%.*]] = bitcast i32 [[X_0_1]] to i32
-; CHECK: RenamedOp: [[X_0_1]]
-; CHECK:         [[X_0_1_3:%.*]] = bitcast i32 [[X_0_1]] to i32
-; CHECK-NEXT:    br i1 [[CMP_0_1]], label [[EXIT3:%.*]], label [[EXIT4:%.*]]
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = icmp eq i32 [[X]], 0 Edge: [label [[BB3]],label [[EXIT3:%.*]]], RenamedOp: [[X]] }
+; CHECK-NEXT:    [[X_0_1_2:%.*]] = bitcast i32 [[X_0_1]] to i32
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[CMP]] = icmp eq i32 [[X]], 0 Edge: [label [[BB3]],label [[EXIT4:%.*]]], RenamedOp: [[X_0_1]] }
+; CHECK-NEXT:    [[X_0_1_3:%.*]] = bitcast i32 [[X_0_1]] to i32
+; CHECK-NEXT:    br i1 [[CMP_0_1]], label [[EXIT3]], label [[EXIT4]]
 ; CHECK:       exit1:
 ; CHECK-NEXT:    ret i32 0
 ; CHECK:       exit2:
diff --git a/llvm/test/Transforms/Util/PredicateInfo/condprop.ll b/llvm/test/Transforms/Util/PredicateInfo/condprop.ll
index 256d0d908ec1e..42e8ccb760b3f 100644
--- a/llvm/test/Transforms/Util/PredicateInfo/condprop.ll
+++ b/llvm/test/Transforms/Util/PredicateInfo/condprop.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-inst-comments
 ; RUN: opt -passes=print-predicateinfo -disable-output < %s 2>&1 | FileCheck %s
 
 @a = external global i32		; <ptr> [#uses=7]
@@ -98,12 +98,17 @@ define void @test3(i32 %x, i32 %y) {
 ; CHECK-NEXT:    [[XZ:%.*]] = icmp eq i32 [[X:%.*]], 0
 ; CHECK-NEXT:    [[YZ:%.*]] = icmp eq i32 [[Y:%.*]], 0
 ; CHECK-NEXT:    [[Z:%.*]] = and i1 [[XZ]], [[YZ]]
-; CHECK:         [[Z_0:%.*]] = bitcast i1 [[Z]] to i1
-; CHECK:         [[XZ_0:%.*]] = bitcast i1 [[XZ]] to i1
-; CHECK:         [[X_0:%.*]] = bitcast i32 [[X]] to i32
-; CHECK:         [[YZ_0:%.*]] = bitcast i1 [[YZ]] to i1
-; CHECK:         [[Y_0:%.*]] = bitcast i32 [[Y]] to i32
-; CHECK-NEXT:    br i1 [[Z]], label [[BOTH_ZERO:%.*]], label [[NOPE:%.*]]
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[Z]] = and i1 [[XZ]], [[YZ]] Edge: [label [[TMP0:%.*]],label [[NOPE:%.*]]], RenamedOp: [[Z]] }
+; CHECK-NEXT:    [[Z_0:%.*]] = bitcast i1 [[Z]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[XZ]] = icmp eq i32 [[X]], 0 Edge: [label [[TMP0]],label [[BOTH_ZERO:%.*]]], RenamedOp: [[XZ]] }
+; CHECK-NEXT:    [[XZ_0:%.*]] = bitcast i1 [[XZ]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[XZ]] = icmp eq i32 [[X]], 0 Edge: [label [[TMP0]],label [[BOTH_ZERO]]], RenamedOp: [[X]] }
+; CHECK-NEXT:    [[X_0:%.*]] = bitcast i32 [[X]] to i32
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[YZ]] = icmp eq i32 [[Y]], 0 Edge: [label [[TMP0]],label [[BOTH_ZERO]]], RenamedOp: [[YZ]] }
+; CHECK-NEXT:    [[YZ_0:%.*]] = bitcast i1 [[YZ]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[YZ]] = icmp eq i32 [[Y]], 0 Edge: [label [[TMP0]],label [[BOTH_ZERO]]], RenamedOp: [[Y]] }
+; CHECK-NEXT:    [[Y_0:%.*]] = bitcast i32 [[Y]] to i32
+; CHECK-NEXT:    br i1 [[Z]], label [[BOTH_ZERO]], label [[NOPE]]
 ; CHECK:       both_zero:
 ; CHECK-NEXT:    call void @foo(i1 [[XZ_0]])
 ; CHECK-NEXT:    call void @foo(i1 [[YZ_0]])
@@ -133,10 +138,11 @@ define void @test4(i1 %b, i32 %x) {
 ; CHECK-LABEL: @test4(
 ; CHECK-NEXT:    br i1 [[B:%.*]], label [[SW:%.*]], label [[CASE3:%.*]]
 ; CHECK:       sw:
-; CHECK:         [[X_0:%.*]] = bitcast i32 [[X:%.*]] to i32
+; CHECK-NEXT:  ; switch predicate info { CaseValue: i32 1 Edge: [label [[SW]],label [[CASE1:%.*]]], RenamedOp: [[X:%.*]] }
+; CHECK-NEXT:    [[X_0:%.*]] = bitcast i32 [[X]] to i32
 ; CHECK-NEXT:    switch i32 [[X]], label [[DEFAULT:%.*]] [
 ; CHECK-NEXT:      i32 0, label [[CASE0:%.*]]
-; CHECK-NEXT:      i32 1, label [[CASE1:%.*]]
+; CHECK-NEXT:      i32 1, label [[CASE1]]
 ; CHECK-NEXT:      i32 2, label [[CASE0]]
 ; CHECK-NEXT:      i32 3, label [[CASE3]]
 ; CHECK-NEXT:      i32 4, label [[DEFAULT]]
@@ -180,11 +186,15 @@ case3:
 define i1 @test5(i32 %x, i32 %y) {
 ; CHECK-LABEL: @test5(
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], [[Y:%.*]]
-; CHECK:         [[X_0:%.*]] = bitcast i32 [[X]] to i32
-; CHECK:         [[X_1:%.*]] = bitcast i32 [[X]] to i32
-; CHECK:         [[Y_0:%.*]] = bitcast i32 [[Y]] to i32
-; CHECK:         [[Y_1:%.*]] = bitcast i32 [[Y]] to i32
-; CHECK-NEXT:    br i1 [[CMP]], label [[SAME:%.*]], label [[DIFFERENT:%.*]]
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = icmp eq i32 [[X]], [[Y]] Edge: [label [[TMP0:%.*]],label [[SAME:%.*]]], RenamedOp: [[X]] }
+; CHECK-NEXT:    [[X_0:%.*]] = bitcast i32 [[X]] to i32
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[CMP]] = icmp eq i32 [[X]], [[Y]] Edge: [label [[TMP0]],label [[DIFFERENT:%.*]]], RenamedOp: [[X]] }
+; CHECK-NEXT:    [[X_1:%.*]] = bitcast i32 [[X]] to i32
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = icmp eq i32 [[X]], [[Y]] Edge: [label [[TMP0]],label [[SAME]]], RenamedOp: [[Y]] }
+; CHECK-NEXT:    [[Y_0:%.*]] = bitcast i32 [[Y]] to i32
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[CMP]] = icmp eq i32 [[X]], [[Y]] Edge: [label [[TMP0]],label [[DIFFERENT]]], RenamedOp: [[Y]] }
+; CHECK-NEXT:    [[Y_1:%.*]] = bitcast i32 [[Y]] to i32
+; CHECK-NEXT:    br i1 [[CMP]], label [[SAME]], label [[DIFFERENT]]
 ; CHECK:       same:
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp ne i32 [[X_0]], [[Y_0]]
 ; CHECK-NEXT:    ret i1 [[CMP2]]
@@ -253,11 +263,15 @@ different:
 define i1 @test7(i32 %x, i32 %y) {
 ; CHECK-LABEL: @test7(
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[X:%.*]], [[Y:%.*]]
-; CHECK:         [[X_0:%.*]] = bitcast i32 [[X]] to i32
-; CHECK:         [[X_1:%.*]] = bitcast i32 [[X]] to i32
-; CHECK:         [[Y_0:%.*]] = bitcast i32 [[Y]] to i32
-; CHECK:         [[Y_1:%.*]] = bitcast i32 [[Y]] to i32
-; CHECK-NEXT:    br i1 [[CMP]], label [[SAME:%.*]], label [[DIFFERENT:%.*]]
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = icmp sgt i32 [[X]], [[Y]] Edge: [label [[TMP0:%.*]],label [[SAME:%.*]]], RenamedOp: [[X]] }
+; CHECK-NEXT:    [[X_0:%.*]] = bitcast i32 [[X]] to i32
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[CMP]] = icmp sgt i32 [[X]], [[Y]] Edge: [label [[TMP0]],label [[DIFFERENT:%.*]]], RenamedOp: [[X]] }
+; CHECK-NEXT:    [[X_1:%.*]] = bitcast i32 [[X]] to i32
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = icmp sgt i32 [[X]], [[Y]] Edge: [label [[TMP0]],label [[SAME]]], RenamedOp: [[Y]] }
+; CHECK-NEXT:    [[Y_0:%.*]] = bitcast i32 [[Y]] to i32
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[CMP]] = icmp sgt i32 [[X]], [[Y]] Edge: [label [[TMP0]],label [[DIFFERENT]]], RenamedOp: [[Y]] }
+; CHECK-NEXT:    [[Y_1:%.*]] = bitcast i32 [[Y]] to i32
+; CHECK-NEXT:    br i1 [[CMP]], label [[SAME]], label [[DIFFERENT]]
 ; CHECK:       same:
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[X_0]], [[Y_0]]
 ; CHECK-NEXT:    ret i1 [[CMP2]]
@@ -280,11 +294,15 @@ different:
 define i1 @test7_fp(float %x, float %y) {
 ; CHECK-LABEL: @test7_fp(
 ; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt float [[X:%.*]], [[Y:%.*]]
-; CHECK:         [[X_0:%.*]] = bitcast float [[X]] to float
-; CHECK:         [[X_1:%.*]] = bitcast float [[X]] to float
-; CHECK:         [[Y_0:%.*]] = bitcast float [[Y]] to float
-; CHECK:         [[Y_1:%.*]] = bitcast float [[Y]] to float
-; CHECK-NEXT:    br i1 [[CMP]], label [[SAME:%.*]], label [[DIFFERENT:%.*]]
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = fcmp ogt float [[X]], [[Y]] Edge: [label [[TMP0:%.*]],label [[SAME:%.*]]], RenamedOp: [[X]] }
+; CHECK-NEXT:    [[X_0:%.*]] = bitcast float [[X]] to float
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[CMP]] = fcmp ogt float [[X]], [[Y]] Edge: [label [[TMP0]],label [[DIFFERENT:%.*]]], RenamedOp: [[X]] }
+; CHECK-NEXT:    [[X_1:%.*]] = bitcast float [[X]] to float
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = fcmp ogt float [[X]], [[Y]] Edge: [label [[TMP0]],label [[SAME]]], RenamedOp: [[Y]] }
+; CHECK-NEXT:    [[Y_0:%.*]] = bitcast float [[Y]] to float
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[CMP]] = fcmp ogt float [[X]], [[Y]] Edge: [label [[TMP0]],label [[DIFFERENT]]], RenamedOp: [[Y]] }
+; CHECK-NEXT:    [[Y_1:%.*]] = bitcast float [[Y]] to float
+; CHECK-NEXT:    br i1 [[CMP]], label [[SAME]], label [[DIFFERENT]]
 ; CHECK:       same:
 ; CHECK-NEXT:    [[CMP2:%.*]] = fcmp ule float [[X_0]], [[Y_0]]
 ; CHECK-NEXT:    ret i1 [[CMP2]]
@@ -353,9 +371,11 @@ different:
 define i32 @test9(i32 %i, i32 %j) {
 ; CHECK-LABEL: @test9(
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[I:%.*]], [[J:%.*]]
-; CHECK:         [[I_0:%.*]] = bitcast i32 [[I]] to i32
-; CHECK:         [[J_0:%.*]] = bitcast i32 [[J]] to i32
-; CHECK-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[RET:%.*]]
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = icmp eq i32 [[I]], [[J]] Edge: [label [[TMP0:%.*]],label [[COND_TRUE:%.*]]], RenamedOp: [[I]] }
+; CHECK-NEXT:    [[I_0:%.*]] = bitcast i32 [[I]] to i32
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = icmp eq i32 [[I]], [[J]] Edge: [label [[TMP0]],label [[COND_TRUE]]], RenamedOp: [[J]] }
+; CHECK-NEXT:    [[J_0:%.*]] = bitcast i32 [[J]] to i32
+; CHECK-NEXT:    br i1 [[CMP]], label [[COND_TRUE]], label [[RET:%.*]]
 ; CHECK:       cond_true:
 ; CHECK-NEXT:    [[DIFF:%.*]] = sub i32 [[I_0]], [[J_0]]
 ; CHECK-NEXT:    ret i32 [[DIFF]]
@@ -376,9 +396,11 @@ ret:
 define i32 @test10(i32 %j, i32 %i) {
 ; CHECK-LABEL: @test10(
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[I:%.*]], [[J:%.*]]
-; CHECK:         [[I_0:%.*]] = bitcast i32 [[I]] to i32
-; CHECK:         [[J_0:%.*]] = bitcast i32 [[J]] to i32
-; CHECK-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[RET:%.*]]
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = icmp eq i32 [[I]], [[J]] Edge: [label [[TMP0:%.*]],label [[COND_TRUE:%.*]]], RenamedOp: [[I]] }
+; CHECK-NEXT:    [[I_0:%.*]] = bitcast i32 [[I]] to i32
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = icmp eq i32 [[I]], [[J]] Edge: [label [[TMP0]],label [[COND_TRUE]]], RenamedOp: [[J]] }
+; CHECK-NEXT:    [[J_0:%.*]] = bitcast i32 [[J]] to i32
+; CHECK-NEXT:    br i1 [[CMP]], label [[COND_TRUE]], label [[RET:%.*]]
 ; CHECK:       cond_true:
 ; CHECK-NEXT:    [[DIFF:%.*]] = sub i32 [[I_0]], [[J_0]]
 ; CHECK-NEXT:    ret i32 [[DIFF]]
@@ -403,15 +425,18 @@ define i32 @test11(i32 %x) {
 ; CHECK-NEXT:    [[V0:%.*]] = call i32 @yogibar()
 ; CHECK-NEXT:    [[V1:%.*]] = call i32 @yogibar()
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[V0]], [[V1]]
-; CHECK:         [[V0_0:%.*]] = bitcast i32 [[V0]] to i32
-; CHECK:         [[V1_0:%.*]] = bitcast i32 [[V1]] to i32
-; CHECK-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[NEXT:%.*]]
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[CMP]] = icmp eq i32 [[V0]], [[V1]] Edge: [label [[TMP0:%.*]],label [[NEXT:%.*]]], RenamedOp: [[V0]] }
+; CHECK-NEXT:    [[V0_0:%.*]] = bitcast i32 [[V0]] to i32
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = icmp eq i32 [[V0]], [[V1]] Edge: [label [[TMP0]],label [[COND_TRUE:%.*]]], RenamedOp: [[V1]] }
+; CHECK-NEXT:    [[V1_0:%.*]] = bitcast i32 [[V1]] to i32
+; CHECK-NEXT:    br i1 [[CMP]], label [[COND_TRUE]], label [[NEXT]]
 ; CHECK:       cond_true:
 ; CHECK-NEXT:    ret i32 [[V1_0]]
 ; CHECK:       next:
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i32 [[X:%.*]], [[V0_0]]
-; CHECK:         [[V0_0_1:%.*]] = bitcast i32 [[V0_0]] to i32
-; CHECK-NEXT:    br i1 [[CMP2]], label [[COND_TRUE2:%.*]], label [[NEXT2:%.*]]
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[CMP2]] = icmp eq i32 [[X]], [[V0_0]] Edge: [label [[NEXT]],label [[COND_TRUE2:%.*]]], RenamedOp: [[V0_0]] }
+; CHECK-NEXT:    [[V0_0_1:%.*]] = bitcast i32 [[V0_0]] to i32
+; CHECK-NEXT:    br i1 [[CMP2]], label [[COND_TRUE2]], label [[NEXT2:%.*]]
 ; CHECK:       cond_true2:
 ; CHECK-NEXT:    ret i32 [[V0_0_1]]
 ; CHECK:       next2:
@@ -439,9 +464,11 @@ next2:
 define i32 @test12(i32 %x) {
 ; CHECK-LABEL: @test12(
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
-; CHECK:         [[X_0:%.*]] = bitcast i32 [[X]] to i32
-; CHECK:         [[X_1:%.*]] = bitcast i32 [[X]] to i32
-; CHECK-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = icmp eq i32 [[X]], 0 Edge: [label [[TMP0:%.*]],label [[COND_TRUE:%.*]]], RenamedOp: [[X]] }
+; CHECK-NEXT:    [[X_0:%.*]] = bitcast i32 [[X]] to i32
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[CMP]] = icmp eq i32 [[X]], 0 Edge: [label [[TMP0]],label [[COND_FALSE:%.*]]], RenamedOp: [[X]] }
+; CHECK-NEXT:    [[X_1:%.*]] = bitcast i32 [[X]] to i32
+; CHECK-NEXT:    br i1 [[CMP]], label [[COND_TRUE]], label [[COND_FALSE]]
 ; CHECK:       cond_true:
 ; CHECK-NEXT:    br label [[RET:%.*]]
 ; CHECK:       cond_false:
diff --git a/llvm/test/Transforms/Util/PredicateInfo/diamond.ll b/llvm/test/Transforms/Util/PredicateInfo/diamond.ll
index ac2c9a1026e76..06c02d699c511 100644
--- a/llvm/test/Transforms/Util/PredicateInfo/diamond.ll
+++ b/llvm/test/Transforms/Util/PredicateInfo/diamond.ll
@@ -1,16 +1,18 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=print-predicateinfo < %s 2>&1 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-inst-comments
+; RUN: opt -passes=print-predicateinfo -disable-output < %s 2>&1 | FileCheck %s
 define i1 @f(i32 %x, i1 %y) {
 ; CHECK-LABEL: @f(
 ; CHECK-NEXT:    br i1 [[Y:%.*]], label [[BB0:%.*]], label [[BB1:%.*]]
 ; CHECK:       bb0:
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i32 [[X:%.*]], 0
-; CHECK:         [[X_0:%.*]] = bitcast i32 [[X]] to i32
-; CHECK-NEXT:    br i1 [[CMP]], label [[BB2:%.*]], label [[BB3:%.*]]
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = icmp sge i32 [[X]], 0 Edge: [label [[BB0]],label [[BB2:%.*]]], RenamedOp: [[X]] }
+; CHECK-NEXT:    [[X_0:%.*]] = bitcast i32 [[X]] to i32
+; CHECK-NEXT:    br i1 [[CMP]], label [[BB2]], label [[BB3:%.*]]
 ; CHECK:       bb1:
 ; CHECK-NEXT:    [[X2:%.*]] = add nuw nsw i32 [[X]], 1
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp sge i32 [[X2]], 2
-; CHECK:         [[X2_0:%.*]] = bitcast i32 [[X2]] to i32
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[CMP2]] = icmp sge i32 [[X2]], 2 Edge: [label [[BB1]],label [[BB2]]], RenamedOp: [[X2]] }
+; CHECK-NEXT:    [[X2_0:%.*]] = bitcast i32 [[X2]] to i32
 ; CHECK-NEXT:    br i1 [[CMP2]], label [[BB2]], label [[BB3]]
 ; CHECK:       bb2:
 ; CHECK-NEXT:    [[X3:%.*]] = phi i32 [ [[X_0]], [[BB0]] ], [ [[X2_0]], [[BB1]] ]
@@ -38,12 +40,14 @@ define i1 @g(i32 %x, i1 %y) {
 ; CHECK-NEXT:    br i1 [[Y:%.*]], label [[BB0:%.*]], label [[BB1:%.*]]
 ; CHECK:       bb0:
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i32 [[X:%.*]], 0
-; CHECK:         [[X_0:%.*]] = bitcast i32 [[X]] to i32
-; CHECK-NEXT:    br i1 [[CMP]], label [[BB3:%.*]], label [[BB2:%.*]]
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[CMP]] = icmp sge i32 [[X]], 0 Edge: [label [[BB0]],label [[BB2:%.*]]], RenamedOp: [[X]] }
+; CHECK-NEXT:    [[X_0:%.*]] = bitcast i32 [[X]] to i32
+; CHECK-NEXT:    br i1 [[CMP]], label [[BB3:%.*]], label [[BB2]]
 ; CHECK:       bb1:
 ; CHECK-NEXT:    [[X2:%.*]] = add nuw nsw i32 [[X]], 1
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp sge i32 [[X2]], 2
-; CHECK:         [[X2_0:%.*]] = bitcast i32 [[X2]] to i32
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[CMP2]] = icmp sge i32 [[X2]], 2 Edge: [label [[BB1]],label [[BB2]]], RenamedOp: [[X2]] }
+; CHECK-NEXT:    [[X2_0:%.*]] = bitcast i32 [[X2]] to i32
 ; CHECK-NEXT:    br i1 [[CMP2]], label [[BB3]], label [[BB2]]
 ; CHECK:       bb2:
 ; CHECK-NEXT:    [[X3:%.*]] = phi i32 [ [[X_0]], [[BB0]] ], [ [[X2_0]], [[BB1]] ]
diff --git a/llvm/test/Transforms/Util/PredicateInfo/edge.ll b/llvm/test/Transforms/Util/PredicateInfo/edge.ll
index ef757f323921a..913832696215e 100644
--- a/llvm/test/Transforms/Util/PredicateInfo/edge.ll
+++ b/llvm/test/Transforms/Util/PredicateInfo/edge.ll
@@ -1,16 +1,17 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=print-predicateinfo < %s 2>&1 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-inst-comments
+; RUN: opt -passes=print-predicateinfo -disable-output < %s 2>&1 | FileCheck %s
 
 define i32 @f1(i32 %x) {
 ; CHECK-LABEL: @f1(
 ; CHECK-NEXT:  bb0:
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
-; CHECK:         [[X_0:%.*]] = bitcast i32 [[X]] to i32
-; CHECK-NEXT:    br i1 [[CMP]], label [[BB2:%.*]], label [[BB1:%.*]]
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = icmp eq i32 [[X]], 0 Edge: [label [[BB0:%.*]],label [[BB2:%.*]]], RenamedOp: [[X]] }
+; CHECK-NEXT:    [[X_0:%.*]] = bitcast i32 [[X]] to i32
+; CHECK-NEXT:    br i1 [[CMP]], label [[BB2]], label [[BB1:%.*]]
 ; CHECK:       bb1:
 ; CHECK-NEXT:    br label [[BB2]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[COND:%.*]] = phi i32 [ [[X_0]], [[BB0:%.*]] ], [ 0, [[BB1]] ]
+; CHECK-NEXT:    [[COND:%.*]] = phi i32 [ [[X_0]], [[BB0]] ], [ 0, [[BB1]] ]
 ; CHECK-NEXT:    [[FOO:%.*]] = add i32 [[COND]], [[X]]
 ; CHECK-NEXT:    ret i32 [[FOO]]
 ;
@@ -29,12 +30,13 @@ define i32 @f2(i32 %x) {
 ; CHECK-LABEL: @f2(
 ; CHECK-NEXT:  bb0:
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[X:%.*]], 0
-; CHECK:         [[X_0:%.*]] = bitcast i32 [[X]] to i32
-; CHECK-NEXT:    br i1 [[CMP]], label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[CMP]] = icmp ne i32 [[X]], 0 Edge: [label [[BB0:%.*]],label [[BB2:%.*]]], RenamedOp: [[X]] }
+; CHECK-NEXT:    [[X_0:%.*]] = bitcast i32 [[X]] to i32
+; CHECK-NEXT:    br i1 [[CMP]], label [[BB1:%.*]], label [[BB2]]
 ; CHECK:       bb1:
 ; CHECK-NEXT:    br label [[BB2]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[COND:%.*]] = phi i32 [ [[X_0]], [[BB0:%.*]] ], [ 0, [[BB1]] ]
+; CHECK-NEXT:    [[COND:%.*]] = phi i32 [ [[X_0]], [[BB0]] ], [ 0, [[BB1]] ]
 ; CHECK-NEXT:    [[FOO:%.*]] = add i32 [[COND]], [[X]]
 ; CHECK-NEXT:    ret i32 [[FOO]]
 ;
@@ -52,14 +54,15 @@ bb2:
 define i32 @f3(i32 %x) {
 ; CHECK-LABEL: @f3(
 ; CHECK-NEXT:  bb0:
-; CHECK:         [[X_0:%.*]] = bitcast i32 [[X:%.*]] to i32
+; CHECK-NEXT:  ; switch predicate info { CaseValue: i32 0 Edge: [label [[BB0:%.*]],label [[BB2:%.*]]], RenamedOp: [[X:%.*]] }
+; CHECK-NEXT:    [[X_0:%.*]] = bitcast i32 [[X]] to i32
 ; CHECK-NEXT:    switch i32 [[X]], label [[BB1:%.*]] [
-; CHECK-NEXT:    i32 0, label [[BB2:%.*]]
+; CHECK-NEXT:      i32 0, label [[BB2]]
 ; CHECK-NEXT:    ]
 ; CHECK:       bb1:
 ; CHECK-NEXT:    br label [[BB2]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[COND:%.*]] = phi i32 [ [[X_0]], [[BB0:%.*]] ], [ 0, [[BB1]] ]
+; CHECK-NEXT:    [[COND:%.*]] = phi i32 [ [[X_0]], [[BB0]] ], [ 0, [[BB1]] ]
 ; CHECK-NEXT:    [[FOO:%.*]] = add i32 [[COND]], [[X]]
 ; CHECK-NEXT:    ret i32 [[FOO]]
 ;
@@ -78,13 +81,14 @@ define double @fcmp_oeq_not_zero(double %x, double %y) {
 ; CHECK-LABEL: @fcmp_oeq_not_zero(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq double [[Y:%.*]], 2.000000e+00
-; CHECK:         [[Y_0:%.*]] = bitcast double [[Y]] to double
-; CHECK-NEXT:    br i1 [[CMP]], label [[IF:%.*]], label [[RETURN:%.*]]
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = fcmp oeq double [[Y]], 2.000000e+00 Edge: [label [[ENTRY:%.*]],label [[IF:%.*]]], RenamedOp: [[Y]] }
+; CHECK-NEXT:    [[Y_0:%.*]] = bitcast double [[Y]] to double
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF]], label [[RETURN:%.*]]
 ; CHECK:       if:
 ; CHECK-NEXT:    [[DIV:%.*]] = fdiv double [[X:%.*]], [[Y_0]]
 ; CHECK-NEXT:    br label [[RETURN]]
 ; CHECK:       return:
-; CHECK-NEXT:    [[RETVAL:%.*]] = phi double [ [[DIV]], [[IF]] ], [ [[X]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi double [ [[DIV]], [[IF]] ], [ [[X]], [[ENTRY]] ]
 ; CHECK-NEXT:    ret double [[RETVAL]]
 ;
 entry:
@@ -105,13 +109,14 @@ define double @fcmp_une_not_zero(double %x, double %y) {
 ; CHECK-LABEL: @fcmp_une_not_zero(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP:%.*]] = fcmp une double [[Y:%.*]], 2.000000e+00
-; CHECK:         [[Y_0:%.*]] = bitcast double [[Y]] to double
-; CHECK-NEXT:    br i1 [[CMP]], label [[RETURN:%.*]], label [[ELSE:%.*]]
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[CMP]] = fcmp une double [[Y]], 2.000000e+00 Edge: [label [[ENTRY:%.*]],label [[ELSE:%.*]]], RenamedOp: [[Y]] }
+; CHECK-NEXT:    [[Y_0:%.*]] = bitcast double [[Y]] to double
+; CHECK-NEXT:    br i1 [[CMP]], label [[RETURN:%.*]], label [[ELSE]]
 ; CHECK:       else:
 ; CHECK-NEXT:    [[DIV:%.*]] = fdiv double [[X:%.*]], [[Y_0]]
 ; CHECK-NEXT:    br label [[RETURN]]
 ; CHECK:       return:
-; CHECK-NEXT:    [[RETVAL:%.*]] = phi double [ [[DIV]], [[ELSE]] ], [ [[X]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi double [ [[DIV]], [[ELSE]] ], [ [[X]], [[ENTRY]] ]
 ; CHECK-NEXT:    ret double [[RETVAL]]
 ;
 entry:
@@ -132,13 +137,14 @@ define double @fcmp_oeq_zero(double %x, double %y) {
 ; CHECK-LABEL: @fcmp_oeq_zero(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq double [[Y:%.*]], 0.000000e+00
-; CHECK:         [[Y_0:%.*]] = bitcast double [[Y]] to double
-; CHECK-NEXT:    br i1 [[CMP]], label [[IF:%.*]], label [[RETURN:%.*]]
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = fcmp oeq double [[Y]], 0.000000e+00 Edge: [label [[ENTRY:%.*]],label [[IF:%.*]]], RenamedOp: [[Y]] }
+; CHECK-NEXT:    [[Y_0:%.*]] = bitcast double [[Y]] to double
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF]], label [[RETURN:%.*]]
 ; CHECK:       if:
 ; CHECK-NEXT:    [[DIV:%.*]] = fdiv double [[X:%.*]], [[Y_0]]
 ; CHECK-NEXT:    br label [[RETURN]]
 ; CHECK:       return:
-; CHECK-NEXT:    [[RETVAL:%.*]] = phi double [ [[DIV]], [[IF]] ], [ [[X]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi double [ [[DIV]], [[IF]] ], [ [[X]], [[ENTRY]] ]
 ; CHECK-NEXT:    ret double [[RETVAL]]
 ;
 entry:
@@ -159,13 +165,14 @@ define double @fcmp_une_zero(double %x, double %y) {
 ; CHECK-LABEL: @fcmp_une_zero(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP:%.*]] = fcmp une double [[Y:%.*]], -0.000000e+00
-; CHECK:         [[Y_0:%.*]] = bitcast double [[Y]] to double
-; CHECK-NEXT:    br i1 [[CMP]], label [[RETURN:%.*]], label [[ELSE:%.*]]
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[CMP]] = fcmp une double [[Y]], -0.000000e+00 Edge: [label [[ENTRY:%.*]],label [[ELSE:%.*]]], RenamedOp: [[Y]] }
+; CHECK-NEXT:    [[Y_0:%.*]] = bitcast double [[Y]] to double
+; CHECK-NEXT:    br i1 [[CMP]], label [[RETURN:%.*]], label [[ELSE]]
 ; CHECK:       else:
 ; CHECK-NEXT:    [[DIV:%.*]] = fdiv double [[X:%.*]], [[Y_0]]
 ; CHECK-NEXT:    br label [[RETURN]]
 ; CHECK:       return:
-; CHECK-NEXT:    [[RETVAL:%.*]] = phi double [ [[DIV]], [[ELSE]] ], [ [[X]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi double [ [[DIV]], [[ELSE]] ], [ [[X]], [[ENTRY]] ]
 ; CHECK-NEXT:    ret double [[RETVAL]]
 ;
 entry:
@@ -188,13 +195,14 @@ define double @fcmp_oeq_maybe_zero(double %x, double %y, double %z1, double %z2)
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[Z:%.*]] = fadd double [[Z1:%.*]], [[Z2:%.*]]
 ; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq double [[Y:%.*]], [[Z]]
-; CHECK:         [[Z_0:%.*]] = bitcast double [[Z]] to double
-; CHECK-NEXT:    br i1 [[CMP]], label [[IF:%.*]], label [[RETURN:%.*]]
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = fcmp oeq double [[Y]], [[Z]] Edge: [label [[ENTRY:%.*]],label [[IF:%.*]]], RenamedOp: [[Z]] }
+; CHECK-NEXT:    [[Z_0:%.*]] = bitcast double [[Z]] to double
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF]], label [[RETURN:%.*]]
 ; CHECK:       if:
 ; CHECK-NEXT:    [[DIV:%.*]] = fdiv double [[X:%.*]], [[Z_0]]
 ; CHECK-NEXT:    br label [[RETURN]]
 ; CHECK:       return:
-; CHECK-NEXT:    [[RETVAL:%.*]] = phi double [ [[DIV]], [[IF]] ], [ [[X]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi double [ [[DIV]], [[IF]] ], [ [[X]], [[ENTRY]] ]
 ; CHECK-NEXT:    ret double [[RETVAL]]
 ;
 entry:
@@ -217,13 +225,14 @@ define double @fcmp_une_maybe_zero(double %x, double %y, double %z1, double %z2)
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[Z:%.*]] = fadd double [[Z1:%.*]], [[Z2:%.*]]
 ; CHECK-NEXT:    [[CMP:%.*]] = fcmp une double [[Y:%.*]], [[Z]]
-; CHECK:         [[Z_0:%.*]] = bitcast double [[Z]] to double
-; CHECK-NEXT:    br i1 [[CMP]], label [[RETURN:%.*]], label [[ELSE:%.*]]
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[CMP]] = fcmp une double [[Y]], [[Z]] Edge: [label [[ENTRY:%.*]],label [[ELSE:%.*]]], RenamedOp: [[Z]] }
+; CHECK-NEXT:    [[Z_0:%.*]] = bitcast double [[Z]] to double
+; CHECK-NEXT:    br i1 [[CMP]], label [[RETURN:%.*]], label [[ELSE]]
 ; CHECK:       else:
 ; CHECK-NEXT:    [[DIV:%.*]] = fdiv double [[X:%.*]], [[Z_0]]
 ; CHECK-NEXT:    br label [[RETURN]]
 ; CHECK:       return:
-; CHECK-NEXT:    [[RETVAL:%.*]] = phi double [ [[DIV]], [[ELSE]] ], [ [[X]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi double [ [[DIV]], [[ELSE]] ], [ [[X]], [[ENTRY]] ]
 ; CHECK-NEXT:    ret double [[RETVAL]]
 ;
 entry:
diff --git a/llvm/test/Transforms/Util/PredicateInfo/pr33456.ll b/llvm/test/Transforms/Util/PredicateInfo/pr33456.ll
index 36eaf6e66578d..4762d376ef5aa 100644
--- a/llvm/test/Transforms/Util/PredicateInfo/pr33456.ll
+++ b/llvm/test/Transforms/Util/PredicateInfo/pr33456.ll
@@ -1,5 +1,5 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=print-predicateinfo < %s 2>&1 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-inst-comments
+; RUN: opt -passes=print-predicateinfo -disable-output < %s 2>&1 | FileCheck %s
 ; Don't insert predicate info for conditions with a single target.
 @a = global i32 1, align 4
 @d = common global i32 0, align 4
@@ -12,22 +12,27 @@ define i32 @main() {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @d, align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP13:%.*]]
-; CHECK:         [[TMP4:%.*]] = load i32, ptr @a, align 4
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr @a, align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr @c, align 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp slt i32 [[TMP5]], 1
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
-; CHECK:         [[TMP8:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[TMP4]], 0
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[TMP9]], label [[TMP9]]
-; CHECK:         [[DOT0:%.*]] = phi i32 [ [[TMP4]], [[TMP7]] ], [ [[TMP4]], [[TMP7]] ], [ [[DOT1:%.*]], [[TMP13]] ], [ [[TMP4]], [[TMP3]] ]
+; CHECK:       9:
+; CHECK-NEXT:    [[DOT0:%.*]] = phi i32 [ [[TMP4]], [[TMP7]] ], [ [[TMP4]], [[TMP7]] ], [ [[DOT1:%.*]], [[TMP13]] ], [ [[TMP4]], [[TMP3]] ]
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr @b, align 4
 ; CHECK-NEXT:    [[TMP11:%.*]] = sdiv i32 [[TMP10]], [[DOT0]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[TMP11]], 0
 ; CHECK-NEXT:    br i1 [[TMP12]], label [[TMP13]], label [[TMP13]]
-; CHECK:         [[DOT1]] = phi i32 [ [[DOT0]], [[TMP9]] ], [ [[DOT0]], [[TMP9]] ], [ undef, [[TMP0:%.*]] ]
+; CHECK:       13:
+; CHECK-NEXT:    [[DOT1]] = phi i32 [ [[DOT0]], [[TMP9]] ], [ [[DOT0]], [[TMP9]] ], [ undef, [[TMP0:%.*]] ]
 ; CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr @e, align 4
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[TMP14]], 0
 ; CHECK-NEXT:    br i1 [[TMP15]], label [[TMP16:%.*]], label [[TMP9]]
-; CHECK:         ret i32 0
+; CHECK:       16:
+; CHECK-NEXT:    ret i32 0
 ;
   %1 = load i32, ptr @d, align 4
   %2 = icmp eq i32 %1, 0
diff --git a/llvm/test/Transforms/Util/PredicateInfo/pr33457.ll b/llvm/test/Transforms/Util/PredicateInfo/pr33457.ll
index bc1d39f371515..e4fd4cc6dd8a2 100644
--- a/llvm/test/Transforms/Util/PredicateInfo/pr33457.ll
+++ b/llvm/test/Transforms/Util/PredicateInfo/pr33457.ll
@@ -1,5 +1,5 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=print-predicateinfo < %s 2>&1 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-inst-comments
+; RUN: opt -passes=print-predicateinfo -disable-output < %s 2>&1 | FileCheck %s
 ; Don't insert predicate info for conditions with a single target.
 @a = global i32 6, align 4
 @c = global i32 -1, align 4
@@ -13,26 +13,32 @@ define i32 @main() {
 ; CHECK-LABEL: @main(
 ; CHECK-NEXT:    store i32 6, ptr @e, align 4
 ; CHECK-NEXT:    br label [[TMP1:%.*]]
-; CHECK:         [[TMP2:%.*]] = load i32, ptr @d, align 4
+; CHECK:       1:
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @d, align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = sext i32 [[TMP2]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [6 x i32], ptr @b, i64 0, i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i32 (ptr, ...) @printf(ptr @.str, i32 [[TMP5]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr @a, align 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[TMP7]], 0
-; CHECK-NEXT:    br i1 [[TMP8]], label %thread-pre-split, label [[TMP9:%.*]]
-; CHECK:         [[TMP10:%.*]] = load i32, ptr @e, align 4
+; CHECK-NEXT:    br i1 [[TMP8]], label [[THREAD_PRE_SPLIT:%.*]], label [[TMP9:%.*]]
+; CHECK:       9:
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr @e, align 4
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[TMP10]], 0
 ; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP12]]
 ; CHECK:       thread-pre-split:
 ; CHECK-NEXT:    [[DOTPR:%.*]] = load i32, ptr @e, align 4
 ; CHECK-NEXT:    br label [[TMP12]]
-; CHECK:         [[TMP13:%.*]] = phi i32 [ [[DOTPR]], %thread-pre-split ], [ [[TMP10]], [[TMP9]] ], [ [[TMP10]], [[TMP9]] ]
+; CHECK:       12:
+; CHECK-NEXT:    [[TMP13:%.*]] = phi i32 [ [[DOTPR]], [[THREAD_PRE_SPLIT]] ], [ [[TMP10]], [[TMP9]] ], [ [[TMP10]], [[TMP9]] ]
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 ; CHECK-NEXT:    br i1 [[TMP14]], label [[TMP15:%.*]], label [[TMP15]]
-; CHECK:         br i1 [[TMP14]], label [[TMP16:%.*]], label [[TMP17:%.*]]
-; CHECK:         br label [[TMP17]]
-; CHECK:         [[DOT0:%.*]] = phi i32 [ 1, [[TMP16]] ], [ -1, [[TMP15]] ]
+; CHECK:       15:
+; CHECK-NEXT:    br i1 [[TMP14]], label [[TMP16:%.*]], label [[TMP17:%.*]]
+; CHECK:       16:
+; CHECK-NEXT:    br label [[TMP17]]
+; CHECK:       17:
+; CHECK-NEXT:    [[DOT0:%.*]] = phi i32 [ 1, [[TMP16]] ], [ -1, [[TMP15]] ]
 ; CHECK-NEXT:    [[TMP18:%.*]] = and i32 [[DOT0]], 8693
 ; CHECK-NEXT:    [[TMP19:%.*]] = load i32, ptr @c, align 4
 ; CHECK-NEXT:    [[TMP20:%.*]] = xor i32 [[TMP18]], [[TMP19]]
@@ -40,7 +46,8 @@ define i32 @main() {
 ; CHECK-NEXT:    store i32 [[TMP21]], ptr @d, align 4
 ; CHECK-NEXT:    [[TMP22:%.*]] = icmp slt i32 [[TMP20]], -2
 ; CHECK-NEXT:    br i1 [[TMP22]], label [[TMP1]], label [[TMP23:%.*]]
-; CHECK:         ret i32 0
+; CHECK:       23:
+; CHECK-NEXT:    ret i32 0
 ;
   store i32 6, ptr @e, align 4
   br label %1
diff --git a/llvm/test/Transforms/Util/PredicateInfo/testandor.ll b/llvm/test/Transforms/Util/PredicateInfo/testandor.ll
index cc1dc4e6989a1..d29aadd54128c 100644
--- a/llvm/test/Transforms/Util/PredicateInfo/testandor.ll
+++ b/llvm/test/Transforms/Util/PredicateInfo/testandor.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-inst-comments
 ; RUN: opt -passes=print-predicateinfo -disable-output < %s 2>&1 | FileCheck %s
 
 declare void @foo(i1)
@@ -10,12 +10,17 @@ define void @test_or(i32 %x, i32 %y) {
 ; CHECK-NEXT:    [[XZ:%.*]] = icmp eq i32 [[X:%.*]], 0
 ; CHECK-NEXT:    [[YZ:%.*]] = icmp eq i32 [[Y:%.*]], 0
 ; CHECK-NEXT:    [[Z:%.*]] = or i1 [[XZ]], [[YZ]]
-; CHECK:         [[Z_0:%.*]] = bitcast i1 [[Z]] to i1
-; CHECK:         [[XZ_0:%.*]] = bitcast i1 [[XZ]] to i1
-; CHECK:         [[X_0:%.*]] = bitcast i32 [[X]] to i32
-; CHECK:         [[YZ_0:%.*]] = bitcast i1 [[YZ]] to i1
-; CHECK:         [[Y_0:%.*]] = bitcast i32 [[Y]] to i32
-; CHECK-NEXT:    br i1 [[Z]], label [[ONEOF:%.*]], label [[NEITHER:%.*]]
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[Z]] = or i1 [[XZ]], [[YZ]] Edge: [label [[TMP0:%.*]],label [[NEITHER:%.*]]], RenamedOp: [[Z]] }
+; CHECK-NEXT:    [[Z_0:%.*]] = bitcast i1 [[Z]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[XZ]] = icmp eq i32 [[X]], 0 Edge: [label [[TMP0]],label [[NEITHER]]], RenamedOp: [[XZ]] }
+; CHECK-NEXT:    [[XZ_0:%.*]] = bitcast i1 [[XZ]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[XZ]] = icmp eq i32 [[X]], 0 Edge: [label [[TMP0]],label [[NEITHER]]], RenamedOp: [[X]] }
+; CHECK-NEXT:    [[X_0:%.*]] = bitcast i32 [[X]] to i32
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[YZ]] = icmp eq i32 [[Y]], 0 Edge: [label [[TMP0]],label [[NEITHER]]], RenamedOp: [[YZ]] }
+; CHECK-NEXT:    [[YZ_0:%.*]] = bitcast i1 [[YZ]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[YZ]] = icmp eq i32 [[Y]], 0 Edge: [label [[TMP0]],label [[NEITHER]]], RenamedOp: [[Y]] }
+; CHECK-NEXT:    [[Y_0:%.*]] = bitcast i32 [[Y]] to i32
+; CHECK-NEXT:    br i1 [[Z]], label [[ONEOF:%.*]], label [[NEITHER]]
 ; CHECK:       oneof:
 ; CHECK-NEXT:    call void @foo(i1 [[XZ]])
 ; CHECK-NEXT:    call void @foo(i1 [[YZ]])
@@ -55,12 +60,17 @@ define void @test_or_logical(i32 %x, i32 %y) {
 ; CHECK-NEXT:    [[XZ:%.*]] = icmp eq i32 [[X:%.*]], 0
 ; CHECK-NEXT:    [[YZ:%.*]] = icmp eq i32 [[Y:%.*]], 0
 ; CHECK-NEXT:    [[Z:%.*]] = select i1 [[XZ]], i1 true, i1 [[YZ]]
-; CHECK:         [[Z_0:%.*]] = bitcast i1 [[Z]] to i1
-; CHECK:         [[XZ_0:%.*]] = bitcast i1 [[XZ]] to i1
-; CHECK:         [[X_0:%.*]] = bitcast i32 [[X]] to i32
-; CHECK:         [[YZ_0:%.*]] = bitcast i1 [[YZ]] to i1
-; CHECK:         [[Y_0:%.*]] = bitcast i32 [[Y]] to i32
-; CHECK-NEXT:    br i1 [[Z]], label [[ONEOF:%.*]], label [[NEITHER:%.*]]
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[Z]] = select i1 [[XZ]], i1 true, i1 [[YZ]] Edge: [label [[TMP0:%.*]],label [[NEITHER:%.*]]], RenamedOp: [[Z]] }
+; CHECK-NEXT:    [[Z_0:%.*]] = bitcast i1 [[Z]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[XZ]] = icmp eq i32 [[X]], 0 Edge: [label [[TMP0]],label [[NEITHER]]], RenamedOp: [[XZ]] }
+; CHECK-NEXT:    [[XZ_0:%.*]] = bitcast i1 [[XZ]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[XZ]] = icmp eq i32 [[X]], 0 Edge: [label [[TMP0]],label [[NEITHER]]], RenamedOp: [[X]] }
+; CHECK-NEXT:    [[X_0:%.*]] = bitcast i32 [[X]] to i32
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[YZ]] = icmp eq i32 [[Y]], 0 Edge: [label [[TMP0]],label [[NEITHER]]], RenamedOp: [[YZ]] }
+; CHECK-NEXT:    [[YZ_0:%.*]] = bitcast i1 [[YZ]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[YZ]] = icmp eq i32 [[Y]], 0 Edge: [label [[TMP0]],label [[NEITHER]]], RenamedOp: [[Y]] }
+; CHECK-NEXT:    [[Y_0:%.*]] = bitcast i32 [[Y]] to i32
+; CHECK-NEXT:    br i1 [[Z]], label [[ONEOF:%.*]], label [[NEITHER]]
 ; CHECK:       oneof:
 ; CHECK-NEXT:    call void @foo(i1 [[XZ]])
 ; CHECK-NEXT:    call void @foo(i1 [[YZ]])
@@ -100,12 +110,17 @@ define void @test_and(i32 %x, i32 %y) {
 ; CHECK-NEXT:    [[XZ:%.*]] = icmp eq i32 [[X:%.*]], 0
 ; CHECK-NEXT:    [[YZ:%.*]] = icmp eq i32 [[Y:%.*]], 0
 ; CHECK-NEXT:    [[Z:%.*]] = and i1 [[XZ]], [[YZ]]
-; CHECK:         [[Z_0:%.*]] = bitcast i1 [[Z]] to i1
-; CHECK:         [[XZ_0:%.*]] = bitcast i1 [[XZ]] to i1
-; CHECK:         [[X_0:%.*]] = bitcast i32 [[X]] to i32
-; CHECK:         [[YZ_0:%.*]] = bitcast i1 [[YZ]] to i1
-; CHECK:         [[Y_0:%.*]] = bitcast i32 [[Y]] to i32
-; CHECK-NEXT:    br i1 [[Z]], label [[BOTH:%.*]], label [[NOPE:%.*]]
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[Z]] = and i1 [[XZ]], [[YZ]] Edge: [label [[TMP0:%.*]],label [[NOPE:%.*]]], RenamedOp: [[Z]] }
+; CHECK-NEXT:    [[Z_0:%.*]] = bitcast i1 [[Z]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[XZ]] = icmp eq i32 [[X]], 0 Edge: [label [[TMP0]],label [[BOTH:%.*]]], RenamedOp: [[XZ]] }
+; CHECK-NEXT:    [[XZ_0:%.*]] = bitcast i1 [[XZ]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[XZ]] = icmp eq i32 [[X]], 0 Edge: [label [[TMP0]],label [[BOTH]]], RenamedOp: [[X]] }
+; CHECK-NEXT:    [[X_0:%.*]] = bitcast i32 [[X]] to i32
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[YZ]] = icmp eq i32 [[Y]], 0 Edge: [label [[TMP0]],label [[BOTH]]], RenamedOp: [[YZ]] }
+; CHECK-NEXT:    [[YZ_0:%.*]] = bitcast i1 [[YZ]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[YZ]] = icmp eq i32 [[Y]], 0 Edge: [label [[TMP0]],label [[BOTH]]], RenamedOp: [[Y]] }
+; CHECK-NEXT:    [[Y_0:%.*]] = bitcast i32 [[Y]] to i32
+; CHECK-NEXT:    br i1 [[Z]], label [[BOTH]], label [[NOPE]]
 ; CHECK:       both:
 ; CHECK-NEXT:    call void @foo(i1 [[XZ_0]])
 ; CHECK-NEXT:    call void @foo(i1 [[YZ_0]])
@@ -145,12 +160,17 @@ define void @test_and_logical(i32 %x, i32 %y) {
 ; CHECK-NEXT:    [[XZ:%.*]] = icmp eq i32 [[X:%.*]], 0
 ; CHECK-NEXT:    [[YZ:%.*]] = icmp eq i32 [[Y:%.*]], 0
 ; CHECK-NEXT:    [[Z:%.*]] = select i1 [[XZ]], i1 [[YZ]], i1 false
-; CHECK:         [[Z_0:%.*]] = bitcast i1 [[Z]] to i1
-; CHECK:         [[XZ_0:%.*]] = bitcast i1 [[XZ]] to i1
-; CHECK:         [[X_0:%.*]] = bitcast i32 [[X]] to i32
-; CHECK:         [[YZ_0:%.*]] = bitcast i1 [[YZ]] to i1
-; CHECK:         [[Y_0:%.*]] = bitcast i32 [[Y]] to i32
-; CHECK-NEXT:    br i1 [[Z]], label [[BOTH:%.*]], label [[NOPE:%.*]]
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[Z]] = select i1 [[XZ]], i1 [[YZ]], i1 false Edge: [label [[TMP0:%.*]],label [[NOPE:%.*]]], RenamedOp: [[Z]] }
+; CHECK-NEXT:    [[Z_0:%.*]] = bitcast i1 [[Z]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[XZ]] = icmp eq i32 [[X]], 0 Edge: [label [[TMP0]],label [[BOTH:%.*]]], RenamedOp: [[XZ]] }
+; CHECK-NEXT:    [[XZ_0:%.*]] = bitcast i1 [[XZ]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[XZ]] = icmp eq i32 [[X]], 0 Edge: [label [[TMP0]],label [[BOTH]]], RenamedOp: [[X]] }
+; CHECK-NEXT:    [[X_0:%.*]] = bitcast i32 [[X]] to i32
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[YZ]] = icmp eq i32 [[Y]], 0 Edge: [label [[TMP0]],label [[BOTH]]], RenamedOp: [[YZ]] }
+; CHECK-NEXT:    [[YZ_0:%.*]] = bitcast i1 [[YZ]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[YZ]] = icmp eq i32 [[Y]], 0 Edge: [label [[TMP0]],label [[BOTH]]], RenamedOp: [[Y]] }
+; CHECK-NEXT:    [[Y_0:%.*]] = bitcast i32 [[Y]] to i32
+; CHECK-NEXT:    br i1 [[Z]], label [[BOTH]], label [[NOPE]]
 ; CHECK:       both:
 ; CHECK-NEXT:    call void @foo(i1 [[XZ_0]])
 ; CHECK-NEXT:    call void @foo(i1 [[YZ_0]])
@@ -190,12 +210,17 @@ define void @testandsame(i32 %x, i32 %y) {
 ; CHECK-NEXT:    [[XGT:%.*]] = icmp sgt i32 [[X:%.*]], 0
 ; CHECK-NEXT:    [[XLT:%.*]] = icmp slt i32 [[X]], 100
 ; CHECK-NEXT:    [[Z:%.*]] = and i1 [[XGT]], [[XLT]]
-; CHECK:         [[Z_0:%.*]] = bitcast i1 [[Z]] to i1
-; CHECK:         [[XGT_0:%.*]] = bitcast i1 [[XGT]] to i1
-; CHECK:         [[X_0:%.*]] = bitcast i32 [[X]] to i32
-; CHECK:         [[X_0_1:%.*]] = bitcast i32 [[X_0]] to i32
-; CHECK:         [[XLT_0:%.*]] = bitcast i1 [[XLT]] to i1
-; CHECK-NEXT:    br i1 [[Z]], label [[BOTH:%.*]], label [[NOPE:%.*]]
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[Z]] = and i1 [[XGT]], [[XLT]] Edge: [label [[TMP0:%.*]],label [[NOPE:%.*]]], RenamedOp: [[Z]] }
+; CHECK-NEXT:    [[Z_0:%.*]] = bitcast i1 [[Z]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[XGT]] = icmp sgt i32 [[X]], 0 Edge: [label [[TMP0]],label [[BOTH:%.*]]], RenamedOp: [[XGT]] }
+; CHECK-NEXT:    [[XGT_0:%.*]] = bitcast i1 [[XGT]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[XGT]] = icmp sgt i32 [[X]], 0 Edge: [label [[TMP0]],label [[BOTH]]], RenamedOp: [[X]] }
+; CHECK-NEXT:    [[X_0:%.*]] = bitcast i32 [[X]] to i32
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[XLT]] = icmp slt i32 [[X]], 100 Edge: [label [[TMP0]],label [[BOTH]]], RenamedOp: [[X]] }
+; CHECK-NEXT:    [[X_0_1:%.*]] = bitcast i32 [[X_0]] to i32
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[XLT]] = icmp slt i32 [[X]], 100 Edge: [label [[TMP0]],label [[BOTH]]], RenamedOp: [[XLT]] }
+; CHECK-NEXT:    [[XLT_0:%.*]] = bitcast i1 [[XLT]] to i1
+; CHECK-NEXT:    br i1 [[Z]], label [[BOTH]], label [[NOPE]]
 ; CHECK:       both:
 ; CHECK-NEXT:    call void @foo(i1 [[XGT_0]])
 ; CHECK-NEXT:    call void @foo(i1 [[XLT_0]])
@@ -229,17 +254,27 @@ define void @testandassume(i32 %x, i32 %y) {
 ; CHECK-NEXT:    [[YZ:%.*]] = icmp eq i32 [[Y:%.*]], 0
 ; CHECK-NEXT:    [[Z:%.*]] = and i1 [[XZ]], [[YZ]]
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[Z]])
-; CHECK:         [[TMP1:%.*]] = bitcast i32 [[Y]] to i32
-; CHECK:         [[TMP2:%.*]] = bitcast i1 [[YZ]] to i1
-; CHECK:         [[TMP3:%.*]] = bitcast i32 [[X]] to i32
-; CHECK:         [[TMP4:%.*]] = bitcast i1 [[XZ]] to i1
-; CHECK:         [[TMP5:%.*]] = bitcast i1 [[Z]] to i1
-; CHECK:         [[DOT0:%.*]] = bitcast i1 [[TMP5]] to i1
-; CHECK:         [[DOT01:%.*]] = bitcast i1 [[TMP4]] to i1
-; CHECK:         [[DOT02:%.*]] = bitcast i32 [[TMP3]] to i32
-; CHECK:         [[DOT03:%.*]] = bitcast i1 [[TMP2]] to i1
-; CHECK:         [[DOT04:%.*]] = bitcast i32 [[TMP1]] to i32
-; CHECK-NEXT:    br i1 [[TMP5]], label [[BOTH:%.*]], label [[NOPE:%.*]]
+; CHECK-NEXT:  ; assume predicate info { Comparison: [[YZ]] = icmp eq i32 [[Y]], 0, RenamedOp: [[Y]] }
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[Y]] to i32
+; CHECK-NEXT:  ; assume predicate info { Comparison: [[YZ]] = icmp eq i32 [[Y]], 0, RenamedOp: [[YZ]] }
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i1 [[YZ]] to i1
+; CHECK-NEXT:  ; assume predicate info { Comparison: [[XZ]] = icmp eq i32 [[X]], 0, RenamedOp: [[X]] }
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32 [[X]] to i32
+; CHECK-NEXT:  ; assume predicate info { Comparison: [[XZ]] = icmp eq i32 [[X]], 0, RenamedOp: [[XZ]] }
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i1 [[XZ]] to i1
+; CHECK-NEXT:  ; assume predicate info { Comparison: [[Z]] = and i1 [[XZ]], [[YZ]], RenamedOp: [[Z]] }
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i1 [[Z]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[Z]] = and i1 [[XZ]], [[YZ]] Edge: [label [[TMP0:%.*]],label [[NOPE:%.*]]], RenamedOp: [[TMP5]] }
+; CHECK-NEXT:    [[DOT0:%.*]] = bitcast i1 [[TMP5]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[XZ]] = icmp eq i32 [[X]], 0 Edge: [label [[TMP0]],label [[BOTH:%.*]]], RenamedOp: [[XZ]] }
+; CHECK-NEXT:    [[DOT01:%.*]] = bitcast i1 [[TMP4]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[XZ]] = icmp eq i32 [[X]], 0 Edge: [label [[TMP0]],label [[BOTH]]], RenamedOp: [[X]] }
+; CHECK-NEXT:    [[DOT02:%.*]] = bitcast i32 [[TMP3]] to i32
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[YZ]] = icmp eq i32 [[Y]], 0 Edge: [label [[TMP0]],label [[BOTH]]], RenamedOp: [[YZ]] }
+; CHECK-NEXT:    [[DOT03:%.*]] = bitcast i1 [[TMP2]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[YZ]] = icmp eq i32 [[Y]], 0 Edge: [label [[TMP0]],label [[BOTH]]], RenamedOp: [[Y]] }
+; CHECK-NEXT:    [[DOT04:%.*]] = bitcast i32 [[TMP1]] to i32
+; CHECK-NEXT:    br i1 [[TMP5]], label [[BOTH]], label [[NOPE]]
 ; CHECK:       both:
 ; CHECK-NEXT:    call void @foo(i1 [[DOT01]])
 ; CHECK-NEXT:    call void @foo(i1 [[DOT03]])
@@ -274,9 +309,11 @@ define void @testorassume(i32 %x, i32 %y) {
 ; CHECK-NEXT:    [[YZ:%.*]] = icmp eq i32 [[Y:%.*]], 0
 ; CHECK-NEXT:    [[Z:%.*]] = or i1 [[XZ]], [[YZ]]
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[Z]])
-; CHECK:         [[TMP1:%.*]] = bitcast i1 [[Z]] to i1
-; CHECK:         [[DOT0:%.*]] = bitcast i1 [[TMP1]] to i1
-; CHECK-NEXT:    br i1 [[TMP1]], label [[BOTH:%.*]], label [[NOPE:%.*]]
+; CHECK-NEXT:  ; assume predicate info { Comparison: [[Z]] = or i1 [[XZ]], [[YZ]], RenamedOp: [[Z]] }
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i1 [[Z]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[Z]] = or i1 [[XZ]], [[YZ]] Edge: [label [[TMP0:%.*]],label [[NOPE:%.*]]], RenamedOp: [[TMP1]] }
+; CHECK-NEXT:    [[DOT0:%.*]] = bitcast i1 [[TMP1]] to i1
+; CHECK-NEXT:    br i1 [[TMP1]], label [[BOTH:%.*]], label [[NOPE]]
 ; CHECK:       both:
 ; CHECK-NEXT:    call void @foo(i1 [[XZ]])
 ; CHECK-NEXT:    call void @foo(i1 [[YZ]])
@@ -307,12 +344,17 @@ define void @test_and_one_unknown_cond(i32 %x, i1 %c1) {
 ; CHECK-LABEL: @test_and_one_unknown_cond(
 ; CHECK-NEXT:    [[C2:%.*]] = icmp eq i32 [[X:%.*]], 0
 ; CHECK-NEXT:    [[A:%.*]] = and i1 [[C1:%.*]], [[C2]]
-; CHECK:         [[A_0:%.*]] = bitcast i1 [[A]] to i1
-; CHECK:         [[A_1:%.*]] = bitcast i1 [[A]] to i1
-; CHECK:         [[C1_0:%.*]] = bitcast i1 [[C1]] to i1
-; CHECK:         [[C2_0:%.*]] = bitcast i1 [[C2]] to i1
-; CHECK:         [[X_0:%.*]] = bitcast i32 [[X]] to i32
-; CHECK-NEXT:    br i1 [[A]], label [[BOTH:%.*]], label [[NOPE:%.*]]
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[A]] = and i1 [[C1]], [[C2]] Edge: [label [[TMP0:%.*]],label [[BOTH:%.*]]], RenamedOp: [[A]] }
+; CHECK-NEXT:    [[A_0:%.*]] = bitcast i1 [[A]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[A]] = and i1 [[C1]], [[C2]] Edge: [label [[TMP0]],label [[NOPE:%.*]]], RenamedOp: [[A]] }
+; CHECK-NEXT:    [[A_1:%.*]] = bitcast i1 [[A]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison:i1 [[C1]] Edge: [label [[TMP0]],label [[BOTH]]], RenamedOp: [[C1]] }
+; CHECK-NEXT:    [[C1_0:%.*]] = bitcast i1 [[C1]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[C2]] = icmp eq i32 [[X]], 0 Edge: [label [[TMP0]],label [[BOTH]]], RenamedOp: [[C2]] }
+; CHECK-NEXT:    [[C2_0:%.*]] = bitcast i1 [[C2]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[C2]] = icmp eq i32 [[X]], 0 Edge: [label [[TMP0]],label [[BOTH]]], RenamedOp: [[X]] }
+; CHECK-NEXT:    [[X_0:%.*]] = bitcast i32 [[X]] to i32
+; CHECK-NEXT:    br i1 [[A]], label [[BOTH]], label [[NOPE]]
 ; CHECK:       both:
 ; CHECK-NEXT:    call void @bar(i32 [[X_0]])
 ; CHECK-NEXT:    call void @foo(i1 [[C1_0]])
@@ -349,12 +391,17 @@ define void @test_or_one_unknown_cond(i32 %x, i1 %c1) {
 ; CHECK-LABEL: @test_or_one_unknown_cond(
 ; CHECK-NEXT:    [[C2:%.*]] = icmp eq i32 [[X:%.*]], 0
 ; CHECK-NEXT:    [[A:%.*]] = or i1 [[C1:%.*]], [[C2]]
-; CHECK:         [[A_0:%.*]] = bitcast i1 [[A]] to i1
-; CHECK:         [[A_1:%.*]] = bitcast i1 [[A]] to i1
-; CHECK:         [[C1_0:%.*]] = bitcast i1 [[C1]] to i1
-; CHECK:         [[C2_0:%.*]] = bitcast i1 [[C2]] to i1
-; CHECK:         [[X_0:%.*]] = bitcast i32 [[X]] to i32
-; CHECK-NEXT:    br i1 [[A]], label [[NOPE:%.*]], label [[BOTH_INVERTED:%.*]]
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[A]] = or i1 [[C1]], [[C2]] Edge: [label [[TMP0:%.*]],label [[NOPE:%.*]]], RenamedOp: [[A]] }
+; CHECK-NEXT:    [[A_0:%.*]] = bitcast i1 [[A]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[A]] = or i1 [[C1]], [[C2]] Edge: [label [[TMP0]],label [[BOTH_INVERTED:%.*]]], RenamedOp: [[A]] }
+; CHECK-NEXT:    [[A_1:%.*]] = bitcast i1 [[A]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison:i1 [[C1]] Edge: [label [[TMP0]],label [[BOTH_INVERTED]]], RenamedOp: [[C1]] }
+; CHECK-NEXT:    [[C1_0:%.*]] = bitcast i1 [[C1]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[C2]] = icmp eq i32 [[X]], 0 Edge: [label [[TMP0]],label [[BOTH_INVERTED]]], RenamedOp: [[C2]] }
+; CHECK-NEXT:    [[C2_0:%.*]] = bitcast i1 [[C2]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[C2]] = icmp eq i32 [[X]], 0 Edge: [label [[TMP0]],label [[BOTH_INVERTED]]], RenamedOp: [[X]] }
+; CHECK-NEXT:    [[X_0:%.*]] = bitcast i32 [[X]] to i32
+; CHECK-NEXT:    br i1 [[A]], label [[NOPE]], label [[BOTH_INVERTED]]
 ; CHECK:       both_inverted:
 ; CHECK-NEXT:    call void @bar(i32 [[X_0]])
 ; CHECK-NEXT:    call void @foo(i1 [[C1_0]])
@@ -391,13 +438,19 @@ define void @test_and_chain(i1 %a, i1 %b, i1 %c) {
 ; CHECK-LABEL: @test_and_chain(
 ; CHECK-NEXT:    [[AND1:%.*]] = and i1 [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[AND2:%.*]] = and i1 [[AND1]], [[C:%.*]]
-; CHECK:         [[AND2_0:%.*]] = bitcast i1 [[AND2]] to i1
-; CHECK:         [[AND2_1:%.*]] = bitcast i1 [[AND2]] to i1
-; CHECK:         [[AND1_0:%.*]] = bitcast i1 [[AND1]] to i1
-; CHECK:         [[A_0:%.*]] = bitcast i1 [[A]] to i1
-; CHECK:         [[B_0:%.*]] = bitcast i1 [[B]] to i1
-; CHECK:         [[C_0:%.*]] = bitcast i1 [[C]] to i1
-; CHECK-NEXT:    br i1 [[AND2]], label [[IF:%.*]], label [[ELSE:%.*]]
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[AND2]] = and i1 [[AND1]], [[C]] Edge: [label [[TMP0:%.*]],label [[IF:%.*]]], RenamedOp: [[AND2]] }
+; CHECK-NEXT:    [[AND2_0:%.*]] = bitcast i1 [[AND2]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[AND2]] = and i1 [[AND1]], [[C]] Edge: [label [[TMP0]],label [[ELSE:%.*]]], RenamedOp: [[AND2]] }
+; CHECK-NEXT:    [[AND2_1:%.*]] = bitcast i1 [[AND2]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[AND1]] = and i1 [[A]], [[B]] Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[AND1]] }
+; CHECK-NEXT:    [[AND1_0:%.*]] = bitcast i1 [[AND1]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison:i1 [[A]] Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[A]] }
+; CHECK-NEXT:    [[A_0:%.*]] = bitcast i1 [[A]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison:i1 [[B]] Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[B]] }
+; CHECK-NEXT:    [[B_0:%.*]] = bitcast i1 [[B]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison:i1 [[C]] Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[C]] }
+; CHECK-NEXT:    [[C_0:%.*]] = bitcast i1 [[C]] to i1
+; CHECK-NEXT:    br i1 [[AND2]], label [[IF]], label [[ELSE]]
 ; CHECK:       if:
 ; CHECK-NEXT:    call void @foo(i1 [[A_0]])
 ; CHECK-NEXT:    call void @foo(i1 [[B_0]])
@@ -438,13 +491,19 @@ define void @test_or_chain(i1 %a, i1 %b, i1 %c) {
 ; CHECK-LABEL: @test_or_chain(
 ; CHECK-NEXT:    [[OR1:%.*]] = or i1 [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[OR2:%.*]] = or i1 [[OR1]], [[C:%.*]]
-; CHECK:         [[OR2_0:%.*]] = bitcast i1 [[OR2]] to i1
-; CHECK:         [[OR2_1:%.*]] = bitcast i1 [[OR2]] to i1
-; CHECK:         [[OR1_0:%.*]] = bitcast i1 [[OR1]] to i1
-; CHECK:         [[A_0:%.*]] = bitcast i1 [[A]] to i1
-; CHECK:         [[B_0:%.*]] = bitcast i1 [[B]] to i1
-; CHECK:         [[C_0:%.*]] = bitcast i1 [[C]] to i1
-; CHECK-NEXT:    br i1 [[OR2]], label [[IF:%.*]], label [[ELSE:%.*]]
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[OR2]] = or i1 [[OR1]], [[C]] Edge: [label [[TMP0:%.*]],label [[IF:%.*]]], RenamedOp: [[OR2]] }
+; CHECK-NEXT:    [[OR2_0:%.*]] = bitcast i1 [[OR2]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[OR2]] = or i1 [[OR1]], [[C]] Edge: [label [[TMP0]],label [[ELSE:%.*]]], RenamedOp: [[OR2]] }
+; CHECK-NEXT:    [[OR2_1:%.*]] = bitcast i1 [[OR2]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[OR1]] = or i1 [[A]], [[B]] Edge: [label [[TMP0]],label [[ELSE]]], RenamedOp: [[OR1]] }
+; CHECK-NEXT:    [[OR1_0:%.*]] = bitcast i1 [[OR1]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison:i1 [[A]] Edge: [label [[TMP0]],label [[ELSE]]], RenamedOp: [[A]] }
+; CHECK-NEXT:    [[A_0:%.*]] = bitcast i1 [[A]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison:i1 [[B]] Edge: [label [[TMP0]],label [[ELSE]]], RenamedOp: [[B]] }
+; CHECK-NEXT:    [[B_0:%.*]] = bitcast i1 [[B]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison:i1 [[C]] Edge: [label [[TMP0]],label [[ELSE]]], RenamedOp: [[C]] }
+; CHECK-NEXT:    [[C_0:%.*]] = bitcast i1 [[C]] to i1
+; CHECK-NEXT:    br i1 [[OR2]], label [[IF]], label [[ELSE]]
 ; CHECK:       if:
 ; CHECK-NEXT:    call void @foo(i1 [[A]])
 ; CHECK-NEXT:    call void @foo(i1 [[B]])
@@ -485,11 +544,15 @@ define void @test_and_or_mixed(i1 %a, i1 %b, i1 %c) {
 ; CHECK-LABEL: @test_and_or_mixed(
 ; CHECK-NEXT:    [[OR:%.*]] = or i1 [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[AND:%.*]] = and i1 [[OR]], [[C:%.*]]
-; CHECK:         [[AND_0:%.*]] = bitcast i1 [[AND]] to i1
-; CHECK:         [[AND_1:%.*]] = bitcast i1 [[AND]] to i1
-; CHECK:         [[OR_0:%.*]] = bitcast i1 [[OR]] to i1
-; CHECK:         [[C_0:%.*]] = bitcast i1 [[C]] to i1
-; CHECK-NEXT:    br i1 [[AND]], label [[IF:%.*]], label [[ELSE:%.*]]
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[AND]] = and i1 [[OR]], [[C]] Edge: [label [[TMP0:%.*]],label [[IF:%.*]]], RenamedOp: [[AND]] }
+; CHECK-NEXT:    [[AND_0:%.*]] = bitcast i1 [[AND]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[AND]] = and i1 [[OR]], [[C]] Edge: [label [[TMP0]],label [[ELSE:%.*]]], RenamedOp: [[AND]] }
+; CHECK-NEXT:    [[AND_1:%.*]] = bitcast i1 [[AND]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[OR]] = or i1 [[A]], [[B]] Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[OR]] }
+; CHECK-NEXT:    [[OR_0:%.*]] = bitcast i1 [[OR]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison:i1 [[C]] Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[C]] }
+; CHECK-NEXT:    [[C_0:%.*]] = bitcast i1 [[C]] to i1
+; CHECK-NEXT:    br i1 [[AND]], label [[IF]], label [[ELSE]]
 ; CHECK:       if:
 ; CHECK-NEXT:    call void @foo(i1 [[A]])
 ; CHECK-NEXT:    call void @foo(i1 [[B]])
@@ -542,16 +605,25 @@ define void @test_deep_and_chain(i1 %a1) {
 ; CHECK-NEXT:    [[A13:%.*]] = and i1 [[A12]], true
 ; CHECK-NEXT:    [[A14:%.*]] = and i1 [[A13]], true
 ; CHECK-NEXT:    [[A15:%.*]] = and i1 [[A14]], true
-; CHECK:         [[A15_0:%.*]] = bitcast i1 [[A15]] to i1
-; CHECK:         [[A15_1:%.*]] = bitcast i1 [[A15]] to i1
-; CHECK:         [[A14_0:%.*]] = bitcast i1 [[A14]] to i1
-; CHECK:         [[A13_0:%.*]] = bitcast i1 [[A13]] to i1
-; CHECK:         [[A12_0:%.*]] = bitcast i1 [[A12]] to i1
-; CHECK:         [[A11_0:%.*]] = bitcast i1 [[A11]] to i1
-; CHECK:         [[A10_0:%.*]] = bitcast i1 [[A10]] to i1
-; CHECK:         [[A9_0:%.*]] = bitcast i1 [[A9]] to i1
-; CHECK:         [[A8_0:%.*]] = bitcast i1 [[A8]] to i1
-; CHECK-NEXT:    br i1 [[A15]], label [[IF:%.*]], label [[ELSE:%.*]]
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[A15]] = and i1 [[A14]], true Edge: [label [[TMP0:%.*]],label [[IF:%.*]]], RenamedOp: [[A15]] }
+; CHECK-NEXT:    [[A15_0:%.*]] = bitcast i1 [[A15]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[A15]] = and i1 [[A14]], true Edge: [label [[TMP0]],label [[ELSE:%.*]]], RenamedOp: [[A15]] }
+; CHECK-NEXT:    [[A15_1:%.*]] = bitcast i1 [[A15]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[A14]] = and i1 [[A13]], true Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[A14]] }
+; CHECK-NEXT:    [[A14_0:%.*]] = bitcast i1 [[A14]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[A13]] = and i1 [[A12]], true Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[A13]] }
+; CHECK-NEXT:    [[A13_0:%.*]] = bitcast i1 [[A13]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[A12]] = and i1 [[A11]], true Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[A12]] }
+; CHECK-NEXT:    [[A12_0:%.*]] = bitcast i1 [[A12]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[A11]] = and i1 [[A10]], true Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[A11]] }
+; CHECK-NEXT:    [[A11_0:%.*]] = bitcast i1 [[A11]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[A10]] = and i1 [[A9]], true Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[A10]] }
+; CHECK-NEXT:    [[A10_0:%.*]] = bitcast i1 [[A10]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[A9]] = and i1 [[A8]], true Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[A9]] }
+; CHECK-NEXT:    [[A9_0:%.*]] = bitcast i1 [[A9]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[A8]] = and i1 [[A7]], true Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[A8]] }
+; CHECK-NEXT:    [[A8_0:%.*]] = bitcast i1 [[A8]] to i1
+; CHECK-NEXT:    br i1 [[A15]], label [[IF]], label [[ELSE]]
 ; CHECK:       if:
 ; CHECK-NEXT:    call void @foo(i1 [[A1]])
 ; CHECK-NEXT:    call void @foo(i1 [[A2]])
@@ -656,16 +728,25 @@ define void @test_deep_and_tree(i1 %a1) {
 ; CHECK-NEXT:    [[A13:%.*]] = and i1 [[A12]], [[A12]]
 ; CHECK-NEXT:    [[A14:%.*]] = and i1 [[A13]], [[A13]]
 ; CHECK-NEXT:    [[A15:%.*]] = and i1 [[A14]], [[A14]]
-; CHECK:         [[A15_0:%.*]] = bitcast i1 [[A15]] to i1
-; CHECK:         [[A15_1:%.*]] = bitcast i1 [[A15]] to i1
-; CHECK:         [[A14_0:%.*]] = bitcast i1 [[A14]] to i1
-; CHECK:         [[A13_0:%.*]] = bitcast i1 [[A13]] to i1
-; CHECK:         [[A12_0:%.*]] = bitcast i1 [[A12]] to i1
-; CHECK:         [[A11_0:%.*]] = bitcast i1 [[A11]] to i1
-; CHECK:         [[A10_0:%.*]] = bitcast i1 [[A10]] to i1
-; CHECK:         [[A9_0:%.*]] = bitcast i1 [[A9]] to i1
-; CHECK:         [[A8_0:%.*]] = bitcast i1 [[A8]] to i1
-; CHECK-NEXT:    br i1 [[A15]], label [[IF:%.*]], label [[ELSE:%.*]]
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[A15]] = and i1 [[A14]], [[A14]] Edge: [label [[TMP0:%.*]],label [[IF:%.*]]], RenamedOp: [[A15]] }
+; CHECK-NEXT:    [[A15_0:%.*]] = bitcast i1 [[A15]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[A15]] = and i1 [[A14]], [[A14]] Edge: [label [[TMP0]],label [[ELSE:%.*]]], RenamedOp: [[A15]] }
+; CHECK-NEXT:    [[A15_1:%.*]] = bitcast i1 [[A15]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[A14]] = and i1 [[A13]], [[A13]] Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[A14]] }
+; CHECK-NEXT:    [[A14_0:%.*]] = bitcast i1 [[A14]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[A13]] = and i1 [[A12]], [[A12]] Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[A13]] }
+; CHECK-NEXT:    [[A13_0:%.*]] = bitcast i1 [[A13]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[A12]] = and i1 [[A11]], [[A11]] Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[A12]] }
+; CHECK-NEXT:    [[A12_0:%.*]] = bitcast i1 [[A12]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[A11]] = and i1 [[A10]], [[A10]] Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[A11]] }
+; CHECK-NEXT:    [[A11_0:%.*]] = bitcast i1 [[A11]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[A10]] = and i1 [[A9]], [[A9]] Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[A10]] }
+; CHECK-NEXT:    [[A10_0:%.*]] = bitcast i1 [[A10]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[A9]] = and i1 [[A8]], [[A8]] Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[A9]] }
+; CHECK-NEXT:    [[A9_0:%.*]] = bitcast i1 [[A9]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[A8]] = and i1 [[A7]], [[A7]] Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[A8]] }
+; CHECK-NEXT:    [[A8_0:%.*]] = bitcast i1 [[A8]] to i1
+; CHECK-NEXT:    br i1 [[A15]], label [[IF]], label [[ELSE]]
 ; CHECK:       if:
 ; CHECK-NEXT:    call void @foo(i1 [[A1]])
 ; CHECK-NEXT:    call void @foo(i1 [[A2]])
@@ -770,16 +851,25 @@ define void @test_deep_or_tree(i1 %a1) {
 ; CHECK-NEXT:    [[A13:%.*]] = or i1 [[A12]], [[A12]]
 ; CHECK-NEXT:    [[A14:%.*]] = or i1 [[A13]], [[A13]]
 ; CHECK-NEXT:    [[A15:%.*]] = or i1 [[A14]], [[A14]]
-; CHECK:         [[A15_0:%.*]] = bitcast i1 [[A15]] to i1
-; CHECK:         [[A15_1:%.*]] = bitcast i1 [[A15]] to i1
-; CHECK:         [[A14_0:%.*]] = bitcast i1 [[A14]] to i1
-; CHECK:         [[A13_0:%.*]] = bitcast i1 [[A13]] to i1
-; CHECK:         [[A12_0:%.*]] = bitcast i1 [[A12]] to i1
-; CHECK:         [[A11_0:%.*]] = bitcast i1 [[A11]] to i1
-; CHECK:         [[A10_0:%.*]] = bitcast i1 [[A10]] to i1
-; CHECK:         [[A9_0:%.*]] = bitcast i1 [[A9]] to i1
-; CHECK:         [[A8_0:%.*]] = bitcast i1 [[A8]] to i1
-; CHECK-NEXT:    br i1 [[A15]], label [[IF:%.*]], label [[ELSE:%.*]]
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[A15]] = or i1 [[A14]], [[A14]] Edge: [label [[TMP0:%.*]],label [[IF:%.*]]], RenamedOp: [[A15]] }
+; CHECK-NEXT:    [[A15_0:%.*]] = bitcast i1 [[A15]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[A15]] = or i1 [[A14]], [[A14]] Edge: [label [[TMP0]],label [[ELSE:%.*]]], RenamedOp: [[A15]] }
+; CHECK-NEXT:    [[A15_1:%.*]] = bitcast i1 [[A15]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[A14]] = or i1 [[A13]], [[A13]] Edge: [label [[TMP0]],label [[ELSE]]], RenamedOp: [[A14]] }
+; CHECK-NEXT:    [[A14_0:%.*]] = bitcast i1 [[A14]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[A13]] = or i1 [[A12]], [[A12]] Edge: [label [[TMP0]],label [[ELSE]]], RenamedOp: [[A13]] }
+; CHECK-NEXT:    [[A13_0:%.*]] = bitcast i1 [[A13]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[A12]] = or i1 [[A11]], [[A11]] Edge: [label [[TMP0]],label [[ELSE]]], RenamedOp: [[A12]] }
+; CHECK-NEXT:    [[A12_0:%.*]] = bitcast i1 [[A12]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[A11]] = or i1 [[A10]], [[A10]] Edge: [label [[TMP0]],label [[ELSE]]], RenamedOp: [[A11]] }
+; CHECK-NEXT:    [[A11_0:%.*]] = bitcast i1 [[A11]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[A10]] = or i1 [[A9]], [[A9]] Edge: [label [[TMP0]],label [[ELSE]]], RenamedOp: [[A10]] }
+; CHECK-NEXT:    [[A10_0:%.*]] = bitcast i1 [[A10]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[A9]] = or i1 [[A8]], [[A8]] Edge: [label [[TMP0]],label [[ELSE]]], RenamedOp: [[A9]] }
+; CHECK-NEXT:    [[A9_0:%.*]] = bitcast i1 [[A9]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[A8]] = or i1 [[A7]], [[A7]] Edge: [label [[TMP0]],label [[ELSE]]], RenamedOp: [[A8]] }
+; CHECK-NEXT:    [[A8_0:%.*]] = bitcast i1 [[A8]] to i1
+; CHECK-NEXT:    br i1 [[A15]], label [[IF]], label [[ELSE]]
 ; CHECK:       if:
 ; CHECK-NEXT:    call void @foo(i1 [[A1]])
 ; CHECK-NEXT:    call void @foo(i1 [[A2]])
@@ -873,11 +963,16 @@ define void @test_assume_and_chain(i1 %a, i1 %b, i1 %c) {
 ; CHECK-NEXT:    [[AND1:%.*]] = and i1 [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[AND2:%.*]] = and i1 [[AND1]], [[C:%.*]]
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[AND2]])
-; CHECK:         [[TMP1:%.*]] = bitcast i1 [[C]] to i1
-; CHECK:         [[TMP2:%.*]] = bitcast i1 [[B]] to i1
-; CHECK:         [[TMP3:%.*]] = bitcast i1 [[A]] to i1
-; CHECK:         [[TMP4:%.*]] = bitcast i1 [[AND1]] to i1
-; CHECK:         [[TMP5:%.*]] = bitcast i1 [[AND2]] to i1
+; CHECK-NEXT:  ; assume predicate info { Comparison:i1 [[C]], RenamedOp: [[C]] }
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i1 [[C]] to i1
+; CHECK-NEXT:  ; assume predicate info { Comparison:i1 [[B]], RenamedOp: [[B]] }
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i1 [[B]] to i1
+; CHECK-NEXT:  ; assume predicate info { Comparison:i1 [[A]], RenamedOp: [[A]] }
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i1 [[A]] to i1
+; CHECK-NEXT:  ; assume predicate info { Comparison: [[AND1]] = and i1 [[A]], [[B]], RenamedOp: [[AND1]] }
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i1 [[AND1]] to i1
+; CHECK-NEXT:  ; assume predicate info { Comparison: [[AND2]] = and i1 [[AND1]], [[C]], RenamedOp: [[AND2]] }
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i1 [[AND2]] to i1
 ; CHECK-NEXT:    call void @foo(i1 [[TMP3]])
 ; CHECK-NEXT:    call void @foo(i1 [[TMP2]])
 ; CHECK-NEXT:    call void @foo(i1 [[TMP1]])
@@ -901,7 +996,8 @@ define void @test_assume_or_chain(i1 %a, i1 %b, i1 %c) {
 ; CHECK-NEXT:    [[OR1:%.*]] = or i1 [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[OR2:%.*]] = or i1 [[OR1]], [[C:%.*]]
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[OR2]])
-; CHECK:         [[TMP1:%.*]] = bitcast i1 [[OR2]] to i1
+; CHECK-NEXT:  ; assume predicate info { Comparison: [[OR2]] = or i1 [[OR1]], [[C]], RenamedOp: [[OR2]] }
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i1 [[OR2]] to i1
 ; CHECK-NEXT:    call void @foo(i1 [[A]])
 ; CHECK-NEXT:    call void @foo(i1 [[B]])
 ; CHECK-NEXT:    call void @foo(i1 [[C]])
@@ -937,14 +1033,22 @@ define void @test_assume_deep_and_tree(i1 %a1) {
 ; CHECK-NEXT:    [[A14:%.*]] = and i1 [[A13]], [[A13]]
 ; CHECK-NEXT:    [[A15:%.*]] = and i1 [[A14]], [[A14]]
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[A15]])
-; CHECK:         [[TMP1:%.*]] = bitcast i1 [[A8]] to i1
-; CHECK:         [[TMP2:%.*]] = bitcast i1 [[A9]] to i1
-; CHECK:         [[TMP3:%.*]] = bitcast i1 [[A10]] to i1
-; CHECK:         [[TMP4:%.*]] = bitcast i1 [[A11]] to i1
-; CHECK:         [[TMP5:%.*]] = bitcast i1 [[A12]] to i1
-; CHECK:         [[TMP6:%.*]] = bitcast i1 [[A13]] to i1
-; CHECK:         [[TMP7:%.*]] = bitcast i1 [[A14]] to i1
-; CHECK:         [[TMP8:%.*]] = bitcast i1 [[A15]] to i1
+; CHECK-NEXT:  ; assume predicate info { Comparison: [[A8]] = and i1 [[A7]], [[A7]], RenamedOp: [[A8]] }
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i1 [[A8]] to i1
+; CHECK-NEXT:  ; assume predicate info { Comparison: [[A9]] = and i1 [[A8]], [[A8]], RenamedOp: [[A9]] }
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i1 [[A9]] to i1
+; CHECK-NEXT:  ; assume predicate info { Comparison: [[A10]] = and i1 [[A9]], [[A9]], RenamedOp: [[A10]] }
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i1 [[A10]] to i1
+; CHECK-NEXT:  ; assume predicate info { Comparison: [[A11]] = and i1 [[A10]], [[A10]], RenamedOp: [[A11]] }
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i1 [[A11]] to i1
+; CHECK-NEXT:  ; assume predicate info { Comparison: [[A12]] = and i1 [[A11]], [[A11]], RenamedOp: [[A12]] }
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i1 [[A12]] to i1
+; CHECK-NEXT:  ; assume predicate info { Comparison: [[A13]] = and i1 [[A12]], [[A12]], RenamedOp: [[A13]] }
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i1 [[A13]] to i1
+; CHECK-NEXT:  ; assume predicate info { Comparison: [[A14]] = and i1 [[A13]], [[A13]], RenamedOp: [[A14]] }
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i1 [[A14]] to i1
+; CHECK-NEXT:  ; assume predicate info { Comparison: [[A15]] = and i1 [[A14]], [[A14]], RenamedOp: [[A15]] }
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i1 [[A15]] to i1
 ; CHECK-NEXT:    call void @foo(i1 [[A1]])
 ; CHECK-NEXT:    call void @foo(i1 [[A2]])
 ; CHECK-NEXT:    call void @foo(i1 [[A3]])
@@ -1001,13 +1105,15 @@ define i32 @test_and_with_phinode(i32 %x) {
 ; CHECK-NEXT:    [[XGE1:%.*]] = icmp uge i32 [[X:%.*]], 1
 ; CHECK-NEXT:    [[XLT2:%.*]] = icmp ult i32 [[X]], 2
 ; CHECK-NEXT:    [[AND:%.*]] = and i1 [[XGE1]], [[XLT2]]
-; CHECK:         [[X_0_1:%.*]] = bitcast i32 [[X]] to i32
-; CHECK:         [[X_0_2:%.*]] = bitcast i32 [[X_0_1]] to i32
-; CHECK-NEXT:    br i1 [[AND]], label [[PHI:%.*]], label [[NOPE:%.*]]
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[XGE1]] = icmp uge i32 [[X]], 1 Edge: [label [[ENTRY:%.*]],label [[PHI:%.*]]], RenamedOp: [[X]] }
+; CHECK-NEXT:    [[X_0_1:%.*]] = bitcast i32 [[X]] to i32
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[XLT2]] = icmp ult i32 [[X]], 2 Edge: [label [[ENTRY]],label [[PHI]]], RenamedOp: [[X]] }
+; CHECK-NEXT:    [[X_0_2:%.*]] = bitcast i32 [[X_0_1]] to i32
+; CHECK-NEXT:    br i1 [[AND]], label [[PHI]], label [[NOPE:%.*]]
 ; CHECK:       nope:
 ; CHECK-NEXT:    br label [[PHI]]
 ; CHECK:       phi:
-; CHECK-NEXT:    [[RES:%.*]] = phi i32 [ [[X_0_2]], [[ENTRY:%.*]] ], [ 1, [[NOPE]] ]
+; CHECK-NEXT:    [[RES:%.*]] = phi i32 [ [[X_0_2]], [[ENTRY]] ], [ 1, [[NOPE]] ]
 ; CHECK-NEXT:    ret i32 [[RES]]
 ;
 entry:
diff --git a/llvm/test/Transforms/Util/PredicateInfo/unnamed-types.ll b/llvm/test/Transforms/Util/PredicateInfo/unnamed-types.ll
index d9f6aed7d01c8..faf4bec61c935 100644
--- a/llvm/test/Transforms/Util/PredicateInfo/unnamed-types.ll
+++ b/llvm/test/Transforms/Util/PredicateInfo/unnamed-types.ll
@@ -6,13 +6,11 @@
 ; Check we can use ssa.copy with unnamed types.
 
 ; CHECK-LABEL: bb:
-; CHECK: Has predicate info
 ; CHECK: branch predicate info { TrueEdge: 1 Comparison:  %cmp1 = icmp ne ptr %arg, null Edge: [label %bb,label %bb1], RenamedOp: %arg }
 ; CHECK-NEXT:  %arg.0 = bitcast ptr %arg to ptr
 
 ; CHECK-LABEL: bb1:
-; CHECK: Has predicate info
-; CHECK-NEXT: branch predicate info { TrueEdge: 0 Comparison:  %cmp2 = icmp ne ptr null, %tmp Edge: [label %bb1,label %bb3], RenamedOp: %tmp }
+; CHECK: branch predicate info { TrueEdge: 0 Comparison:  %cmp2 = icmp ne ptr null, %tmp Edge: [label %bb1,label %bb3], RenamedOp: %tmp }
 ; CHECK-NEXT: %tmp.0 = bitcast ptr %tmp to ptr
 
 define void @f0(ptr %arg, ptr %tmp) {
diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py
index 781240aac94b6..11a5a5785a6ec 100644
--- a/llvm/test/lit.cfg.py
+++ b/llvm/test/lit.cfg.py
@@ -753,10 +753,17 @@ def host_unwind_supports_jit():
     config.available_features.add("unix-sockets")
 
 # .debug_frame is not emitted for targeting Windows x64, aarch64/arm64, AIX, or Apple Silicon Mac.
-if not re.match(
-    r"^(x86_64|aarch64|arm64|powerpc|powerpc64).*-(windows-cygnus|windows-gnu|windows-msvc|aix)",
-    config.target_triple,
-) and not re.match(r"^arm64(e)?-apple-(macos|darwin)", config.target_triple):
+if (
+    not re.match(
+        r"^(x86_64|aarch64|arm64|powerpc|powerpc64).*-(windows-cygnus|windows-gnu|windows-msvc|aix)",
+        config.target_triple,
+    )
+    and not re.match(
+        r"^arm64(e)?-apple-(macos|darwin)",
+        config.target_triple,
+    )
+    and not re.match(r".*-zos.*", config.target_triple)
+):
     config.available_features.add("debug_frame")
 
 if config.enable_backtrace:
diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/check_empty.ll b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/check_empty.ll
new file mode 100644
index 0000000000000..bfd216d1ced49
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/check_empty.ll
@@ -0,0 +1,29 @@
+; RUN: opt < %s -S | FileCheck %s
+
+; Test whether UTC checks empty lines instead of skipping them.
+define i32 @test(i32 %x) {
+entry:
+  br label %block1
+
+block1:
+  %cmp = icmp eq i32 %x, 0
+  br i1 %cmp, label %block2, label %exit1
+
+block2:
+  br i1 %cmp, label %block3, label %exit2
+
+block3:
+  br i1 %cmp, label %exit3, label %exit4
+
+exit1:
+  ret i32 0
+
+exit2:
+  ret i32 %x
+
+exit3:
+  ret i32 %x
+
+exit4:
+  ret i32 %x
+}
diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/check_empty.ll.expected b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/check_empty.ll.expected
new file mode 100644
index 0000000000000..c5f822d10181a
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/check_empty.ll.expected
@@ -0,0 +1,57 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 7
+; RUN: opt < %s -S | FileCheck %s
+
+; Test whether UTC checks empty lines instead of skipping them.
+define i32 @test(i32 %x) {
+; CHECK-LABEL: define i32 @test(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[BLOCK1:.*]]
+; CHECK-EMPTY:
+; CHECK-NEXT:  [[BLOCK1]]:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label %[[BLOCK2:.*]], label %[[EXIT1:.*]]
+; CHECK-EMPTY:
+; CHECK-NEXT:  [[BLOCK2]]:
+; CHECK-NEXT:    br i1 [[CMP]], label %[[BLOCK3:.*]], label %[[EXIT2:.*]]
+; CHECK-EMPTY:
+; CHECK-NEXT:  [[BLOCK3]]:
+; CHECK-NEXT:    br i1 [[CMP]], label %[[EXIT3:.*]], label %[[EXIT4:.*]]
+; CHECK-EMPTY:
+; CHECK-NEXT:  [[EXIT1]]:
+; CHECK-NEXT:    ret i32 0
+; CHECK-EMPTY:
+; CHECK-NEXT:  [[EXIT2]]:
+; CHECK-NEXT:    ret i32 [[X]]
+; CHECK-EMPTY:
+; CHECK-NEXT:  [[EXIT3]]:
+; CHECK-NEXT:    ret i32 [[X]]
+; CHECK-EMPTY:
+; CHECK-NEXT:  [[EXIT4]]:
+; CHECK-NEXT:    ret i32 [[X]]
+;
+entry:
+  br label %block1
+
+block1:
+  %cmp = icmp eq i32 %x, 0
+  br i1 %cmp, label %block2, label %exit1
+
+block2:
+  br i1 %cmp, label %block3, label %exit2
+
+block3:
+  br i1 %cmp, label %exit3, label %exit4
+
+exit1:
+  ret i32 0
+
+exit2:
+  ret i32 %x
+
+exit3:
+  ret i32 %x
+
+exit4:
+  ret i32 %x
+}
diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/switch_case.ll b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/switch_case.ll
new file mode 100644
index 0000000000000..a804225a380c8
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/switch_case.ll
@@ -0,0 +1,54 @@
+; RUN: opt < %s -S | FileCheck %s
+
+; Test whether the UTC format the switch-cases correctly, which requires TWO extra spaces.
+
+define i8 @testi8(i8 %x) {
+  switch i8 %x, label %default [
+    i8 0, label %case1
+    i8 1, label %case2
+    i8 2, label %case3
+    i8 3, label %case3
+  ]
+default:
+  ret i8 0
+case1:
+  ret i8 1
+case2:
+  ret i8 2
+case3:
+  ret i8 3
+}
+
+define i32 @testi32(i32 %x) {
+  switch i32 %x, label %default [
+    i32 0, label %case1
+    i32 1, label %case2
+    i32 2, label %case3
+    i32 3, label %case3
+  ]
+default:
+  ret i32 0
+case1:
+  ret i32 1
+case2:
+  ret i32 2
+case3:
+  ret i32 3
+}
+
+define i128 @testi128(i128 %x) {
+  switch i128 %x, label %default [
+    i128 0, label %case1
+    i128 1, label %case2
+    i128 2, label %case3
+    i128 3, label %case3
+  ]
+default:
+  ret i128 0
+case1:
+  ret i128 1
+case2:
+  ret i128 2
+case3:
+  ret i128 3
+}
diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/switch_case.ll.expected b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/switch_case.ll.expected
new file mode 100644
index 0000000000000..8cab0bbf304f3
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/switch_case.ll.expected
@@ -0,0 +1,118 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 7
+; RUN: opt < %s -S | FileCheck %s
+
+; Test whether the UTC format the switch-cases correctly, which requires TWO extra spaces.
+
+define i8 @testi8(i8 %x) {
+; CHECK-LABEL: define i8 @testi8(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:    switch i8 [[X]], label %[[DEFAULT:.*]] [
+; CHECK-NEXT:      i8 0, label %[[CASE1:.*]]
+; CHECK-NEXT:      i8 1, label %[[CASE2:.*]]
+; CHECK-NEXT:      i8 2, label %[[CASE3:.*]]
+; CHECK-NEXT:      i8 3, label %[[CASE3]]
+; CHECK-NEXT:    ]
+; CHECK-EMPTY:
+; CHECK-NEXT:  [[DEFAULT]]:
+; CHECK-NEXT:    ret i8 0
+; CHECK-EMPTY:
+; CHECK-NEXT:  [[CASE1]]:
+; CHECK-NEXT:    ret i8 1
+; CHECK-EMPTY:
+; CHECK-NEXT:  [[CASE2]]:
+; CHECK-NEXT:    ret i8 2
+; CHECK-EMPTY:
+; CHECK-NEXT:  [[CASE3]]:
+; CHECK-NEXT:    ret i8 3
+;
+  switch i8 %x, label %default [
+    i8 0, label %case1
+    i8 1, label %case2
+    i8 2, label %case3
+    i8 3, label %case3
+  ]
+default:
+  ret i8 0
+case1:
+  ret i8 1
+case2:
+  ret i8 2
+case3:
+  ret i8 3
+}
+
+define i32 @testi32(i32 %x) {
+; CHECK-LABEL: define i32 @testi32(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    switch i32 [[X]], label %[[DEFAULT:.*]] [
+; CHECK-NEXT:      i32 0, label %[[CASE1:.*]]
+; CHECK-NEXT:      i32 1, label %[[CASE2:.*]]
+; CHECK-NEXT:      i32 2, label %[[CASE3:.*]]
+; CHECK-NEXT:      i32 3, label %[[CASE3]]
+; CHECK-NEXT:    ]
+; CHECK-EMPTY:
+; CHECK-NEXT:  [[DEFAULT]]:
+; CHECK-NEXT:    ret i32 0
+; CHECK-EMPTY:
+; CHECK-NEXT:  [[CASE1]]:
+; CHECK-NEXT:    ret i32 1
+; CHECK-EMPTY:
+; CHECK-NEXT:  [[CASE2]]:
+; CHECK-NEXT:    ret i32 2
+; CHECK-EMPTY:
+; CHECK-NEXT:  [[CASE3]]:
+; CHECK-NEXT:    ret i32 3
+;
+  switch i32 %x, label %default [
+    i32 0, label %case1
+    i32 1, label %case2
+    i32 2, label %case3
+    i32 3, label %case3
+  ]
+default:
+  ret i32 0
+case1:
+  ret i32 1
+case2:
+  ret i32 2
+case3:
+  ret i32 3
+}
+
+define i128 @testi128(i128 %x) {
+; CHECK-LABEL: define i128 @testi128(
+; CHECK-SAME: i128 [[X:%.*]]) {
+; CHECK-NEXT:    switch i128 [[X]], label %[[DEFAULT:.*]] [
+; CHECK-NEXT:      i128 0, label %[[CASE1:.*]]
+; CHECK-NEXT:      i128 1, label %[[CASE2:.*]]
+; CHECK-NEXT:      i128 2, label %[[CASE3:.*]]
+; CHECK-NEXT:      i128 3, label %[[CASE3]]
+; CHECK-NEXT:    ]
+; CHECK-EMPTY:
+; CHECK-NEXT:  [[DEFAULT]]:
+; CHECK-NEXT:    ret i128 0
+; CHECK-EMPTY:
+; CHECK-NEXT:  [[CASE1]]:
+; CHECK-NEXT:    ret i128 1
+; CHECK-EMPTY:
+; CHECK-NEXT:  [[CASE2]]:
+; CHECK-NEXT:    ret i128 2
+; CHECK-EMPTY:
+; CHECK-NEXT:  [[CASE3]]:
+; CHECK-NEXT:    ret i128 3
+;
+  switch i128 %x, label %default [
+    i128 0, label %case1
+    i128 1, label %case2
+    i128 2, label %case3
+    i128 3, label %case3
+  ]
+default:
+  ret i128 0
+case1:
+  ret i128 1
+case2:
+  ret i128 2
+case3:
+  ret i128 3
+}
diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/check_empty.test b/llvm/test/tools/UpdateTestChecks/update_test_checks/check_empty.test
new file mode 100644
index 0000000000000..670bda27bb369
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/check_empty.test
@@ -0,0 +1,3 @@
+## test whether the UTC generates CHECK-EMPTY for blank lines
+# RUN: cp -f %S/Inputs/check_empty.ll %t.ll && %update_test_checks %t.ll --version 7
+# RUN: diff -u %t.ll %S/Inputs/check_empty.ll.expected
diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/switch_case.test b/llvm/test/tools/UpdateTestChecks/update_test_checks/switch_case.test
new file mode 100644
index 0000000000000..891dbe06bbf59
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/switch_case.test
@@ -0,0 +1,3 @@
+## switch_case test checking that update_test_checks.py works correctly
+# RUN: cp -f %S/Inputs/switch_case.ll %t.ll && %update_test_checks %t.ll --version 7
+# RUN: diff -u %t.ll %S/Inputs/switch_case.ll.expected
diff --git a/llvm/test/tools/dxil-dis/di-subprogram.ll b/llvm/test/tools/dxil-dis/di-subprogram.ll
index 8255d396dd55d..912421fb28ae5 100644
--- a/llvm/test/tools/dxil-dis/di-subprogram.ll
+++ b/llvm/test/tools/dxil-dis/di-subprogram.ll
@@ -3,8 +3,6 @@ target triple = "dxil-unknown-shadermodel6.7-library"
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4}
-!llvm.used = !{!5}
-!llvm.lines = !{!13, !14, !15, !16}
 
 ; CHECK: !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "Some Compiler", isOptimized: true, runtimeVersion: 0, emissionKind: 1, enums: !2)
 !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "Some Compiler", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None)
@@ -16,38 +14,3 @@ target triple = "dxil-unknown-shadermodel6.7-library"
 !3 = !{i32 2, !"Dwarf Version", i32 4}
 ; CHECK: !4 = !{i32 2, !"Debug Info Version", i32 3}
 !4 = !{i32 2, !"Debug Info Version", i32 3}
-
-; CHECK: !5 = distinct !DISubprogram(name: "fma", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, function: !0, variables: !9)
-!5 = distinct !DISubprogram(name: "fma", scope: !1, file: !1, line: 1, type: !6, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !9)
-
-; CHECK: !6 = !DISubroutineType(types: !7)
-!6 = !DISubroutineType(types: !7)
-
-; CHECK: !7 = !{!8, !8, !8, !8}
-!7 = !{!8, !8, !8, !8}
-
-; CHECK: !8 = !DIBasicType(name: "float", size: 32, encoding: DW_ATE_float)
-!8 = !DIBasicType(name: "float", size: 32, encoding: DW_ATE_float)
-
-; CHECK: !9 = !{!10, !11, !12}
-!9 = !{!10, !11, !12}
-
-; CHECK: !10 = !DILocalVariable(tag: DW_TAG_variable, name: "x", arg: 1, scope: !5, file: !1, line: 1, type: !8)
-!10 = !DILocalVariable(name: "x", arg: 1, scope: !5, file: !1, line: 1, type: !8)
-
-; CHECK: !11 = !DILocalVariable(tag: DW_TAG_variable, name: "y", arg: 2, scope: !5, file: !1, line: 1, type: !8)
-!11 = !DILocalVariable(name: "y", arg: 2, scope: !5, file: !1, line: 1, type: !8)
-
-; CHECK: !12 = !DILocalVariable(tag: DW_TAG_variable, name: "z", arg: 3, scope: !5, file: !1, line: 1, type: !8)
-!12 = !DILocalVariable(name: "z", arg: 3, scope: !5, file: !1, line: 1, type: !8)
-
-
-; CHECK: !13 = !DILocation(line: 0, scope: !5)
-; CHECK: !14 = !DILocation(line: 2, column: 12, scope: !5)
-; CHECK: !15 = !DILocation(line: 2, column: 16, scope: !5)
-; CHECK: !16 = !DILocation(line: 2, column: 3, scope: !5)
-
-!13 = !DILocation(line: 0, scope: !5)
-!14 = !DILocation(line: 2, column: 12, scope: !5)
-!15 = !DILocation(line: 2, column: 16, scope: !5)
-!16 = !DILocation(line: 2, column: 3, scope: !5)
diff --git a/llvm/test/tools/dxil-dis/di-subrotine.ll b/llvm/test/tools/dxil-dis/di-subrotine.ll
deleted file mode 100644
index 285e319b74056..0000000000000
--- a/llvm/test/tools/dxil-dis/di-subrotine.ll
+++ /dev/null
@@ -1,12 +0,0 @@
-; RUN: llc --filetype=obj %s -o - | dxil-dis -o - | FileCheck %s
-target triple = "dxil-unknown-shadermodel6.7-library"
-
-!llvm.used = !{!0}
-
-!0 = !DISubroutineType(types: !1)
-!1 = !{!2, !2, !2, !2}
-!2 = !DIBasicType(name: "float", size: 32, encoding: DW_ATE_float)
-
-; CHECK: !0 = !DISubroutineType(types: !1)
-; CHECK: !1 = !{!2, !2, !2, !2}
-; CHECK: !2 = !DIBasicType(name: "float", size: 32, encoding: DW_ATE_float)
diff --git a/llvm/test/tools/dxil-dis/md-manystrings.ll b/llvm/test/tools/dxil-dis/md-manystrings.ll
index 938e2dd5114da..a7dd595f09d94 100644
--- a/llvm/test/tools/dxil-dis/md-manystrings.ll
+++ b/llvm/test/tools/dxil-dis/md-manystrings.ll
@@ -4,7 +4,7 @@
 
 target triple = "dxil-unknown-shadermodel6.7-library"
 
-!llvm.too_many_strings = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31}
+!llvm.ident = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31}
 
 !0 = !{!"String 0"}
 !1 = !{!"String 1"}
@@ -39,7 +39,7 @@ target triple = "dxil-unknown-shadermodel6.7-library"
 !30 = !{!"String 30"}
 !31 = !{!"String 31"}
 
-; CHECK: !llvm.too_many_strings = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31}
+; CHECK: !llvm.ident = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31}
 ; CHECK: !0 = !{!"String 0"}
 ; CHECK: !1 = !{!"String 1"}
 ; CHECK: !2 = !{!"String 2"}
diff --git a/llvm/test/tools/llvm-config/paths.test b/llvm/test/tools/llvm-config/paths.test
index 419f155ae1f83..61d86f7eb0ba1 100644
--- a/llvm/test/tools/llvm-config/paths.test
+++ b/llvm/test/tools/llvm-config/paths.test
@@ -4,18 +4,34 @@ RUN: llvm-config --bindir 2>&1 | FileCheck --check-prefix=CHECK-BINDIR %s
 CHECK-BINDIR: {{.*}}{{/|\\}}bin
 CHECK-BINDIR-NOT: error:
 CHECK-BINDIR-NOT: warning
+RUN: llvm-config --bindir --quote-paths 2>&1 | FileCheck --check-prefix=CHECK-BINDIR2 %s
+CHECK-BINDIR2: {{.*}}{{/|\\\\}}bin
+CHECK-BINDIR2-NOT: error:
+CHECK-BINDIR2-NOT: warning
 
 RUN: llvm-config --includedir 2>&1 | FileCheck --check-prefix=CHECK-INCLUDEDIR %s
 CHECK-INCLUDEDIR: {{.*}}{{/|\\}}include
 CHECK-INCLUDEDIR-NOT: error:
 CHECK-INCLUDEDIR-NOT: warning
+RUN: llvm-config --includedir --quote-paths 2>&1 | FileCheck --check-prefix=CHECK-INCLUDEDIR2 %s
+CHECK-INCLUDEDIR2: {{.*}}{{/|\\\\}}include
+CHECK-INCLUDEDIR2-NOT: error:
+CHECK-INCLUDEDIR2-NOT: warning
 
 RUN: llvm-config --libdir 2>&1 | FileCheck --check-prefix=CHECK-LIBDIR %s
 CHECK-LIBDIR: {{.*}}{{/|\\}}lib{{.*}}
 CHECK-LIBDIR-NOT: error:
 CHECK-LIBDIR-NOT: warning
+RUN: llvm-config --libdir --quote-paths 2>&1 | FileCheck --check-prefix=CHECK-LIBDIR2 %s
+CHECK-LIBDIR2: {{.*}}{{/|\\\\}}lib{{.*}}
+CHECK-LIBDIR2-NOT: error:
+CHECK-LIBDIR2-NOT: warning
 
 RUN: llvm-config --cmakedir 2>&1 | FileCheck --check-prefix=CHECK-CMAKEDIR %s
 CHECK-CMAKEDIR: {{.*}}{{/|\\}}cmake{{/|\\}}llvm
 CHECK-CMAKEDIR-NOT: error:
 CHECK-CMAKEDIR-NOT: warning
+RUN: llvm-config --cmakedir --quote-paths 2>&1 | FileCheck --check-prefix=CHECK-CMAKEDIR2 %s
+CHECK-CMAKEDIR2: {{.*}}{{/|\\\\}}cmake{{/|\\\\}}llvm
+CHECK-CMAKEDIR2-NOT: error:
+CHECK-CMAKEDIR2-NOT: warning
diff --git a/llvm/test/tools/llvm-dwarfdump/AArch64/DW_AT_APPLE_property.s b/llvm/test/tools/llvm-dwarfdump/AArch64/DW_AT_APPLE_property.s
new file mode 100644
index 0000000000000..6c38791b0a083
--- /dev/null
+++ b/llvm/test/tools/llvm-dwarfdump/AArch64/DW_AT_APPLE_property.s
@@ -0,0 +1,126 @@
+# Checks that we correctly display the DW_AT_APPLE_property_name of a
+# referenced DW_TAG_APPLE_property.
+#
+# RUN: llvm-mc -triple=aarch64--darwin -filetype=obj -o %t.o < %s
+# RUN: not llvm-dwarfdump %t.o 2> %t.errs.txt | FileCheck %s
+# RUN: FileCheck %s --check-prefix=ERRORS < %t.errs.txt 
+
+# CHECK: 0x[[PROP_REF:[0-9a-f]+]]: DW_TAG_APPLE_property
+# CHECK-NEXT: DW_AT_APPLE_property_name ("autoSynthProp")
+#
+# CHECK: 0x[[NO_NAME_PROP:[0-9a-f]+]]: DW_TAG_APPLE_property
+# CHECK-NOT: DW_AT_APPLE_property_name
+#
+# CHECK: 0x[[INVALID_STRP:[0-9a-f]+]]: DW_TAG_APPLE_property
+# CHECK-NEXT: DW_AT_APPLE_property_name
+#
+# CHECK: DW_TAG_member
+# CHECK:   DW_AT_APPLE_property  (0x[[PROP_REF]] "autoSynthProp")
+# CHECK:   DW_AT_APPLE_property  (0x[[NO_NAME_PROP]] "")
+# CHECK:   DW_AT_APPLE_property  (0x{{.*}})
+# CHECK:   DW_AT_APPLE_property  (0x{{.*}})
+# CHECK:   DW_AT_APPLE_property  (0x[[INVALID_STRP]])
+
+# ERRORS: error: decoding DW_AT_APPLE_property_name: not referencing a DW_TAG_APPLE_property
+# ERRORS: error: decoding DW_AT_APPLE_property_name: invalid DIE
+# ERRORS: error: decoding DW_AT_APPLE_property_name: DW_FORM_strp offset 102 is beyond .debug_str bounds
+
+	.section	__DWARF,__debug_abbrev,regular,debug
+Lsection_abbrev:
+	.byte	1                               ; Abbreviation Code
+	.byte	17                              ; DW_TAG_compile_unit
+	.byte	1                               ; DW_CHILDREN_yes
+	.byte	114                             ; DW_AT_str_offsets_base
+	.byte	23                              ; DW_FORM_sec_offset
+	.byte	0                               ; EOM(1)
+	.byte	0                               ; EOM(2)
+	.byte	2                               ; Abbreviation Code
+	.byte	19                              ; DW_TAG_structure_type
+	.byte	1                               ; DW_CHILDREN_yes
+	.byte	3                               ; DW_AT_name
+	.byte	37                              ; DW_FORM_strx1
+	.byte	0                               ; EOM(1)
+	.byte	0                               ; EOM(2)
+	.byte	3                               ; Abbreviation Code
+	.ascii	"\200\204\001"                  ; DW_TAG_APPLE_property
+	.byte	0                               ; DW_CHILDREN_no
+	.ascii	"\350\177"                      ; DW_AT_APPLE_property_name
+	.byte	37                              ; DW_FORM_strx1
+	.byte	0                               ; EOM(1)
+	.byte	0                               ; EOM(2)
+	.byte	4                               ; Abbreviation Code
+	.ascii	"\200\204\001"                  ; DW_TAG_APPLE_property
+	.byte	0                               ; DW_CHILDREN_no
+	.byte	0                               ; EOM(1)
+	.byte	0                               ; EOM(2)
+	.byte	5                               ; Abbreviation Code
+	.ascii	"\200\204\001"                  ; DW_TAG_APPLE_property
+	.byte	0                               ; DW_CHILDREN_no
+	.ascii	"\350\177"                      ; DW_AT_APPLE_property_name
+	.byte	14                              ; DW_FORM_strp
+	.byte	0                               ; EOM(1)
+	.byte	0                               ; EOM(2)
+	.byte	6                               ; Abbreviation Code
+	.byte	13                              ; DW_TAG_member
+	.byte	0                               ; DW_CHILDREN_no
+	.byte	3                               ; DW_AT_name
+	.byte	37                              ; DW_FORM_strx1
+	.ascii	"\355\177"                      ; DW_AT_APPLE_property
+	.byte	19                              ; DW_FORM_ref4
+	.ascii	"\355\177"                      ; DW_AT_APPLE_property
+	.byte	19                              ; DW_FORM_ref4
+	.ascii	"\355\177"                      ; DW_AT_APPLE_property
+	.byte	19                              ; DW_FORM_ref4
+	.ascii	"\355\177"                      ; DW_AT_APPLE_property
+	.byte	19                              ; DW_FORM_ref4
+	.ascii	"\355\177"                      ; DW_AT_APPLE_property
+	.byte	19                              ; DW_FORM_ref4
+	.byte	0                               ; EOM(1)
+	.byte	0                               ; EOM(2)
+	.byte	0                               ; EOM(3)
+	.section	__DWARF,__debug_info,regular,debug
+Lsection_info:
+Lcu_begin0:
+Lset0 = Ldebug_info_end0-Ldebug_info_start0 ; Length of Unit
+	.long	Lset0
+Ldebug_info_start0:
+	.short	5                               ; DWARF version number
+	.byte	1                               ; DWARF Unit Type
+	.byte	8                               ; Address Size (in bytes)
+Lset1 = Lsection_abbrev-Lsection_abbrev ; Offset Into Abbrev. Section
+	.long	Lset1
+	.byte	1                               ; Abbrev [1] DW_TAG_compile_unit
+Lset2 = Lstr_offsets_base0-Lsection_str_off ; DW_AT_str_offsets_base
+	.long	Lset2
+	.byte	2                               ; Abbrev [2] DW_TAG_structure_type
+	.byte	2                               ; DW_AT_name
+	.byte	3                               ; Abbrev [3] DW_TAG_APPLE_property
+	.byte	0                               ; DW_AT_APPLE_property_name
+	.byte	4                               ; Abbrev [4] DW_TAG_APPLE_property
+	.byte	5                               ; Abbrev [5] DW_TAG_APPLE_property
+	.long	102                             ; DW_AT_APPLE_property_name
+	.byte	6                               ; Abbrev [6] DW_TAG_member
+	.byte	1                               ; DW_AT_name
+	.long	19                              ; DW_AT_APPLE_property
+	.long	21                              ; DW_AT_APPLE_property
+	.long	17                              ; DW_AT_APPLE_property
+	.long	0                               ; DW_AT_APPLE_property
+	.long	22                              ; DW_AT_APPLE_property
+	.byte	0                               ; End Of Children Mark
+	.byte	0                               ; End Of Children Mark
+Ldebug_info_end0:
+	.section	__DWARF,__debug_str_offs,regular,debug
+Lsection_str_off:
+	.long	16                              ; Length of String Offsets Set
+	.short	5
+	.short	0
+Lstr_offsets_base0:
+	.section	__DWARF,__debug_str,regular,debug
+Linfo_string:
+	.asciz	"autoSynthProp"                 ; string offset=0
+	.asciz	"_var"                          ; string offset=14
+	.asciz	"Foo"                           ; string offset=19
+	.section	__DWARF,__debug_str_offs,regular,debug
+	.long	0
+	.long	14
+	.long	19
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/type_units_split_dwp_v4.s b/llvm/test/tools/llvm-dwarfdump/X86/type_units_split_dwp_v4.s
index becd9d1b55693..519edf043be5d 100644
--- a/llvm/test/tools/llvm-dwarfdump/X86/type_units_split_dwp_v4.s
+++ b/llvm/test/tools/llvm-dwarfdump/X86/type_units_split_dwp_v4.s
@@ -1,6 +1,12 @@
 ## This test uses TU index for type parsing in dwp and makes sure the DWARF4 type is
 ## successfully retrieved.
 
+## cd to a unique dir so we can refer to the file as just "test.dwo" in the
+## assembly test input below.
+# RUN: rm -rf %t
+# RUN: mkdir %t
+# RUN: cd %t
+
 # RUN: llvm-mc %s --split-dwarf-file=test.dwo -filetype obj -triple x86_64 -o test.o
 # RUN: llvm-dwp -e test.o -o test.dwp
 # RUN: llvm-dwarfdump test.dwp | FileCheck %s
diff --git a/llvm/test/tools/llvm-ir2vec/output/reference_triplets.txt b/llvm/test/tools/llvm-ir2vec/output/reference_triplets.txt
index dfbac4ce0c4d3..141a56ad10903 100644
--- a/llvm/test/tools/llvm-ir2vec/output/reference_triplets.txt
+++ b/llvm/test/tools/llvm-ir2vec/output/reference_triplets.txt
@@ -1,33 +1,33 @@
 MAX_RELATION=4
-187	7072	1
-187	6968	2
+187	7051	1
+187	6948	2
 187	187	0
-187	7072	1
-187	6969	2
+187	7051	1
+187	6949	2
 187	10	0
-10	7072	1
-10	7072	2
-10	7072	3
-10	6961	4
+10	7051	1
+10	7051	2
+10	7051	3
+10	6941	4
 10	187	0
-187	6952	1
-187	7072	2
-187	1555	0
-1555	6882	1
-1555	6952	2
-187	7072	1
-187	6968	2
+187	6932	1
+187	7051	2
+187	1543	0
+1543	6862	1
+1543	6932	2
+187	7051	1
+187	6948	2
 187	187	0
-187	7072	1
-187	6969	2
+187	7051	1
+187	6949	2
 187	601	0
-601	7072	1
-601	7072	2
-601	7072	3
-601	6961	4
+601	7051	1
+601	7051	2
+601	7051	3
+601	6941	4
 601	187	0
-187	6952	1
-187	7072	2
-187	1555	0
-1555	6882	1
-1555	6952	2
+187	6932	1
+187	7051	2
+187	1543	0
+1543	6862	1
+1543	6932	2
diff --git a/llvm/test/tools/llvm-ir2vec/output/reference_x86_entities.txt b/llvm/test/tools/llvm-ir2vec/output/reference_x86_entities.txt
index dc436d123fd35..dbbbbc746a769 100644
--- a/llvm/test/tools/llvm-ir2vec/output/reference_x86_entities.txt
+++ b/llvm/test/tools/llvm-ir2vec/output/reference_x86_entities.txt
@@ -1,4 +1,4 @@
-7173
+7151
 AAA	0
 AAD	1
 AADD	2
@@ -1440,5735 +1440,5713 @@ PSUBWrm	1437
 PSUBWrr	1438
 PSWAPDrm	1439
 PSWAPDrr	1440
-PT	1441
-PTCMMIMFP	1442
-PTCMMRLFP	1443
-PTCONJTCMMIMFP	1444
-PTCONJTFP	1445
-PTCVTROWD	1446
-PTCVTROWPS	1447
-PTDPBF	1448
-PTDPBHF	1449
-PTDPBSSD	1450
-PTDPBSSDV	1451
-PTDPBSUD	1452
-PTDPBSUDV	1453
-PTDPBUSD	1454
-PTDPBUSDV	1455
-PTDPBUUD	1456
-PTDPBUUDV	1457
-PTDPFP	1458
-PTDPHBF	1459
-PTDPHF	1460
-PTESTrm	1461
-PTESTrr	1462
-PTILELOADD	1463
-PTILELOADDRS	1464
-PTILELOADDRST	1465
-PTILELOADDRSV	1466
-PTILELOADDT	1467
-PTILELOADDV	1468
-PTILEMOVROWrre	1469
-PTILEMOVROWrreV	1470
-PTILEMOVROWrri	1471
-PTILEMOVROWrriV	1472
-PTILEPAIRLOAD	1473
-PTILEPAIRSTORE	1474
-PTILESTORED	1475
-PTILESTOREDV	1476
-PTILEZERO	1477
-PTILEZEROV	1478
-PTMMULTF	1479
-PTTCMMIMFP	1480
-PTTCMMRLFP	1481
-PTTDPBF	1482
-PTTDPFP	1483
-PTTMMULTF	1484
-PTTRANSPOSED	1485
-PTTRANSPOSEDV	1486
-PTWRITE	1487
-PTWRITEm	1488
-PTWRITEr	1489
-PUNPCKHBWrm	1490
-PUNPCKHBWrr	1491
-PUNPCKHDQrm	1492
-PUNPCKHDQrr	1493
-PUNPCKHQDQrm	1494
-PUNPCKHQDQrr	1495
-PUNPCKHWDrm	1496
-PUNPCKHWDrr	1497
-PUNPCKLBWrm	1498
-PUNPCKLBWrr	1499
-PUNPCKLDQrm	1500
-PUNPCKLDQrr	1501
-PUNPCKLQDQrm	1502
-PUNPCKLQDQrr	1503
-PUNPCKLWDrm	1504
-PUNPCKLWDrr	1505
-PUSH	1506
-PUSHA	1507
-PUSHCS	1508
-PUSHDS	1509
-PUSHES	1510
-PUSHF	1511
-PUSHFS	1512
-PUSHGS	1513
-PUSHP	1514
-PUSHSS	1515
-PVALIDATE	1516
-PXORrm	1517
-PXORrr	1518
-RCL	1519
-RCPPSm	1520
-RCPPSr	1521
-RCPSSm	1522
-RCPSSm_Int	1523
-RCPSSr	1524
-RCPSSr_Int	1525
-RCR	1526
-RDFLAGS	1527
-RDFSBASE	1528
-RDGSBASE	1529
-RDMSR	1530
-RDMSRLIST	1531
-RDMSRri	1532
-RDMSRri_EVEX	1533
-RDPID	1534
-RDPKRUr	1535
-RDPMC	1536
-RDPRU	1537
-RDRAND	1538
-RDSEED	1539
-RDSSPD	1540
-RDSSPQ	1541
-RDTSC	1542
-RDTSCP	1543
-REG_SEQUENCE	1544
-REPNE_PREFIX	1545
-REP_MOVSB	1546
-REP_MOVSD	1547
-REP_MOVSQ	1548
-REP_MOVSW	1549
-REP_PREFIX	1550
-REP_STOSB	1551
-REP_STOSD	1552
-REP_STOSQ	1553
-REP_STOSW	1554
-RET	1555
-RETI	1556
-REX	1557
-RMPADJUST	1558
-RMPQUERY	1559
-RMPUPDATE	1560
-ROL	1561
-ROR	1562
-RORX	1563
-ROUNDPDmi	1564
-ROUNDPDri	1565
-ROUNDPSmi	1566
-ROUNDPSri	1567
-ROUNDSDmi	1568
-ROUNDSDmi_Int	1569
-ROUNDSDri	1570
-ROUNDSDri_Int	1571
-ROUNDSSmi	1572
-ROUNDSSmi_Int	1573
-ROUNDSSri	1574
-ROUNDSSri_Int	1575
-RSM	1576
-RSQRTPSm	1577
-RSQRTPSr	1578
-RSQRTSSm	1579
-RSQRTSSm_Int	1580
-RSQRTSSr	1581
-RSQRTSSr_Int	1582
-RSTORSSP	1583
-SAHF	1584
-SALC	1585
-SAR	1586
-SARX	1587
-SAVEPREVSSP	1588
-SBB	1589
-SCASB	1590
-SCASL	1591
-SCASQ	1592
-SCASW	1593
-SEAMCALL	1594
-SEAMOPS	1595
-SEAMRET	1596
-SEG_ALLOCA	1597
-SEH_BeginEpilogue	1598
-SEH_EndEpilogue	1599
-SEH_EndPrologue	1600
-SEH_PushFrame	1601
-SEH_PushReg	1602
-SEH_SaveReg	1603
-SEH_SaveXMM	1604
-SEH_SetFrame	1605
-SEH_StackAlign	1606
-SEH_StackAlloc	1607
-SEH_UnwindV	1608
-SEH_UnwindVersion	1609
-SENDUIPI	1610
-SERIALIZE	1611
-SETB_C	1612
-SETCCm	1613
-SETCCm_EVEX	1614
-SETCCr	1615
-SETCCr_EVEX	1616
-SETSSBSY	1617
-SETZUCCm	1618
-SETZUCCr	1619
-SFENCE	1620
-SGDT	1621
-SHA	1622
-SHL	1623
-SHLD	1624
-SHLDROT	1625
-SHLX	1626
-SHR	1627
-SHRD	1628
-SHRDROT	1629
-SHRX	1630
-SHUFPDrmi	1631
-SHUFPDrri	1632
-SHUFPSrmi	1633
-SHUFPSrri	1634
-SIDT	1635
-SKINIT	1636
-SLDT	1637
-SLWPCB	1638
-SMSW	1639
-SQRTPDm	1640
-SQRTPDr	1641
-SQRTPSm	1642
-SQRTPSr	1643
-SQRTSDm	1644
-SQRTSDm_Int	1645
-SQRTSDr	1646
-SQRTSDr_Int	1647
-SQRTSSm	1648
-SQRTSSm_Int	1649
-SQRTSSr	1650
-SQRTSSr_Int	1651
-SQRT_F	1652
-SQRT_Fp	1653
-SS_PREFIX	1654
-STAC	1655
-STACKALLOC_W_PROBING	1656
-STACKMAP	1657
-STATEPOINT	1658
-STC	1659
-STD	1660
-STGI	1661
-STI	1662
-STMXCSR	1663
-STOSB	1664
-STOSL	1665
-STOSQ	1666
-STOSW	1667
-STR	1668
-STRm	1669
-STTILECFG	1670
-STTILECFG_EVEX	1671
-STUI	1672
-ST_F	1673
-ST_FP	1674
-ST_FPrr	1675
-ST_Fp	1676
-ST_FpP	1677
-ST_Frr	1678
-SUB	1679
-SUBPDrm	1680
-SUBPDrr	1681
-SUBPSrm	1682
-SUBPSrr	1683
-SUBREG_TO_REG	1684
-SUBR_F	1685
-SUBR_FI	1686
-SUBR_FPrST	1687
-SUBR_FST	1688
-SUBR_Fp	1689
-SUBR_FpI	1690
-SUBR_FrST	1691
-SUBSDrm	1692
-SUBSDrm_Int	1693
-SUBSDrr	1694
-SUBSDrr_Int	1695
-SUBSSrm	1696
-SUBSSrm_Int	1697
-SUBSSrr	1698
-SUBSSrr_Int	1699
-SUB_F	1700
-SUB_FI	1701
-SUB_FPrST	1702
-SUB_FST	1703
-SUB_Fp	1704
-SUB_FpI	1705
-SUB_FrST	1706
-SWAPGS	1707
-SYSCALL	1708
-SYSENTER	1709
-SYSEXIT	1710
-SYSRET	1711
-T	1712
-TAILJMPd	1713
-TAILJMPd_CC	1714
-TAILJMPm	1715
-TAILJMPr	1716
-TCMMIMFP	1717
-TCMMRLFP	1718
-TCONJTCMMIMFP	1719
-TCONJTFP	1720
-TCRETURN_HIPE	1721
-TCRETURN_WIN	1722
-TCRETURN_WINmi	1723
-TCRETURNdi	1724
-TCRETURNdicc	1725
-TCRETURNmi	1726
-TCRETURNri	1727
-TCVTROWD	1728
-TCVTROWPS	1729
-TDCALL	1730
-TDPBF	1731
-TDPBHF	1732
-TDPBSSD	1733
-TDPBSUD	1734
-TDPBUSD	1735
-TDPBUUD	1736
-TDPFP	1737
-TDPHBF	1738
-TDPHF	1739
-TEST	1740
-TESTUI	1741
-TILELOADD	1742
-TILELOADDRS	1743
-TILELOADDRST	1744
-TILELOADDRS_EVEX	1745
-TILELOADDT	1746
-TILELOADD_EVEX	1747
-TILEMOVROWrre	1748
-TILEMOVROWrri	1749
-TILERELEASE	1750
-TILESTORED	1751
-TILESTORED_EVEX	1752
-TILEZERO	1753
-TLBSYNC	1754
-TLSCall	1755
-TLS_addr	1756
-TLS_addrX	1757
-TLS_base_addr	1758
-TLS_base_addrX	1759
-TLS_desc	1760
-TMMULTF	1761
-TPAUSE	1762
-TRAP	1763
-TST_F	1764
-TST_Fp	1765
-TTCMMIMFP	1766
-TTCMMRLFP	1767
-TTDPBF	1768
-TTDPFP	1769
-TTMMULTF	1770
-TTRANSPOSED	1771
-TZCNT	1772
-TZMSK	1773
-UBSAN_UD	1774
-UCOMISDrm	1775
-UCOMISDrm_Int	1776
-UCOMISDrr	1777
-UCOMISDrr_Int	1778
-UCOMISSrm	1779
-UCOMISSrm_Int	1780
-UCOMISSrr	1781
-UCOMISSrr_Int	1782
-UCOM_FIPr	1783
-UCOM_FIr	1784
-UCOM_FPPr	1785
-UCOM_FPr	1786
-UCOM_FpIr	1787
-UCOM_Fpr	1788
-UCOM_Fr	1789
-UD	1790
-UIRET	1791
-UMONITOR	1792
-UMWAIT	1793
-UNPCKHPDrm	1794
-UNPCKHPDrr	1795
-UNPCKHPSrm	1796
-UNPCKHPSrr	1797
-UNPCKLPDrm	1798
-UNPCKLPDrr	1799
-UNPCKLPSrm	1800
-UNPCKLPSrr	1801
-URDMSRri	1802
-URDMSRri_EVEX	1803
-URDMSRrr	1804
-URDMSRrr_EVEX	1805
-UWRMSRir	1806
-UWRMSRir_EVEX	1807
-UWRMSRrr	1808
-UWRMSRrr_EVEX	1809
-V	1810
-VAARG	1811
-VAARG_X	1812
-VADDBF	1813
-VADDPDYrm	1814
-VADDPDYrr	1815
-VADDPDZ	1816
-VADDPDZrm	1817
-VADDPDZrmb	1818
-VADDPDZrmbk	1819
-VADDPDZrmbkz	1820
-VADDPDZrmk	1821
-VADDPDZrmkz	1822
-VADDPDZrr	1823
-VADDPDZrrb	1824
-VADDPDZrrbk	1825
-VADDPDZrrbkz	1826
-VADDPDZrrk	1827
-VADDPDZrrkz	1828
-VADDPDrm	1829
-VADDPDrr	1830
-VADDPHZ	1831
-VADDPHZrm	1832
-VADDPHZrmb	1833
-VADDPHZrmbk	1834
-VADDPHZrmbkz	1835
-VADDPHZrmk	1836
-VADDPHZrmkz	1837
-VADDPHZrr	1838
-VADDPHZrrb	1839
-VADDPHZrrbk	1840
-VADDPHZrrbkz	1841
-VADDPHZrrk	1842
-VADDPHZrrkz	1843
-VADDPSYrm	1844
-VADDPSYrr	1845
-VADDPSZ	1846
-VADDPSZrm	1847
-VADDPSZrmb	1848
-VADDPSZrmbk	1849
-VADDPSZrmbkz	1850
-VADDPSZrmk	1851
-VADDPSZrmkz	1852
-VADDPSZrr	1853
-VADDPSZrrb	1854
-VADDPSZrrbk	1855
-VADDPSZrrbkz	1856
-VADDPSZrrk	1857
-VADDPSZrrkz	1858
-VADDPSrm	1859
-VADDPSrr	1860
-VADDSDZrm	1861
-VADDSDZrm_Int	1862
-VADDSDZrmk_Int	1863
-VADDSDZrmkz_Int	1864
-VADDSDZrr	1865
-VADDSDZrr_Int	1866
-VADDSDZrrb_Int	1867
-VADDSDZrrbk_Int	1868
-VADDSDZrrbkz_Int	1869
-VADDSDZrrk_Int	1870
-VADDSDZrrkz_Int	1871
-VADDSDrm	1872
-VADDSDrm_Int	1873
-VADDSDrr	1874
-VADDSDrr_Int	1875
-VADDSHZrm	1876
-VADDSHZrm_Int	1877
-VADDSHZrmk_Int	1878
-VADDSHZrmkz_Int	1879
-VADDSHZrr	1880
-VADDSHZrr_Int	1881
-VADDSHZrrb_Int	1882
-VADDSHZrrbk_Int	1883
-VADDSHZrrbkz_Int	1884
-VADDSHZrrk_Int	1885
-VADDSHZrrkz_Int	1886
-VADDSSZrm	1887
-VADDSSZrm_Int	1888
-VADDSSZrmk_Int	1889
-VADDSSZrmkz_Int	1890
-VADDSSZrr	1891
-VADDSSZrr_Int	1892
-VADDSSZrrb_Int	1893
-VADDSSZrrbk_Int	1894
-VADDSSZrrbkz_Int	1895
-VADDSSZrrk_Int	1896
-VADDSSZrrkz_Int	1897
-VADDSSrm	1898
-VADDSSrm_Int	1899
-VADDSSrr	1900
-VADDSSrr_Int	1901
-VADDSUBPDYrm	1902
-VADDSUBPDYrr	1903
-VADDSUBPDrm	1904
-VADDSUBPDrr	1905
-VADDSUBPSYrm	1906
-VADDSUBPSYrr	1907
-VADDSUBPSrm	1908
-VADDSUBPSrr	1909
-VAESDECLASTYrm	1910
-VAESDECLASTYrr	1911
-VAESDECLASTZ	1912
-VAESDECLASTZrm	1913
-VAESDECLASTZrr	1914
-VAESDECLASTrm	1915
-VAESDECLASTrr	1916
-VAESDECYrm	1917
-VAESDECYrr	1918
-VAESDECZ	1919
-VAESDECZrm	1920
-VAESDECZrr	1921
-VAESDECrm	1922
-VAESDECrr	1923
-VAESENCLASTYrm	1924
-VAESENCLASTYrr	1925
-VAESENCLASTZ	1926
-VAESENCLASTZrm	1927
-VAESENCLASTZrr	1928
-VAESENCLASTrm	1929
-VAESENCLASTrr	1930
-VAESENCYrm	1931
-VAESENCYrr	1932
-VAESENCZ	1933
-VAESENCZrm	1934
-VAESENCZrr	1935
-VAESENCrm	1936
-VAESENCrr	1937
-VAESIMCrm	1938
-VAESIMCrr	1939
-VAESKEYGENASSISTrmi	1940
-VAESKEYGENASSISTrri	1941
-VALIGNDZ	1942
-VALIGNDZrmbi	1943
-VALIGNDZrmbik	1944
-VALIGNDZrmbikz	1945
-VALIGNDZrmi	1946
-VALIGNDZrmik	1947
-VALIGNDZrmikz	1948
-VALIGNDZrri	1949
-VALIGNDZrrik	1950
-VALIGNDZrrikz	1951
-VALIGNQZ	1952
-VALIGNQZrmbi	1953
-VALIGNQZrmbik	1954
-VALIGNQZrmbikz	1955
-VALIGNQZrmi	1956
-VALIGNQZrmik	1957
-VALIGNQZrmikz	1958
-VALIGNQZrri	1959
-VALIGNQZrrik	1960
-VALIGNQZrrikz	1961
-VANDNPDYrm	1962
-VANDNPDYrr	1963
-VANDNPDZ	1964
-VANDNPDZrm	1965
-VANDNPDZrmb	1966
-VANDNPDZrmbk	1967
-VANDNPDZrmbkz	1968
-VANDNPDZrmk	1969
-VANDNPDZrmkz	1970
-VANDNPDZrr	1971
-VANDNPDZrrk	1972
-VANDNPDZrrkz	1973
-VANDNPDrm	1974
-VANDNPDrr	1975
-VANDNPSYrm	1976
-VANDNPSYrr	1977
-VANDNPSZ	1978
-VANDNPSZrm	1979
-VANDNPSZrmb	1980
-VANDNPSZrmbk	1981
-VANDNPSZrmbkz	1982
-VANDNPSZrmk	1983
-VANDNPSZrmkz	1984
-VANDNPSZrr	1985
-VANDNPSZrrk	1986
-VANDNPSZrrkz	1987
-VANDNPSrm	1988
-VANDNPSrr	1989
-VANDPDYrm	1990
-VANDPDYrr	1991
-VANDPDZ	1992
-VANDPDZrm	1993
-VANDPDZrmb	1994
-VANDPDZrmbk	1995
-VANDPDZrmbkz	1996
-VANDPDZrmk	1997
-VANDPDZrmkz	1998
-VANDPDZrr	1999
-VANDPDZrrk	2000
-VANDPDZrrkz	2001
-VANDPDrm	2002
-VANDPDrr	2003
-VANDPSYrm	2004
-VANDPSYrr	2005
-VANDPSZ	2006
-VANDPSZrm	2007
-VANDPSZrmb	2008
-VANDPSZrmbk	2009
-VANDPSZrmbkz	2010
-VANDPSZrmk	2011
-VANDPSZrmkz	2012
-VANDPSZrr	2013
-VANDPSZrrk	2014
-VANDPSZrrkz	2015
-VANDPSrm	2016
-VANDPSrr	2017
-VASTART_SAVE_XMM_REGS	2018
-VBCSTNEBF	2019
-VBCSTNESH	2020
-VBLENDMPDZ	2021
-VBLENDMPDZrm	2022
-VBLENDMPDZrmb	2023
-VBLENDMPDZrmbk	2024
-VBLENDMPDZrmbkz	2025
-VBLENDMPDZrmk	2026
-VBLENDMPDZrmkz	2027
-VBLENDMPDZrr	2028
-VBLENDMPDZrrk	2029
-VBLENDMPDZrrkz	2030
-VBLENDMPSZ	2031
-VBLENDMPSZrm	2032
-VBLENDMPSZrmb	2033
-VBLENDMPSZrmbk	2034
-VBLENDMPSZrmbkz	2035
-VBLENDMPSZrmk	2036
-VBLENDMPSZrmkz	2037
-VBLENDMPSZrr	2038
-VBLENDMPSZrrk	2039
-VBLENDMPSZrrkz	2040
-VBLENDPDYrmi	2041
-VBLENDPDYrri	2042
-VBLENDPDrmi	2043
-VBLENDPDrri	2044
-VBLENDPSYrmi	2045
-VBLENDPSYrri	2046
-VBLENDPSrmi	2047
-VBLENDPSrri	2048
-VBLENDVPDYrmr	2049
-VBLENDVPDYrrr	2050
-VBLENDVPDrmr	2051
-VBLENDVPDrrr	2052
-VBLENDVPSYrmr	2053
-VBLENDVPSYrrr	2054
-VBLENDVPSrmr	2055
-VBLENDVPSrrr	2056
-VBROADCASTF	2057
-VBROADCASTI	2058
-VBROADCASTSDYrm	2059
-VBROADCASTSDYrr	2060
-VBROADCASTSDZ	2061
-VBROADCASTSDZrm	2062
-VBROADCASTSDZrmk	2063
-VBROADCASTSDZrmkz	2064
-VBROADCASTSDZrr	2065
-VBROADCASTSDZrrk	2066
-VBROADCASTSDZrrkz	2067
-VBROADCASTSSYrm	2068
-VBROADCASTSSYrr	2069
-VBROADCASTSSZ	2070
-VBROADCASTSSZrm	2071
-VBROADCASTSSZrmk	2072
-VBROADCASTSSZrmkz	2073
-VBROADCASTSSZrr	2074
-VBROADCASTSSZrrk	2075
-VBROADCASTSSZrrkz	2076
-VBROADCASTSSrm	2077
-VBROADCASTSSrr	2078
-VCMPBF	2079
-VCMPPDYrmi	2080
-VCMPPDYrri	2081
-VCMPPDZ	2082
-VCMPPDZrmbi	2083
-VCMPPDZrmbik	2084
-VCMPPDZrmi	2085
-VCMPPDZrmik	2086
-VCMPPDZrri	2087
-VCMPPDZrrib	2088
-VCMPPDZrribk	2089
-VCMPPDZrrik	2090
-VCMPPDrmi	2091
-VCMPPDrri	2092
-VCMPPHZ	2093
-VCMPPHZrmbi	2094
-VCMPPHZrmbik	2095
-VCMPPHZrmi	2096
-VCMPPHZrmik	2097
-VCMPPHZrri	2098
-VCMPPHZrrib	2099
-VCMPPHZrribk	2100
-VCMPPHZrrik	2101
-VCMPPSYrmi	2102
-VCMPPSYrri	2103
-VCMPPSZ	2104
-VCMPPSZrmbi	2105
-VCMPPSZrmbik	2106
-VCMPPSZrmi	2107
-VCMPPSZrmik	2108
-VCMPPSZrri	2109
-VCMPPSZrrib	2110
-VCMPPSZrribk	2111
-VCMPPSZrrik	2112
-VCMPPSrmi	2113
-VCMPPSrri	2114
-VCMPSDZrmi	2115
-VCMPSDZrmi_Int	2116
-VCMPSDZrmik_Int	2117
-VCMPSDZrri	2118
-VCMPSDZrri_Int	2119
-VCMPSDZrrib_Int	2120
-VCMPSDZrribk_Int	2121
-VCMPSDZrrik_Int	2122
-VCMPSDrmi	2123
-VCMPSDrmi_Int	2124
-VCMPSDrri	2125
-VCMPSDrri_Int	2126
-VCMPSHZrmi	2127
-VCMPSHZrmi_Int	2128
-VCMPSHZrmik_Int	2129
-VCMPSHZrri	2130
-VCMPSHZrri_Int	2131
-VCMPSHZrrib_Int	2132
-VCMPSHZrribk_Int	2133
-VCMPSHZrrik_Int	2134
-VCMPSSZrmi	2135
-VCMPSSZrmi_Int	2136
-VCMPSSZrmik_Int	2137
-VCMPSSZrri	2138
-VCMPSSZrri_Int	2139
-VCMPSSZrrib_Int	2140
-VCMPSSZrribk_Int	2141
-VCMPSSZrrik_Int	2142
-VCMPSSrmi	2143
-VCMPSSrmi_Int	2144
-VCMPSSrri	2145
-VCMPSSrri_Int	2146
-VCOMISBF	2147
-VCOMISDZrm	2148
-VCOMISDZrm_Int	2149
-VCOMISDZrr	2150
-VCOMISDZrr_Int	2151
-VCOMISDZrrb	2152
-VCOMISDrm	2153
-VCOMISDrm_Int	2154
-VCOMISDrr	2155
-VCOMISDrr_Int	2156
-VCOMISHZrm	2157
-VCOMISHZrm_Int	2158
-VCOMISHZrr	2159
-VCOMISHZrr_Int	2160
-VCOMISHZrrb	2161
-VCOMISSZrm	2162
-VCOMISSZrm_Int	2163
-VCOMISSZrr	2164
-VCOMISSZrr_Int	2165
-VCOMISSZrrb	2166
-VCOMISSrm	2167
-VCOMISSrm_Int	2168
-VCOMISSrr	2169
-VCOMISSrr_Int	2170
-VCOMPRESSPDZ	2171
-VCOMPRESSPDZmr	2172
-VCOMPRESSPDZmrk	2173
-VCOMPRESSPDZrr	2174
-VCOMPRESSPDZrrk	2175
-VCOMPRESSPDZrrkz	2176
-VCOMPRESSPSZ	2177
-VCOMPRESSPSZmr	2178
-VCOMPRESSPSZmrk	2179
-VCOMPRESSPSZrr	2180
-VCOMPRESSPSZrrk	2181
-VCOMPRESSPSZrrkz	2182
-VCOMXSDZrm_Int	2183
-VCOMXSDZrr_Int	2184
-VCOMXSDZrrb_Int	2185
-VCOMXSHZrm_Int	2186
-VCOMXSHZrr_Int	2187
-VCOMXSHZrrb_Int	2188
-VCOMXSSZrm_Int	2189
-VCOMXSSZrr_Int	2190
-VCOMXSSZrrb_Int	2191
-VCVT	2192
-VCVTBF	2193
-VCVTBIASPH	2194
-VCVTDQ	2195
-VCVTHF	2196
-VCVTNE	2197
-VCVTNEEBF	2198
-VCVTNEEPH	2199
-VCVTNEOBF	2200
-VCVTNEOPH	2201
-VCVTNEPS	2202
-VCVTPD	2203
-VCVTPH	2204
-VCVTPS	2205
-VCVTQQ	2206
-VCVTSD	2207
-VCVTSH	2208
-VCVTSI	2209
-VCVTSS	2210
-VCVTTBF	2211
-VCVTTPD	2212
-VCVTTPH	2213
-VCVTTPS	2214
-VCVTTSD	2215
-VCVTTSH	2216
-VCVTTSS	2217
-VCVTUDQ	2218
-VCVTUQQ	2219
-VCVTUSI	2220
-VCVTUW	2221
-VCVTW	2222
-VDBPSADBWZ	2223
-VDBPSADBWZrmi	2224
-VDBPSADBWZrmik	2225
-VDBPSADBWZrmikz	2226
-VDBPSADBWZrri	2227
-VDBPSADBWZrrik	2228
-VDBPSADBWZrrikz	2229
-VDIVBF	2230
-VDIVPDYrm	2231
-VDIVPDYrr	2232
-VDIVPDZ	2233
-VDIVPDZrm	2234
-VDIVPDZrmb	2235
-VDIVPDZrmbk	2236
-VDIVPDZrmbkz	2237
-VDIVPDZrmk	2238
-VDIVPDZrmkz	2239
-VDIVPDZrr	2240
-VDIVPDZrrb	2241
-VDIVPDZrrbk	2242
-VDIVPDZrrbkz	2243
-VDIVPDZrrk	2244
-VDIVPDZrrkz	2245
-VDIVPDrm	2246
-VDIVPDrr	2247
-VDIVPHZ	2248
-VDIVPHZrm	2249
-VDIVPHZrmb	2250
-VDIVPHZrmbk	2251
-VDIVPHZrmbkz	2252
-VDIVPHZrmk	2253
-VDIVPHZrmkz	2254
-VDIVPHZrr	2255
-VDIVPHZrrb	2256
-VDIVPHZrrbk	2257
-VDIVPHZrrbkz	2258
-VDIVPHZrrk	2259
-VDIVPHZrrkz	2260
-VDIVPSYrm	2261
-VDIVPSYrr	2262
-VDIVPSZ	2263
-VDIVPSZrm	2264
-VDIVPSZrmb	2265
-VDIVPSZrmbk	2266
-VDIVPSZrmbkz	2267
-VDIVPSZrmk	2268
-VDIVPSZrmkz	2269
-VDIVPSZrr	2270
-VDIVPSZrrb	2271
-VDIVPSZrrbk	2272
-VDIVPSZrrbkz	2273
-VDIVPSZrrk	2274
-VDIVPSZrrkz	2275
-VDIVPSrm	2276
-VDIVPSrr	2277
-VDIVSDZrm	2278
-VDIVSDZrm_Int	2279
-VDIVSDZrmk_Int	2280
-VDIVSDZrmkz_Int	2281
-VDIVSDZrr	2282
-VDIVSDZrr_Int	2283
-VDIVSDZrrb_Int	2284
-VDIVSDZrrbk_Int	2285
-VDIVSDZrrbkz_Int	2286
-VDIVSDZrrk_Int	2287
-VDIVSDZrrkz_Int	2288
-VDIVSDrm	2289
-VDIVSDrm_Int	2290
-VDIVSDrr	2291
-VDIVSDrr_Int	2292
-VDIVSHZrm	2293
-VDIVSHZrm_Int	2294
-VDIVSHZrmk_Int	2295
-VDIVSHZrmkz_Int	2296
-VDIVSHZrr	2297
-VDIVSHZrr_Int	2298
-VDIVSHZrrb_Int	2299
-VDIVSHZrrbk_Int	2300
-VDIVSHZrrbkz_Int	2301
-VDIVSHZrrk_Int	2302
-VDIVSHZrrkz_Int	2303
-VDIVSSZrm	2304
-VDIVSSZrm_Int	2305
-VDIVSSZrmk_Int	2306
-VDIVSSZrmkz_Int	2307
-VDIVSSZrr	2308
-VDIVSSZrr_Int	2309
-VDIVSSZrrb_Int	2310
-VDIVSSZrrbk_Int	2311
-VDIVSSZrrbkz_Int	2312
-VDIVSSZrrk_Int	2313
-VDIVSSZrrkz_Int	2314
-VDIVSSrm	2315
-VDIVSSrm_Int	2316
-VDIVSSrr	2317
-VDIVSSrr_Int	2318
-VDPBF	2319
-VDPPDrmi	2320
-VDPPDrri	2321
-VDPPHPSZ	2322
-VDPPHPSZm	2323
-VDPPHPSZmb	2324
-VDPPHPSZmbk	2325
-VDPPHPSZmbkz	2326
-VDPPHPSZmk	2327
-VDPPHPSZmkz	2328
-VDPPHPSZr	2329
-VDPPHPSZrk	2330
-VDPPHPSZrkz	2331
-VDPPSYrmi	2332
-VDPPSYrri	2333
-VDPPSrmi	2334
-VDPPSrri	2335
-VERRm	2336
-VERRr	2337
-VERWm	2338
-VERWr	2339
-VEXP	2340
-VEXPANDPDZ	2341
-VEXPANDPDZrm	2342
-VEXPANDPDZrmk	2343
-VEXPANDPDZrmkz	2344
-VEXPANDPDZrr	2345
-VEXPANDPDZrrk	2346
-VEXPANDPDZrrkz	2347
-VEXPANDPSZ	2348
-VEXPANDPSZrm	2349
-VEXPANDPSZrmk	2350
-VEXPANDPSZrmkz	2351
-VEXPANDPSZrr	2352
-VEXPANDPSZrrk	2353
-VEXPANDPSZrrkz	2354
-VEXTRACTF	2355
-VEXTRACTI	2356
-VEXTRACTPSZmri	2357
-VEXTRACTPSZrri	2358
-VEXTRACTPSmri	2359
-VEXTRACTPSrri	2360
-VFCMADDCPHZ	2361
-VFCMADDCPHZm	2362
-VFCMADDCPHZmb	2363
-VFCMADDCPHZmbk	2364
-VFCMADDCPHZmbkz	2365
-VFCMADDCPHZmk	2366
-VFCMADDCPHZmkz	2367
-VFCMADDCPHZr	2368
-VFCMADDCPHZrb	2369
-VFCMADDCPHZrbk	2370
-VFCMADDCPHZrbkz	2371
-VFCMADDCPHZrk	2372
-VFCMADDCPHZrkz	2373
-VFCMADDCSHZm	2374
-VFCMADDCSHZmk	2375
-VFCMADDCSHZmkz	2376
-VFCMADDCSHZr	2377
-VFCMADDCSHZrb	2378
-VFCMADDCSHZrbk	2379
-VFCMADDCSHZrbkz	2380
-VFCMADDCSHZrk	2381
-VFCMADDCSHZrkz	2382
-VFCMULCPHZ	2383
-VFCMULCPHZrm	2384
-VFCMULCPHZrmb	2385
-VFCMULCPHZrmbk	2386
-VFCMULCPHZrmbkz	2387
-VFCMULCPHZrmk	2388
-VFCMULCPHZrmkz	2389
-VFCMULCPHZrr	2390
-VFCMULCPHZrrb	2391
-VFCMULCPHZrrbk	2392
-VFCMULCPHZrrbkz	2393
-VFCMULCPHZrrk	2394
-VFCMULCPHZrrkz	2395
-VFCMULCSHZrm	2396
-VFCMULCSHZrmk	2397
-VFCMULCSHZrmkz	2398
-VFCMULCSHZrr	2399
-VFCMULCSHZrrb	2400
-VFCMULCSHZrrbk	2401
-VFCMULCSHZrrbkz	2402
-VFCMULCSHZrrk	2403
-VFCMULCSHZrrkz	2404
-VFIXUPIMMPDZ	2405
-VFIXUPIMMPDZrmbi	2406
-VFIXUPIMMPDZrmbik	2407
-VFIXUPIMMPDZrmbikz	2408
-VFIXUPIMMPDZrmi	2409
-VFIXUPIMMPDZrmik	2410
-VFIXUPIMMPDZrmikz	2411
-VFIXUPIMMPDZrri	2412
-VFIXUPIMMPDZrrib	2413
-VFIXUPIMMPDZrribk	2414
-VFIXUPIMMPDZrribkz	2415
-VFIXUPIMMPDZrrik	2416
-VFIXUPIMMPDZrrikz	2417
-VFIXUPIMMPSZ	2418
-VFIXUPIMMPSZrmbi	2419
-VFIXUPIMMPSZrmbik	2420
-VFIXUPIMMPSZrmbikz	2421
-VFIXUPIMMPSZrmi	2422
-VFIXUPIMMPSZrmik	2423
-VFIXUPIMMPSZrmikz	2424
-VFIXUPIMMPSZrri	2425
-VFIXUPIMMPSZrrib	2426
-VFIXUPIMMPSZrribk	2427
-VFIXUPIMMPSZrribkz	2428
-VFIXUPIMMPSZrrik	2429
-VFIXUPIMMPSZrrikz	2430
-VFIXUPIMMSDZrmi	2431
-VFIXUPIMMSDZrmik	2432
-VFIXUPIMMSDZrmikz	2433
-VFIXUPIMMSDZrri	2434
-VFIXUPIMMSDZrrib	2435
-VFIXUPIMMSDZrribk	2436
-VFIXUPIMMSDZrribkz	2437
-VFIXUPIMMSDZrrik	2438
-VFIXUPIMMSDZrrikz	2439
-VFIXUPIMMSSZrmi	2440
-VFIXUPIMMSSZrmik	2441
-VFIXUPIMMSSZrmikz	2442
-VFIXUPIMMSSZrri	2443
-VFIXUPIMMSSZrrib	2444
-VFIXUPIMMSSZrribk	2445
-VFIXUPIMMSSZrribkz	2446
-VFIXUPIMMSSZrrik	2447
-VFIXUPIMMSSZrrikz	2448
-VFMADD	2449
-VFMADDCPHZ	2450
-VFMADDCPHZm	2451
-VFMADDCPHZmb	2452
-VFMADDCPHZmbk	2453
-VFMADDCPHZmbkz	2454
-VFMADDCPHZmk	2455
-VFMADDCPHZmkz	2456
-VFMADDCPHZr	2457
-VFMADDCPHZrb	2458
-VFMADDCPHZrbk	2459
-VFMADDCPHZrbkz	2460
-VFMADDCPHZrk	2461
-VFMADDCPHZrkz	2462
-VFMADDCSHZm	2463
-VFMADDCSHZmk	2464
-VFMADDCSHZmkz	2465
-VFMADDCSHZr	2466
-VFMADDCSHZrb	2467
-VFMADDCSHZrbk	2468
-VFMADDCSHZrbkz	2469
-VFMADDCSHZrk	2470
-VFMADDCSHZrkz	2471
-VFMADDPD	2472
-VFMADDPS	2473
-VFMADDSD	2474
-VFMADDSS	2475
-VFMADDSUB	2476
-VFMADDSUBPD	2477
-VFMADDSUBPS	2478
-VFMSUB	2479
-VFMSUBADD	2480
-VFMSUBADDPD	2481
-VFMSUBADDPS	2482
-VFMSUBPD	2483
-VFMSUBPS	2484
-VFMSUBSD	2485
-VFMSUBSS	2486
-VFMULCPHZ	2487
-VFMULCPHZrm	2488
-VFMULCPHZrmb	2489
-VFMULCPHZrmbk	2490
-VFMULCPHZrmbkz	2491
-VFMULCPHZrmk	2492
-VFMULCPHZrmkz	2493
-VFMULCPHZrr	2494
-VFMULCPHZrrb	2495
-VFMULCPHZrrbk	2496
-VFMULCPHZrrbkz	2497
-VFMULCPHZrrk	2498
-VFMULCPHZrrkz	2499
-VFMULCSHZrm	2500
-VFMULCSHZrmk	2501
-VFMULCSHZrmkz	2502
-VFMULCSHZrr	2503
-VFMULCSHZrrb	2504
-VFMULCSHZrrbk	2505
-VFMULCSHZrrbkz	2506
-VFMULCSHZrrk	2507
-VFMULCSHZrrkz	2508
-VFNMADD	2509
-VFNMADDPD	2510
-VFNMADDPS	2511
-VFNMADDSD	2512
-VFNMADDSS	2513
-VFNMSUB	2514
-VFNMSUBPD	2515
-VFNMSUBPS	2516
-VFNMSUBSD	2517
-VFNMSUBSS	2518
-VFPCLASSBF	2519
-VFPCLASSPDZ	2520
-VFPCLASSPDZmbi	2521
-VFPCLASSPDZmbik	2522
-VFPCLASSPDZmi	2523
-VFPCLASSPDZmik	2524
-VFPCLASSPDZri	2525
-VFPCLASSPDZrik	2526
-VFPCLASSPHZ	2527
-VFPCLASSPHZmbi	2528
-VFPCLASSPHZmbik	2529
-VFPCLASSPHZmi	2530
-VFPCLASSPHZmik	2531
-VFPCLASSPHZri	2532
-VFPCLASSPHZrik	2533
-VFPCLASSPSZ	2534
-VFPCLASSPSZmbi	2535
-VFPCLASSPSZmbik	2536
-VFPCLASSPSZmi	2537
-VFPCLASSPSZmik	2538
-VFPCLASSPSZri	2539
-VFPCLASSPSZrik	2540
-VFPCLASSSDZmi	2541
-VFPCLASSSDZmik	2542
-VFPCLASSSDZri	2543
-VFPCLASSSDZrik	2544
-VFPCLASSSHZmi	2545
-VFPCLASSSHZmik	2546
-VFPCLASSSHZri	2547
-VFPCLASSSHZrik	2548
-VFPCLASSSSZmi	2549
-VFPCLASSSSZmik	2550
-VFPCLASSSSZri	2551
-VFPCLASSSSZrik	2552
-VFRCZPDYrm	2553
-VFRCZPDYrr	2554
-VFRCZPDrm	2555
-VFRCZPDrr	2556
-VFRCZPSYrm	2557
-VFRCZPSYrr	2558
-VFRCZPSrm	2559
-VFRCZPSrr	2560
-VFRCZSDrm	2561
-VFRCZSDrr	2562
-VFRCZSSrm	2563
-VFRCZSSrr	2564
-VGATHERDPDYrm	2565
-VGATHERDPDZ	2566
-VGATHERDPDZrm	2567
-VGATHERDPDrm	2568
-VGATHERDPSYrm	2569
-VGATHERDPSZ	2570
-VGATHERDPSZrm	2571
-VGATHERDPSrm	2572
-VGATHERPF	2573
-VGATHERQPDYrm	2574
-VGATHERQPDZ	2575
-VGATHERQPDZrm	2576
-VGATHERQPDrm	2577
-VGATHERQPSYrm	2578
-VGATHERQPSZ	2579
-VGATHERQPSZrm	2580
-VGATHERQPSrm	2581
-VGETEXPBF	2582
-VGETEXPPDZ	2583
-VGETEXPPDZm	2584
-VGETEXPPDZmb	2585
-VGETEXPPDZmbk	2586
-VGETEXPPDZmbkz	2587
-VGETEXPPDZmk	2588
-VGETEXPPDZmkz	2589
-VGETEXPPDZr	2590
-VGETEXPPDZrb	2591
-VGETEXPPDZrbk	2592
-VGETEXPPDZrbkz	2593
-VGETEXPPDZrk	2594
-VGETEXPPDZrkz	2595
-VGETEXPPHZ	2596
-VGETEXPPHZm	2597
-VGETEXPPHZmb	2598
-VGETEXPPHZmbk	2599
-VGETEXPPHZmbkz	2600
-VGETEXPPHZmk	2601
-VGETEXPPHZmkz	2602
-VGETEXPPHZr	2603
-VGETEXPPHZrb	2604
-VGETEXPPHZrbk	2605
-VGETEXPPHZrbkz	2606
-VGETEXPPHZrk	2607
-VGETEXPPHZrkz	2608
-VGETEXPPSZ	2609
-VGETEXPPSZm	2610
-VGETEXPPSZmb	2611
-VGETEXPPSZmbk	2612
-VGETEXPPSZmbkz	2613
-VGETEXPPSZmk	2614
-VGETEXPPSZmkz	2615
-VGETEXPPSZr	2616
-VGETEXPPSZrb	2617
-VGETEXPPSZrbk	2618
-VGETEXPPSZrbkz	2619
-VGETEXPPSZrk	2620
-VGETEXPPSZrkz	2621
-VGETEXPSDZm	2622
-VGETEXPSDZmk	2623
-VGETEXPSDZmkz	2624
-VGETEXPSDZr	2625
-VGETEXPSDZrb	2626
-VGETEXPSDZrbk	2627
-VGETEXPSDZrbkz	2628
-VGETEXPSDZrk	2629
-VGETEXPSDZrkz	2630
-VGETEXPSHZm	2631
-VGETEXPSHZmk	2632
-VGETEXPSHZmkz	2633
-VGETEXPSHZr	2634
-VGETEXPSHZrb	2635
-VGETEXPSHZrbk	2636
-VGETEXPSHZrbkz	2637
-VGETEXPSHZrk	2638
-VGETEXPSHZrkz	2639
-VGETEXPSSZm	2640
-VGETEXPSSZmk	2641
-VGETEXPSSZmkz	2642
-VGETEXPSSZr	2643
-VGETEXPSSZrb	2644
-VGETEXPSSZrbk	2645
-VGETEXPSSZrbkz	2646
-VGETEXPSSZrk	2647
-VGETEXPSSZrkz	2648
-VGETMANTBF	2649
-VGETMANTPDZ	2650
-VGETMANTPDZrmbi	2651
-VGETMANTPDZrmbik	2652
-VGETMANTPDZrmbikz	2653
-VGETMANTPDZrmi	2654
-VGETMANTPDZrmik	2655
-VGETMANTPDZrmikz	2656
-VGETMANTPDZrri	2657
-VGETMANTPDZrrib	2658
-VGETMANTPDZrribk	2659
-VGETMANTPDZrribkz	2660
-VGETMANTPDZrrik	2661
-VGETMANTPDZrrikz	2662
-VGETMANTPHZ	2663
-VGETMANTPHZrmbi	2664
-VGETMANTPHZrmbik	2665
-VGETMANTPHZrmbikz	2666
-VGETMANTPHZrmi	2667
-VGETMANTPHZrmik	2668
-VGETMANTPHZrmikz	2669
-VGETMANTPHZrri	2670
-VGETMANTPHZrrib	2671
-VGETMANTPHZrribk	2672
-VGETMANTPHZrribkz	2673
-VGETMANTPHZrrik	2674
-VGETMANTPHZrrikz	2675
-VGETMANTPSZ	2676
-VGETMANTPSZrmbi	2677
-VGETMANTPSZrmbik	2678
-VGETMANTPSZrmbikz	2679
-VGETMANTPSZrmi	2680
-VGETMANTPSZrmik	2681
-VGETMANTPSZrmikz	2682
-VGETMANTPSZrri	2683
-VGETMANTPSZrrib	2684
-VGETMANTPSZrribk	2685
-VGETMANTPSZrribkz	2686
-VGETMANTPSZrrik	2687
-VGETMANTPSZrrikz	2688
-VGETMANTSDZrmi	2689
-VGETMANTSDZrmik	2690
-VGETMANTSDZrmikz	2691
-VGETMANTSDZrri	2692
-VGETMANTSDZrrib	2693
-VGETMANTSDZrribk	2694
-VGETMANTSDZrribkz	2695
-VGETMANTSDZrrik	2696
-VGETMANTSDZrrikz	2697
-VGETMANTSHZrmi	2698
-VGETMANTSHZrmik	2699
-VGETMANTSHZrmikz	2700
-VGETMANTSHZrri	2701
-VGETMANTSHZrrib	2702
-VGETMANTSHZrribk	2703
-VGETMANTSHZrribkz	2704
-VGETMANTSHZrrik	2705
-VGETMANTSHZrrikz	2706
-VGETMANTSSZrmi	2707
-VGETMANTSSZrmik	2708
-VGETMANTSSZrmikz	2709
-VGETMANTSSZrri	2710
-VGETMANTSSZrrib	2711
-VGETMANTSSZrribk	2712
-VGETMANTSSZrribkz	2713
-VGETMANTSSZrrik	2714
-VGETMANTSSZrrikz	2715
-VGF	2716
-VHADDPDYrm	2717
-VHADDPDYrr	2718
-VHADDPDrm	2719
-VHADDPDrr	2720
-VHADDPSYrm	2721
-VHADDPSYrr	2722
-VHADDPSrm	2723
-VHADDPSrr	2724
-VHSUBPDYrm	2725
-VHSUBPDYrr	2726
-VHSUBPDrm	2727
-VHSUBPDrr	2728
-VHSUBPSYrm	2729
-VHSUBPSYrr	2730
-VHSUBPSrm	2731
-VHSUBPSrr	2732
-VINSERTF	2733
-VINSERTI	2734
-VINSERTPSZrmi	2735
-VINSERTPSZrri	2736
-VINSERTPSrmi	2737
-VINSERTPSrri	2738
-VLDDQUYrm	2739
-VLDDQUrm	2740
-VLDMXCSR	2741
-VMASKMOVDQU	2742
-VMASKMOVPDYmr	2743
-VMASKMOVPDYrm	2744
-VMASKMOVPDmr	2745
-VMASKMOVPDrm	2746
-VMASKMOVPSYmr	2747
-VMASKMOVPSYrm	2748
-VMASKMOVPSmr	2749
-VMASKMOVPSrm	2750
-VMAXBF	2751
-VMAXCPDYrm	2752
-VMAXCPDYrr	2753
-VMAXCPDZ	2754
-VMAXCPDZrm	2755
-VMAXCPDZrmb	2756
-VMAXCPDZrmbk	2757
-VMAXCPDZrmbkz	2758
-VMAXCPDZrmk	2759
-VMAXCPDZrmkz	2760
-VMAXCPDZrr	2761
-VMAXCPDZrrk	2762
-VMAXCPDZrrkz	2763
-VMAXCPDrm	2764
-VMAXCPDrr	2765
-VMAXCPHZ	2766
-VMAXCPHZrm	2767
-VMAXCPHZrmb	2768
-VMAXCPHZrmbk	2769
-VMAXCPHZrmbkz	2770
-VMAXCPHZrmk	2771
-VMAXCPHZrmkz	2772
-VMAXCPHZrr	2773
-VMAXCPHZrrk	2774
-VMAXCPHZrrkz	2775
-VMAXCPSYrm	2776
-VMAXCPSYrr	2777
-VMAXCPSZ	2778
-VMAXCPSZrm	2779
-VMAXCPSZrmb	2780
-VMAXCPSZrmbk	2781
-VMAXCPSZrmbkz	2782
-VMAXCPSZrmk	2783
-VMAXCPSZrmkz	2784
-VMAXCPSZrr	2785
-VMAXCPSZrrk	2786
-VMAXCPSZrrkz	2787
-VMAXCPSrm	2788
-VMAXCPSrr	2789
-VMAXCSDZrm	2790
-VMAXCSDZrr	2791
-VMAXCSDrm	2792
-VMAXCSDrr	2793
-VMAXCSHZrm	2794
-VMAXCSHZrr	2795
-VMAXCSSZrm	2796
-VMAXCSSZrr	2797
-VMAXCSSrm	2798
-VMAXCSSrr	2799
-VMAXPDYrm	2800
-VMAXPDYrr	2801
-VMAXPDZ	2802
-VMAXPDZrm	2803
-VMAXPDZrmb	2804
-VMAXPDZrmbk	2805
-VMAXPDZrmbkz	2806
-VMAXPDZrmk	2807
-VMAXPDZrmkz	2808
-VMAXPDZrr	2809
-VMAXPDZrrb	2810
-VMAXPDZrrbk	2811
-VMAXPDZrrbkz	2812
-VMAXPDZrrk	2813
-VMAXPDZrrkz	2814
-VMAXPDrm	2815
-VMAXPDrr	2816
-VMAXPHZ	2817
-VMAXPHZrm	2818
-VMAXPHZrmb	2819
-VMAXPHZrmbk	2820
-VMAXPHZrmbkz	2821
-VMAXPHZrmk	2822
-VMAXPHZrmkz	2823
-VMAXPHZrr	2824
-VMAXPHZrrb	2825
-VMAXPHZrrbk	2826
-VMAXPHZrrbkz	2827
-VMAXPHZrrk	2828
-VMAXPHZrrkz	2829
-VMAXPSYrm	2830
-VMAXPSYrr	2831
-VMAXPSZ	2832
-VMAXPSZrm	2833
-VMAXPSZrmb	2834
-VMAXPSZrmbk	2835
-VMAXPSZrmbkz	2836
-VMAXPSZrmk	2837
-VMAXPSZrmkz	2838
-VMAXPSZrr	2839
-VMAXPSZrrb	2840
-VMAXPSZrrbk	2841
-VMAXPSZrrbkz	2842
-VMAXPSZrrk	2843
-VMAXPSZrrkz	2844
-VMAXPSrm	2845
-VMAXPSrr	2846
-VMAXSDZrm	2847
-VMAXSDZrm_Int	2848
-VMAXSDZrmk_Int	2849
-VMAXSDZrmkz_Int	2850
-VMAXSDZrr	2851
-VMAXSDZrr_Int	2852
-VMAXSDZrrb_Int	2853
-VMAXSDZrrbk_Int	2854
-VMAXSDZrrbkz_Int	2855
-VMAXSDZrrk_Int	2856
-VMAXSDZrrkz_Int	2857
-VMAXSDrm	2858
-VMAXSDrm_Int	2859
-VMAXSDrr	2860
-VMAXSDrr_Int	2861
-VMAXSHZrm	2862
-VMAXSHZrm_Int	2863
-VMAXSHZrmk_Int	2864
-VMAXSHZrmkz_Int	2865
-VMAXSHZrr	2866
-VMAXSHZrr_Int	2867
-VMAXSHZrrb_Int	2868
-VMAXSHZrrbk_Int	2869
-VMAXSHZrrbkz_Int	2870
-VMAXSHZrrk_Int	2871
-VMAXSHZrrkz_Int	2872
-VMAXSSZrm	2873
-VMAXSSZrm_Int	2874
-VMAXSSZrmk_Int	2875
-VMAXSSZrmkz_Int	2876
-VMAXSSZrr	2877
-VMAXSSZrr_Int	2878
-VMAXSSZrrb_Int	2879
-VMAXSSZrrbk_Int	2880
-VMAXSSZrrbkz_Int	2881
-VMAXSSZrrk_Int	2882
-VMAXSSZrrkz_Int	2883
-VMAXSSrm	2884
-VMAXSSrm_Int	2885
-VMAXSSrr	2886
-VMAXSSrr_Int	2887
-VMCALL	2888
-VMCLEARm	2889
-VMFUNC	2890
-VMINBF	2891
-VMINCPDYrm	2892
-VMINCPDYrr	2893
-VMINCPDZ	2894
-VMINCPDZrm	2895
-VMINCPDZrmb	2896
-VMINCPDZrmbk	2897
-VMINCPDZrmbkz	2898
-VMINCPDZrmk	2899
-VMINCPDZrmkz	2900
-VMINCPDZrr	2901
-VMINCPDZrrk	2902
-VMINCPDZrrkz	2903
-VMINCPDrm	2904
-VMINCPDrr	2905
-VMINCPHZ	2906
-VMINCPHZrm	2907
-VMINCPHZrmb	2908
-VMINCPHZrmbk	2909
-VMINCPHZrmbkz	2910
-VMINCPHZrmk	2911
-VMINCPHZrmkz	2912
-VMINCPHZrr	2913
-VMINCPHZrrk	2914
-VMINCPHZrrkz	2915
-VMINCPSYrm	2916
-VMINCPSYrr	2917
-VMINCPSZ	2918
-VMINCPSZrm	2919
-VMINCPSZrmb	2920
-VMINCPSZrmbk	2921
-VMINCPSZrmbkz	2922
-VMINCPSZrmk	2923
-VMINCPSZrmkz	2924
-VMINCPSZrr	2925
-VMINCPSZrrk	2926
-VMINCPSZrrkz	2927
-VMINCPSrm	2928
-VMINCPSrr	2929
-VMINCSDZrm	2930
-VMINCSDZrr	2931
-VMINCSDrm	2932
-VMINCSDrr	2933
-VMINCSHZrm	2934
-VMINCSHZrr	2935
-VMINCSSZrm	2936
-VMINCSSZrr	2937
-VMINCSSrm	2938
-VMINCSSrr	2939
-VMINMAXBF	2940
-VMINMAXPDZ	2941
-VMINMAXPDZrmbi	2942
-VMINMAXPDZrmbik	2943
-VMINMAXPDZrmbikz	2944
-VMINMAXPDZrmi	2945
-VMINMAXPDZrmik	2946
-VMINMAXPDZrmikz	2947
-VMINMAXPDZrri	2948
-VMINMAXPDZrrib	2949
-VMINMAXPDZrribk	2950
-VMINMAXPDZrribkz	2951
-VMINMAXPDZrrik	2952
-VMINMAXPDZrrikz	2953
-VMINMAXPHZ	2954
-VMINMAXPHZrmbi	2955
-VMINMAXPHZrmbik	2956
-VMINMAXPHZrmbikz	2957
-VMINMAXPHZrmi	2958
-VMINMAXPHZrmik	2959
-VMINMAXPHZrmikz	2960
-VMINMAXPHZrri	2961
-VMINMAXPHZrrib	2962
-VMINMAXPHZrribk	2963
-VMINMAXPHZrribkz	2964
-VMINMAXPHZrrik	2965
-VMINMAXPHZrrikz	2966
-VMINMAXPSZ	2967
-VMINMAXPSZrmbi	2968
-VMINMAXPSZrmbik	2969
-VMINMAXPSZrmbikz	2970
-VMINMAXPSZrmi	2971
-VMINMAXPSZrmik	2972
-VMINMAXPSZrmikz	2973
-VMINMAXPSZrri	2974
-VMINMAXPSZrrib	2975
-VMINMAXPSZrribk	2976
-VMINMAXPSZrribkz	2977
-VMINMAXPSZrrik	2978
-VMINMAXPSZrrikz	2979
-VMINMAXSDrmi	2980
-VMINMAXSDrmi_Int	2981
-VMINMAXSDrmik_Int	2982
-VMINMAXSDrmikz_Int	2983
-VMINMAXSDrri	2984
-VMINMAXSDrri_Int	2985
-VMINMAXSDrrib_Int	2986
-VMINMAXSDrribk_Int	2987
-VMINMAXSDrribkz_Int	2988
-VMINMAXSDrrik_Int	2989
-VMINMAXSDrrikz_Int	2990
-VMINMAXSHrmi	2991
-VMINMAXSHrmi_Int	2992
-VMINMAXSHrmik_Int	2993
-VMINMAXSHrmikz_Int	2994
-VMINMAXSHrri	2995
-VMINMAXSHrri_Int	2996
-VMINMAXSHrrib_Int	2997
-VMINMAXSHrribk_Int	2998
-VMINMAXSHrribkz_Int	2999
-VMINMAXSHrrik_Int	3000
-VMINMAXSHrrikz_Int	3001
-VMINMAXSSrmi	3002
-VMINMAXSSrmi_Int	3003
-VMINMAXSSrmik_Int	3004
-VMINMAXSSrmikz_Int	3005
-VMINMAXSSrri	3006
-VMINMAXSSrri_Int	3007
-VMINMAXSSrrib_Int	3008
-VMINMAXSSrribk_Int	3009
-VMINMAXSSrribkz_Int	3010
-VMINMAXSSrrik_Int	3011
-VMINMAXSSrrikz_Int	3012
-VMINPDYrm	3013
-VMINPDYrr	3014
-VMINPDZ	3015
-VMINPDZrm	3016
-VMINPDZrmb	3017
-VMINPDZrmbk	3018
-VMINPDZrmbkz	3019
-VMINPDZrmk	3020
-VMINPDZrmkz	3021
-VMINPDZrr	3022
-VMINPDZrrb	3023
-VMINPDZrrbk	3024
-VMINPDZrrbkz	3025
-VMINPDZrrk	3026
-VMINPDZrrkz	3027
-VMINPDrm	3028
-VMINPDrr	3029
-VMINPHZ	3030
-VMINPHZrm	3031
-VMINPHZrmb	3032
-VMINPHZrmbk	3033
-VMINPHZrmbkz	3034
-VMINPHZrmk	3035
-VMINPHZrmkz	3036
-VMINPHZrr	3037
-VMINPHZrrb	3038
-VMINPHZrrbk	3039
-VMINPHZrrbkz	3040
-VMINPHZrrk	3041
-VMINPHZrrkz	3042
-VMINPSYrm	3043
-VMINPSYrr	3044
-VMINPSZ	3045
-VMINPSZrm	3046
-VMINPSZrmb	3047
-VMINPSZrmbk	3048
-VMINPSZrmbkz	3049
-VMINPSZrmk	3050
-VMINPSZrmkz	3051
-VMINPSZrr	3052
-VMINPSZrrb	3053
-VMINPSZrrbk	3054
-VMINPSZrrbkz	3055
-VMINPSZrrk	3056
-VMINPSZrrkz	3057
-VMINPSrm	3058
-VMINPSrr	3059
-VMINSDZrm	3060
-VMINSDZrm_Int	3061
-VMINSDZrmk_Int	3062
-VMINSDZrmkz_Int	3063
-VMINSDZrr	3064
-VMINSDZrr_Int	3065
-VMINSDZrrb_Int	3066
-VMINSDZrrbk_Int	3067
-VMINSDZrrbkz_Int	3068
-VMINSDZrrk_Int	3069
-VMINSDZrrkz_Int	3070
-VMINSDrm	3071
-VMINSDrm_Int	3072
-VMINSDrr	3073
-VMINSDrr_Int	3074
-VMINSHZrm	3075
-VMINSHZrm_Int	3076
-VMINSHZrmk_Int	3077
-VMINSHZrmkz_Int	3078
-VMINSHZrr	3079
-VMINSHZrr_Int	3080
-VMINSHZrrb_Int	3081
-VMINSHZrrbk_Int	3082
-VMINSHZrrbkz_Int	3083
-VMINSHZrrk_Int	3084
-VMINSHZrrkz_Int	3085
-VMINSSZrm	3086
-VMINSSZrm_Int	3087
-VMINSSZrmk_Int	3088
-VMINSSZrmkz_Int	3089
-VMINSSZrr	3090
-VMINSSZrr_Int	3091
-VMINSSZrrb_Int	3092
-VMINSSZrrbk_Int	3093
-VMINSSZrrbkz_Int	3094
-VMINSSZrrk_Int	3095
-VMINSSZrrkz_Int	3096
-VMINSSrm	3097
-VMINSSrm_Int	3098
-VMINSSrr	3099
-VMINSSrr_Int	3100
-VMLAUNCH	3101
-VMLOAD	3102
-VMMCALL	3103
-VMOV	3104
-VMOVAPDYmr	3105
-VMOVAPDYrm	3106
-VMOVAPDYrr	3107
-VMOVAPDYrr_REV	3108
-VMOVAPDZ	3109
-VMOVAPDZmr	3110
-VMOVAPDZmrk	3111
-VMOVAPDZrm	3112
-VMOVAPDZrmk	3113
-VMOVAPDZrmkz	3114
-VMOVAPDZrr	3115
-VMOVAPDZrr_REV	3116
-VMOVAPDZrrk	3117
-VMOVAPDZrrk_REV	3118
-VMOVAPDZrrkz	3119
-VMOVAPDZrrkz_REV	3120
-VMOVAPDmr	3121
-VMOVAPDrm	3122
-VMOVAPDrr	3123
-VMOVAPDrr_REV	3124
-VMOVAPSYmr	3125
-VMOVAPSYrm	3126
-VMOVAPSYrr	3127
-VMOVAPSYrr_REV	3128
-VMOVAPSZ	3129
-VMOVAPSZmr	3130
-VMOVAPSZmrk	3131
-VMOVAPSZrm	3132
-VMOVAPSZrmk	3133
-VMOVAPSZrmkz	3134
-VMOVAPSZrr	3135
-VMOVAPSZrr_REV	3136
-VMOVAPSZrrk	3137
-VMOVAPSZrrk_REV	3138
-VMOVAPSZrrkz	3139
-VMOVAPSZrrkz_REV	3140
-VMOVAPSmr	3141
-VMOVAPSrm	3142
-VMOVAPSrr	3143
-VMOVAPSrr_REV	3144
-VMOVDDUPYrm	3145
-VMOVDDUPYrr	3146
-VMOVDDUPZ	3147
-VMOVDDUPZrm	3148
-VMOVDDUPZrmk	3149
-VMOVDDUPZrmkz	3150
-VMOVDDUPZrr	3151
-VMOVDDUPZrrk	3152
-VMOVDDUPZrrkz	3153
-VMOVDDUPrm	3154
-VMOVDDUPrr	3155
-VMOVDI	3156
-VMOVDQA	3157
-VMOVDQAYmr	3158
-VMOVDQAYrm	3159
-VMOVDQAYrr	3160
-VMOVDQAYrr_REV	3161
-VMOVDQAmr	3162
-VMOVDQArm	3163
-VMOVDQArr	3164
-VMOVDQArr_REV	3165
-VMOVDQU	3166
-VMOVDQUYmr	3167
-VMOVDQUYrm	3168
-VMOVDQUYrr	3169
-VMOVDQUYrr_REV	3170
-VMOVDQUmr	3171
-VMOVDQUrm	3172
-VMOVDQUrr	3173
-VMOVDQUrr_REV	3174
-VMOVHLPSZrr	3175
-VMOVHLPSrr	3176
-VMOVHPDZ	3177
-VMOVHPDmr	3178
-VMOVHPDrm	3179
-VMOVHPSZ	3180
-VMOVHPSmr	3181
-VMOVHPSrm	3182
-VMOVLHPSZrr	3183
-VMOVLHPSrr	3184
-VMOVLPDZ	3185
-VMOVLPDmr	3186
-VMOVLPDrm	3187
-VMOVLPSZ	3188
-VMOVLPSmr	3189
-VMOVLPSrm	3190
-VMOVMSKPDYrr	3191
-VMOVMSKPDrr	3192
-VMOVMSKPSYrr	3193
-VMOVMSKPSrr	3194
-VMOVNTDQAYrm	3195
-VMOVNTDQAZ	3196
-VMOVNTDQAZrm	3197
-VMOVNTDQArm	3198
-VMOVNTDQYmr	3199
-VMOVNTDQZ	3200
-VMOVNTDQZmr	3201
-VMOVNTDQmr	3202
-VMOVNTPDYmr	3203
-VMOVNTPDZ	3204
-VMOVNTPDZmr	3205
-VMOVNTPDmr	3206
-VMOVNTPSYmr	3207
-VMOVNTPSZ	3208
-VMOVNTPSZmr	3209
-VMOVNTPSmr	3210
-VMOVPDI	3211
-VMOVPQI	3212
-VMOVPQIto	3213
-VMOVQI	3214
-VMOVRSBZ	3215
-VMOVRSBZm	3216
-VMOVRSBZmk	3217
-VMOVRSBZmkz	3218
-VMOVRSDZ	3219
-VMOVRSDZm	3220
-VMOVRSDZmk	3221
-VMOVRSDZmkz	3222
-VMOVRSQZ	3223
-VMOVRSQZm	3224
-VMOVRSQZmk	3225
-VMOVRSQZmkz	3226
-VMOVRSWZ	3227
-VMOVRSWZm	3228
-VMOVRSWZmk	3229
-VMOVRSWZmkz	3230
-VMOVSDZmr	3231
-VMOVSDZmrk	3232
-VMOVSDZrm	3233
-VMOVSDZrm_alt	3234
-VMOVSDZrmk	3235
-VMOVSDZrmkz	3236
-VMOVSDZrr	3237
-VMOVSDZrr_REV	3238
-VMOVSDZrrk	3239
-VMOVSDZrrk_REV	3240
-VMOVSDZrrkz	3241
-VMOVSDZrrkz_REV	3242
-VMOVSDmr	3243
-VMOVSDrm	3244
-VMOVSDrm_alt	3245
-VMOVSDrr	3246
-VMOVSDrr_REV	3247
-VMOVSDto	3248
-VMOVSH	3249
-VMOVSHDUPYrm	3250
-VMOVSHDUPYrr	3251
-VMOVSHDUPZ	3252
-VMOVSHDUPZrm	3253
-VMOVSHDUPZrmk	3254
-VMOVSHDUPZrmkz	3255
-VMOVSHDUPZrr	3256
-VMOVSHDUPZrrk	3257
-VMOVSHDUPZrrkz	3258
-VMOVSHDUPrm	3259
-VMOVSHDUPrr	3260
-VMOVSHZmr	3261
-VMOVSHZmrk	3262
-VMOVSHZrm	3263
-VMOVSHZrm_alt	3264
-VMOVSHZrmk	3265
-VMOVSHZrmkz	3266
-VMOVSHZrr	3267
-VMOVSHZrr_REV	3268
-VMOVSHZrrk	3269
-VMOVSHZrrk_REV	3270
-VMOVSHZrrkz	3271
-VMOVSHZrrkz_REV	3272
-VMOVSHtoW	3273
-VMOVSLDUPYrm	3274
-VMOVSLDUPYrr	3275
-VMOVSLDUPZ	3276
-VMOVSLDUPZrm	3277
-VMOVSLDUPZrmk	3278
-VMOVSLDUPZrmkz	3279
-VMOVSLDUPZrr	3280
-VMOVSLDUPZrrk	3281
-VMOVSLDUPZrrkz	3282
-VMOVSLDUPrm	3283
-VMOVSLDUPrr	3284
-VMOVSS	3285
-VMOVSSZmr	3286
-VMOVSSZmrk	3287
-VMOVSSZrm	3288
-VMOVSSZrm_alt	3289
-VMOVSSZrmk	3290
-VMOVSSZrmkz	3291
-VMOVSSZrr	3292
-VMOVSSZrr_REV	3293
-VMOVSSZrrk	3294
-VMOVSSZrrk_REV	3295
-VMOVSSZrrkz	3296
-VMOVSSZrrkz_REV	3297
-VMOVSSmr	3298
-VMOVSSrm	3299
-VMOVSSrm_alt	3300
-VMOVSSrr	3301
-VMOVSSrr_REV	3302
-VMOVUPDYmr	3303
-VMOVUPDYrm	3304
-VMOVUPDYrr	3305
-VMOVUPDYrr_REV	3306
-VMOVUPDZ	3307
-VMOVUPDZmr	3308
-VMOVUPDZmrk	3309
-VMOVUPDZrm	3310
-VMOVUPDZrmk	3311
-VMOVUPDZrmkz	3312
-VMOVUPDZrr	3313
-VMOVUPDZrr_REV	3314
-VMOVUPDZrrk	3315
-VMOVUPDZrrk_REV	3316
-VMOVUPDZrrkz	3317
-VMOVUPDZrrkz_REV	3318
-VMOVUPDmr	3319
-VMOVUPDrm	3320
-VMOVUPDrr	3321
-VMOVUPDrr_REV	3322
-VMOVUPSYmr	3323
-VMOVUPSYrm	3324
-VMOVUPSYrr	3325
-VMOVUPSYrr_REV	3326
-VMOVUPSZ	3327
-VMOVUPSZmr	3328
-VMOVUPSZmrk	3329
-VMOVUPSZrm	3330
-VMOVUPSZrmk	3331
-VMOVUPSZrmkz	3332
-VMOVUPSZrr	3333
-VMOVUPSZrr_REV	3334
-VMOVUPSZrrk	3335
-VMOVUPSZrrk_REV	3336
-VMOVUPSZrrkz	3337
-VMOVUPSZrrkz_REV	3338
-VMOVUPSmr	3339
-VMOVUPSrm	3340
-VMOVUPSrr	3341
-VMOVUPSrr_REV	3342
-VMOVW	3343
-VMOVWmr	3344
-VMOVWrm	3345
-VMOVZPDILo	3346
-VMOVZPQILo	3347
-VMOVZPWILo	3348
-VMPSADBWYrmi	3349
-VMPSADBWYrri	3350
-VMPSADBWZ	3351
-VMPSADBWZrmi	3352
-VMPSADBWZrmik	3353
-VMPSADBWZrmikz	3354
-VMPSADBWZrri	3355
-VMPSADBWZrrik	3356
-VMPSADBWZrrikz	3357
-VMPSADBWrmi	3358
-VMPSADBWrri	3359
-VMPTRLDm	3360
-VMPTRSTm	3361
-VMREAD	3362
-VMRESUME	3363
-VMRUN	3364
-VMSAVE	3365
-VMULBF	3366
-VMULPDYrm	3367
-VMULPDYrr	3368
-VMULPDZ	3369
-VMULPDZrm	3370
-VMULPDZrmb	3371
-VMULPDZrmbk	3372
-VMULPDZrmbkz	3373
-VMULPDZrmk	3374
-VMULPDZrmkz	3375
-VMULPDZrr	3376
-VMULPDZrrb	3377
-VMULPDZrrbk	3378
-VMULPDZrrbkz	3379
-VMULPDZrrk	3380
-VMULPDZrrkz	3381
-VMULPDrm	3382
-VMULPDrr	3383
-VMULPHZ	3384
-VMULPHZrm	3385
-VMULPHZrmb	3386
-VMULPHZrmbk	3387
-VMULPHZrmbkz	3388
-VMULPHZrmk	3389
-VMULPHZrmkz	3390
-VMULPHZrr	3391
-VMULPHZrrb	3392
-VMULPHZrrbk	3393
-VMULPHZrrbkz	3394
-VMULPHZrrk	3395
-VMULPHZrrkz	3396
-VMULPSYrm	3397
-VMULPSYrr	3398
-VMULPSZ	3399
-VMULPSZrm	3400
-VMULPSZrmb	3401
-VMULPSZrmbk	3402
-VMULPSZrmbkz	3403
-VMULPSZrmk	3404
-VMULPSZrmkz	3405
-VMULPSZrr	3406
-VMULPSZrrb	3407
-VMULPSZrrbk	3408
-VMULPSZrrbkz	3409
-VMULPSZrrk	3410
-VMULPSZrrkz	3411
-VMULPSrm	3412
-VMULPSrr	3413
-VMULSDZrm	3414
-VMULSDZrm_Int	3415
-VMULSDZrmk_Int	3416
-VMULSDZrmkz_Int	3417
-VMULSDZrr	3418
-VMULSDZrr_Int	3419
-VMULSDZrrb_Int	3420
-VMULSDZrrbk_Int	3421
-VMULSDZrrbkz_Int	3422
-VMULSDZrrk_Int	3423
-VMULSDZrrkz_Int	3424
-VMULSDrm	3425
-VMULSDrm_Int	3426
-VMULSDrr	3427
-VMULSDrr_Int	3428
-VMULSHZrm	3429
-VMULSHZrm_Int	3430
-VMULSHZrmk_Int	3431
-VMULSHZrmkz_Int	3432
-VMULSHZrr	3433
-VMULSHZrr_Int	3434
-VMULSHZrrb_Int	3435
-VMULSHZrrbk_Int	3436
-VMULSHZrrbkz_Int	3437
-VMULSHZrrk_Int	3438
-VMULSHZrrkz_Int	3439
-VMULSSZrm	3440
-VMULSSZrm_Int	3441
-VMULSSZrmk_Int	3442
-VMULSSZrmkz_Int	3443
-VMULSSZrr	3444
-VMULSSZrr_Int	3445
-VMULSSZrrb_Int	3446
-VMULSSZrrbk_Int	3447
-VMULSSZrrbkz_Int	3448
-VMULSSZrrk_Int	3449
-VMULSSZrrkz_Int	3450
-VMULSSrm	3451
-VMULSSrm_Int	3452
-VMULSSrr	3453
-VMULSSrr_Int	3454
-VMWRITE	3455
-VMXOFF	3456
-VMXON	3457
-VORPDYrm	3458
-VORPDYrr	3459
-VORPDZ	3460
-VORPDZrm	3461
-VORPDZrmb	3462
-VORPDZrmbk	3463
-VORPDZrmbkz	3464
-VORPDZrmk	3465
-VORPDZrmkz	3466
-VORPDZrr	3467
-VORPDZrrk	3468
-VORPDZrrkz	3469
-VORPDrm	3470
-VORPDrr	3471
-VORPSYrm	3472
-VORPSYrr	3473
-VORPSZ	3474
-VORPSZrm	3475
-VORPSZrmb	3476
-VORPSZrmbk	3477
-VORPSZrmbkz	3478
-VORPSZrmk	3479
-VORPSZrmkz	3480
-VORPSZrr	3481
-VORPSZrrk	3482
-VORPSZrrkz	3483
-VORPSrm	3484
-VORPSrr	3485
-VP	3486
-VPABSBYrm	3487
-VPABSBYrr	3488
-VPABSBZ	3489
-VPABSBZrm	3490
-VPABSBZrmk	3491
-VPABSBZrmkz	3492
-VPABSBZrr	3493
-VPABSBZrrk	3494
-VPABSBZrrkz	3495
-VPABSBrm	3496
-VPABSBrr	3497
-VPABSDYrm	3498
-VPABSDYrr	3499
-VPABSDZ	3500
-VPABSDZrm	3501
-VPABSDZrmb	3502
-VPABSDZrmbk	3503
-VPABSDZrmbkz	3504
-VPABSDZrmk	3505
-VPABSDZrmkz	3506
-VPABSDZrr	3507
-VPABSDZrrk	3508
-VPABSDZrrkz	3509
-VPABSDrm	3510
-VPABSDrr	3511
-VPABSQZ	3512
-VPABSQZrm	3513
-VPABSQZrmb	3514
-VPABSQZrmbk	3515
-VPABSQZrmbkz	3516
-VPABSQZrmk	3517
-VPABSQZrmkz	3518
-VPABSQZrr	3519
-VPABSQZrrk	3520
-VPABSQZrrkz	3521
-VPABSWYrm	3522
-VPABSWYrr	3523
-VPABSWZ	3524
-VPABSWZrm	3525
-VPABSWZrmk	3526
-VPABSWZrmkz	3527
-VPABSWZrr	3528
-VPABSWZrrk	3529
-VPABSWZrrkz	3530
-VPABSWrm	3531
-VPABSWrr	3532
-VPACKSSDWYrm	3533
-VPACKSSDWYrr	3534
-VPACKSSDWZ	3535
-VPACKSSDWZrm	3536
-VPACKSSDWZrmb	3537
-VPACKSSDWZrmbk	3538
-VPACKSSDWZrmbkz	3539
-VPACKSSDWZrmk	3540
-VPACKSSDWZrmkz	3541
-VPACKSSDWZrr	3542
-VPACKSSDWZrrk	3543
-VPACKSSDWZrrkz	3544
-VPACKSSDWrm	3545
-VPACKSSDWrr	3546
-VPACKSSWBYrm	3547
-VPACKSSWBYrr	3548
-VPACKSSWBZ	3549
-VPACKSSWBZrm	3550
-VPACKSSWBZrmk	3551
-VPACKSSWBZrmkz	3552
-VPACKSSWBZrr	3553
-VPACKSSWBZrrk	3554
-VPACKSSWBZrrkz	3555
-VPACKSSWBrm	3556
-VPACKSSWBrr	3557
-VPACKUSDWYrm	3558
-VPACKUSDWYrr	3559
-VPACKUSDWZ	3560
-VPACKUSDWZrm	3561
-VPACKUSDWZrmb	3562
-VPACKUSDWZrmbk	3563
-VPACKUSDWZrmbkz	3564
-VPACKUSDWZrmk	3565
-VPACKUSDWZrmkz	3566
-VPACKUSDWZrr	3567
-VPACKUSDWZrrk	3568
-VPACKUSDWZrrkz	3569
-VPACKUSDWrm	3570
-VPACKUSDWrr	3571
-VPACKUSWBYrm	3572
-VPACKUSWBYrr	3573
-VPACKUSWBZ	3574
-VPACKUSWBZrm	3575
-VPACKUSWBZrmk	3576
-VPACKUSWBZrmkz	3577
-VPACKUSWBZrr	3578
-VPACKUSWBZrrk	3579
-VPACKUSWBZrrkz	3580
-VPACKUSWBrm	3581
-VPACKUSWBrr	3582
-VPADDBYrm	3583
-VPADDBYrr	3584
-VPADDBZ	3585
-VPADDBZrm	3586
-VPADDBZrmk	3587
-VPADDBZrmkz	3588
-VPADDBZrr	3589
-VPADDBZrrk	3590
-VPADDBZrrkz	3591
-VPADDBrm	3592
-VPADDBrr	3593
-VPADDDYrm	3594
-VPADDDYrr	3595
-VPADDDZ	3596
-VPADDDZrm	3597
-VPADDDZrmb	3598
-VPADDDZrmbk	3599
-VPADDDZrmbkz	3600
-VPADDDZrmk	3601
-VPADDDZrmkz	3602
-VPADDDZrr	3603
-VPADDDZrrk	3604
-VPADDDZrrkz	3605
-VPADDDrm	3606
-VPADDDrr	3607
-VPADDQYrm	3608
-VPADDQYrr	3609
-VPADDQZ	3610
-VPADDQZrm	3611
-VPADDQZrmb	3612
-VPADDQZrmbk	3613
-VPADDQZrmbkz	3614
-VPADDQZrmk	3615
-VPADDQZrmkz	3616
-VPADDQZrr	3617
-VPADDQZrrk	3618
-VPADDQZrrkz	3619
-VPADDQrm	3620
-VPADDQrr	3621
-VPADDSBYrm	3622
-VPADDSBYrr	3623
-VPADDSBZ	3624
-VPADDSBZrm	3625
-VPADDSBZrmk	3626
-VPADDSBZrmkz	3627
-VPADDSBZrr	3628
-VPADDSBZrrk	3629
-VPADDSBZrrkz	3630
-VPADDSBrm	3631
-VPADDSBrr	3632
-VPADDSWYrm	3633
-VPADDSWYrr	3634
-VPADDSWZ	3635
-VPADDSWZrm	3636
-VPADDSWZrmk	3637
-VPADDSWZrmkz	3638
-VPADDSWZrr	3639
-VPADDSWZrrk	3640
-VPADDSWZrrkz	3641
-VPADDSWrm	3642
-VPADDSWrr	3643
-VPADDUSBYrm	3644
-VPADDUSBYrr	3645
-VPADDUSBZ	3646
-VPADDUSBZrm	3647
-VPADDUSBZrmk	3648
-VPADDUSBZrmkz	3649
-VPADDUSBZrr	3650
-VPADDUSBZrrk	3651
-VPADDUSBZrrkz	3652
-VPADDUSBrm	3653
-VPADDUSBrr	3654
-VPADDUSWYrm	3655
-VPADDUSWYrr	3656
-VPADDUSWZ	3657
-VPADDUSWZrm	3658
-VPADDUSWZrmk	3659
-VPADDUSWZrmkz	3660
-VPADDUSWZrr	3661
-VPADDUSWZrrk	3662
-VPADDUSWZrrkz	3663
-VPADDUSWrm	3664
-VPADDUSWrr	3665
-VPADDWYrm	3666
-VPADDWYrr	3667
-VPADDWZ	3668
-VPADDWZrm	3669
-VPADDWZrmk	3670
-VPADDWZrmkz	3671
-VPADDWZrr	3672
-VPADDWZrrk	3673
-VPADDWZrrkz	3674
-VPADDWrm	3675
-VPADDWrr	3676
-VPALIGNRYrmi	3677
-VPALIGNRYrri	3678
-VPALIGNRZ	3679
-VPALIGNRZrmi	3680
-VPALIGNRZrmik	3681
-VPALIGNRZrmikz	3682
-VPALIGNRZrri	3683
-VPALIGNRZrrik	3684
-VPALIGNRZrrikz	3685
-VPALIGNRrmi	3686
-VPALIGNRrri	3687
-VPANDDZ	3688
-VPANDDZrm	3689
-VPANDDZrmb	3690
-VPANDDZrmbk	3691
-VPANDDZrmbkz	3692
-VPANDDZrmk	3693
-VPANDDZrmkz	3694
-VPANDDZrr	3695
-VPANDDZrrk	3696
-VPANDDZrrkz	3697
-VPANDNDZ	3698
-VPANDNDZrm	3699
-VPANDNDZrmb	3700
-VPANDNDZrmbk	3701
-VPANDNDZrmbkz	3702
-VPANDNDZrmk	3703
-VPANDNDZrmkz	3704
-VPANDNDZrr	3705
-VPANDNDZrrk	3706
-VPANDNDZrrkz	3707
-VPANDNQZ	3708
-VPANDNQZrm	3709
-VPANDNQZrmb	3710
-VPANDNQZrmbk	3711
-VPANDNQZrmbkz	3712
-VPANDNQZrmk	3713
-VPANDNQZrmkz	3714
-VPANDNQZrr	3715
-VPANDNQZrrk	3716
-VPANDNQZrrkz	3717
-VPANDNYrm	3718
-VPANDNYrr	3719
-VPANDNrm	3720
-VPANDNrr	3721
-VPANDQZ	3722
-VPANDQZrm	3723
-VPANDQZrmb	3724
-VPANDQZrmbk	3725
-VPANDQZrmbkz	3726
-VPANDQZrmk	3727
-VPANDQZrmkz	3728
-VPANDQZrr	3729
-VPANDQZrrk	3730
-VPANDQZrrkz	3731
-VPANDYrm	3732
-VPANDYrr	3733
-VPANDrm	3734
-VPANDrr	3735
-VPAVGBYrm	3736
-VPAVGBYrr	3737
-VPAVGBZ	3738
-VPAVGBZrm	3739
-VPAVGBZrmk	3740
-VPAVGBZrmkz	3741
-VPAVGBZrr	3742
-VPAVGBZrrk	3743
-VPAVGBZrrkz	3744
-VPAVGBrm	3745
-VPAVGBrr	3746
-VPAVGWYrm	3747
-VPAVGWYrr	3748
-VPAVGWZ	3749
-VPAVGWZrm	3750
-VPAVGWZrmk	3751
-VPAVGWZrmkz	3752
-VPAVGWZrr	3753
-VPAVGWZrrk	3754
-VPAVGWZrrkz	3755
-VPAVGWrm	3756
-VPAVGWrr	3757
-VPBLENDDYrmi	3758
-VPBLENDDYrri	3759
-VPBLENDDrmi	3760
-VPBLENDDrri	3761
-VPBLENDMBZ	3762
-VPBLENDMBZrm	3763
-VPBLENDMBZrmk	3764
-VPBLENDMBZrmkz	3765
-VPBLENDMBZrr	3766
-VPBLENDMBZrrk	3767
-VPBLENDMBZrrkz	3768
-VPBLENDMDZ	3769
-VPBLENDMDZrm	3770
-VPBLENDMDZrmb	3771
-VPBLENDMDZrmbk	3772
-VPBLENDMDZrmbkz	3773
-VPBLENDMDZrmk	3774
-VPBLENDMDZrmkz	3775
-VPBLENDMDZrr	3776
-VPBLENDMDZrrk	3777
-VPBLENDMDZrrkz	3778
-VPBLENDMQZ	3779
-VPBLENDMQZrm	3780
-VPBLENDMQZrmb	3781
-VPBLENDMQZrmbk	3782
-VPBLENDMQZrmbkz	3783
-VPBLENDMQZrmk	3784
-VPBLENDMQZrmkz	3785
-VPBLENDMQZrr	3786
-VPBLENDMQZrrk	3787
-VPBLENDMQZrrkz	3788
-VPBLENDMWZ	3789
-VPBLENDMWZrm	3790
-VPBLENDMWZrmk	3791
-VPBLENDMWZrmkz	3792
-VPBLENDMWZrr	3793
-VPBLENDMWZrrk	3794
-VPBLENDMWZrrkz	3795
-VPBLENDVBYrmr	3796
-VPBLENDVBYrrr	3797
-VPBLENDVBrmr	3798
-VPBLENDVBrrr	3799
-VPBLENDWYrmi	3800
-VPBLENDWYrri	3801
-VPBLENDWrmi	3802
-VPBLENDWrri	3803
-VPBROADCASTBYrm	3804
-VPBROADCASTBYrr	3805
-VPBROADCASTBZ	3806
-VPBROADCASTBZrm	3807
-VPBROADCASTBZrmk	3808
-VPBROADCASTBZrmkz	3809
-VPBROADCASTBZrr	3810
-VPBROADCASTBZrrk	3811
-VPBROADCASTBZrrkz	3812
-VPBROADCASTBrZ	3813
-VPBROADCASTBrZrr	3814
-VPBROADCASTBrZrrk	3815
-VPBROADCASTBrZrrkz	3816
-VPBROADCASTBrm	3817
-VPBROADCASTBrr	3818
-VPBROADCASTDYrm	3819
-VPBROADCASTDYrr	3820
-VPBROADCASTDZ	3821
-VPBROADCASTDZrm	3822
-VPBROADCASTDZrmk	3823
-VPBROADCASTDZrmkz	3824
-VPBROADCASTDZrr	3825
-VPBROADCASTDZrrk	3826
-VPBROADCASTDZrrkz	3827
-VPBROADCASTDrZ	3828
-VPBROADCASTDrZrr	3829
-VPBROADCASTDrZrrk	3830
-VPBROADCASTDrZrrkz	3831
-VPBROADCASTDrm	3832
-VPBROADCASTDrr	3833
-VPBROADCASTMB	3834
-VPBROADCASTMW	3835
-VPBROADCASTQYrm	3836
-VPBROADCASTQYrr	3837
-VPBROADCASTQZ	3838
-VPBROADCASTQZrm	3839
-VPBROADCASTQZrmk	3840
-VPBROADCASTQZrmkz	3841
-VPBROADCASTQZrr	3842
-VPBROADCASTQZrrk	3843
-VPBROADCASTQZrrkz	3844
-VPBROADCASTQrZ	3845
-VPBROADCASTQrZrr	3846
-VPBROADCASTQrZrrk	3847
-VPBROADCASTQrZrrkz	3848
-VPBROADCASTQrm	3849
-VPBROADCASTQrr	3850
-VPBROADCASTWYrm	3851
-VPBROADCASTWYrr	3852
-VPBROADCASTWZ	3853
-VPBROADCASTWZrm	3854
-VPBROADCASTWZrmk	3855
-VPBROADCASTWZrmkz	3856
-VPBROADCASTWZrr	3857
-VPBROADCASTWZrrk	3858
-VPBROADCASTWZrrkz	3859
-VPBROADCASTWrZ	3860
-VPBROADCASTWrZrr	3861
-VPBROADCASTWrZrrk	3862
-VPBROADCASTWrZrrkz	3863
-VPBROADCASTWrm	3864
-VPBROADCASTWrr	3865
-VPCLMULQDQYrmi	3866
-VPCLMULQDQYrri	3867
-VPCLMULQDQZ	3868
-VPCLMULQDQZrmi	3869
-VPCLMULQDQZrri	3870
-VPCLMULQDQrmi	3871
-VPCLMULQDQrri	3872
-VPCMOVYrmr	3873
-VPCMOVYrrm	3874
-VPCMOVYrrr	3875
-VPCMOVYrrr_REV	3876
-VPCMOVrmr	3877
-VPCMOVrrm	3878
-VPCMOVrrr	3879
-VPCMOVrrr_REV	3880
-VPCMPBZ	3881
-VPCMPBZrmi	3882
-VPCMPBZrmik	3883
-VPCMPBZrri	3884
-VPCMPBZrrik	3885
-VPCMPDZ	3886
-VPCMPDZrmbi	3887
-VPCMPDZrmbik	3888
-VPCMPDZrmi	3889
-VPCMPDZrmik	3890
-VPCMPDZrri	3891
-VPCMPDZrrik	3892
-VPCMPEQBYrm	3893
-VPCMPEQBYrr	3894
-VPCMPEQBZ	3895
-VPCMPEQBZrm	3896
-VPCMPEQBZrmk	3897
-VPCMPEQBZrr	3898
-VPCMPEQBZrrk	3899
-VPCMPEQBrm	3900
-VPCMPEQBrr	3901
-VPCMPEQDYrm	3902
-VPCMPEQDYrr	3903
-VPCMPEQDZ	3904
-VPCMPEQDZrm	3905
-VPCMPEQDZrmb	3906
-VPCMPEQDZrmbk	3907
-VPCMPEQDZrmk	3908
-VPCMPEQDZrr	3909
-VPCMPEQDZrrk	3910
-VPCMPEQDrm	3911
-VPCMPEQDrr	3912
-VPCMPEQQYrm	3913
-VPCMPEQQYrr	3914
-VPCMPEQQZ	3915
-VPCMPEQQZrm	3916
-VPCMPEQQZrmb	3917
-VPCMPEQQZrmbk	3918
-VPCMPEQQZrmk	3919
-VPCMPEQQZrr	3920
-VPCMPEQQZrrk	3921
-VPCMPEQQrm	3922
-VPCMPEQQrr	3923
-VPCMPEQWYrm	3924
-VPCMPEQWYrr	3925
-VPCMPEQWZ	3926
-VPCMPEQWZrm	3927
-VPCMPEQWZrmk	3928
-VPCMPEQWZrr	3929
-VPCMPEQWZrrk	3930
-VPCMPEQWrm	3931
-VPCMPEQWrr	3932
-VPCMPESTRIrmi	3933
-VPCMPESTRIrri	3934
-VPCMPESTRMrmi	3935
-VPCMPESTRMrri	3936
-VPCMPGTBYrm	3937
-VPCMPGTBYrr	3938
-VPCMPGTBZ	3939
-VPCMPGTBZrm	3940
-VPCMPGTBZrmk	3941
-VPCMPGTBZrr	3942
-VPCMPGTBZrrk	3943
-VPCMPGTBrm	3944
-VPCMPGTBrr	3945
-VPCMPGTDYrm	3946
-VPCMPGTDYrr	3947
-VPCMPGTDZ	3948
-VPCMPGTDZrm	3949
-VPCMPGTDZrmb	3950
-VPCMPGTDZrmbk	3951
-VPCMPGTDZrmk	3952
-VPCMPGTDZrr	3953
-VPCMPGTDZrrk	3954
-VPCMPGTDrm	3955
-VPCMPGTDrr	3956
-VPCMPGTQYrm	3957
-VPCMPGTQYrr	3958
-VPCMPGTQZ	3959
-VPCMPGTQZrm	3960
-VPCMPGTQZrmb	3961
-VPCMPGTQZrmbk	3962
-VPCMPGTQZrmk	3963
-VPCMPGTQZrr	3964
-VPCMPGTQZrrk	3965
-VPCMPGTQrm	3966
-VPCMPGTQrr	3967
-VPCMPGTWYrm	3968
-VPCMPGTWYrr	3969
-VPCMPGTWZ	3970
-VPCMPGTWZrm	3971
-VPCMPGTWZrmk	3972
-VPCMPGTWZrr	3973
-VPCMPGTWZrrk	3974
-VPCMPGTWrm	3975
-VPCMPGTWrr	3976
-VPCMPISTRIrmi	3977
-VPCMPISTRIrri	3978
-VPCMPISTRMrmi	3979
-VPCMPISTRMrri	3980
-VPCMPQZ	3981
-VPCMPQZrmbi	3982
-VPCMPQZrmbik	3983
-VPCMPQZrmi	3984
-VPCMPQZrmik	3985
-VPCMPQZrri	3986
-VPCMPQZrrik	3987
-VPCMPUBZ	3988
-VPCMPUBZrmi	3989
-VPCMPUBZrmik	3990
-VPCMPUBZrri	3991
-VPCMPUBZrrik	3992
-VPCMPUDZ	3993
-VPCMPUDZrmbi	3994
-VPCMPUDZrmbik	3995
-VPCMPUDZrmi	3996
-VPCMPUDZrmik	3997
-VPCMPUDZrri	3998
-VPCMPUDZrrik	3999
-VPCMPUQZ	4000
-VPCMPUQZrmbi	4001
-VPCMPUQZrmbik	4002
-VPCMPUQZrmi	4003
-VPCMPUQZrmik	4004
-VPCMPUQZrri	4005
-VPCMPUQZrrik	4006
-VPCMPUWZ	4007
-VPCMPUWZrmi	4008
-VPCMPUWZrmik	4009
-VPCMPUWZrri	4010
-VPCMPUWZrrik	4011
-VPCMPWZ	4012
-VPCMPWZrmi	4013
-VPCMPWZrmik	4014
-VPCMPWZrri	4015
-VPCMPWZrrik	4016
-VPCOMBmi	4017
-VPCOMBri	4018
-VPCOMDmi	4019
-VPCOMDri	4020
-VPCOMPRESSBZ	4021
-VPCOMPRESSBZmr	4022
-VPCOMPRESSBZmrk	4023
-VPCOMPRESSBZrr	4024
-VPCOMPRESSBZrrk	4025
-VPCOMPRESSBZrrkz	4026
-VPCOMPRESSDZ	4027
-VPCOMPRESSDZmr	4028
-VPCOMPRESSDZmrk	4029
-VPCOMPRESSDZrr	4030
-VPCOMPRESSDZrrk	4031
-VPCOMPRESSDZrrkz	4032
-VPCOMPRESSQZ	4033
-VPCOMPRESSQZmr	4034
-VPCOMPRESSQZmrk	4035
-VPCOMPRESSQZrr	4036
-VPCOMPRESSQZrrk	4037
-VPCOMPRESSQZrrkz	4038
-VPCOMPRESSWZ	4039
-VPCOMPRESSWZmr	4040
-VPCOMPRESSWZmrk	4041
-VPCOMPRESSWZrr	4042
-VPCOMPRESSWZrrk	4043
-VPCOMPRESSWZrrkz	4044
-VPCOMQmi	4045
-VPCOMQri	4046
-VPCOMUBmi	4047
-VPCOMUBri	4048
-VPCOMUDmi	4049
-VPCOMUDri	4050
-VPCOMUQmi	4051
-VPCOMUQri	4052
-VPCOMUWmi	4053
-VPCOMUWri	4054
-VPCOMWmi	4055
-VPCOMWri	4056
-VPCONFLICTDZ	4057
-VPCONFLICTDZrm	4058
-VPCONFLICTDZrmb	4059
-VPCONFLICTDZrmbk	4060
-VPCONFLICTDZrmbkz	4061
-VPCONFLICTDZrmk	4062
-VPCONFLICTDZrmkz	4063
-VPCONFLICTDZrr	4064
-VPCONFLICTDZrrk	4065
-VPCONFLICTDZrrkz	4066
-VPCONFLICTQZ	4067
-VPCONFLICTQZrm	4068
-VPCONFLICTQZrmb	4069
-VPCONFLICTQZrmbk	4070
-VPCONFLICTQZrmbkz	4071
-VPCONFLICTQZrmk	4072
-VPCONFLICTQZrmkz	4073
-VPCONFLICTQZrr	4074
-VPCONFLICTQZrrk	4075
-VPCONFLICTQZrrkz	4076
-VPDPBSSDSYrm	4077
-VPDPBSSDSYrr	4078
-VPDPBSSDSZ	4079
-VPDPBSSDSZrm	4080
-VPDPBSSDSZrmb	4081
-VPDPBSSDSZrmbk	4082
-VPDPBSSDSZrmbkz	4083
-VPDPBSSDSZrmk	4084
-VPDPBSSDSZrmkz	4085
-VPDPBSSDSZrr	4086
-VPDPBSSDSZrrk	4087
-VPDPBSSDSZrrkz	4088
-VPDPBSSDSrm	4089
-VPDPBSSDSrr	4090
-VPDPBSSDYrm	4091
-VPDPBSSDYrr	4092
-VPDPBSSDZ	4093
-VPDPBSSDZrm	4094
-VPDPBSSDZrmb	4095
-VPDPBSSDZrmbk	4096
-VPDPBSSDZrmbkz	4097
-VPDPBSSDZrmk	4098
-VPDPBSSDZrmkz	4099
-VPDPBSSDZrr	4100
-VPDPBSSDZrrk	4101
-VPDPBSSDZrrkz	4102
-VPDPBSSDrm	4103
-VPDPBSSDrr	4104
-VPDPBSUDSYrm	4105
-VPDPBSUDSYrr	4106
-VPDPBSUDSZ	4107
-VPDPBSUDSZrm	4108
-VPDPBSUDSZrmb	4109
-VPDPBSUDSZrmbk	4110
-VPDPBSUDSZrmbkz	4111
-VPDPBSUDSZrmk	4112
-VPDPBSUDSZrmkz	4113
-VPDPBSUDSZrr	4114
-VPDPBSUDSZrrk	4115
-VPDPBSUDSZrrkz	4116
-VPDPBSUDSrm	4117
-VPDPBSUDSrr	4118
-VPDPBSUDYrm	4119
-VPDPBSUDYrr	4120
-VPDPBSUDZ	4121
-VPDPBSUDZrm	4122
-VPDPBSUDZrmb	4123
-VPDPBSUDZrmbk	4124
-VPDPBSUDZrmbkz	4125
-VPDPBSUDZrmk	4126
-VPDPBSUDZrmkz	4127
-VPDPBSUDZrr	4128
-VPDPBSUDZrrk	4129
-VPDPBSUDZrrkz	4130
-VPDPBSUDrm	4131
-VPDPBSUDrr	4132
-VPDPBUSDSYrm	4133
-VPDPBUSDSYrr	4134
-VPDPBUSDSZ	4135
-VPDPBUSDSZrm	4136
-VPDPBUSDSZrmb	4137
-VPDPBUSDSZrmbk	4138
-VPDPBUSDSZrmbkz	4139
-VPDPBUSDSZrmk	4140
-VPDPBUSDSZrmkz	4141
-VPDPBUSDSZrr	4142
-VPDPBUSDSZrrk	4143
-VPDPBUSDSZrrkz	4144
-VPDPBUSDSrm	4145
-VPDPBUSDSrr	4146
-VPDPBUSDYrm	4147
-VPDPBUSDYrr	4148
-VPDPBUSDZ	4149
-VPDPBUSDZrm	4150
-VPDPBUSDZrmb	4151
-VPDPBUSDZrmbk	4152
-VPDPBUSDZrmbkz	4153
-VPDPBUSDZrmk	4154
-VPDPBUSDZrmkz	4155
-VPDPBUSDZrr	4156
-VPDPBUSDZrrk	4157
-VPDPBUSDZrrkz	4158
-VPDPBUSDrm	4159
-VPDPBUSDrr	4160
-VPDPBUUDSYrm	4161
-VPDPBUUDSYrr	4162
-VPDPBUUDSZ	4163
-VPDPBUUDSZrm	4164
-VPDPBUUDSZrmb	4165
-VPDPBUUDSZrmbk	4166
-VPDPBUUDSZrmbkz	4167
-VPDPBUUDSZrmk	4168
-VPDPBUUDSZrmkz	4169
-VPDPBUUDSZrr	4170
-VPDPBUUDSZrrk	4171
-VPDPBUUDSZrrkz	4172
-VPDPBUUDSrm	4173
-VPDPBUUDSrr	4174
-VPDPBUUDYrm	4175
-VPDPBUUDYrr	4176
-VPDPBUUDZ	4177
-VPDPBUUDZrm	4178
-VPDPBUUDZrmb	4179
-VPDPBUUDZrmbk	4180
-VPDPBUUDZrmbkz	4181
-VPDPBUUDZrmk	4182
-VPDPBUUDZrmkz	4183
-VPDPBUUDZrr	4184
-VPDPBUUDZrrk	4185
-VPDPBUUDZrrkz	4186
-VPDPBUUDrm	4187
-VPDPBUUDrr	4188
-VPDPWSSDSYrm	4189
-VPDPWSSDSYrr	4190
-VPDPWSSDSZ	4191
-VPDPWSSDSZrm	4192
-VPDPWSSDSZrmb	4193
-VPDPWSSDSZrmbk	4194
-VPDPWSSDSZrmbkz	4195
-VPDPWSSDSZrmk	4196
-VPDPWSSDSZrmkz	4197
-VPDPWSSDSZrr	4198
-VPDPWSSDSZrrk	4199
-VPDPWSSDSZrrkz	4200
-VPDPWSSDSrm	4201
-VPDPWSSDSrr	4202
-VPDPWSSDYrm	4203
-VPDPWSSDYrr	4204
-VPDPWSSDZ	4205
-VPDPWSSDZrm	4206
-VPDPWSSDZrmb	4207
-VPDPWSSDZrmbk	4208
-VPDPWSSDZrmbkz	4209
-VPDPWSSDZrmk	4210
-VPDPWSSDZrmkz	4211
-VPDPWSSDZrr	4212
-VPDPWSSDZrrk	4213
-VPDPWSSDZrrkz	4214
-VPDPWSSDrm	4215
-VPDPWSSDrr	4216
-VPDPWSUDSYrm	4217
-VPDPWSUDSYrr	4218
-VPDPWSUDSZ	4219
-VPDPWSUDSZrm	4220
-VPDPWSUDSZrmb	4221
-VPDPWSUDSZrmbk	4222
-VPDPWSUDSZrmbkz	4223
-VPDPWSUDSZrmk	4224
-VPDPWSUDSZrmkz	4225
-VPDPWSUDSZrr	4226
-VPDPWSUDSZrrk	4227
-VPDPWSUDSZrrkz	4228
-VPDPWSUDSrm	4229
-VPDPWSUDSrr	4230
-VPDPWSUDYrm	4231
-VPDPWSUDYrr	4232
-VPDPWSUDZ	4233
-VPDPWSUDZrm	4234
-VPDPWSUDZrmb	4235
-VPDPWSUDZrmbk	4236
-VPDPWSUDZrmbkz	4237
-VPDPWSUDZrmk	4238
-VPDPWSUDZrmkz	4239
-VPDPWSUDZrr	4240
-VPDPWSUDZrrk	4241
-VPDPWSUDZrrkz	4242
-VPDPWSUDrm	4243
-VPDPWSUDrr	4244
-VPDPWUSDSYrm	4245
-VPDPWUSDSYrr	4246
-VPDPWUSDSZ	4247
-VPDPWUSDSZrm	4248
-VPDPWUSDSZrmb	4249
-VPDPWUSDSZrmbk	4250
-VPDPWUSDSZrmbkz	4251
-VPDPWUSDSZrmk	4252
-VPDPWUSDSZrmkz	4253
-VPDPWUSDSZrr	4254
-VPDPWUSDSZrrk	4255
-VPDPWUSDSZrrkz	4256
-VPDPWUSDSrm	4257
-VPDPWUSDSrr	4258
-VPDPWUSDYrm	4259
-VPDPWUSDYrr	4260
-VPDPWUSDZ	4261
-VPDPWUSDZrm	4262
-VPDPWUSDZrmb	4263
-VPDPWUSDZrmbk	4264
-VPDPWUSDZrmbkz	4265
-VPDPWUSDZrmk	4266
-VPDPWUSDZrmkz	4267
-VPDPWUSDZrr	4268
-VPDPWUSDZrrk	4269
-VPDPWUSDZrrkz	4270
-VPDPWUSDrm	4271
-VPDPWUSDrr	4272
-VPDPWUUDSYrm	4273
-VPDPWUUDSYrr	4274
-VPDPWUUDSZ	4275
-VPDPWUUDSZrm	4276
-VPDPWUUDSZrmb	4277
-VPDPWUUDSZrmbk	4278
-VPDPWUUDSZrmbkz	4279
-VPDPWUUDSZrmk	4280
-VPDPWUUDSZrmkz	4281
-VPDPWUUDSZrr	4282
-VPDPWUUDSZrrk	4283
-VPDPWUUDSZrrkz	4284
-VPDPWUUDSrm	4285
-VPDPWUUDSrr	4286
-VPDPWUUDYrm	4287
-VPDPWUUDYrr	4288
-VPDPWUUDZ	4289
-VPDPWUUDZrm	4290
-VPDPWUUDZrmb	4291
-VPDPWUUDZrmbk	4292
-VPDPWUUDZrmbkz	4293
-VPDPWUUDZrmk	4294
-VPDPWUUDZrmkz	4295
-VPDPWUUDZrr	4296
-VPDPWUUDZrrk	4297
-VPDPWUUDZrrkz	4298
-VPDPWUUDrm	4299
-VPDPWUUDrr	4300
-VPERM	4301
-VPERMBZ	4302
-VPERMBZrm	4303
-VPERMBZrmk	4304
-VPERMBZrmkz	4305
-VPERMBZrr	4306
-VPERMBZrrk	4307
-VPERMBZrrkz	4308
-VPERMDYrm	4309
-VPERMDYrr	4310
-VPERMDZ	4311
-VPERMDZrm	4312
-VPERMDZrmb	4313
-VPERMDZrmbk	4314
-VPERMDZrmbkz	4315
-VPERMDZrmk	4316
-VPERMDZrmkz	4317
-VPERMDZrr	4318
-VPERMDZrrk	4319
-VPERMDZrrkz	4320
-VPERMI	4321
-VPERMIL	4322
-VPERMILPDYmi	4323
-VPERMILPDYri	4324
-VPERMILPDYrm	4325
-VPERMILPDYrr	4326
-VPERMILPDZ	4327
-VPERMILPDZmbi	4328
-VPERMILPDZmbik	4329
-VPERMILPDZmbikz	4330
-VPERMILPDZmi	4331
-VPERMILPDZmik	4332
-VPERMILPDZmikz	4333
-VPERMILPDZri	4334
-VPERMILPDZrik	4335
-VPERMILPDZrikz	4336
-VPERMILPDZrm	4337
-VPERMILPDZrmb	4338
-VPERMILPDZrmbk	4339
-VPERMILPDZrmbkz	4340
-VPERMILPDZrmk	4341
-VPERMILPDZrmkz	4342
-VPERMILPDZrr	4343
-VPERMILPDZrrk	4344
-VPERMILPDZrrkz	4345
-VPERMILPDmi	4346
-VPERMILPDri	4347
-VPERMILPDrm	4348
-VPERMILPDrr	4349
-VPERMILPSYmi	4350
-VPERMILPSYri	4351
-VPERMILPSYrm	4352
-VPERMILPSYrr	4353
-VPERMILPSZ	4354
-VPERMILPSZmbi	4355
-VPERMILPSZmbik	4356
-VPERMILPSZmbikz	4357
-VPERMILPSZmi	4358
-VPERMILPSZmik	4359
-VPERMILPSZmikz	4360
-VPERMILPSZri	4361
-VPERMILPSZrik	4362
-VPERMILPSZrikz	4363
-VPERMILPSZrm	4364
-VPERMILPSZrmb	4365
-VPERMILPSZrmbk	4366
-VPERMILPSZrmbkz	4367
-VPERMILPSZrmk	4368
-VPERMILPSZrmkz	4369
-VPERMILPSZrr	4370
-VPERMILPSZrrk	4371
-VPERMILPSZrrkz	4372
-VPERMILPSmi	4373
-VPERMILPSri	4374
-VPERMILPSrm	4375
-VPERMILPSrr	4376
-VPERMPDYmi	4377
-VPERMPDYri	4378
-VPERMPDZ	4379
-VPERMPDZmbi	4380
-VPERMPDZmbik	4381
-VPERMPDZmbikz	4382
-VPERMPDZmi	4383
-VPERMPDZmik	4384
-VPERMPDZmikz	4385
-VPERMPDZri	4386
-VPERMPDZrik	4387
-VPERMPDZrikz	4388
-VPERMPDZrm	4389
-VPERMPDZrmb	4390
-VPERMPDZrmbk	4391
-VPERMPDZrmbkz	4392
-VPERMPDZrmk	4393
-VPERMPDZrmkz	4394
-VPERMPDZrr	4395
-VPERMPDZrrk	4396
-VPERMPDZrrkz	4397
-VPERMPSYrm	4398
-VPERMPSYrr	4399
-VPERMPSZ	4400
-VPERMPSZrm	4401
-VPERMPSZrmb	4402
-VPERMPSZrmbk	4403
-VPERMPSZrmbkz	4404
-VPERMPSZrmk	4405
-VPERMPSZrmkz	4406
-VPERMPSZrr	4407
-VPERMPSZrrk	4408
-VPERMPSZrrkz	4409
-VPERMQYmi	4410
-VPERMQYri	4411
-VPERMQZ	4412
-VPERMQZmbi	4413
-VPERMQZmbik	4414
-VPERMQZmbikz	4415
-VPERMQZmi	4416
-VPERMQZmik	4417
-VPERMQZmikz	4418
-VPERMQZri	4419
-VPERMQZrik	4420
-VPERMQZrikz	4421
-VPERMQZrm	4422
-VPERMQZrmb	4423
-VPERMQZrmbk	4424
-VPERMQZrmbkz	4425
-VPERMQZrmk	4426
-VPERMQZrmkz	4427
-VPERMQZrr	4428
-VPERMQZrrk	4429
-VPERMQZrrkz	4430
-VPERMT	4431
-VPERMWZ	4432
-VPERMWZrm	4433
-VPERMWZrmk	4434
-VPERMWZrmkz	4435
-VPERMWZrr	4436
-VPERMWZrrk	4437
-VPERMWZrrkz	4438
-VPEXPANDBZ	4439
-VPEXPANDBZrm	4440
-VPEXPANDBZrmk	4441
-VPEXPANDBZrmkz	4442
-VPEXPANDBZrr	4443
-VPEXPANDBZrrk	4444
-VPEXPANDBZrrkz	4445
-VPEXPANDDZ	4446
-VPEXPANDDZrm	4447
-VPEXPANDDZrmk	4448
-VPEXPANDDZrmkz	4449
-VPEXPANDDZrr	4450
-VPEXPANDDZrrk	4451
-VPEXPANDDZrrkz	4452
-VPEXPANDQZ	4453
-VPEXPANDQZrm	4454
-VPEXPANDQZrmk	4455
-VPEXPANDQZrmkz	4456
-VPEXPANDQZrr	4457
-VPEXPANDQZrrk	4458
-VPEXPANDQZrrkz	4459
-VPEXPANDWZ	4460
-VPEXPANDWZrm	4461
-VPEXPANDWZrmk	4462
-VPEXPANDWZrmkz	4463
-VPEXPANDWZrr	4464
-VPEXPANDWZrrk	4465
-VPEXPANDWZrrkz	4466
-VPEXTRBZmri	4467
-VPEXTRBZrri	4468
-VPEXTRBmri	4469
-VPEXTRBrri	4470
-VPEXTRDZmri	4471
-VPEXTRDZrri	4472
-VPEXTRDmri	4473
-VPEXTRDrri	4474
-VPEXTRQZmri	4475
-VPEXTRQZrri	4476
-VPEXTRQmri	4477
-VPEXTRQrri	4478
-VPEXTRWZmri	4479
-VPEXTRWZrri	4480
-VPEXTRWZrri_REV	4481
-VPEXTRWmri	4482
-VPEXTRWrri	4483
-VPEXTRWrri_REV	4484
-VPGATHERDDYrm	4485
-VPGATHERDDZ	4486
-VPGATHERDDZrm	4487
-VPGATHERDDrm	4488
-VPGATHERDQYrm	4489
-VPGATHERDQZ	4490
-VPGATHERDQZrm	4491
-VPGATHERDQrm	4492
-VPGATHERQDYrm	4493
-VPGATHERQDZ	4494
-VPGATHERQDZrm	4495
-VPGATHERQDrm	4496
-VPGATHERQQYrm	4497
-VPGATHERQQZ	4498
-VPGATHERQQZrm	4499
-VPGATHERQQrm	4500
-VPHADDBDrm	4501
-VPHADDBDrr	4502
-VPHADDBQrm	4503
-VPHADDBQrr	4504
-VPHADDBWrm	4505
-VPHADDBWrr	4506
-VPHADDDQrm	4507
-VPHADDDQrr	4508
-VPHADDDYrm	4509
-VPHADDDYrr	4510
-VPHADDDrm	4511
-VPHADDDrr	4512
-VPHADDSWYrm	4513
-VPHADDSWYrr	4514
-VPHADDSWrm	4515
-VPHADDSWrr	4516
-VPHADDUBDrm	4517
-VPHADDUBDrr	4518
-VPHADDUBQrm	4519
-VPHADDUBQrr	4520
-VPHADDUBWrm	4521
-VPHADDUBWrr	4522
-VPHADDUDQrm	4523
-VPHADDUDQrr	4524
-VPHADDUWDrm	4525
-VPHADDUWDrr	4526
-VPHADDUWQrm	4527
-VPHADDUWQrr	4528
-VPHADDWDrm	4529
-VPHADDWDrr	4530
-VPHADDWQrm	4531
-VPHADDWQrr	4532
-VPHADDWYrm	4533
-VPHADDWYrr	4534
-VPHADDWrm	4535
-VPHADDWrr	4536
-VPHMINPOSUWrm	4537
-VPHMINPOSUWrr	4538
-VPHSUBBWrm	4539
-VPHSUBBWrr	4540
-VPHSUBDQrm	4541
-VPHSUBDQrr	4542
-VPHSUBDYrm	4543
-VPHSUBDYrr	4544
-VPHSUBDrm	4545
-VPHSUBDrr	4546
-VPHSUBSWYrm	4547
-VPHSUBSWYrr	4548
-VPHSUBSWrm	4549
-VPHSUBSWrr	4550
-VPHSUBWDrm	4551
-VPHSUBWDrr	4552
-VPHSUBWYrm	4553
-VPHSUBWYrr	4554
-VPHSUBWrm	4555
-VPHSUBWrr	4556
-VPINSRBZrmi	4557
-VPINSRBZrri	4558
-VPINSRBrmi	4559
-VPINSRBrri	4560
-VPINSRDZrmi	4561
-VPINSRDZrri	4562
-VPINSRDrmi	4563
-VPINSRDrri	4564
-VPINSRQZrmi	4565
-VPINSRQZrri	4566
-VPINSRQrmi	4567
-VPINSRQrri	4568
-VPINSRWZrmi	4569
-VPINSRWZrri	4570
-VPINSRWrmi	4571
-VPINSRWrri	4572
-VPLZCNTDZ	4573
-VPLZCNTDZrm	4574
-VPLZCNTDZrmb	4575
-VPLZCNTDZrmbk	4576
-VPLZCNTDZrmbkz	4577
-VPLZCNTDZrmk	4578
-VPLZCNTDZrmkz	4579
-VPLZCNTDZrr	4580
-VPLZCNTDZrrk	4581
-VPLZCNTDZrrkz	4582
-VPLZCNTQZ	4583
-VPLZCNTQZrm	4584
-VPLZCNTQZrmb	4585
-VPLZCNTQZrmbk	4586
-VPLZCNTQZrmbkz	4587
-VPLZCNTQZrmk	4588
-VPLZCNTQZrmkz	4589
-VPLZCNTQZrr	4590
-VPLZCNTQZrrk	4591
-VPLZCNTQZrrkz	4592
-VPMACSDDrm	4593
-VPMACSDDrr	4594
-VPMACSDQHrm	4595
-VPMACSDQHrr	4596
-VPMACSDQLrm	4597
-VPMACSDQLrr	4598
-VPMACSSDDrm	4599
-VPMACSSDDrr	4600
-VPMACSSDQHrm	4601
-VPMACSSDQHrr	4602
-VPMACSSDQLrm	4603
-VPMACSSDQLrr	4604
-VPMACSSWDrm	4605
-VPMACSSWDrr	4606
-VPMACSSWWrm	4607
-VPMACSSWWrr	4608
-VPMACSWDrm	4609
-VPMACSWDrr	4610
-VPMACSWWrm	4611
-VPMACSWWrr	4612
-VPMADCSSWDrm	4613
-VPMADCSSWDrr	4614
-VPMADCSWDrm	4615
-VPMADCSWDrr	4616
-VPMADD	4617
-VPMADDUBSWYrm	4618
-VPMADDUBSWYrr	4619
-VPMADDUBSWZ	4620
-VPMADDUBSWZrm	4621
-VPMADDUBSWZrmk	4622
-VPMADDUBSWZrmkz	4623
-VPMADDUBSWZrr	4624
-VPMADDUBSWZrrk	4625
-VPMADDUBSWZrrkz	4626
-VPMADDUBSWrm	4627
-VPMADDUBSWrr	4628
-VPMADDWDYrm	4629
-VPMADDWDYrr	4630
-VPMADDWDZ	4631
-VPMADDWDZrm	4632
-VPMADDWDZrmk	4633
-VPMADDWDZrmkz	4634
-VPMADDWDZrr	4635
-VPMADDWDZrrk	4636
-VPMADDWDZrrkz	4637
-VPMADDWDrm	4638
-VPMADDWDrr	4639
-VPMASKMOVDYmr	4640
-VPMASKMOVDYrm	4641
-VPMASKMOVDmr	4642
-VPMASKMOVDrm	4643
-VPMASKMOVQYmr	4644
-VPMASKMOVQYrm	4645
-VPMASKMOVQmr	4646
-VPMASKMOVQrm	4647
-VPMAXSBYrm	4648
-VPMAXSBYrr	4649
-VPMAXSBZ	4650
-VPMAXSBZrm	4651
-VPMAXSBZrmk	4652
-VPMAXSBZrmkz	4653
-VPMAXSBZrr	4654
-VPMAXSBZrrk	4655
-VPMAXSBZrrkz	4656
-VPMAXSBrm	4657
-VPMAXSBrr	4658
-VPMAXSDYrm	4659
-VPMAXSDYrr	4660
-VPMAXSDZ	4661
-VPMAXSDZrm	4662
-VPMAXSDZrmb	4663
-VPMAXSDZrmbk	4664
-VPMAXSDZrmbkz	4665
-VPMAXSDZrmk	4666
-VPMAXSDZrmkz	4667
-VPMAXSDZrr	4668
-VPMAXSDZrrk	4669
-VPMAXSDZrrkz	4670
-VPMAXSDrm	4671
-VPMAXSDrr	4672
-VPMAXSQZ	4673
-VPMAXSQZrm	4674
-VPMAXSQZrmb	4675
-VPMAXSQZrmbk	4676
-VPMAXSQZrmbkz	4677
-VPMAXSQZrmk	4678
-VPMAXSQZrmkz	4679
-VPMAXSQZrr	4680
-VPMAXSQZrrk	4681
-VPMAXSQZrrkz	4682
-VPMAXSWYrm	4683
-VPMAXSWYrr	4684
-VPMAXSWZ	4685
-VPMAXSWZrm	4686
-VPMAXSWZrmk	4687
-VPMAXSWZrmkz	4688
-VPMAXSWZrr	4689
-VPMAXSWZrrk	4690
-VPMAXSWZrrkz	4691
-VPMAXSWrm	4692
-VPMAXSWrr	4693
-VPMAXUBYrm	4694
-VPMAXUBYrr	4695
-VPMAXUBZ	4696
-VPMAXUBZrm	4697
-VPMAXUBZrmk	4698
-VPMAXUBZrmkz	4699
-VPMAXUBZrr	4700
-VPMAXUBZrrk	4701
-VPMAXUBZrrkz	4702
-VPMAXUBrm	4703
-VPMAXUBrr	4704
-VPMAXUDYrm	4705
-VPMAXUDYrr	4706
-VPMAXUDZ	4707
-VPMAXUDZrm	4708
-VPMAXUDZrmb	4709
-VPMAXUDZrmbk	4710
-VPMAXUDZrmbkz	4711
-VPMAXUDZrmk	4712
-VPMAXUDZrmkz	4713
-VPMAXUDZrr	4714
-VPMAXUDZrrk	4715
-VPMAXUDZrrkz	4716
-VPMAXUDrm	4717
-VPMAXUDrr	4718
-VPMAXUQZ	4719
-VPMAXUQZrm	4720
-VPMAXUQZrmb	4721
-VPMAXUQZrmbk	4722
-VPMAXUQZrmbkz	4723
-VPMAXUQZrmk	4724
-VPMAXUQZrmkz	4725
-VPMAXUQZrr	4726
-VPMAXUQZrrk	4727
-VPMAXUQZrrkz	4728
-VPMAXUWYrm	4729
-VPMAXUWYrr	4730
-VPMAXUWZ	4731
-VPMAXUWZrm	4732
-VPMAXUWZrmk	4733
-VPMAXUWZrmkz	4734
-VPMAXUWZrr	4735
-VPMAXUWZrrk	4736
-VPMAXUWZrrkz	4737
-VPMAXUWrm	4738
-VPMAXUWrr	4739
-VPMINSBYrm	4740
-VPMINSBYrr	4741
-VPMINSBZ	4742
-VPMINSBZrm	4743
-VPMINSBZrmk	4744
-VPMINSBZrmkz	4745
-VPMINSBZrr	4746
-VPMINSBZrrk	4747
-VPMINSBZrrkz	4748
-VPMINSBrm	4749
-VPMINSBrr	4750
-VPMINSDYrm	4751
-VPMINSDYrr	4752
-VPMINSDZ	4753
-VPMINSDZrm	4754
-VPMINSDZrmb	4755
-VPMINSDZrmbk	4756
-VPMINSDZrmbkz	4757
-VPMINSDZrmk	4758
-VPMINSDZrmkz	4759
-VPMINSDZrr	4760
-VPMINSDZrrk	4761
-VPMINSDZrrkz	4762
-VPMINSDrm	4763
-VPMINSDrr	4764
-VPMINSQZ	4765
-VPMINSQZrm	4766
-VPMINSQZrmb	4767
-VPMINSQZrmbk	4768
-VPMINSQZrmbkz	4769
-VPMINSQZrmk	4770
-VPMINSQZrmkz	4771
-VPMINSQZrr	4772
-VPMINSQZrrk	4773
-VPMINSQZrrkz	4774
-VPMINSWYrm	4775
-VPMINSWYrr	4776
-VPMINSWZ	4777
-VPMINSWZrm	4778
-VPMINSWZrmk	4779
-VPMINSWZrmkz	4780
-VPMINSWZrr	4781
-VPMINSWZrrk	4782
-VPMINSWZrrkz	4783
-VPMINSWrm	4784
-VPMINSWrr	4785
-VPMINUBYrm	4786
-VPMINUBYrr	4787
-VPMINUBZ	4788
-VPMINUBZrm	4789
-VPMINUBZrmk	4790
-VPMINUBZrmkz	4791
-VPMINUBZrr	4792
-VPMINUBZrrk	4793
-VPMINUBZrrkz	4794
-VPMINUBrm	4795
-VPMINUBrr	4796
-VPMINUDYrm	4797
-VPMINUDYrr	4798
-VPMINUDZ	4799
-VPMINUDZrm	4800
-VPMINUDZrmb	4801
-VPMINUDZrmbk	4802
-VPMINUDZrmbkz	4803
-VPMINUDZrmk	4804
-VPMINUDZrmkz	4805
-VPMINUDZrr	4806
-VPMINUDZrrk	4807
-VPMINUDZrrkz	4808
-VPMINUDrm	4809
-VPMINUDrr	4810
-VPMINUQZ	4811
-VPMINUQZrm	4812
-VPMINUQZrmb	4813
-VPMINUQZrmbk	4814
-VPMINUQZrmbkz	4815
-VPMINUQZrmk	4816
-VPMINUQZrmkz	4817
-VPMINUQZrr	4818
-VPMINUQZrrk	4819
-VPMINUQZrrkz	4820
-VPMINUWYrm	4821
-VPMINUWYrr	4822
-VPMINUWZ	4823
-VPMINUWZrm	4824
-VPMINUWZrmk	4825
-VPMINUWZrmkz	4826
-VPMINUWZrr	4827
-VPMINUWZrrk	4828
-VPMINUWZrrkz	4829
-VPMINUWrm	4830
-VPMINUWrr	4831
-VPMOVB	4832
-VPMOVD	4833
-VPMOVDBZ	4834
-VPMOVDBZmr	4835
-VPMOVDBZmrk	4836
-VPMOVDBZrr	4837
-VPMOVDBZrrk	4838
-VPMOVDBZrrkz	4839
-VPMOVDWZ	4840
-VPMOVDWZmr	4841
-VPMOVDWZmrk	4842
-VPMOVDWZrr	4843
-VPMOVDWZrrk	4844
-VPMOVDWZrrkz	4845
-VPMOVM	4846
-VPMOVMSKBYrr	4847
-VPMOVMSKBrr	4848
-VPMOVQ	4849
-VPMOVQBZ	4850
-VPMOVQBZmr	4851
-VPMOVQBZmrk	4852
-VPMOVQBZrr	4853
-VPMOVQBZrrk	4854
-VPMOVQBZrrkz	4855
-VPMOVQDZ	4856
-VPMOVQDZmr	4857
-VPMOVQDZmrk	4858
-VPMOVQDZrr	4859
-VPMOVQDZrrk	4860
-VPMOVQDZrrkz	4861
-VPMOVQWZ	4862
-VPMOVQWZmr	4863
-VPMOVQWZmrk	4864
-VPMOVQWZrr	4865
-VPMOVQWZrrk	4866
-VPMOVQWZrrkz	4867
-VPMOVSDBZ	4868
-VPMOVSDBZmr	4869
-VPMOVSDBZmrk	4870
-VPMOVSDBZrr	4871
-VPMOVSDBZrrk	4872
-VPMOVSDBZrrkz	4873
-VPMOVSDWZ	4874
-VPMOVSDWZmr	4875
-VPMOVSDWZmrk	4876
-VPMOVSDWZrr	4877
-VPMOVSDWZrrk	4878
-VPMOVSDWZrrkz	4879
-VPMOVSQBZ	4880
-VPMOVSQBZmr	4881
-VPMOVSQBZmrk	4882
-VPMOVSQBZrr	4883
-VPMOVSQBZrrk	4884
-VPMOVSQBZrrkz	4885
-VPMOVSQDZ	4886
-VPMOVSQDZmr	4887
-VPMOVSQDZmrk	4888
-VPMOVSQDZrr	4889
-VPMOVSQDZrrk	4890
-VPMOVSQDZrrkz	4891
-VPMOVSQWZ	4892
-VPMOVSQWZmr	4893
-VPMOVSQWZmrk	4894
-VPMOVSQWZrr	4895
-VPMOVSQWZrrk	4896
-VPMOVSQWZrrkz	4897
-VPMOVSWBZ	4898
-VPMOVSWBZmr	4899
-VPMOVSWBZmrk	4900
-VPMOVSWBZrr	4901
-VPMOVSWBZrrk	4902
-VPMOVSWBZrrkz	4903
-VPMOVSXBDYrm	4904
-VPMOVSXBDYrr	4905
-VPMOVSXBDZ	4906
-VPMOVSXBDZrm	4907
-VPMOVSXBDZrmk	4908
-VPMOVSXBDZrmkz	4909
-VPMOVSXBDZrr	4910
-VPMOVSXBDZrrk	4911
-VPMOVSXBDZrrkz	4912
-VPMOVSXBDrm	4913
-VPMOVSXBDrr	4914
-VPMOVSXBQYrm	4915
-VPMOVSXBQYrr	4916
-VPMOVSXBQZ	4917
-VPMOVSXBQZrm	4918
-VPMOVSXBQZrmk	4919
-VPMOVSXBQZrmkz	4920
-VPMOVSXBQZrr	4921
-VPMOVSXBQZrrk	4922
-VPMOVSXBQZrrkz	4923
-VPMOVSXBQrm	4924
-VPMOVSXBQrr	4925
-VPMOVSXBWYrm	4926
-VPMOVSXBWYrr	4927
-VPMOVSXBWZ	4928
-VPMOVSXBWZrm	4929
-VPMOVSXBWZrmk	4930
-VPMOVSXBWZrmkz	4931
-VPMOVSXBWZrr	4932
-VPMOVSXBWZrrk	4933
-VPMOVSXBWZrrkz	4934
-VPMOVSXBWrm	4935
-VPMOVSXBWrr	4936
-VPMOVSXDQYrm	4937
-VPMOVSXDQYrr	4938
-VPMOVSXDQZ	4939
-VPMOVSXDQZrm	4940
-VPMOVSXDQZrmk	4941
-VPMOVSXDQZrmkz	4942
-VPMOVSXDQZrr	4943
-VPMOVSXDQZrrk	4944
-VPMOVSXDQZrrkz	4945
-VPMOVSXDQrm	4946
-VPMOVSXDQrr	4947
-VPMOVSXWDYrm	4948
-VPMOVSXWDYrr	4949
-VPMOVSXWDZ	4950
-VPMOVSXWDZrm	4951
-VPMOVSXWDZrmk	4952
-VPMOVSXWDZrmkz	4953
-VPMOVSXWDZrr	4954
-VPMOVSXWDZrrk	4955
-VPMOVSXWDZrrkz	4956
-VPMOVSXWDrm	4957
-VPMOVSXWDrr	4958
-VPMOVSXWQYrm	4959
-VPMOVSXWQYrr	4960
-VPMOVSXWQZ	4961
-VPMOVSXWQZrm	4962
-VPMOVSXWQZrmk	4963
-VPMOVSXWQZrmkz	4964
-VPMOVSXWQZrr	4965
-VPMOVSXWQZrrk	4966
-VPMOVSXWQZrrkz	4967
-VPMOVSXWQrm	4968
-VPMOVSXWQrr	4969
-VPMOVUSDBZ	4970
-VPMOVUSDBZmr	4971
-VPMOVUSDBZmrk	4972
-VPMOVUSDBZrr	4973
-VPMOVUSDBZrrk	4974
-VPMOVUSDBZrrkz	4975
-VPMOVUSDWZ	4976
-VPMOVUSDWZmr	4977
-VPMOVUSDWZmrk	4978
-VPMOVUSDWZrr	4979
-VPMOVUSDWZrrk	4980
-VPMOVUSDWZrrkz	4981
-VPMOVUSQBZ	4982
-VPMOVUSQBZmr	4983
-VPMOVUSQBZmrk	4984
-VPMOVUSQBZrr	4985
-VPMOVUSQBZrrk	4986
-VPMOVUSQBZrrkz	4987
-VPMOVUSQDZ	4988
-VPMOVUSQDZmr	4989
-VPMOVUSQDZmrk	4990
-VPMOVUSQDZrr	4991
-VPMOVUSQDZrrk	4992
-VPMOVUSQDZrrkz	4993
-VPMOVUSQWZ	4994
-VPMOVUSQWZmr	4995
-VPMOVUSQWZmrk	4996
-VPMOVUSQWZrr	4997
-VPMOVUSQWZrrk	4998
-VPMOVUSQWZrrkz	4999
-VPMOVUSWBZ	5000
-VPMOVUSWBZmr	5001
-VPMOVUSWBZmrk	5002
-VPMOVUSWBZrr	5003
-VPMOVUSWBZrrk	5004
-VPMOVUSWBZrrkz	5005
-VPMOVW	5006
-VPMOVWBZ	5007
-VPMOVWBZmr	5008
-VPMOVWBZmrk	5009
-VPMOVWBZrr	5010
-VPMOVWBZrrk	5011
-VPMOVWBZrrkz	5012
-VPMOVZXBDYrm	5013
-VPMOVZXBDYrr	5014
-VPMOVZXBDZ	5015
-VPMOVZXBDZrm	5016
-VPMOVZXBDZrmk	5017
-VPMOVZXBDZrmkz	5018
-VPMOVZXBDZrr	5019
-VPMOVZXBDZrrk	5020
-VPMOVZXBDZrrkz	5021
-VPMOVZXBDrm	5022
-VPMOVZXBDrr	5023
-VPMOVZXBQYrm	5024
-VPMOVZXBQYrr	5025
-VPMOVZXBQZ	5026
-VPMOVZXBQZrm	5027
-VPMOVZXBQZrmk	5028
-VPMOVZXBQZrmkz	5029
-VPMOVZXBQZrr	5030
-VPMOVZXBQZrrk	5031
-VPMOVZXBQZrrkz	5032
-VPMOVZXBQrm	5033
-VPMOVZXBQrr	5034
-VPMOVZXBWYrm	5035
-VPMOVZXBWYrr	5036
-VPMOVZXBWZ	5037
-VPMOVZXBWZrm	5038
-VPMOVZXBWZrmk	5039
-VPMOVZXBWZrmkz	5040
-VPMOVZXBWZrr	5041
-VPMOVZXBWZrrk	5042
-VPMOVZXBWZrrkz	5043
-VPMOVZXBWrm	5044
-VPMOVZXBWrr	5045
-VPMOVZXDQYrm	5046
-VPMOVZXDQYrr	5047
-VPMOVZXDQZ	5048
-VPMOVZXDQZrm	5049
-VPMOVZXDQZrmk	5050
-VPMOVZXDQZrmkz	5051
-VPMOVZXDQZrr	5052
-VPMOVZXDQZrrk	5053
-VPMOVZXDQZrrkz	5054
-VPMOVZXDQrm	5055
-VPMOVZXDQrr	5056
-VPMOVZXWDYrm	5057
-VPMOVZXWDYrr	5058
-VPMOVZXWDZ	5059
-VPMOVZXWDZrm	5060
-VPMOVZXWDZrmk	5061
-VPMOVZXWDZrmkz	5062
-VPMOVZXWDZrr	5063
-VPMOVZXWDZrrk	5064
-VPMOVZXWDZrrkz	5065
-VPMOVZXWDrm	5066
-VPMOVZXWDrr	5067
-VPMOVZXWQYrm	5068
-VPMOVZXWQYrr	5069
-VPMOVZXWQZ	5070
-VPMOVZXWQZrm	5071
-VPMOVZXWQZrmk	5072
-VPMOVZXWQZrmkz	5073
-VPMOVZXWQZrr	5074
-VPMOVZXWQZrrk	5075
-VPMOVZXWQZrrkz	5076
-VPMOVZXWQrm	5077
-VPMOVZXWQrr	5078
-VPMULDQYrm	5079
-VPMULDQYrr	5080
-VPMULDQZ	5081
-VPMULDQZrm	5082
-VPMULDQZrmb	5083
-VPMULDQZrmbk	5084
-VPMULDQZrmbkz	5085
-VPMULDQZrmk	5086
-VPMULDQZrmkz	5087
-VPMULDQZrr	5088
-VPMULDQZrrk	5089
-VPMULDQZrrkz	5090
-VPMULDQrm	5091
-VPMULDQrr	5092
-VPMULHRSWYrm	5093
-VPMULHRSWYrr	5094
-VPMULHRSWZ	5095
-VPMULHRSWZrm	5096
-VPMULHRSWZrmk	5097
-VPMULHRSWZrmkz	5098
-VPMULHRSWZrr	5099
-VPMULHRSWZrrk	5100
-VPMULHRSWZrrkz	5101
-VPMULHRSWrm	5102
-VPMULHRSWrr	5103
-VPMULHUWYrm	5104
-VPMULHUWYrr	5105
-VPMULHUWZ	5106
-VPMULHUWZrm	5107
-VPMULHUWZrmk	5108
-VPMULHUWZrmkz	5109
-VPMULHUWZrr	5110
-VPMULHUWZrrk	5111
-VPMULHUWZrrkz	5112
-VPMULHUWrm	5113
-VPMULHUWrr	5114
-VPMULHWYrm	5115
-VPMULHWYrr	5116
-VPMULHWZ	5117
-VPMULHWZrm	5118
-VPMULHWZrmk	5119
-VPMULHWZrmkz	5120
-VPMULHWZrr	5121
-VPMULHWZrrk	5122
-VPMULHWZrrkz	5123
-VPMULHWrm	5124
-VPMULHWrr	5125
-VPMULLDYrm	5126
-VPMULLDYrr	5127
-VPMULLDZ	5128
-VPMULLDZrm	5129
-VPMULLDZrmb	5130
-VPMULLDZrmbk	5131
-VPMULLDZrmbkz	5132
-VPMULLDZrmk	5133
-VPMULLDZrmkz	5134
-VPMULLDZrr	5135
-VPMULLDZrrk	5136
-VPMULLDZrrkz	5137
-VPMULLDrm	5138
-VPMULLDrr	5139
-VPMULLQZ	5140
-VPMULLQZrm	5141
-VPMULLQZrmb	5142
-VPMULLQZrmbk	5143
-VPMULLQZrmbkz	5144
-VPMULLQZrmk	5145
-VPMULLQZrmkz	5146
-VPMULLQZrr	5147
-VPMULLQZrrk	5148
-VPMULLQZrrkz	5149
-VPMULLWYrm	5150
-VPMULLWYrr	5151
-VPMULLWZ	5152
-VPMULLWZrm	5153
-VPMULLWZrmk	5154
-VPMULLWZrmkz	5155
-VPMULLWZrr	5156
-VPMULLWZrrk	5157
-VPMULLWZrrkz	5158
-VPMULLWrm	5159
-VPMULLWrr	5160
-VPMULTISHIFTQBZ	5161
-VPMULTISHIFTQBZrm	5162
-VPMULTISHIFTQBZrmb	5163
-VPMULTISHIFTQBZrmbk	5164
-VPMULTISHIFTQBZrmbkz	5165
-VPMULTISHIFTQBZrmk	5166
-VPMULTISHIFTQBZrmkz	5167
-VPMULTISHIFTQBZrr	5168
-VPMULTISHIFTQBZrrk	5169
-VPMULTISHIFTQBZrrkz	5170
-VPMULUDQYrm	5171
-VPMULUDQYrr	5172
-VPMULUDQZ	5173
-VPMULUDQZrm	5174
-VPMULUDQZrmb	5175
-VPMULUDQZrmbk	5176
-VPMULUDQZrmbkz	5177
-VPMULUDQZrmk	5178
-VPMULUDQZrmkz	5179
-VPMULUDQZrr	5180
-VPMULUDQZrrk	5181
-VPMULUDQZrrkz	5182
-VPMULUDQrm	5183
-VPMULUDQrr	5184
-VPOPCNTBZ	5185
-VPOPCNTBZrm	5186
-VPOPCNTBZrmk	5187
-VPOPCNTBZrmkz	5188
-VPOPCNTBZrr	5189
-VPOPCNTBZrrk	5190
-VPOPCNTBZrrkz	5191
-VPOPCNTDZ	5192
-VPOPCNTDZrm	5193
-VPOPCNTDZrmb	5194
-VPOPCNTDZrmbk	5195
-VPOPCNTDZrmbkz	5196
-VPOPCNTDZrmk	5197
-VPOPCNTDZrmkz	5198
-VPOPCNTDZrr	5199
-VPOPCNTDZrrk	5200
-VPOPCNTDZrrkz	5201
-VPOPCNTQZ	5202
-VPOPCNTQZrm	5203
-VPOPCNTQZrmb	5204
-VPOPCNTQZrmbk	5205
-VPOPCNTQZrmbkz	5206
-VPOPCNTQZrmk	5207
-VPOPCNTQZrmkz	5208
-VPOPCNTQZrr	5209
-VPOPCNTQZrrk	5210
-VPOPCNTQZrrkz	5211
-VPOPCNTWZ	5212
-VPOPCNTWZrm	5213
-VPOPCNTWZrmk	5214
-VPOPCNTWZrmkz	5215
-VPOPCNTWZrr	5216
-VPOPCNTWZrrk	5217
-VPOPCNTWZrrkz	5218
-VPORDZ	5219
-VPORDZrm	5220
-VPORDZrmb	5221
-VPORDZrmbk	5222
-VPORDZrmbkz	5223
-VPORDZrmk	5224
-VPORDZrmkz	5225
-VPORDZrr	5226
-VPORDZrrk	5227
-VPORDZrrkz	5228
-VPORQZ	5229
-VPORQZrm	5230
-VPORQZrmb	5231
-VPORQZrmbk	5232
-VPORQZrmbkz	5233
-VPORQZrmk	5234
-VPORQZrmkz	5235
-VPORQZrr	5236
-VPORQZrrk	5237
-VPORQZrrkz	5238
-VPORYrm	5239
-VPORYrr	5240
-VPORrm	5241
-VPORrr	5242
-VPPERMrmr	5243
-VPPERMrrm	5244
-VPPERMrrr	5245
-VPPERMrrr_REV	5246
-VPROLDZ	5247
-VPROLDZmbi	5248
-VPROLDZmbik	5249
-VPROLDZmbikz	5250
-VPROLDZmi	5251
-VPROLDZmik	5252
-VPROLDZmikz	5253
-VPROLDZri	5254
-VPROLDZrik	5255
-VPROLDZrikz	5256
-VPROLQZ	5257
-VPROLQZmbi	5258
-VPROLQZmbik	5259
-VPROLQZmbikz	5260
-VPROLQZmi	5261
-VPROLQZmik	5262
-VPROLQZmikz	5263
-VPROLQZri	5264
-VPROLQZrik	5265
-VPROLQZrikz	5266
-VPROLVDZ	5267
-VPROLVDZrm	5268
-VPROLVDZrmb	5269
-VPROLVDZrmbk	5270
-VPROLVDZrmbkz	5271
-VPROLVDZrmk	5272
-VPROLVDZrmkz	5273
-VPROLVDZrr	5274
-VPROLVDZrrk	5275
-VPROLVDZrrkz	5276
-VPROLVQZ	5277
-VPROLVQZrm	5278
-VPROLVQZrmb	5279
-VPROLVQZrmbk	5280
-VPROLVQZrmbkz	5281
-VPROLVQZrmk	5282
-VPROLVQZrmkz	5283
-VPROLVQZrr	5284
-VPROLVQZrrk	5285
-VPROLVQZrrkz	5286
-VPRORDZ	5287
-VPRORDZmbi	5288
-VPRORDZmbik	5289
-VPRORDZmbikz	5290
-VPRORDZmi	5291
-VPRORDZmik	5292
-VPRORDZmikz	5293
-VPRORDZri	5294
-VPRORDZrik	5295
-VPRORDZrikz	5296
-VPRORQZ	5297
-VPRORQZmbi	5298
-VPRORQZmbik	5299
-VPRORQZmbikz	5300
-VPRORQZmi	5301
-VPRORQZmik	5302
-VPRORQZmikz	5303
-VPRORQZri	5304
-VPRORQZrik	5305
-VPRORQZrikz	5306
-VPRORVDZ	5307
-VPRORVDZrm	5308
-VPRORVDZrmb	5309
-VPRORVDZrmbk	5310
-VPRORVDZrmbkz	5311
-VPRORVDZrmk	5312
-VPRORVDZrmkz	5313
-VPRORVDZrr	5314
-VPRORVDZrrk	5315
-VPRORVDZrrkz	5316
-VPRORVQZ	5317
-VPRORVQZrm	5318
-VPRORVQZrmb	5319
-VPRORVQZrmbk	5320
-VPRORVQZrmbkz	5321
-VPRORVQZrmk	5322
-VPRORVQZrmkz	5323
-VPRORVQZrr	5324
-VPRORVQZrrk	5325
-VPRORVQZrrkz	5326
-VPROTBmi	5327
-VPROTBmr	5328
-VPROTBri	5329
-VPROTBrm	5330
-VPROTBrr	5331
-VPROTBrr_REV	5332
-VPROTDmi	5333
-VPROTDmr	5334
-VPROTDri	5335
-VPROTDrm	5336
-VPROTDrr	5337
-VPROTDrr_REV	5338
-VPROTQmi	5339
-VPROTQmr	5340
-VPROTQri	5341
-VPROTQrm	5342
-VPROTQrr	5343
-VPROTQrr_REV	5344
-VPROTWmi	5345
-VPROTWmr	5346
-VPROTWri	5347
-VPROTWrm	5348
-VPROTWrr	5349
-VPROTWrr_REV	5350
-VPSADBWYrm	5351
-VPSADBWYrr	5352
-VPSADBWZ	5353
-VPSADBWZrm	5354
-VPSADBWZrr	5355
-VPSADBWrm	5356
-VPSADBWrr	5357
-VPSCATTERDDZ	5358
-VPSCATTERDDZmr	5359
-VPSCATTERDQZ	5360
-VPSCATTERDQZmr	5361
-VPSCATTERQDZ	5362
-VPSCATTERQDZmr	5363
-VPSCATTERQQZ	5364
-VPSCATTERQQZmr	5365
-VPSHABmr	5366
-VPSHABrm	5367
-VPSHABrr	5368
-VPSHABrr_REV	5369
-VPSHADmr	5370
-VPSHADrm	5371
-VPSHADrr	5372
-VPSHADrr_REV	5373
-VPSHAQmr	5374
-VPSHAQrm	5375
-VPSHAQrr	5376
-VPSHAQrr_REV	5377
-VPSHAWmr	5378
-VPSHAWrm	5379
-VPSHAWrr	5380
-VPSHAWrr_REV	5381
-VPSHLBmr	5382
-VPSHLBrm	5383
-VPSHLBrr	5384
-VPSHLBrr_REV	5385
-VPSHLDDZ	5386
-VPSHLDDZrmbi	5387
-VPSHLDDZrmbik	5388
-VPSHLDDZrmbikz	5389
-VPSHLDDZrmi	5390
-VPSHLDDZrmik	5391
-VPSHLDDZrmikz	5392
-VPSHLDDZrri	5393
-VPSHLDDZrrik	5394
-VPSHLDDZrrikz	5395
-VPSHLDQZ	5396
-VPSHLDQZrmbi	5397
-VPSHLDQZrmbik	5398
-VPSHLDQZrmbikz	5399
-VPSHLDQZrmi	5400
-VPSHLDQZrmik	5401
-VPSHLDQZrmikz	5402
-VPSHLDQZrri	5403
-VPSHLDQZrrik	5404
-VPSHLDQZrrikz	5405
-VPSHLDVDZ	5406
-VPSHLDVDZm	5407
-VPSHLDVDZmb	5408
-VPSHLDVDZmbk	5409
-VPSHLDVDZmbkz	5410
-VPSHLDVDZmk	5411
-VPSHLDVDZmkz	5412
-VPSHLDVDZr	5413
-VPSHLDVDZrk	5414
-VPSHLDVDZrkz	5415
-VPSHLDVQZ	5416
-VPSHLDVQZm	5417
-VPSHLDVQZmb	5418
-VPSHLDVQZmbk	5419
-VPSHLDVQZmbkz	5420
-VPSHLDVQZmk	5421
-VPSHLDVQZmkz	5422
-VPSHLDVQZr	5423
-VPSHLDVQZrk	5424
-VPSHLDVQZrkz	5425
-VPSHLDVWZ	5426
-VPSHLDVWZm	5427
-VPSHLDVWZmk	5428
-VPSHLDVWZmkz	5429
-VPSHLDVWZr	5430
-VPSHLDVWZrk	5431
-VPSHLDVWZrkz	5432
-VPSHLDWZ	5433
-VPSHLDWZrmi	5434
-VPSHLDWZrmik	5435
-VPSHLDWZrmikz	5436
-VPSHLDWZrri	5437
-VPSHLDWZrrik	5438
-VPSHLDWZrrikz	5439
-VPSHLDmr	5440
-VPSHLDrm	5441
-VPSHLDrr	5442
-VPSHLDrr_REV	5443
-VPSHLQmr	5444
-VPSHLQrm	5445
-VPSHLQrr	5446
-VPSHLQrr_REV	5447
-VPSHLWmr	5448
-VPSHLWrm	5449
-VPSHLWrr	5450
-VPSHLWrr_REV	5451
-VPSHRDDZ	5452
-VPSHRDDZrmbi	5453
-VPSHRDDZrmbik	5454
-VPSHRDDZrmbikz	5455
-VPSHRDDZrmi	5456
-VPSHRDDZrmik	5457
-VPSHRDDZrmikz	5458
-VPSHRDDZrri	5459
-VPSHRDDZrrik	5460
-VPSHRDDZrrikz	5461
-VPSHRDQZ	5462
-VPSHRDQZrmbi	5463
-VPSHRDQZrmbik	5464
-VPSHRDQZrmbikz	5465
-VPSHRDQZrmi	5466
-VPSHRDQZrmik	5467
-VPSHRDQZrmikz	5468
-VPSHRDQZrri	5469
-VPSHRDQZrrik	5470
-VPSHRDQZrrikz	5471
-VPSHRDVDZ	5472
-VPSHRDVDZm	5473
-VPSHRDVDZmb	5474
-VPSHRDVDZmbk	5475
-VPSHRDVDZmbkz	5476
-VPSHRDVDZmk	5477
-VPSHRDVDZmkz	5478
-VPSHRDVDZr	5479
-VPSHRDVDZrk	5480
-VPSHRDVDZrkz	5481
-VPSHRDVQZ	5482
-VPSHRDVQZm	5483
-VPSHRDVQZmb	5484
-VPSHRDVQZmbk	5485
-VPSHRDVQZmbkz	5486
-VPSHRDVQZmk	5487
-VPSHRDVQZmkz	5488
-VPSHRDVQZr	5489
-VPSHRDVQZrk	5490
-VPSHRDVQZrkz	5491
-VPSHRDVWZ	5492
-VPSHRDVWZm	5493
-VPSHRDVWZmk	5494
-VPSHRDVWZmkz	5495
-VPSHRDVWZr	5496
-VPSHRDVWZrk	5497
-VPSHRDVWZrkz	5498
-VPSHRDWZ	5499
-VPSHRDWZrmi	5500
-VPSHRDWZrmik	5501
-VPSHRDWZrmikz	5502
-VPSHRDWZrri	5503
-VPSHRDWZrrik	5504
-VPSHRDWZrrikz	5505
-VPSHUFBITQMBZ	5506
-VPSHUFBITQMBZrm	5507
-VPSHUFBITQMBZrmk	5508
-VPSHUFBITQMBZrr	5509
-VPSHUFBITQMBZrrk	5510
-VPSHUFBYrm	5511
-VPSHUFBYrr	5512
-VPSHUFBZ	5513
-VPSHUFBZrm	5514
-VPSHUFBZrmk	5515
-VPSHUFBZrmkz	5516
-VPSHUFBZrr	5517
-VPSHUFBZrrk	5518
-VPSHUFBZrrkz	5519
-VPSHUFBrm	5520
-VPSHUFBrr	5521
-VPSHUFDYmi	5522
-VPSHUFDYri	5523
-VPSHUFDZ	5524
-VPSHUFDZmbi	5525
-VPSHUFDZmbik	5526
-VPSHUFDZmbikz	5527
-VPSHUFDZmi	5528
-VPSHUFDZmik	5529
-VPSHUFDZmikz	5530
-VPSHUFDZri	5531
-VPSHUFDZrik	5532
-VPSHUFDZrikz	5533
-VPSHUFDmi	5534
-VPSHUFDri	5535
-VPSHUFHWYmi	5536
-VPSHUFHWYri	5537
-VPSHUFHWZ	5538
-VPSHUFHWZmi	5539
-VPSHUFHWZmik	5540
-VPSHUFHWZmikz	5541
-VPSHUFHWZri	5542
-VPSHUFHWZrik	5543
-VPSHUFHWZrikz	5544
-VPSHUFHWmi	5545
-VPSHUFHWri	5546
-VPSHUFLWYmi	5547
-VPSHUFLWYri	5548
-VPSHUFLWZ	5549
-VPSHUFLWZmi	5550
-VPSHUFLWZmik	5551
-VPSHUFLWZmikz	5552
-VPSHUFLWZri	5553
-VPSHUFLWZrik	5554
-VPSHUFLWZrikz	5555
-VPSHUFLWmi	5556
-VPSHUFLWri	5557
-VPSIGNBYrm	5558
-VPSIGNBYrr	5559
-VPSIGNBrm	5560
-VPSIGNBrr	5561
-VPSIGNDYrm	5562
-VPSIGNDYrr	5563
-VPSIGNDrm	5564
-VPSIGNDrr	5565
-VPSIGNWYrm	5566
-VPSIGNWYrr	5567
-VPSIGNWrm	5568
-VPSIGNWrr	5569
-VPSLLDQYri	5570
-VPSLLDQZ	5571
-VPSLLDQZmi	5572
-VPSLLDQZri	5573
-VPSLLDQri	5574
-VPSLLDYri	5575
-VPSLLDYrm	5576
-VPSLLDYrr	5577
-VPSLLDZ	5578
-VPSLLDZmbi	5579
-VPSLLDZmbik	5580
-VPSLLDZmbikz	5581
-VPSLLDZmi	5582
-VPSLLDZmik	5583
-VPSLLDZmikz	5584
-VPSLLDZri	5585
-VPSLLDZrik	5586
-VPSLLDZrikz	5587
-VPSLLDZrm	5588
-VPSLLDZrmk	5589
-VPSLLDZrmkz	5590
-VPSLLDZrr	5591
-VPSLLDZrrk	5592
-VPSLLDZrrkz	5593
-VPSLLDri	5594
-VPSLLDrm	5595
-VPSLLDrr	5596
-VPSLLQYri	5597
-VPSLLQYrm	5598
-VPSLLQYrr	5599
-VPSLLQZ	5600
-VPSLLQZmbi	5601
-VPSLLQZmbik	5602
-VPSLLQZmbikz	5603
-VPSLLQZmi	5604
-VPSLLQZmik	5605
-VPSLLQZmikz	5606
-VPSLLQZri	5607
-VPSLLQZrik	5608
-VPSLLQZrikz	5609
-VPSLLQZrm	5610
-VPSLLQZrmk	5611
-VPSLLQZrmkz	5612
-VPSLLQZrr	5613
-VPSLLQZrrk	5614
-VPSLLQZrrkz	5615
-VPSLLQri	5616
-VPSLLQrm	5617
-VPSLLQrr	5618
-VPSLLVDYrm	5619
-VPSLLVDYrr	5620
-VPSLLVDZ	5621
-VPSLLVDZrm	5622
-VPSLLVDZrmb	5623
-VPSLLVDZrmbk	5624
-VPSLLVDZrmbkz	5625
-VPSLLVDZrmk	5626
-VPSLLVDZrmkz	5627
-VPSLLVDZrr	5628
-VPSLLVDZrrk	5629
-VPSLLVDZrrkz	5630
-VPSLLVDrm	5631
-VPSLLVDrr	5632
-VPSLLVQYrm	5633
-VPSLLVQYrr	5634
-VPSLLVQZ	5635
-VPSLLVQZrm	5636
-VPSLLVQZrmb	5637
-VPSLLVQZrmbk	5638
-VPSLLVQZrmbkz	5639
-VPSLLVQZrmk	5640
-VPSLLVQZrmkz	5641
-VPSLLVQZrr	5642
-VPSLLVQZrrk	5643
-VPSLLVQZrrkz	5644
-VPSLLVQrm	5645
-VPSLLVQrr	5646
-VPSLLVWZ	5647
-VPSLLVWZrm	5648
-VPSLLVWZrmk	5649
-VPSLLVWZrmkz	5650
-VPSLLVWZrr	5651
-VPSLLVWZrrk	5652
-VPSLLVWZrrkz	5653
-VPSLLWYri	5654
-VPSLLWYrm	5655
-VPSLLWYrr	5656
-VPSLLWZ	5657
-VPSLLWZmi	5658
-VPSLLWZmik	5659
-VPSLLWZmikz	5660
-VPSLLWZri	5661
-VPSLLWZrik	5662
-VPSLLWZrikz	5663
-VPSLLWZrm	5664
-VPSLLWZrmk	5665
-VPSLLWZrmkz	5666
-VPSLLWZrr	5667
-VPSLLWZrrk	5668
-VPSLLWZrrkz	5669
-VPSLLWri	5670
-VPSLLWrm	5671
-VPSLLWrr	5672
-VPSRADYri	5673
-VPSRADYrm	5674
-VPSRADYrr	5675
-VPSRADZ	5676
-VPSRADZmbi	5677
-VPSRADZmbik	5678
-VPSRADZmbikz	5679
-VPSRADZmi	5680
-VPSRADZmik	5681
-VPSRADZmikz	5682
-VPSRADZri	5683
-VPSRADZrik	5684
-VPSRADZrikz	5685
-VPSRADZrm	5686
-VPSRADZrmk	5687
-VPSRADZrmkz	5688
-VPSRADZrr	5689
-VPSRADZrrk	5690
-VPSRADZrrkz	5691
-VPSRADri	5692
-VPSRADrm	5693
-VPSRADrr	5694
-VPSRAQZ	5695
-VPSRAQZmbi	5696
-VPSRAQZmbik	5697
-VPSRAQZmbikz	5698
-VPSRAQZmi	5699
-VPSRAQZmik	5700
-VPSRAQZmikz	5701
-VPSRAQZri	5702
-VPSRAQZrik	5703
-VPSRAQZrikz	5704
-VPSRAQZrm	5705
-VPSRAQZrmk	5706
-VPSRAQZrmkz	5707
-VPSRAQZrr	5708
-VPSRAQZrrk	5709
-VPSRAQZrrkz	5710
-VPSRAVDYrm	5711
-VPSRAVDYrr	5712
-VPSRAVDZ	5713
-VPSRAVDZrm	5714
-VPSRAVDZrmb	5715
-VPSRAVDZrmbk	5716
-VPSRAVDZrmbkz	5717
-VPSRAVDZrmk	5718
-VPSRAVDZrmkz	5719
-VPSRAVDZrr	5720
-VPSRAVDZrrk	5721
-VPSRAVDZrrkz	5722
-VPSRAVDrm	5723
-VPSRAVDrr	5724
-VPSRAVQZ	5725
-VPSRAVQZrm	5726
-VPSRAVQZrmb	5727
-VPSRAVQZrmbk	5728
-VPSRAVQZrmbkz	5729
-VPSRAVQZrmk	5730
-VPSRAVQZrmkz	5731
-VPSRAVQZrr	5732
-VPSRAVQZrrk	5733
-VPSRAVQZrrkz	5734
-VPSRAVWZ	5735
-VPSRAVWZrm	5736
-VPSRAVWZrmk	5737
-VPSRAVWZrmkz	5738
-VPSRAVWZrr	5739
-VPSRAVWZrrk	5740
-VPSRAVWZrrkz	5741
-VPSRAWYri	5742
-VPSRAWYrm	5743
-VPSRAWYrr	5744
-VPSRAWZ	5745
-VPSRAWZmi	5746
-VPSRAWZmik	5747
-VPSRAWZmikz	5748
-VPSRAWZri	5749
-VPSRAWZrik	5750
-VPSRAWZrikz	5751
-VPSRAWZrm	5752
-VPSRAWZrmk	5753
-VPSRAWZrmkz	5754
-VPSRAWZrr	5755
-VPSRAWZrrk	5756
-VPSRAWZrrkz	5757
-VPSRAWri	5758
-VPSRAWrm	5759
-VPSRAWrr	5760
-VPSRLDQYri	5761
-VPSRLDQZ	5762
-VPSRLDQZmi	5763
-VPSRLDQZri	5764
-VPSRLDQri	5765
-VPSRLDYri	5766
-VPSRLDYrm	5767
-VPSRLDYrr	5768
-VPSRLDZ	5769
-VPSRLDZmbi	5770
-VPSRLDZmbik	5771
-VPSRLDZmbikz	5772
-VPSRLDZmi	5773
-VPSRLDZmik	5774
-VPSRLDZmikz	5775
-VPSRLDZri	5776
-VPSRLDZrik	5777
-VPSRLDZrikz	5778
-VPSRLDZrm	5779
-VPSRLDZrmk	5780
-VPSRLDZrmkz	5781
-VPSRLDZrr	5782
-VPSRLDZrrk	5783
-VPSRLDZrrkz	5784
-VPSRLDri	5785
-VPSRLDrm	5786
-VPSRLDrr	5787
-VPSRLQYri	5788
-VPSRLQYrm	5789
-VPSRLQYrr	5790
-VPSRLQZ	5791
-VPSRLQZmbi	5792
-VPSRLQZmbik	5793
-VPSRLQZmbikz	5794
-VPSRLQZmi	5795
-VPSRLQZmik	5796
-VPSRLQZmikz	5797
-VPSRLQZri	5798
-VPSRLQZrik	5799
-VPSRLQZrikz	5800
-VPSRLQZrm	5801
-VPSRLQZrmk	5802
-VPSRLQZrmkz	5803
-VPSRLQZrr	5804
-VPSRLQZrrk	5805
-VPSRLQZrrkz	5806
-VPSRLQri	5807
-VPSRLQrm	5808
-VPSRLQrr	5809
-VPSRLVDYrm	5810
-VPSRLVDYrr	5811
-VPSRLVDZ	5812
-VPSRLVDZrm	5813
-VPSRLVDZrmb	5814
-VPSRLVDZrmbk	5815
-VPSRLVDZrmbkz	5816
-VPSRLVDZrmk	5817
-VPSRLVDZrmkz	5818
-VPSRLVDZrr	5819
-VPSRLVDZrrk	5820
-VPSRLVDZrrkz	5821
-VPSRLVDrm	5822
-VPSRLVDrr	5823
-VPSRLVQYrm	5824
-VPSRLVQYrr	5825
-VPSRLVQZ	5826
-VPSRLVQZrm	5827
-VPSRLVQZrmb	5828
-VPSRLVQZrmbk	5829
-VPSRLVQZrmbkz	5830
-VPSRLVQZrmk	5831
-VPSRLVQZrmkz	5832
-VPSRLVQZrr	5833
-VPSRLVQZrrk	5834
-VPSRLVQZrrkz	5835
-VPSRLVQrm	5836
-VPSRLVQrr	5837
-VPSRLVWZ	5838
-VPSRLVWZrm	5839
-VPSRLVWZrmk	5840
-VPSRLVWZrmkz	5841
-VPSRLVWZrr	5842
-VPSRLVWZrrk	5843
-VPSRLVWZrrkz	5844
-VPSRLWYri	5845
-VPSRLWYrm	5846
-VPSRLWYrr	5847
-VPSRLWZ	5848
-VPSRLWZmi	5849
-VPSRLWZmik	5850
-VPSRLWZmikz	5851
-VPSRLWZri	5852
-VPSRLWZrik	5853
-VPSRLWZrikz	5854
-VPSRLWZrm	5855
-VPSRLWZrmk	5856
-VPSRLWZrmkz	5857
-VPSRLWZrr	5858
-VPSRLWZrrk	5859
-VPSRLWZrrkz	5860
-VPSRLWri	5861
-VPSRLWrm	5862
-VPSRLWrr	5863
-VPSUBBYrm	5864
-VPSUBBYrr	5865
-VPSUBBZ	5866
-VPSUBBZrm	5867
-VPSUBBZrmk	5868
-VPSUBBZrmkz	5869
-VPSUBBZrr	5870
-VPSUBBZrrk	5871
-VPSUBBZrrkz	5872
-VPSUBBrm	5873
-VPSUBBrr	5874
-VPSUBDYrm	5875
-VPSUBDYrr	5876
-VPSUBDZ	5877
-VPSUBDZrm	5878
-VPSUBDZrmb	5879
-VPSUBDZrmbk	5880
-VPSUBDZrmbkz	5881
-VPSUBDZrmk	5882
-VPSUBDZrmkz	5883
-VPSUBDZrr	5884
-VPSUBDZrrk	5885
-VPSUBDZrrkz	5886
-VPSUBDrm	5887
-VPSUBDrr	5888
-VPSUBQYrm	5889
-VPSUBQYrr	5890
-VPSUBQZ	5891
-VPSUBQZrm	5892
-VPSUBQZrmb	5893
-VPSUBQZrmbk	5894
-VPSUBQZrmbkz	5895
-VPSUBQZrmk	5896
-VPSUBQZrmkz	5897
-VPSUBQZrr	5898
-VPSUBQZrrk	5899
-VPSUBQZrrkz	5900
-VPSUBQrm	5901
-VPSUBQrr	5902
-VPSUBSBYrm	5903
-VPSUBSBYrr	5904
-VPSUBSBZ	5905
-VPSUBSBZrm	5906
-VPSUBSBZrmk	5907
-VPSUBSBZrmkz	5908
-VPSUBSBZrr	5909
-VPSUBSBZrrk	5910
-VPSUBSBZrrkz	5911
-VPSUBSBrm	5912
-VPSUBSBrr	5913
-VPSUBSWYrm	5914
-VPSUBSWYrr	5915
-VPSUBSWZ	5916
-VPSUBSWZrm	5917
-VPSUBSWZrmk	5918
-VPSUBSWZrmkz	5919
-VPSUBSWZrr	5920
-VPSUBSWZrrk	5921
-VPSUBSWZrrkz	5922
-VPSUBSWrm	5923
-VPSUBSWrr	5924
-VPSUBUSBYrm	5925
-VPSUBUSBYrr	5926
-VPSUBUSBZ	5927
-VPSUBUSBZrm	5928
-VPSUBUSBZrmk	5929
-VPSUBUSBZrmkz	5930
-VPSUBUSBZrr	5931
-VPSUBUSBZrrk	5932
-VPSUBUSBZrrkz	5933
-VPSUBUSBrm	5934
-VPSUBUSBrr	5935
-VPSUBUSWYrm	5936
-VPSUBUSWYrr	5937
-VPSUBUSWZ	5938
-VPSUBUSWZrm	5939
-VPSUBUSWZrmk	5940
-VPSUBUSWZrmkz	5941
-VPSUBUSWZrr	5942
-VPSUBUSWZrrk	5943
-VPSUBUSWZrrkz	5944
-VPSUBUSWrm	5945
-VPSUBUSWrr	5946
-VPSUBWYrm	5947
-VPSUBWYrr	5948
-VPSUBWZ	5949
-VPSUBWZrm	5950
-VPSUBWZrmk	5951
-VPSUBWZrmkz	5952
-VPSUBWZrr	5953
-VPSUBWZrrk	5954
-VPSUBWZrrkz	5955
-VPSUBWrm	5956
-VPSUBWrr	5957
-VPTERNLOGDZ	5958
-VPTERNLOGDZrmbi	5959
-VPTERNLOGDZrmbik	5960
-VPTERNLOGDZrmbikz	5961
-VPTERNLOGDZrmi	5962
-VPTERNLOGDZrmik	5963
-VPTERNLOGDZrmikz	5964
-VPTERNLOGDZrri	5965
-VPTERNLOGDZrrik	5966
-VPTERNLOGDZrrikz	5967
-VPTERNLOGQZ	5968
-VPTERNLOGQZrmbi	5969
-VPTERNLOGQZrmbik	5970
-VPTERNLOGQZrmbikz	5971
-VPTERNLOGQZrmi	5972
-VPTERNLOGQZrmik	5973
-VPTERNLOGQZrmikz	5974
-VPTERNLOGQZrri	5975
-VPTERNLOGQZrrik	5976
-VPTERNLOGQZrrikz	5977
-VPTESTMBZ	5978
-VPTESTMBZrm	5979
-VPTESTMBZrmk	5980
-VPTESTMBZrr	5981
-VPTESTMBZrrk	5982
-VPTESTMDZ	5983
-VPTESTMDZrm	5984
-VPTESTMDZrmb	5985
-VPTESTMDZrmbk	5986
-VPTESTMDZrmk	5987
-VPTESTMDZrr	5988
-VPTESTMDZrrk	5989
-VPTESTMQZ	5990
-VPTESTMQZrm	5991
-VPTESTMQZrmb	5992
-VPTESTMQZrmbk	5993
-VPTESTMQZrmk	5994
-VPTESTMQZrr	5995
-VPTESTMQZrrk	5996
-VPTESTMWZ	5997
-VPTESTMWZrm	5998
-VPTESTMWZrmk	5999
-VPTESTMWZrr	6000
-VPTESTMWZrrk	6001
-VPTESTNMBZ	6002
-VPTESTNMBZrm	6003
-VPTESTNMBZrmk	6004
-VPTESTNMBZrr	6005
-VPTESTNMBZrrk	6006
-VPTESTNMDZ	6007
-VPTESTNMDZrm	6008
-VPTESTNMDZrmb	6009
-VPTESTNMDZrmbk	6010
-VPTESTNMDZrmk	6011
-VPTESTNMDZrr	6012
-VPTESTNMDZrrk	6013
-VPTESTNMQZ	6014
-VPTESTNMQZrm	6015
-VPTESTNMQZrmb	6016
-VPTESTNMQZrmbk	6017
-VPTESTNMQZrmk	6018
-VPTESTNMQZrr	6019
-VPTESTNMQZrrk	6020
-VPTESTNMWZ	6021
-VPTESTNMWZrm	6022
-VPTESTNMWZrmk	6023
-VPTESTNMWZrr	6024
-VPTESTNMWZrrk	6025
-VPTESTYrm	6026
-VPTESTYrr	6027
-VPTESTrm	6028
-VPTESTrr	6029
-VPUNPCKHBWYrm	6030
-VPUNPCKHBWYrr	6031
-VPUNPCKHBWZ	6032
-VPUNPCKHBWZrm	6033
-VPUNPCKHBWZrmk	6034
-VPUNPCKHBWZrmkz	6035
-VPUNPCKHBWZrr	6036
-VPUNPCKHBWZrrk	6037
-VPUNPCKHBWZrrkz	6038
-VPUNPCKHBWrm	6039
-VPUNPCKHBWrr	6040
-VPUNPCKHDQYrm	6041
-VPUNPCKHDQYrr	6042
-VPUNPCKHDQZ	6043
-VPUNPCKHDQZrm	6044
-VPUNPCKHDQZrmb	6045
-VPUNPCKHDQZrmbk	6046
-VPUNPCKHDQZrmbkz	6047
-VPUNPCKHDQZrmk	6048
-VPUNPCKHDQZrmkz	6049
-VPUNPCKHDQZrr	6050
-VPUNPCKHDQZrrk	6051
-VPUNPCKHDQZrrkz	6052
-VPUNPCKHDQrm	6053
-VPUNPCKHDQrr	6054
-VPUNPCKHQDQYrm	6055
-VPUNPCKHQDQYrr	6056
-VPUNPCKHQDQZ	6057
-VPUNPCKHQDQZrm	6058
-VPUNPCKHQDQZrmb	6059
-VPUNPCKHQDQZrmbk	6060
-VPUNPCKHQDQZrmbkz	6061
-VPUNPCKHQDQZrmk	6062
-VPUNPCKHQDQZrmkz	6063
-VPUNPCKHQDQZrr	6064
-VPUNPCKHQDQZrrk	6065
-VPUNPCKHQDQZrrkz	6066
-VPUNPCKHQDQrm	6067
-VPUNPCKHQDQrr	6068
-VPUNPCKHWDYrm	6069
-VPUNPCKHWDYrr	6070
-VPUNPCKHWDZ	6071
-VPUNPCKHWDZrm	6072
-VPUNPCKHWDZrmk	6073
-VPUNPCKHWDZrmkz	6074
-VPUNPCKHWDZrr	6075
-VPUNPCKHWDZrrk	6076
-VPUNPCKHWDZrrkz	6077
-VPUNPCKHWDrm	6078
-VPUNPCKHWDrr	6079
-VPUNPCKLBWYrm	6080
-VPUNPCKLBWYrr	6081
-VPUNPCKLBWZ	6082
-VPUNPCKLBWZrm	6083
-VPUNPCKLBWZrmk	6084
-VPUNPCKLBWZrmkz	6085
-VPUNPCKLBWZrr	6086
-VPUNPCKLBWZrrk	6087
-VPUNPCKLBWZrrkz	6088
-VPUNPCKLBWrm	6089
-VPUNPCKLBWrr	6090
-VPUNPCKLDQYrm	6091
-VPUNPCKLDQYrr	6092
-VPUNPCKLDQZ	6093
-VPUNPCKLDQZrm	6094
-VPUNPCKLDQZrmb	6095
-VPUNPCKLDQZrmbk	6096
-VPUNPCKLDQZrmbkz	6097
-VPUNPCKLDQZrmk	6098
-VPUNPCKLDQZrmkz	6099
-VPUNPCKLDQZrr	6100
-VPUNPCKLDQZrrk	6101
-VPUNPCKLDQZrrkz	6102
-VPUNPCKLDQrm	6103
-VPUNPCKLDQrr	6104
-VPUNPCKLQDQYrm	6105
-VPUNPCKLQDQYrr	6106
-VPUNPCKLQDQZ	6107
-VPUNPCKLQDQZrm	6108
-VPUNPCKLQDQZrmb	6109
-VPUNPCKLQDQZrmbk	6110
-VPUNPCKLQDQZrmbkz	6111
-VPUNPCKLQDQZrmk	6112
-VPUNPCKLQDQZrmkz	6113
-VPUNPCKLQDQZrr	6114
-VPUNPCKLQDQZrrk	6115
-VPUNPCKLQDQZrrkz	6116
-VPUNPCKLQDQrm	6117
-VPUNPCKLQDQrr	6118
-VPUNPCKLWDYrm	6119
-VPUNPCKLWDYrr	6120
-VPUNPCKLWDZ	6121
-VPUNPCKLWDZrm	6122
-VPUNPCKLWDZrmk	6123
-VPUNPCKLWDZrmkz	6124
-VPUNPCKLWDZrr	6125
-VPUNPCKLWDZrrk	6126
-VPUNPCKLWDZrrkz	6127
-VPUNPCKLWDrm	6128
-VPUNPCKLWDrr	6129
-VPXORDZ	6130
-VPXORDZrm	6131
-VPXORDZrmb	6132
-VPXORDZrmbk	6133
-VPXORDZrmbkz	6134
-VPXORDZrmk	6135
-VPXORDZrmkz	6136
-VPXORDZrr	6137
-VPXORDZrrk	6138
-VPXORDZrrkz	6139
-VPXORQZ	6140
-VPXORQZrm	6141
-VPXORQZrmb	6142
-VPXORQZrmbk	6143
-VPXORQZrmbkz	6144
-VPXORQZrmk	6145
-VPXORQZrmkz	6146
-VPXORQZrr	6147
-VPXORQZrrk	6148
-VPXORQZrrkz	6149
-VPXORYrm	6150
-VPXORYrr	6151
-VPXORrm	6152
-VPXORrr	6153
-VRANGEPDZ	6154
-VRANGEPDZrmbi	6155
-VRANGEPDZrmbik	6156
-VRANGEPDZrmbikz	6157
-VRANGEPDZrmi	6158
-VRANGEPDZrmik	6159
-VRANGEPDZrmikz	6160
-VRANGEPDZrri	6161
-VRANGEPDZrrib	6162
-VRANGEPDZrribk	6163
-VRANGEPDZrribkz	6164
-VRANGEPDZrrik	6165
-VRANGEPDZrrikz	6166
-VRANGEPSZ	6167
-VRANGEPSZrmbi	6168
-VRANGEPSZrmbik	6169
-VRANGEPSZrmbikz	6170
-VRANGEPSZrmi	6171
-VRANGEPSZrmik	6172
-VRANGEPSZrmikz	6173
-VRANGEPSZrri	6174
-VRANGEPSZrrib	6175
-VRANGEPSZrribk	6176
-VRANGEPSZrribkz	6177
-VRANGEPSZrrik	6178
-VRANGEPSZrrikz	6179
-VRANGESDZrmi	6180
-VRANGESDZrmik	6181
-VRANGESDZrmikz	6182
-VRANGESDZrri	6183
-VRANGESDZrrib	6184
-VRANGESDZrribk	6185
-VRANGESDZrribkz	6186
-VRANGESDZrrik	6187
-VRANGESDZrrikz	6188
-VRANGESSZrmi	6189
-VRANGESSZrmik	6190
-VRANGESSZrmikz	6191
-VRANGESSZrri	6192
-VRANGESSZrrib	6193
-VRANGESSZrribk	6194
-VRANGESSZrribkz	6195
-VRANGESSZrrik	6196
-VRANGESSZrrikz	6197
-VRCP	6198
-VRCPBF	6199
-VRCPPHZ	6200
-VRCPPHZm	6201
-VRCPPHZmb	6202
-VRCPPHZmbk	6203
-VRCPPHZmbkz	6204
-VRCPPHZmk	6205
-VRCPPHZmkz	6206
-VRCPPHZr	6207
-VRCPPHZrk	6208
-VRCPPHZrkz	6209
-VRCPPSYm	6210
-VRCPPSYr	6211
-VRCPPSm	6212
-VRCPPSr	6213
-VRCPSHZrm	6214
-VRCPSHZrmk	6215
-VRCPSHZrmkz	6216
-VRCPSHZrr	6217
-VRCPSHZrrk	6218
-VRCPSHZrrkz	6219
-VRCPSSm	6220
-VRCPSSm_Int	6221
-VRCPSSr	6222
-VRCPSSr_Int	6223
-VREDUCEBF	6224
-VREDUCEPDZ	6225
-VREDUCEPDZrmbi	6226
-VREDUCEPDZrmbik	6227
-VREDUCEPDZrmbikz	6228
-VREDUCEPDZrmi	6229
-VREDUCEPDZrmik	6230
-VREDUCEPDZrmikz	6231
-VREDUCEPDZrri	6232
-VREDUCEPDZrrib	6233
-VREDUCEPDZrribk	6234
-VREDUCEPDZrribkz	6235
-VREDUCEPDZrrik	6236
-VREDUCEPDZrrikz	6237
-VREDUCEPHZ	6238
-VREDUCEPHZrmbi	6239
-VREDUCEPHZrmbik	6240
-VREDUCEPHZrmbikz	6241
-VREDUCEPHZrmi	6242
-VREDUCEPHZrmik	6243
-VREDUCEPHZrmikz	6244
-VREDUCEPHZrri	6245
-VREDUCEPHZrrib	6246
-VREDUCEPHZrribk	6247
-VREDUCEPHZrribkz	6248
-VREDUCEPHZrrik	6249
-VREDUCEPHZrrikz	6250
-VREDUCEPSZ	6251
-VREDUCEPSZrmbi	6252
-VREDUCEPSZrmbik	6253
-VREDUCEPSZrmbikz	6254
-VREDUCEPSZrmi	6255
-VREDUCEPSZrmik	6256
-VREDUCEPSZrmikz	6257
-VREDUCEPSZrri	6258
-VREDUCEPSZrrib	6259
-VREDUCEPSZrribk	6260
-VREDUCEPSZrribkz	6261
-VREDUCEPSZrrik	6262
-VREDUCEPSZrrikz	6263
-VREDUCESDZrmi	6264
-VREDUCESDZrmik	6265
-VREDUCESDZrmikz	6266
-VREDUCESDZrri	6267
-VREDUCESDZrrib	6268
-VREDUCESDZrribk	6269
-VREDUCESDZrribkz	6270
-VREDUCESDZrrik	6271
-VREDUCESDZrrikz	6272
-VREDUCESHZrmi	6273
-VREDUCESHZrmik	6274
-VREDUCESHZrmikz	6275
-VREDUCESHZrri	6276
-VREDUCESHZrrib	6277
-VREDUCESHZrribk	6278
-VREDUCESHZrribkz	6279
-VREDUCESHZrrik	6280
-VREDUCESHZrrikz	6281
-VREDUCESSZrmi	6282
-VREDUCESSZrmik	6283
-VREDUCESSZrmikz	6284
-VREDUCESSZrri	6285
-VREDUCESSZrrib	6286
-VREDUCESSZrribk	6287
-VREDUCESSZrribkz	6288
-VREDUCESSZrrik	6289
-VREDUCESSZrrikz	6290
-VRNDSCALEBF	6291
-VRNDSCALEPDZ	6292
-VRNDSCALEPDZrmbi	6293
-VRNDSCALEPDZrmbik	6294
-VRNDSCALEPDZrmbikz	6295
-VRNDSCALEPDZrmi	6296
-VRNDSCALEPDZrmik	6297
-VRNDSCALEPDZrmikz	6298
-VRNDSCALEPDZrri	6299
-VRNDSCALEPDZrrib	6300
-VRNDSCALEPDZrribk	6301
-VRNDSCALEPDZrribkz	6302
-VRNDSCALEPDZrrik	6303
-VRNDSCALEPDZrrikz	6304
-VRNDSCALEPHZ	6305
-VRNDSCALEPHZrmbi	6306
-VRNDSCALEPHZrmbik	6307
-VRNDSCALEPHZrmbikz	6308
-VRNDSCALEPHZrmi	6309
-VRNDSCALEPHZrmik	6310
-VRNDSCALEPHZrmikz	6311
-VRNDSCALEPHZrri	6312
-VRNDSCALEPHZrrib	6313
-VRNDSCALEPHZrribk	6314
-VRNDSCALEPHZrribkz	6315
-VRNDSCALEPHZrrik	6316
-VRNDSCALEPHZrrikz	6317
-VRNDSCALEPSZ	6318
-VRNDSCALEPSZrmbi	6319
-VRNDSCALEPSZrmbik	6320
-VRNDSCALEPSZrmbikz	6321
-VRNDSCALEPSZrmi	6322
-VRNDSCALEPSZrmik	6323
-VRNDSCALEPSZrmikz	6324
-VRNDSCALEPSZrri	6325
-VRNDSCALEPSZrrib	6326
-VRNDSCALEPSZrribk	6327
-VRNDSCALEPSZrribkz	6328
-VRNDSCALEPSZrrik	6329
-VRNDSCALEPSZrrikz	6330
-VRNDSCALESDZrmi	6331
-VRNDSCALESDZrmi_Int	6332
-VRNDSCALESDZrmik_Int	6333
-VRNDSCALESDZrmikz_Int	6334
-VRNDSCALESDZrri	6335
-VRNDSCALESDZrri_Int	6336
-VRNDSCALESDZrrib_Int	6337
-VRNDSCALESDZrribk_Int	6338
-VRNDSCALESDZrribkz_Int	6339
-VRNDSCALESDZrrik_Int	6340
-VRNDSCALESDZrrikz_Int	6341
-VRNDSCALESHZrmi	6342
-VRNDSCALESHZrmi_Int	6343
-VRNDSCALESHZrmik_Int	6344
-VRNDSCALESHZrmikz_Int	6345
-VRNDSCALESHZrri	6346
-VRNDSCALESHZrri_Int	6347
-VRNDSCALESHZrrib_Int	6348
-VRNDSCALESHZrribk_Int	6349
-VRNDSCALESHZrribkz_Int	6350
-VRNDSCALESHZrrik_Int	6351
-VRNDSCALESHZrrikz_Int	6352
-VRNDSCALESSZrmi	6353
-VRNDSCALESSZrmi_Int	6354
-VRNDSCALESSZrmik_Int	6355
-VRNDSCALESSZrmikz_Int	6356
-VRNDSCALESSZrri	6357
-VRNDSCALESSZrri_Int	6358
-VRNDSCALESSZrrib_Int	6359
-VRNDSCALESSZrribk_Int	6360
-VRNDSCALESSZrribkz_Int	6361
-VRNDSCALESSZrrik_Int	6362
-VRNDSCALESSZrrikz_Int	6363
-VROUNDPDYmi	6364
-VROUNDPDYri	6365
-VROUNDPDmi	6366
-VROUNDPDri	6367
-VROUNDPSYmi	6368
-VROUNDPSYri	6369
-VROUNDPSmi	6370
-VROUNDPSri	6371
-VROUNDSDmi	6372
-VROUNDSDmi_Int	6373
-VROUNDSDri	6374
-VROUNDSDri_Int	6375
-VROUNDSSmi	6376
-VROUNDSSmi_Int	6377
-VROUNDSSri	6378
-VROUNDSSri_Int	6379
-VRSQRT	6380
-VRSQRTBF	6381
-VRSQRTPHZ	6382
-VRSQRTPHZm	6383
-VRSQRTPHZmb	6384
-VRSQRTPHZmbk	6385
-VRSQRTPHZmbkz	6386
-VRSQRTPHZmk	6387
-VRSQRTPHZmkz	6388
-VRSQRTPHZr	6389
-VRSQRTPHZrk	6390
-VRSQRTPHZrkz	6391
-VRSQRTPSYm	6392
-VRSQRTPSYr	6393
-VRSQRTPSm	6394
-VRSQRTPSr	6395
-VRSQRTSHZrm	6396
-VRSQRTSHZrmk	6397
-VRSQRTSHZrmkz	6398
-VRSQRTSHZrr	6399
-VRSQRTSHZrrk	6400
-VRSQRTSHZrrkz	6401
-VRSQRTSSm	6402
-VRSQRTSSm_Int	6403
-VRSQRTSSr	6404
-VRSQRTSSr_Int	6405
-VSCALEFBF	6406
-VSCALEFPDZ	6407
-VSCALEFPDZrm	6408
-VSCALEFPDZrmb	6409
-VSCALEFPDZrmbk	6410
-VSCALEFPDZrmbkz	6411
-VSCALEFPDZrmk	6412
-VSCALEFPDZrmkz	6413
-VSCALEFPDZrr	6414
-VSCALEFPDZrrb	6415
-VSCALEFPDZrrbk	6416
-VSCALEFPDZrrbkz	6417
-VSCALEFPDZrrk	6418
-VSCALEFPDZrrkz	6419
-VSCALEFPHZ	6420
-VSCALEFPHZrm	6421
-VSCALEFPHZrmb	6422
-VSCALEFPHZrmbk	6423
-VSCALEFPHZrmbkz	6424
-VSCALEFPHZrmk	6425
-VSCALEFPHZrmkz	6426
-VSCALEFPHZrr	6427
-VSCALEFPHZrrb	6428
-VSCALEFPHZrrbk	6429
-VSCALEFPHZrrbkz	6430
-VSCALEFPHZrrk	6431
-VSCALEFPHZrrkz	6432
-VSCALEFPSZ	6433
-VSCALEFPSZrm	6434
-VSCALEFPSZrmb	6435
-VSCALEFPSZrmbk	6436
-VSCALEFPSZrmbkz	6437
-VSCALEFPSZrmk	6438
-VSCALEFPSZrmkz	6439
-VSCALEFPSZrr	6440
-VSCALEFPSZrrb	6441
-VSCALEFPSZrrbk	6442
-VSCALEFPSZrrbkz	6443
-VSCALEFPSZrrk	6444
-VSCALEFPSZrrkz	6445
-VSCALEFSDZrm	6446
-VSCALEFSDZrmk	6447
-VSCALEFSDZrmkz	6448
-VSCALEFSDZrr	6449
-VSCALEFSDZrrb_Int	6450
-VSCALEFSDZrrbk_Int	6451
-VSCALEFSDZrrbkz_Int	6452
-VSCALEFSDZrrk	6453
-VSCALEFSDZrrkz	6454
-VSCALEFSHZrm	6455
-VSCALEFSHZrmk	6456
-VSCALEFSHZrmkz	6457
-VSCALEFSHZrr	6458
-VSCALEFSHZrrb_Int	6459
-VSCALEFSHZrrbk_Int	6460
-VSCALEFSHZrrbkz_Int	6461
-VSCALEFSHZrrk	6462
-VSCALEFSHZrrkz	6463
-VSCALEFSSZrm	6464
-VSCALEFSSZrmk	6465
-VSCALEFSSZrmkz	6466
-VSCALEFSSZrr	6467
-VSCALEFSSZrrb_Int	6468
-VSCALEFSSZrrbk_Int	6469
-VSCALEFSSZrrbkz_Int	6470
-VSCALEFSSZrrk	6471
-VSCALEFSSZrrkz	6472
-VSCATTERDPDZ	6473
-VSCATTERDPDZmr	6474
-VSCATTERDPSZ	6475
-VSCATTERDPSZmr	6476
-VSCATTERPF	6477
-VSCATTERQPDZ	6478
-VSCATTERQPDZmr	6479
-VSCATTERQPSZ	6480
-VSCATTERQPSZmr	6481
-VSHA	6482
-VSHUFF	6483
-VSHUFI	6484
-VSHUFPDYrmi	6485
-VSHUFPDYrri	6486
-VSHUFPDZ	6487
-VSHUFPDZrmbi	6488
-VSHUFPDZrmbik	6489
-VSHUFPDZrmbikz	6490
-VSHUFPDZrmi	6491
-VSHUFPDZrmik	6492
-VSHUFPDZrmikz	6493
-VSHUFPDZrri	6494
-VSHUFPDZrrik	6495
-VSHUFPDZrrikz	6496
-VSHUFPDrmi	6497
-VSHUFPDrri	6498
-VSHUFPSYrmi	6499
-VSHUFPSYrri	6500
-VSHUFPSZ	6501
-VSHUFPSZrmbi	6502
-VSHUFPSZrmbik	6503
-VSHUFPSZrmbikz	6504
-VSHUFPSZrmi	6505
-VSHUFPSZrmik	6506
-VSHUFPSZrmikz	6507
-VSHUFPSZrri	6508
-VSHUFPSZrrik	6509
-VSHUFPSZrrikz	6510
-VSHUFPSrmi	6511
-VSHUFPSrri	6512
-VSM	6513
-VSQRTBF	6514
-VSQRTPDYm	6515
-VSQRTPDYr	6516
-VSQRTPDZ	6517
-VSQRTPDZm	6518
-VSQRTPDZmb	6519
-VSQRTPDZmbk	6520
-VSQRTPDZmbkz	6521
-VSQRTPDZmk	6522
-VSQRTPDZmkz	6523
-VSQRTPDZr	6524
-VSQRTPDZrb	6525
-VSQRTPDZrbk	6526
-VSQRTPDZrbkz	6527
-VSQRTPDZrk	6528
-VSQRTPDZrkz	6529
-VSQRTPDm	6530
-VSQRTPDr	6531
-VSQRTPHZ	6532
-VSQRTPHZm	6533
-VSQRTPHZmb	6534
-VSQRTPHZmbk	6535
-VSQRTPHZmbkz	6536
-VSQRTPHZmk	6537
-VSQRTPHZmkz	6538
-VSQRTPHZr	6539
-VSQRTPHZrb	6540
-VSQRTPHZrbk	6541
-VSQRTPHZrbkz	6542
-VSQRTPHZrk	6543
-VSQRTPHZrkz	6544
-VSQRTPSYm	6545
-VSQRTPSYr	6546
-VSQRTPSZ	6547
-VSQRTPSZm	6548
-VSQRTPSZmb	6549
-VSQRTPSZmbk	6550
-VSQRTPSZmbkz	6551
-VSQRTPSZmk	6552
-VSQRTPSZmkz	6553
-VSQRTPSZr	6554
-VSQRTPSZrb	6555
-VSQRTPSZrbk	6556
-VSQRTPSZrbkz	6557
-VSQRTPSZrk	6558
-VSQRTPSZrkz	6559
-VSQRTPSm	6560
-VSQRTPSr	6561
-VSQRTSDZm	6562
-VSQRTSDZm_Int	6563
-VSQRTSDZmk_Int	6564
-VSQRTSDZmkz_Int	6565
-VSQRTSDZr	6566
-VSQRTSDZr_Int	6567
-VSQRTSDZrb_Int	6568
-VSQRTSDZrbk_Int	6569
-VSQRTSDZrbkz_Int	6570
-VSQRTSDZrk_Int	6571
-VSQRTSDZrkz_Int	6572
-VSQRTSDm	6573
-VSQRTSDm_Int	6574
-VSQRTSDr	6575
-VSQRTSDr_Int	6576
-VSQRTSHZm	6577
-VSQRTSHZm_Int	6578
-VSQRTSHZmk_Int	6579
-VSQRTSHZmkz_Int	6580
-VSQRTSHZr	6581
-VSQRTSHZr_Int	6582
-VSQRTSHZrb_Int	6583
-VSQRTSHZrbk_Int	6584
-VSQRTSHZrbkz_Int	6585
-VSQRTSHZrk_Int	6586
-VSQRTSHZrkz_Int	6587
-VSQRTSSZm	6588
-VSQRTSSZm_Int	6589
-VSQRTSSZmk_Int	6590
-VSQRTSSZmkz_Int	6591
-VSQRTSSZr	6592
-VSQRTSSZr_Int	6593
-VSQRTSSZrb_Int	6594
-VSQRTSSZrbk_Int	6595
-VSQRTSSZrbkz_Int	6596
-VSQRTSSZrk_Int	6597
-VSQRTSSZrkz_Int	6598
-VSQRTSSm	6599
-VSQRTSSm_Int	6600
-VSQRTSSr	6601
-VSQRTSSr_Int	6602
-VSTMXCSR	6603
-VSUBBF	6604
-VSUBPDYrm	6605
-VSUBPDYrr	6606
-VSUBPDZ	6607
-VSUBPDZrm	6608
-VSUBPDZrmb	6609
-VSUBPDZrmbk	6610
-VSUBPDZrmbkz	6611
-VSUBPDZrmk	6612
-VSUBPDZrmkz	6613
-VSUBPDZrr	6614
-VSUBPDZrrb	6615
-VSUBPDZrrbk	6616
-VSUBPDZrrbkz	6617
-VSUBPDZrrk	6618
-VSUBPDZrrkz	6619
-VSUBPDrm	6620
-VSUBPDrr	6621
-VSUBPHZ	6622
-VSUBPHZrm	6623
-VSUBPHZrmb	6624
-VSUBPHZrmbk	6625
-VSUBPHZrmbkz	6626
-VSUBPHZrmk	6627
-VSUBPHZrmkz	6628
-VSUBPHZrr	6629
-VSUBPHZrrb	6630
-VSUBPHZrrbk	6631
-VSUBPHZrrbkz	6632
-VSUBPHZrrk	6633
-VSUBPHZrrkz	6634
-VSUBPSYrm	6635
-VSUBPSYrr	6636
-VSUBPSZ	6637
-VSUBPSZrm	6638
-VSUBPSZrmb	6639
-VSUBPSZrmbk	6640
-VSUBPSZrmbkz	6641
-VSUBPSZrmk	6642
-VSUBPSZrmkz	6643
-VSUBPSZrr	6644
-VSUBPSZrrb	6645
-VSUBPSZrrbk	6646
-VSUBPSZrrbkz	6647
-VSUBPSZrrk	6648
-VSUBPSZrrkz	6649
-VSUBPSrm	6650
-VSUBPSrr	6651
-VSUBSDZrm	6652
-VSUBSDZrm_Int	6653
-VSUBSDZrmk_Int	6654
-VSUBSDZrmkz_Int	6655
-VSUBSDZrr	6656
-VSUBSDZrr_Int	6657
-VSUBSDZrrb_Int	6658
-VSUBSDZrrbk_Int	6659
-VSUBSDZrrbkz_Int	6660
-VSUBSDZrrk_Int	6661
-VSUBSDZrrkz_Int	6662
-VSUBSDrm	6663
-VSUBSDrm_Int	6664
-VSUBSDrr	6665
-VSUBSDrr_Int	6666
-VSUBSHZrm	6667
-VSUBSHZrm_Int	6668
-VSUBSHZrmk_Int	6669
-VSUBSHZrmkz_Int	6670
-VSUBSHZrr	6671
-VSUBSHZrr_Int	6672
-VSUBSHZrrb_Int	6673
-VSUBSHZrrbk_Int	6674
-VSUBSHZrrbkz_Int	6675
-VSUBSHZrrk_Int	6676
-VSUBSHZrrkz_Int	6677
-VSUBSSZrm	6678
-VSUBSSZrm_Int	6679
-VSUBSSZrmk_Int	6680
-VSUBSSZrmkz_Int	6681
-VSUBSSZrr	6682
-VSUBSSZrr_Int	6683
-VSUBSSZrrb_Int	6684
-VSUBSSZrrbk_Int	6685
-VSUBSSZrrbkz_Int	6686
-VSUBSSZrrk_Int	6687
-VSUBSSZrrkz_Int	6688
-VSUBSSrm	6689
-VSUBSSrm_Int	6690
-VSUBSSrr	6691
-VSUBSSrr_Int	6692
-VTESTPDYrm	6693
-VTESTPDYrr	6694
-VTESTPDrm	6695
-VTESTPDrr	6696
-VTESTPSYrm	6697
-VTESTPSYrr	6698
-VTESTPSrm	6699
-VTESTPSrr	6700
-VUCOMISDZrm	6701
-VUCOMISDZrm_Int	6702
-VUCOMISDZrr	6703
-VUCOMISDZrr_Int	6704
-VUCOMISDZrrb	6705
-VUCOMISDrm	6706
-VUCOMISDrm_Int	6707
-VUCOMISDrr	6708
-VUCOMISDrr_Int	6709
-VUCOMISHZrm	6710
-VUCOMISHZrm_Int	6711
-VUCOMISHZrr	6712
-VUCOMISHZrr_Int	6713
-VUCOMISHZrrb	6714
-VUCOMISSZrm	6715
-VUCOMISSZrm_Int	6716
-VUCOMISSZrr	6717
-VUCOMISSZrr_Int	6718
-VUCOMISSZrrb	6719
-VUCOMISSrm	6720
-VUCOMISSrm_Int	6721
-VUCOMISSrr	6722
-VUCOMISSrr_Int	6723
-VUCOMXSDZrm	6724
-VUCOMXSDZrm_Int	6725
-VUCOMXSDZrr	6726
-VUCOMXSDZrr_Int	6727
-VUCOMXSDZrrb_Int	6728
-VUCOMXSHZrm	6729
-VUCOMXSHZrm_Int	6730
-VUCOMXSHZrr	6731
-VUCOMXSHZrr_Int	6732
-VUCOMXSHZrrb_Int	6733
-VUCOMXSSZrm	6734
-VUCOMXSSZrm_Int	6735
-VUCOMXSSZrr	6736
-VUCOMXSSZrr_Int	6737
-VUCOMXSSZrrb_Int	6738
-VUNPCKHPDYrm	6739
-VUNPCKHPDYrr	6740
-VUNPCKHPDZ	6741
-VUNPCKHPDZrm	6742
-VUNPCKHPDZrmb	6743
-VUNPCKHPDZrmbk	6744
-VUNPCKHPDZrmbkz	6745
-VUNPCKHPDZrmk	6746
-VUNPCKHPDZrmkz	6747
-VUNPCKHPDZrr	6748
-VUNPCKHPDZrrk	6749
-VUNPCKHPDZrrkz	6750
-VUNPCKHPDrm	6751
-VUNPCKHPDrr	6752
-VUNPCKHPSYrm	6753
-VUNPCKHPSYrr	6754
-VUNPCKHPSZ	6755
-VUNPCKHPSZrm	6756
-VUNPCKHPSZrmb	6757
-VUNPCKHPSZrmbk	6758
-VUNPCKHPSZrmbkz	6759
-VUNPCKHPSZrmk	6760
-VUNPCKHPSZrmkz	6761
-VUNPCKHPSZrr	6762
-VUNPCKHPSZrrk	6763
-VUNPCKHPSZrrkz	6764
-VUNPCKHPSrm	6765
-VUNPCKHPSrr	6766
-VUNPCKLPDYrm	6767
-VUNPCKLPDYrr	6768
-VUNPCKLPDZ	6769
-VUNPCKLPDZrm	6770
-VUNPCKLPDZrmb	6771
-VUNPCKLPDZrmbk	6772
-VUNPCKLPDZrmbkz	6773
-VUNPCKLPDZrmk	6774
-VUNPCKLPDZrmkz	6775
-VUNPCKLPDZrr	6776
-VUNPCKLPDZrrk	6777
-VUNPCKLPDZrrkz	6778
-VUNPCKLPDrm	6779
-VUNPCKLPDrr	6780
-VUNPCKLPSYrm	6781
-VUNPCKLPSYrr	6782
-VUNPCKLPSZ	6783
-VUNPCKLPSZrm	6784
-VUNPCKLPSZrmb	6785
-VUNPCKLPSZrmbk	6786
-VUNPCKLPSZrmbkz	6787
-VUNPCKLPSZrmk	6788
-VUNPCKLPSZrmkz	6789
-VUNPCKLPSZrr	6790
-VUNPCKLPSZrrk	6791
-VUNPCKLPSZrrkz	6792
-VUNPCKLPSrm	6793
-VUNPCKLPSrr	6794
-VXORPDYrm	6795
-VXORPDYrr	6796
-VXORPDZ	6797
-VXORPDZrm	6798
-VXORPDZrmb	6799
-VXORPDZrmbk	6800
-VXORPDZrmbkz	6801
-VXORPDZrmk	6802
-VXORPDZrmkz	6803
-VXORPDZrr	6804
-VXORPDZrrk	6805
-VXORPDZrrkz	6806
-VXORPDrm	6807
-VXORPDrr	6808
-VXORPSYrm	6809
-VXORPSYrr	6810
-VXORPSZ	6811
-VXORPSZrm	6812
-VXORPSZrmb	6813
-VXORPSZrmbk	6814
-VXORPSZrmbkz	6815
-VXORPSZrmk	6816
-VXORPSZrmkz	6817
-VXORPSZrr	6818
-VXORPSZrrk	6819
-VXORPSZrrkz	6820
-VXORPSrm	6821
-VXORPSrr	6822
-VZEROALL	6823
-VZEROUPPER	6824
-V_SET	6825
-V_SETALLONES	6826
-WAIT	6827
-WBINVD	6828
-WBNOINVD	6829
-WRFLAGS	6830
-WRFSBASE	6831
-WRGSBASE	6832
-WRMSR	6833
-WRMSRLIST	6834
-WRMSRNS	6835
-WRMSRNSir	6836
-WRMSRNSir_EVEX	6837
-WRPKRUr	6838
-WRSSD	6839
-WRSSD_EVEX	6840
-WRSSQ	6841
-WRSSQ_EVEX	6842
-WRUSSD	6843
-WRUSSD_EVEX	6844
-WRUSSQ	6845
-WRUSSQ_EVEX	6846
-XABORT	6847
-XABORT_DEF	6848
-XACQUIRE_PREFIX	6849
-XADD	6850
-XAM_F	6851
-XAM_Fp	6852
-XBEGIN	6853
-XCHG	6854
-XCH_F	6855
-XCRYPTCBC	6856
-XCRYPTCFB	6857
-XCRYPTCTR	6858
-XCRYPTECB	6859
-XCRYPTOFB	6860
-XEND	6861
-XGETBV	6862
-XLAT	6863
-XOR	6864
-XORPDrm	6865
-XORPDrr	6866
-XORPSrm	6867
-XORPSrr	6868
-XRELEASE_PREFIX	6869
-XRESLDTRK	6870
-XRSTOR	6871
-XRSTORS	6872
-XSAVE	6873
-XSAVEC	6874
-XSAVEOPT	6875
-XSAVES	6876
-XSETBV	6877
-XSHA	6878
-XSTORE	6879
-XSUSLDTRK	6880
-XTEST	6881
-Immediate	6882
-CImmediate	6883
-FPImmediate	6884
-MBB	6885
-FrameIndex	6886
-ConstantPoolIndex	6887
-TargetIndex	6888
-JumpTableIndex	6889
-ExternalSymbol	6890
-GlobalAddress	6891
-BlockAddress	6892
-RegisterMask	6893
-RegisterLiveOut	6894
-Metadata	6895
-MCSymbol	6896
-CFIIndex	6897
-IntrinsicID	6898
-Predicate	6899
-ShuffleMask	6900
-PhyReg_GR8	6901
-PhyReg_GRH8	6902
-PhyReg_GR8_NOREX2	6903
-PhyReg_GR8_NOREX	6904
-PhyReg_GR8_ABCD_H	6905
-PhyReg_GR8_ABCD_L	6906
-PhyReg_GRH16	6907
-PhyReg_GR16	6908
-PhyReg_GR16_NOREX2	6909
-PhyReg_GR16_NOREX	6910
-PhyReg_VK1	6911
-PhyReg_VK16	6912
-PhyReg_VK2	6913
-PhyReg_VK4	6914
-PhyReg_VK8	6915
-PhyReg_VK16WM	6916
-PhyReg_VK1WM	6917
-PhyReg_VK2WM	6918
-PhyReg_VK4WM	6919
-PhyReg_VK8WM	6920
-PhyReg_SEGMENT_REG	6921
-PhyReg_GR16_ABCD	6922
-PhyReg_FPCCR	6923
-PhyReg_FR16X	6924
-PhyReg_FR16	6925
-PhyReg_VK16PAIR	6926
-PhyReg_VK1PAIR	6927
-PhyReg_VK2PAIR	6928
-PhyReg_VK4PAIR	6929
-PhyReg_VK8PAIR	6930
-PhyReg_VK1PAIR_with_sub_mask_0_in_VK1WM	6931
-PhyReg_LOW32_ADDR_ACCESS_RBP	6932
-PhyReg_LOW32_ADDR_ACCESS	6933
-PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit	6934
-PhyReg_FR32X	6935
-PhyReg_GR32	6936
-PhyReg_GR32_NOSP	6937
-PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX2	6938
-PhyReg_DEBUG_REG	6939
-PhyReg_FR32	6940
-PhyReg_GR32_NOREX2	6941
-PhyReg_GR32_NOREX2_NOSP	6942
-PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX	6943
-PhyReg_GR32_NOREX	6944
-PhyReg_VK32	6945
-PhyReg_GR32_NOREX_NOSP	6946
-PhyReg_RFP32	6947
-PhyReg_VK32WM	6948
-PhyReg_GR32_ABCD	6949
-PhyReg_GR32_TC	6950
-PhyReg_GR32_ABCD_and_GR32_TC	6951
-PhyReg_GR32_AD	6952
-PhyReg_GR32_ArgRef	6953
-PhyReg_GR32_BPSP	6954
-PhyReg_GR32_BSI	6955
-PhyReg_GR32_CB	6956
-PhyReg_GR32_DC	6957
-PhyReg_GR32_DIBP	6958
-PhyReg_GR32_SIDI	6959
-PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_32bit	6960
-PhyReg_CCR	6961
-PhyReg_DFCCR	6962
-PhyReg_GR32_ABCD_and_GR32_BSI	6963
-PhyReg_GR32_AD_and_GR32_ArgRef	6964
-PhyReg_GR32_ArgRef_and_GR32_CB	6965
-PhyReg_GR32_BPSP_and_GR32_DIBP	6966
-PhyReg_GR32_BPSP_and_GR32_TC	6967
-PhyReg_GR32_BSI_and_GR32_SIDI	6968
-PhyReg_GR32_DIBP_and_GR32_SIDI	6969
-PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit_with_sub_32bit	6970
-PhyReg_LOW32_ADDR_ACCESS_with_sub_32bit	6971
-PhyReg_RFP64	6972
-PhyReg_GR64	6973
-PhyReg_FR64X	6974
-PhyReg_GR64_with_sub_8bit	6975
-PhyReg_GR64_NOSP	6976
-PhyReg_GR64_NOREX2	6977
-PhyReg_CONTROL_REG	6978
-PhyReg_FR64	6979
-PhyReg_GR64_with_sub_16bit_in_GR16_NOREX2	6980
-PhyReg_GR64_NOREX2_NOSP	6981
-PhyReg_GR64PLTSafe	6982
-PhyReg_GR64_TC	6983
-PhyReg_GR64_NOREX	6984
-PhyReg_GR64_TCW64	6985
-PhyReg_GR64_TC_with_sub_8bit	6986
-PhyReg_GR64_NOREX2_NOSP_and_GR64_TC	6987
-PhyReg_GR64_TCW64_with_sub_8bit	6988
-PhyReg_GR64_TC_and_GR64_TCW64	6989
-PhyReg_GR64_with_sub_16bit_in_GR16_NOREX	6990
-PhyReg_VK64	6991
-PhyReg_VR64	6992
-PhyReg_GR64PLTSafe_and_GR64_TC	6993
-PhyReg_GR64_NOREX2_NOSP_and_GR64_TCW64	6994
-PhyReg_GR64_NOREX_NOSP	6995
-PhyReg_GR64_NOREX_and_GR64_TC	6996
-PhyReg_GR64_TCW64_and_GR64_TC_with_sub_8bit	6997
-PhyReg_VK64WM	6998
-PhyReg_GR64_TC_and_GR64_NOREX2_NOSP_and_GR64_TCW64	6999
-PhyReg_GR64_TC_and_GR64_with_sub_16bit_in_GR16_NOREX	7000
-PhyReg_GR64PLTSafe_and_GR64_TCW64	7001
-PhyReg_GR64_NOREX_and_GR64PLTSafe_and_GR64_TC	7002
-PhyReg_GR64_NOREX_and_GR64_TCW64	7003
-PhyReg_GR64_ABCD	7004
-PhyReg_GR64_with_sub_32bit_in_GR32_TC	7005
-PhyReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_TC	7006
-PhyReg_GR64_AD	7007
-PhyReg_GR64_ArgRef	7008
-PhyReg_GR64_and_LOW32_ADDR_ACCESS_RBP	7009
-PhyReg_GR64_with_sub_32bit_in_GR32_ArgRef	7010
-PhyReg_GR64_with_sub_32bit_in_GR32_BPSP	7011
-PhyReg_GR64_with_sub_32bit_in_GR32_BSI	7012
-PhyReg_GR64_with_sub_32bit_in_GR32_CB	7013
-PhyReg_GR64_with_sub_32bit_in_GR32_DIBP	7014
-PhyReg_GR64_with_sub_32bit_in_GR32_SIDI	7015
-PhyReg_GR64_A	7016
-PhyReg_GR64_ArgRef_and_GR64_TC	7017
-PhyReg_GR64_and_LOW32_ADDR_ACCESS	7018
-PhyReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_BSI	7019
-PhyReg_GR64_with_sub_32bit_in_GR32_AD_and_GR32_ArgRef	7020
-PhyReg_GR64_with_sub_32bit_in_GR32_ArgRef_and_GR32_CB	7021
-PhyReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_DIBP	7022
-PhyReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_TC	7023
-PhyReg_GR64_with_sub_32bit_in_GR32_BSI_and_GR32_SIDI	7024
-PhyReg_GR64_with_sub_32bit_in_GR32_DIBP_and_GR32_SIDI	7025
-PhyReg_RST	7026
-PhyReg_RFP80	7027
-PhyReg_RFP80_7	7028
-PhyReg_VR128X	7029
-PhyReg_VR128	7030
-PhyReg_VR256X	7031
-PhyReg_VR256	7032
-PhyReg_VR512	7033
-PhyReg_VR512_0_15	7034
-PhyReg_TILE	7035
-PhyReg_TILEPAIR	7036
-VirtReg_GR8	7037
-VirtReg_GRH8	7038
-VirtReg_GR8_NOREX2	7039
-VirtReg_GR8_NOREX	7040
-VirtReg_GR8_ABCD_H	7041
-VirtReg_GR8_ABCD_L	7042
-VirtReg_GRH16	7043
-VirtReg_GR16	7044
-VirtReg_GR16_NOREX2	7045
-VirtReg_GR16_NOREX	7046
-VirtReg_VK1	7047
-VirtReg_VK16	7048
-VirtReg_VK2	7049
-VirtReg_VK4	7050
-VirtReg_VK8	7051
-VirtReg_VK16WM	7052
-VirtReg_VK1WM	7053
-VirtReg_VK2WM	7054
-VirtReg_VK4WM	7055
-VirtReg_VK8WM	7056
-VirtReg_SEGMENT_REG	7057
-VirtReg_GR16_ABCD	7058
-VirtReg_FPCCR	7059
-VirtReg_FR16X	7060
-VirtReg_FR16	7061
-VirtReg_VK16PAIR	7062
-VirtReg_VK1PAIR	7063
-VirtReg_VK2PAIR	7064
-VirtReg_VK4PAIR	7065
-VirtReg_VK8PAIR	7066
-VirtReg_VK1PAIR_with_sub_mask_0_in_VK1WM	7067
-VirtReg_LOW32_ADDR_ACCESS_RBP	7068
-VirtReg_LOW32_ADDR_ACCESS	7069
-VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit	7070
-VirtReg_FR32X	7071
-VirtReg_GR32	7072
-VirtReg_GR32_NOSP	7073
-VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX2	7074
-VirtReg_DEBUG_REG	7075
-VirtReg_FR32	7076
-VirtReg_GR32_NOREX2	7077
-VirtReg_GR32_NOREX2_NOSP	7078
-VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX	7079
-VirtReg_GR32_NOREX	7080
-VirtReg_VK32	7081
-VirtReg_GR32_NOREX_NOSP	7082
-VirtReg_RFP32	7083
-VirtReg_VK32WM	7084
-VirtReg_GR32_ABCD	7085
-VirtReg_GR32_TC	7086
-VirtReg_GR32_ABCD_and_GR32_TC	7087
-VirtReg_GR32_AD	7088
-VirtReg_GR32_ArgRef	7089
-VirtReg_GR32_BPSP	7090
-VirtReg_GR32_BSI	7091
-VirtReg_GR32_CB	7092
-VirtReg_GR32_DC	7093
-VirtReg_GR32_DIBP	7094
-VirtReg_GR32_SIDI	7095
-VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_32bit	7096
-VirtReg_CCR	7097
-VirtReg_DFCCR	7098
-VirtReg_GR32_ABCD_and_GR32_BSI	7099
-VirtReg_GR32_AD_and_GR32_ArgRef	7100
-VirtReg_GR32_ArgRef_and_GR32_CB	7101
-VirtReg_GR32_BPSP_and_GR32_DIBP	7102
-VirtReg_GR32_BPSP_and_GR32_TC	7103
-VirtReg_GR32_BSI_and_GR32_SIDI	7104
-VirtReg_GR32_DIBP_and_GR32_SIDI	7105
-VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit_with_sub_32bit	7106
-VirtReg_LOW32_ADDR_ACCESS_with_sub_32bit	7107
-VirtReg_RFP64	7108
-VirtReg_GR64	7109
-VirtReg_FR64X	7110
-VirtReg_GR64_with_sub_8bit	7111
-VirtReg_GR64_NOSP	7112
-VirtReg_GR64_NOREX2	7113
-VirtReg_CONTROL_REG	7114
-VirtReg_FR64	7115
-VirtReg_GR64_with_sub_16bit_in_GR16_NOREX2	7116
-VirtReg_GR64_NOREX2_NOSP	7117
-VirtReg_GR64PLTSafe	7118
-VirtReg_GR64_TC	7119
-VirtReg_GR64_NOREX	7120
-VirtReg_GR64_TCW64	7121
-VirtReg_GR64_TC_with_sub_8bit	7122
-VirtReg_GR64_NOREX2_NOSP_and_GR64_TC	7123
-VirtReg_GR64_TCW64_with_sub_8bit	7124
-VirtReg_GR64_TC_and_GR64_TCW64	7125
-VirtReg_GR64_with_sub_16bit_in_GR16_NOREX	7126
-VirtReg_VK64	7127
-VirtReg_VR64	7128
-VirtReg_GR64PLTSafe_and_GR64_TC	7129
-VirtReg_GR64_NOREX2_NOSP_and_GR64_TCW64	7130
-VirtReg_GR64_NOREX_NOSP	7131
-VirtReg_GR64_NOREX_and_GR64_TC	7132
-VirtReg_GR64_TCW64_and_GR64_TC_with_sub_8bit	7133
-VirtReg_VK64WM	7134
-VirtReg_GR64_TC_and_GR64_NOREX2_NOSP_and_GR64_TCW64	7135
-VirtReg_GR64_TC_and_GR64_with_sub_16bit_in_GR16_NOREX	7136
-VirtReg_GR64PLTSafe_and_GR64_TCW64	7137
-VirtReg_GR64_NOREX_and_GR64PLTSafe_and_GR64_TC	7138
-VirtReg_GR64_NOREX_and_GR64_TCW64	7139
-VirtReg_GR64_ABCD	7140
-VirtReg_GR64_with_sub_32bit_in_GR32_TC	7141
-VirtReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_TC	7142
-VirtReg_GR64_AD	7143
-VirtReg_GR64_ArgRef	7144
-VirtReg_GR64_and_LOW32_ADDR_ACCESS_RBP	7145
-VirtReg_GR64_with_sub_32bit_in_GR32_ArgRef	7146
-VirtReg_GR64_with_sub_32bit_in_GR32_BPSP	7147
-VirtReg_GR64_with_sub_32bit_in_GR32_BSI	7148
-VirtReg_GR64_with_sub_32bit_in_GR32_CB	7149
-VirtReg_GR64_with_sub_32bit_in_GR32_DIBP	7150
-VirtReg_GR64_with_sub_32bit_in_GR32_SIDI	7151
-VirtReg_GR64_A	7152
-VirtReg_GR64_ArgRef_and_GR64_TC	7153
-VirtReg_GR64_and_LOW32_ADDR_ACCESS	7154
-VirtReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_BSI	7155
-VirtReg_GR64_with_sub_32bit_in_GR32_AD_and_GR32_ArgRef	7156
-VirtReg_GR64_with_sub_32bit_in_GR32_ArgRef_and_GR32_CB	7157
-VirtReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_DIBP	7158
-VirtReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_TC	7159
-VirtReg_GR64_with_sub_32bit_in_GR32_BSI_and_GR32_SIDI	7160
-VirtReg_GR64_with_sub_32bit_in_GR32_DIBP_and_GR32_SIDI	7161
-VirtReg_RST	7162
-VirtReg_RFP80	7163
-VirtReg_RFP80_7	7164
-VirtReg_VR128X	7165
-VirtReg_VR128	7166
-VirtReg_VR256X	7167
-VirtReg_VR256	7168
-VirtReg_VR512	7169
-VirtReg_VR512_0_15	7170
-VirtReg_TILE	7171
-VirtReg_TILEPAIR	7172
+PTCMMIMFP	1441
+PTCMMRLFP	1442
+PTCVTROWD	1443
+PTCVTROWPS	1444
+PTDPBF	1445
+PTDPBHF	1446
+PTDPBSSD	1447
+PTDPBSSDV	1448
+PTDPBSUD	1449
+PTDPBSUDV	1450
+PTDPBUSD	1451
+PTDPBUSDV	1452
+PTDPBUUD	1453
+PTDPBUUDV	1454
+PTDPFP	1455
+PTDPHBF	1456
+PTDPHF	1457
+PTESTrm	1458
+PTESTrr	1459
+PTILELOADD	1460
+PTILELOADDRS	1461
+PTILELOADDRST	1462
+PTILELOADDRSV	1463
+PTILELOADDT	1464
+PTILELOADDV	1465
+PTILEMOVROWrre	1466
+PTILEMOVROWrreV	1467
+PTILEMOVROWrri	1468
+PTILEMOVROWrriV	1469
+PTILESTORED	1470
+PTILESTOREDV	1471
+PTILEZERO	1472
+PTILEZEROV	1473
+PTMMULTF	1474
+PTWRITE	1475
+PTWRITEm	1476
+PTWRITEr	1477
+PUNPCKHBWrm	1478
+PUNPCKHBWrr	1479
+PUNPCKHDQrm	1480
+PUNPCKHDQrr	1481
+PUNPCKHQDQrm	1482
+PUNPCKHQDQrr	1483
+PUNPCKHWDrm	1484
+PUNPCKHWDrr	1485
+PUNPCKLBWrm	1486
+PUNPCKLBWrr	1487
+PUNPCKLDQrm	1488
+PUNPCKLDQrr	1489
+PUNPCKLQDQrm	1490
+PUNPCKLQDQrr	1491
+PUNPCKLWDrm	1492
+PUNPCKLWDrr	1493
+PUSH	1494
+PUSHA	1495
+PUSHCS	1496
+PUSHDS	1497
+PUSHES	1498
+PUSHF	1499
+PUSHFS	1500
+PUSHGS	1501
+PUSHP	1502
+PUSHSS	1503
+PVALIDATE	1504
+PXORrm	1505
+PXORrr	1506
+RCL	1507
+RCPPSm	1508
+RCPPSr	1509
+RCPSSm	1510
+RCPSSm_Int	1511
+RCPSSr	1512
+RCPSSr_Int	1513
+RCR	1514
+RDFLAGS	1515
+RDFSBASE	1516
+RDGSBASE	1517
+RDMSR	1518
+RDMSRLIST	1519
+RDMSRri	1520
+RDMSRri_EVEX	1521
+RDPID	1522
+RDPKRUr	1523
+RDPMC	1524
+RDPRU	1525
+RDRAND	1526
+RDSEED	1527
+RDSSPD	1528
+RDSSPQ	1529
+RDTSC	1530
+RDTSCP	1531
+REG_SEQUENCE	1532
+REPNE_PREFIX	1533
+REP_MOVSB	1534
+REP_MOVSD	1535
+REP_MOVSQ	1536
+REP_MOVSW	1537
+REP_PREFIX	1538
+REP_STOSB	1539
+REP_STOSD	1540
+REP_STOSQ	1541
+REP_STOSW	1542
+RET	1543
+RETI	1544
+REX	1545
+RMPADJUST	1546
+RMPQUERY	1547
+RMPUPDATE	1548
+ROL	1549
+ROR	1550
+RORX	1551
+ROUNDPDmi	1552
+ROUNDPDri	1553
+ROUNDPSmi	1554
+ROUNDPSri	1555
+ROUNDSDmi	1556
+ROUNDSDmi_Int	1557
+ROUNDSDri	1558
+ROUNDSDri_Int	1559
+ROUNDSSmi	1560
+ROUNDSSmi_Int	1561
+ROUNDSSri	1562
+ROUNDSSri_Int	1563
+RSM	1564
+RSQRTPSm	1565
+RSQRTPSr	1566
+RSQRTSSm	1567
+RSQRTSSm_Int	1568
+RSQRTSSr	1569
+RSQRTSSr_Int	1570
+RSTORSSP	1571
+SAHF	1572
+SALC	1573
+SAR	1574
+SARX	1575
+SAVEPREVSSP	1576
+SBB	1577
+SCASB	1578
+SCASL	1579
+SCASQ	1580
+SCASW	1581
+SEAMCALL	1582
+SEAMOPS	1583
+SEAMRET	1584
+SEG_ALLOCA	1585
+SEH_BeginEpilogue	1586
+SEH_EndEpilogue	1587
+SEH_EndPrologue	1588
+SEH_PushFrame	1589
+SEH_PushReg	1590
+SEH_SaveReg	1591
+SEH_SaveXMM	1592
+SEH_SetFrame	1593
+SEH_StackAlign	1594
+SEH_StackAlloc	1595
+SEH_UnwindV	1596
+SEH_UnwindVersion	1597
+SENDUIPI	1598
+SERIALIZE	1599
+SETB_C	1600
+SETCCm	1601
+SETCCm_EVEX	1602
+SETCCr	1603
+SETCCr_EVEX	1604
+SETSSBSY	1605
+SETZUCCm	1606
+SETZUCCr	1607
+SFENCE	1608
+SGDT	1609
+SHA	1610
+SHL	1611
+SHLD	1612
+SHLDROT	1613
+SHLX	1614
+SHR	1615
+SHRD	1616
+SHRDROT	1617
+SHRX	1618
+SHUFPDrmi	1619
+SHUFPDrri	1620
+SHUFPSrmi	1621
+SHUFPSrri	1622
+SIDT	1623
+SKINIT	1624
+SLDT	1625
+SLWPCB	1626
+SMSW	1627
+SQRTPDm	1628
+SQRTPDr	1629
+SQRTPSm	1630
+SQRTPSr	1631
+SQRTSDm	1632
+SQRTSDm_Int	1633
+SQRTSDr	1634
+SQRTSDr_Int	1635
+SQRTSSm	1636
+SQRTSSm_Int	1637
+SQRTSSr	1638
+SQRTSSr_Int	1639
+SQRT_F	1640
+SQRT_Fp	1641
+SS_PREFIX	1642
+STAC	1643
+STACKALLOC_W_PROBING	1644
+STACKMAP	1645
+STATEPOINT	1646
+STC	1647
+STD	1648
+STGI	1649
+STI	1650
+STMXCSR	1651
+STOSB	1652
+STOSL	1653
+STOSQ	1654
+STOSW	1655
+STR	1656
+STRm	1657
+STTILECFG	1658
+STTILECFG_EVEX	1659
+STUI	1660
+ST_F	1661
+ST_FP	1662
+ST_FPrr	1663
+ST_Fp	1664
+ST_FpP	1665
+ST_Frr	1666
+SUB	1667
+SUBPDrm	1668
+SUBPDrr	1669
+SUBPSrm	1670
+SUBPSrr	1671
+SUBREG_TO_REG	1672
+SUBR_F	1673
+SUBR_FI	1674
+SUBR_FPrST	1675
+SUBR_FST	1676
+SUBR_Fp	1677
+SUBR_FpI	1678
+SUBR_FrST	1679
+SUBSDrm	1680
+SUBSDrm_Int	1681
+SUBSDrr	1682
+SUBSDrr_Int	1683
+SUBSSrm	1684
+SUBSSrm_Int	1685
+SUBSSrr	1686
+SUBSSrr_Int	1687
+SUB_F	1688
+SUB_FI	1689
+SUB_FPrST	1690
+SUB_FST	1691
+SUB_Fp	1692
+SUB_FpI	1693
+SUB_FrST	1694
+SWAPGS	1695
+SYSCALL	1696
+SYSENTER	1697
+SYSEXIT	1698
+SYSRET	1699
+T	1700
+TAILJMPd	1701
+TAILJMPd_CC	1702
+TAILJMPm	1703
+TAILJMPr	1704
+TCMMIMFP	1705
+TCMMRLFP	1706
+TCRETURN_HIPE	1707
+TCRETURN_WIN	1708
+TCRETURN_WINmi	1709
+TCRETURNdi	1710
+TCRETURNdicc	1711
+TCRETURNmi	1712
+TCRETURNri	1713
+TCVTROWD	1714
+TCVTROWPS	1715
+TDCALL	1716
+TDPBF	1717
+TDPBHF	1718
+TDPBSSD	1719
+TDPBSUD	1720
+TDPBUSD	1721
+TDPBUUD	1722
+TDPFP	1723
+TDPHBF	1724
+TDPHF	1725
+TEST	1726
+TESTUI	1727
+TILELOADD	1728
+TILELOADDRS	1729
+TILELOADDRST	1730
+TILELOADDRS_EVEX	1731
+TILELOADDT	1732
+TILELOADD_EVEX	1733
+TILEMOVROWrre	1734
+TILEMOVROWrri	1735
+TILERELEASE	1736
+TILESTORED	1737
+TILESTORED_EVEX	1738
+TILEZERO	1739
+TLBSYNC	1740
+TLSCall	1741
+TLS_addr	1742
+TLS_addrX	1743
+TLS_base_addr	1744
+TLS_base_addrX	1745
+TLS_desc	1746
+TMMULTF	1747
+TPAUSE	1748
+TRAP	1749
+TST_F	1750
+TST_Fp	1751
+TZCNT	1752
+TZMSK	1753
+UBSAN_UD	1754
+UCOMISDrm	1755
+UCOMISDrm_Int	1756
+UCOMISDrr	1757
+UCOMISDrr_Int	1758
+UCOMISSrm	1759
+UCOMISSrm_Int	1760
+UCOMISSrr	1761
+UCOMISSrr_Int	1762
+UCOM_FIPr	1763
+UCOM_FIr	1764
+UCOM_FPPr	1765
+UCOM_FPr	1766
+UCOM_FpIr	1767
+UCOM_Fpr	1768
+UCOM_Fr	1769
+UD	1770
+UIRET	1771
+UMONITOR	1772
+UMWAIT	1773
+UNPCKHPDrm	1774
+UNPCKHPDrr	1775
+UNPCKHPSrm	1776
+UNPCKHPSrr	1777
+UNPCKLPDrm	1778
+UNPCKLPDrr	1779
+UNPCKLPSrm	1780
+UNPCKLPSrr	1781
+URDMSRri	1782
+URDMSRri_EVEX	1783
+URDMSRrr	1784
+URDMSRrr_EVEX	1785
+UWRMSRir	1786
+UWRMSRir_EVEX	1787
+UWRMSRrr	1788
+UWRMSRrr_EVEX	1789
+V	1790
+VAARG	1791
+VAARG_X	1792
+VADDBF	1793
+VADDPDYrm	1794
+VADDPDYrr	1795
+VADDPDZ	1796
+VADDPDZrm	1797
+VADDPDZrmb	1798
+VADDPDZrmbk	1799
+VADDPDZrmbkz	1800
+VADDPDZrmk	1801
+VADDPDZrmkz	1802
+VADDPDZrr	1803
+VADDPDZrrb	1804
+VADDPDZrrbk	1805
+VADDPDZrrbkz	1806
+VADDPDZrrk	1807
+VADDPDZrrkz	1808
+VADDPDrm	1809
+VADDPDrr	1810
+VADDPHZ	1811
+VADDPHZrm	1812
+VADDPHZrmb	1813
+VADDPHZrmbk	1814
+VADDPHZrmbkz	1815
+VADDPHZrmk	1816
+VADDPHZrmkz	1817
+VADDPHZrr	1818
+VADDPHZrrb	1819
+VADDPHZrrbk	1820
+VADDPHZrrbkz	1821
+VADDPHZrrk	1822
+VADDPHZrrkz	1823
+VADDPSYrm	1824
+VADDPSYrr	1825
+VADDPSZ	1826
+VADDPSZrm	1827
+VADDPSZrmb	1828
+VADDPSZrmbk	1829
+VADDPSZrmbkz	1830
+VADDPSZrmk	1831
+VADDPSZrmkz	1832
+VADDPSZrr	1833
+VADDPSZrrb	1834
+VADDPSZrrbk	1835
+VADDPSZrrbkz	1836
+VADDPSZrrk	1837
+VADDPSZrrkz	1838
+VADDPSrm	1839
+VADDPSrr	1840
+VADDSDZrm	1841
+VADDSDZrm_Int	1842
+VADDSDZrmk_Int	1843
+VADDSDZrmkz_Int	1844
+VADDSDZrr	1845
+VADDSDZrr_Int	1846
+VADDSDZrrb_Int	1847
+VADDSDZrrbk_Int	1848
+VADDSDZrrbkz_Int	1849
+VADDSDZrrk_Int	1850
+VADDSDZrrkz_Int	1851
+VADDSDrm	1852
+VADDSDrm_Int	1853
+VADDSDrr	1854
+VADDSDrr_Int	1855
+VADDSHZrm	1856
+VADDSHZrm_Int	1857
+VADDSHZrmk_Int	1858
+VADDSHZrmkz_Int	1859
+VADDSHZrr	1860
+VADDSHZrr_Int	1861
+VADDSHZrrb_Int	1862
+VADDSHZrrbk_Int	1863
+VADDSHZrrbkz_Int	1864
+VADDSHZrrk_Int	1865
+VADDSHZrrkz_Int	1866
+VADDSSZrm	1867
+VADDSSZrm_Int	1868
+VADDSSZrmk_Int	1869
+VADDSSZrmkz_Int	1870
+VADDSSZrr	1871
+VADDSSZrr_Int	1872
+VADDSSZrrb_Int	1873
+VADDSSZrrbk_Int	1874
+VADDSSZrrbkz_Int	1875
+VADDSSZrrk_Int	1876
+VADDSSZrrkz_Int	1877
+VADDSSrm	1878
+VADDSSrm_Int	1879
+VADDSSrr	1880
+VADDSSrr_Int	1881
+VADDSUBPDYrm	1882
+VADDSUBPDYrr	1883
+VADDSUBPDrm	1884
+VADDSUBPDrr	1885
+VADDSUBPSYrm	1886
+VADDSUBPSYrr	1887
+VADDSUBPSrm	1888
+VADDSUBPSrr	1889
+VAESDECLASTYrm	1890
+VAESDECLASTYrr	1891
+VAESDECLASTZ	1892
+VAESDECLASTZrm	1893
+VAESDECLASTZrr	1894
+VAESDECLASTrm	1895
+VAESDECLASTrr	1896
+VAESDECYrm	1897
+VAESDECYrr	1898
+VAESDECZ	1899
+VAESDECZrm	1900
+VAESDECZrr	1901
+VAESDECrm	1902
+VAESDECrr	1903
+VAESENCLASTYrm	1904
+VAESENCLASTYrr	1905
+VAESENCLASTZ	1906
+VAESENCLASTZrm	1907
+VAESENCLASTZrr	1908
+VAESENCLASTrm	1909
+VAESENCLASTrr	1910
+VAESENCYrm	1911
+VAESENCYrr	1912
+VAESENCZ	1913
+VAESENCZrm	1914
+VAESENCZrr	1915
+VAESENCrm	1916
+VAESENCrr	1917
+VAESIMCrm	1918
+VAESIMCrr	1919
+VAESKEYGENASSISTrmi	1920
+VAESKEYGENASSISTrri	1921
+VALIGNDZ	1922
+VALIGNDZrmbi	1923
+VALIGNDZrmbik	1924
+VALIGNDZrmbikz	1925
+VALIGNDZrmi	1926
+VALIGNDZrmik	1927
+VALIGNDZrmikz	1928
+VALIGNDZrri	1929
+VALIGNDZrrik	1930
+VALIGNDZrrikz	1931
+VALIGNQZ	1932
+VALIGNQZrmbi	1933
+VALIGNQZrmbik	1934
+VALIGNQZrmbikz	1935
+VALIGNQZrmi	1936
+VALIGNQZrmik	1937
+VALIGNQZrmikz	1938
+VALIGNQZrri	1939
+VALIGNQZrrik	1940
+VALIGNQZrrikz	1941
+VANDNPDYrm	1942
+VANDNPDYrr	1943
+VANDNPDZ	1944
+VANDNPDZrm	1945
+VANDNPDZrmb	1946
+VANDNPDZrmbk	1947
+VANDNPDZrmbkz	1948
+VANDNPDZrmk	1949
+VANDNPDZrmkz	1950
+VANDNPDZrr	1951
+VANDNPDZrrk	1952
+VANDNPDZrrkz	1953
+VANDNPDrm	1954
+VANDNPDrr	1955
+VANDNPSYrm	1956
+VANDNPSYrr	1957
+VANDNPSZ	1958
+VANDNPSZrm	1959
+VANDNPSZrmb	1960
+VANDNPSZrmbk	1961
+VANDNPSZrmbkz	1962
+VANDNPSZrmk	1963
+VANDNPSZrmkz	1964
+VANDNPSZrr	1965
+VANDNPSZrrk	1966
+VANDNPSZrrkz	1967
+VANDNPSrm	1968
+VANDNPSrr	1969
+VANDPDYrm	1970
+VANDPDYrr	1971
+VANDPDZ	1972
+VANDPDZrm	1973
+VANDPDZrmb	1974
+VANDPDZrmbk	1975
+VANDPDZrmbkz	1976
+VANDPDZrmk	1977
+VANDPDZrmkz	1978
+VANDPDZrr	1979
+VANDPDZrrk	1980
+VANDPDZrrkz	1981
+VANDPDrm	1982
+VANDPDrr	1983
+VANDPSYrm	1984
+VANDPSYrr	1985
+VANDPSZ	1986
+VANDPSZrm	1987
+VANDPSZrmb	1988
+VANDPSZrmbk	1989
+VANDPSZrmbkz	1990
+VANDPSZrmk	1991
+VANDPSZrmkz	1992
+VANDPSZrr	1993
+VANDPSZrrk	1994
+VANDPSZrrkz	1995
+VANDPSrm	1996
+VANDPSrr	1997
+VASTART_SAVE_XMM_REGS	1998
+VBCSTNEBF	1999
+VBCSTNESH	2000
+VBLENDMPDZ	2001
+VBLENDMPDZrm	2002
+VBLENDMPDZrmb	2003
+VBLENDMPDZrmbk	2004
+VBLENDMPDZrmbkz	2005
+VBLENDMPDZrmk	2006
+VBLENDMPDZrmkz	2007
+VBLENDMPDZrr	2008
+VBLENDMPDZrrk	2009
+VBLENDMPDZrrkz	2010
+VBLENDMPSZ	2011
+VBLENDMPSZrm	2012
+VBLENDMPSZrmb	2013
+VBLENDMPSZrmbk	2014
+VBLENDMPSZrmbkz	2015
+VBLENDMPSZrmk	2016
+VBLENDMPSZrmkz	2017
+VBLENDMPSZrr	2018
+VBLENDMPSZrrk	2019
+VBLENDMPSZrrkz	2020
+VBLENDPDYrmi	2021
+VBLENDPDYrri	2022
+VBLENDPDrmi	2023
+VBLENDPDrri	2024
+VBLENDPSYrmi	2025
+VBLENDPSYrri	2026
+VBLENDPSrmi	2027
+VBLENDPSrri	2028
+VBLENDVPDYrmr	2029
+VBLENDVPDYrrr	2030
+VBLENDVPDrmr	2031
+VBLENDVPDrrr	2032
+VBLENDVPSYrmr	2033
+VBLENDVPSYrrr	2034
+VBLENDVPSrmr	2035
+VBLENDVPSrrr	2036
+VBROADCASTF	2037
+VBROADCASTI	2038
+VBROADCASTSDYrm	2039
+VBROADCASTSDYrr	2040
+VBROADCASTSDZ	2041
+VBROADCASTSDZrm	2042
+VBROADCASTSDZrmk	2043
+VBROADCASTSDZrmkz	2044
+VBROADCASTSDZrr	2045
+VBROADCASTSDZrrk	2046
+VBROADCASTSDZrrkz	2047
+VBROADCASTSSYrm	2048
+VBROADCASTSSYrr	2049
+VBROADCASTSSZ	2050
+VBROADCASTSSZrm	2051
+VBROADCASTSSZrmk	2052
+VBROADCASTSSZrmkz	2053
+VBROADCASTSSZrr	2054
+VBROADCASTSSZrrk	2055
+VBROADCASTSSZrrkz	2056
+VBROADCASTSSrm	2057
+VBROADCASTSSrr	2058
+VCMPBF	2059
+VCMPPDYrmi	2060
+VCMPPDYrri	2061
+VCMPPDZ	2062
+VCMPPDZrmbi	2063
+VCMPPDZrmbik	2064
+VCMPPDZrmi	2065
+VCMPPDZrmik	2066
+VCMPPDZrri	2067
+VCMPPDZrrib	2068
+VCMPPDZrribk	2069
+VCMPPDZrrik	2070
+VCMPPDrmi	2071
+VCMPPDrri	2072
+VCMPPHZ	2073
+VCMPPHZrmbi	2074
+VCMPPHZrmbik	2075
+VCMPPHZrmi	2076
+VCMPPHZrmik	2077
+VCMPPHZrri	2078
+VCMPPHZrrib	2079
+VCMPPHZrribk	2080
+VCMPPHZrrik	2081
+VCMPPSYrmi	2082
+VCMPPSYrri	2083
+VCMPPSZ	2084
+VCMPPSZrmbi	2085
+VCMPPSZrmbik	2086
+VCMPPSZrmi	2087
+VCMPPSZrmik	2088
+VCMPPSZrri	2089
+VCMPPSZrrib	2090
+VCMPPSZrribk	2091
+VCMPPSZrrik	2092
+VCMPPSrmi	2093
+VCMPPSrri	2094
+VCMPSDZrmi	2095
+VCMPSDZrmi_Int	2096
+VCMPSDZrmik_Int	2097
+VCMPSDZrri	2098
+VCMPSDZrri_Int	2099
+VCMPSDZrrib_Int	2100
+VCMPSDZrribk_Int	2101
+VCMPSDZrrik_Int	2102
+VCMPSDrmi	2103
+VCMPSDrmi_Int	2104
+VCMPSDrri	2105
+VCMPSDrri_Int	2106
+VCMPSHZrmi	2107
+VCMPSHZrmi_Int	2108
+VCMPSHZrmik_Int	2109
+VCMPSHZrri	2110
+VCMPSHZrri_Int	2111
+VCMPSHZrrib_Int	2112
+VCMPSHZrribk_Int	2113
+VCMPSHZrrik_Int	2114
+VCMPSSZrmi	2115
+VCMPSSZrmi_Int	2116
+VCMPSSZrmik_Int	2117
+VCMPSSZrri	2118
+VCMPSSZrri_Int	2119
+VCMPSSZrrib_Int	2120
+VCMPSSZrribk_Int	2121
+VCMPSSZrrik_Int	2122
+VCMPSSrmi	2123
+VCMPSSrmi_Int	2124
+VCMPSSrri	2125
+VCMPSSrri_Int	2126
+VCOMISBF	2127
+VCOMISDZrm	2128
+VCOMISDZrm_Int	2129
+VCOMISDZrr	2130
+VCOMISDZrr_Int	2131
+VCOMISDZrrb	2132
+VCOMISDrm	2133
+VCOMISDrm_Int	2134
+VCOMISDrr	2135
+VCOMISDrr_Int	2136
+VCOMISHZrm	2137
+VCOMISHZrm_Int	2138
+VCOMISHZrr	2139
+VCOMISHZrr_Int	2140
+VCOMISHZrrb	2141
+VCOMISSZrm	2142
+VCOMISSZrm_Int	2143
+VCOMISSZrr	2144
+VCOMISSZrr_Int	2145
+VCOMISSZrrb	2146
+VCOMISSrm	2147
+VCOMISSrm_Int	2148
+VCOMISSrr	2149
+VCOMISSrr_Int	2150
+VCOMPRESSPDZ	2151
+VCOMPRESSPDZmr	2152
+VCOMPRESSPDZmrk	2153
+VCOMPRESSPDZrr	2154
+VCOMPRESSPDZrrk	2155
+VCOMPRESSPDZrrkz	2156
+VCOMPRESSPSZ	2157
+VCOMPRESSPSZmr	2158
+VCOMPRESSPSZmrk	2159
+VCOMPRESSPSZrr	2160
+VCOMPRESSPSZrrk	2161
+VCOMPRESSPSZrrkz	2162
+VCOMXSDZrm_Int	2163
+VCOMXSDZrr_Int	2164
+VCOMXSDZrrb_Int	2165
+VCOMXSHZrm_Int	2166
+VCOMXSHZrr_Int	2167
+VCOMXSHZrrb_Int	2168
+VCOMXSSZrm_Int	2169
+VCOMXSSZrr_Int	2170
+VCOMXSSZrrb_Int	2171
+VCVT	2172
+VCVTBF	2173
+VCVTBIASPH	2174
+VCVTDQ	2175
+VCVTHF	2176
+VCVTNE	2177
+VCVTNEEBF	2178
+VCVTNEEPH	2179
+VCVTNEOBF	2180
+VCVTNEOPH	2181
+VCVTNEPS	2182
+VCVTPD	2183
+VCVTPH	2184
+VCVTPS	2185
+VCVTQQ	2186
+VCVTSD	2187
+VCVTSH	2188
+VCVTSI	2189
+VCVTSS	2190
+VCVTTBF	2191
+VCVTTPD	2192
+VCVTTPH	2193
+VCVTTPS	2194
+VCVTTSD	2195
+VCVTTSH	2196
+VCVTTSS	2197
+VCVTUDQ	2198
+VCVTUQQ	2199
+VCVTUSI	2200
+VCVTUW	2201
+VCVTW	2202
+VDBPSADBWZ	2203
+VDBPSADBWZrmi	2204
+VDBPSADBWZrmik	2205
+VDBPSADBWZrmikz	2206
+VDBPSADBWZrri	2207
+VDBPSADBWZrrik	2208
+VDBPSADBWZrrikz	2209
+VDIVBF	2210
+VDIVPDYrm	2211
+VDIVPDYrr	2212
+VDIVPDZ	2213
+VDIVPDZrm	2214
+VDIVPDZrmb	2215
+VDIVPDZrmbk	2216
+VDIVPDZrmbkz	2217
+VDIVPDZrmk	2218
+VDIVPDZrmkz	2219
+VDIVPDZrr	2220
+VDIVPDZrrb	2221
+VDIVPDZrrbk	2222
+VDIVPDZrrbkz	2223
+VDIVPDZrrk	2224
+VDIVPDZrrkz	2225
+VDIVPDrm	2226
+VDIVPDrr	2227
+VDIVPHZ	2228
+VDIVPHZrm	2229
+VDIVPHZrmb	2230
+VDIVPHZrmbk	2231
+VDIVPHZrmbkz	2232
+VDIVPHZrmk	2233
+VDIVPHZrmkz	2234
+VDIVPHZrr	2235
+VDIVPHZrrb	2236
+VDIVPHZrrbk	2237
+VDIVPHZrrbkz	2238
+VDIVPHZrrk	2239
+VDIVPHZrrkz	2240
+VDIVPSYrm	2241
+VDIVPSYrr	2242
+VDIVPSZ	2243
+VDIVPSZrm	2244
+VDIVPSZrmb	2245
+VDIVPSZrmbk	2246
+VDIVPSZrmbkz	2247
+VDIVPSZrmk	2248
+VDIVPSZrmkz	2249
+VDIVPSZrr	2250
+VDIVPSZrrb	2251
+VDIVPSZrrbk	2252
+VDIVPSZrrbkz	2253
+VDIVPSZrrk	2254
+VDIVPSZrrkz	2255
+VDIVPSrm	2256
+VDIVPSrr	2257
+VDIVSDZrm	2258
+VDIVSDZrm_Int	2259
+VDIVSDZrmk_Int	2260
+VDIVSDZrmkz_Int	2261
+VDIVSDZrr	2262
+VDIVSDZrr_Int	2263
+VDIVSDZrrb_Int	2264
+VDIVSDZrrbk_Int	2265
+VDIVSDZrrbkz_Int	2266
+VDIVSDZrrk_Int	2267
+VDIVSDZrrkz_Int	2268
+VDIVSDrm	2269
+VDIVSDrm_Int	2270
+VDIVSDrr	2271
+VDIVSDrr_Int	2272
+VDIVSHZrm	2273
+VDIVSHZrm_Int	2274
+VDIVSHZrmk_Int	2275
+VDIVSHZrmkz_Int	2276
+VDIVSHZrr	2277
+VDIVSHZrr_Int	2278
+VDIVSHZrrb_Int	2279
+VDIVSHZrrbk_Int	2280
+VDIVSHZrrbkz_Int	2281
+VDIVSHZrrk_Int	2282
+VDIVSHZrrkz_Int	2283
+VDIVSSZrm	2284
+VDIVSSZrm_Int	2285
+VDIVSSZrmk_Int	2286
+VDIVSSZrmkz_Int	2287
+VDIVSSZrr	2288
+VDIVSSZrr_Int	2289
+VDIVSSZrrb_Int	2290
+VDIVSSZrrbk_Int	2291
+VDIVSSZrrbkz_Int	2292
+VDIVSSZrrk_Int	2293
+VDIVSSZrrkz_Int	2294
+VDIVSSrm	2295
+VDIVSSrm_Int	2296
+VDIVSSrr	2297
+VDIVSSrr_Int	2298
+VDPBF	2299
+VDPPDrmi	2300
+VDPPDrri	2301
+VDPPHPSZ	2302
+VDPPHPSZm	2303
+VDPPHPSZmb	2304
+VDPPHPSZmbk	2305
+VDPPHPSZmbkz	2306
+VDPPHPSZmk	2307
+VDPPHPSZmkz	2308
+VDPPHPSZr	2309
+VDPPHPSZrk	2310
+VDPPHPSZrkz	2311
+VDPPSYrmi	2312
+VDPPSYrri	2313
+VDPPSrmi	2314
+VDPPSrri	2315
+VERRm	2316
+VERRr	2317
+VERWm	2318
+VERWr	2319
+VEXP	2320
+VEXPANDPDZ	2321
+VEXPANDPDZrm	2322
+VEXPANDPDZrmk	2323
+VEXPANDPDZrmkz	2324
+VEXPANDPDZrr	2325
+VEXPANDPDZrrk	2326
+VEXPANDPDZrrkz	2327
+VEXPANDPSZ	2328
+VEXPANDPSZrm	2329
+VEXPANDPSZrmk	2330
+VEXPANDPSZrmkz	2331
+VEXPANDPSZrr	2332
+VEXPANDPSZrrk	2333
+VEXPANDPSZrrkz	2334
+VEXTRACTF	2335
+VEXTRACTI	2336
+VEXTRACTPSZmri	2337
+VEXTRACTPSZrri	2338
+VEXTRACTPSmri	2339
+VEXTRACTPSrri	2340
+VFCMADDCPHZ	2341
+VFCMADDCPHZm	2342
+VFCMADDCPHZmb	2343
+VFCMADDCPHZmbk	2344
+VFCMADDCPHZmbkz	2345
+VFCMADDCPHZmk	2346
+VFCMADDCPHZmkz	2347
+VFCMADDCPHZr	2348
+VFCMADDCPHZrb	2349
+VFCMADDCPHZrbk	2350
+VFCMADDCPHZrbkz	2351
+VFCMADDCPHZrk	2352
+VFCMADDCPHZrkz	2353
+VFCMADDCSHZm	2354
+VFCMADDCSHZmk	2355
+VFCMADDCSHZmkz	2356
+VFCMADDCSHZr	2357
+VFCMADDCSHZrb	2358
+VFCMADDCSHZrbk	2359
+VFCMADDCSHZrbkz	2360
+VFCMADDCSHZrk	2361
+VFCMADDCSHZrkz	2362
+VFCMULCPHZ	2363
+VFCMULCPHZrm	2364
+VFCMULCPHZrmb	2365
+VFCMULCPHZrmbk	2366
+VFCMULCPHZrmbkz	2367
+VFCMULCPHZrmk	2368
+VFCMULCPHZrmkz	2369
+VFCMULCPHZrr	2370
+VFCMULCPHZrrb	2371
+VFCMULCPHZrrbk	2372
+VFCMULCPHZrrbkz	2373
+VFCMULCPHZrrk	2374
+VFCMULCPHZrrkz	2375
+VFCMULCSHZrm	2376
+VFCMULCSHZrmk	2377
+VFCMULCSHZrmkz	2378
+VFCMULCSHZrr	2379
+VFCMULCSHZrrb	2380
+VFCMULCSHZrrbk	2381
+VFCMULCSHZrrbkz	2382
+VFCMULCSHZrrk	2383
+VFCMULCSHZrrkz	2384
+VFIXUPIMMPDZ	2385
+VFIXUPIMMPDZrmbi	2386
+VFIXUPIMMPDZrmbik	2387
+VFIXUPIMMPDZrmbikz	2388
+VFIXUPIMMPDZrmi	2389
+VFIXUPIMMPDZrmik	2390
+VFIXUPIMMPDZrmikz	2391
+VFIXUPIMMPDZrri	2392
+VFIXUPIMMPDZrrib	2393
+VFIXUPIMMPDZrribk	2394
+VFIXUPIMMPDZrribkz	2395
+VFIXUPIMMPDZrrik	2396
+VFIXUPIMMPDZrrikz	2397
+VFIXUPIMMPSZ	2398
+VFIXUPIMMPSZrmbi	2399
+VFIXUPIMMPSZrmbik	2400
+VFIXUPIMMPSZrmbikz	2401
+VFIXUPIMMPSZrmi	2402
+VFIXUPIMMPSZrmik	2403
+VFIXUPIMMPSZrmikz	2404
+VFIXUPIMMPSZrri	2405
+VFIXUPIMMPSZrrib	2406
+VFIXUPIMMPSZrribk	2407
+VFIXUPIMMPSZrribkz	2408
+VFIXUPIMMPSZrrik	2409
+VFIXUPIMMPSZrrikz	2410
+VFIXUPIMMSDZrmi	2411
+VFIXUPIMMSDZrmik	2412
+VFIXUPIMMSDZrmikz	2413
+VFIXUPIMMSDZrri	2414
+VFIXUPIMMSDZrrib	2415
+VFIXUPIMMSDZrribk	2416
+VFIXUPIMMSDZrribkz	2417
+VFIXUPIMMSDZrrik	2418
+VFIXUPIMMSDZrrikz	2419
+VFIXUPIMMSSZrmi	2420
+VFIXUPIMMSSZrmik	2421
+VFIXUPIMMSSZrmikz	2422
+VFIXUPIMMSSZrri	2423
+VFIXUPIMMSSZrrib	2424
+VFIXUPIMMSSZrribk	2425
+VFIXUPIMMSSZrribkz	2426
+VFIXUPIMMSSZrrik	2427
+VFIXUPIMMSSZrrikz	2428
+VFMADD	2429
+VFMADDCPHZ	2430
+VFMADDCPHZm	2431
+VFMADDCPHZmb	2432
+VFMADDCPHZmbk	2433
+VFMADDCPHZmbkz	2434
+VFMADDCPHZmk	2435
+VFMADDCPHZmkz	2436
+VFMADDCPHZr	2437
+VFMADDCPHZrb	2438
+VFMADDCPHZrbk	2439
+VFMADDCPHZrbkz	2440
+VFMADDCPHZrk	2441
+VFMADDCPHZrkz	2442
+VFMADDCSHZm	2443
+VFMADDCSHZmk	2444
+VFMADDCSHZmkz	2445
+VFMADDCSHZr	2446
+VFMADDCSHZrb	2447
+VFMADDCSHZrbk	2448
+VFMADDCSHZrbkz	2449
+VFMADDCSHZrk	2450
+VFMADDCSHZrkz	2451
+VFMADDPD	2452
+VFMADDPS	2453
+VFMADDSD	2454
+VFMADDSS	2455
+VFMADDSUB	2456
+VFMADDSUBPD	2457
+VFMADDSUBPS	2458
+VFMSUB	2459
+VFMSUBADD	2460
+VFMSUBADDPD	2461
+VFMSUBADDPS	2462
+VFMSUBPD	2463
+VFMSUBPS	2464
+VFMSUBSD	2465
+VFMSUBSS	2466
+VFMULCPHZ	2467
+VFMULCPHZrm	2468
+VFMULCPHZrmb	2469
+VFMULCPHZrmbk	2470
+VFMULCPHZrmbkz	2471
+VFMULCPHZrmk	2472
+VFMULCPHZrmkz	2473
+VFMULCPHZrr	2474
+VFMULCPHZrrb	2475
+VFMULCPHZrrbk	2476
+VFMULCPHZrrbkz	2477
+VFMULCPHZrrk	2478
+VFMULCPHZrrkz	2479
+VFMULCSHZrm	2480
+VFMULCSHZrmk	2481
+VFMULCSHZrmkz	2482
+VFMULCSHZrr	2483
+VFMULCSHZrrb	2484
+VFMULCSHZrrbk	2485
+VFMULCSHZrrbkz	2486
+VFMULCSHZrrk	2487
+VFMULCSHZrrkz	2488
+VFNMADD	2489
+VFNMADDPD	2490
+VFNMADDPS	2491
+VFNMADDSD	2492
+VFNMADDSS	2493
+VFNMSUB	2494
+VFNMSUBPD	2495
+VFNMSUBPS	2496
+VFNMSUBSD	2497
+VFNMSUBSS	2498
+VFPCLASSBF	2499
+VFPCLASSPDZ	2500
+VFPCLASSPDZmbi	2501
+VFPCLASSPDZmbik	2502
+VFPCLASSPDZmi	2503
+VFPCLASSPDZmik	2504
+VFPCLASSPDZri	2505
+VFPCLASSPDZrik	2506
+VFPCLASSPHZ	2507
+VFPCLASSPHZmbi	2508
+VFPCLASSPHZmbik	2509
+VFPCLASSPHZmi	2510
+VFPCLASSPHZmik	2511
+VFPCLASSPHZri	2512
+VFPCLASSPHZrik	2513
+VFPCLASSPSZ	2514
+VFPCLASSPSZmbi	2515
+VFPCLASSPSZmbik	2516
+VFPCLASSPSZmi	2517
+VFPCLASSPSZmik	2518
+VFPCLASSPSZri	2519
+VFPCLASSPSZrik	2520
+VFPCLASSSDZmi	2521
+VFPCLASSSDZmik	2522
+VFPCLASSSDZri	2523
+VFPCLASSSDZrik	2524
+VFPCLASSSHZmi	2525
+VFPCLASSSHZmik	2526
+VFPCLASSSHZri	2527
+VFPCLASSSHZrik	2528
+VFPCLASSSSZmi	2529
+VFPCLASSSSZmik	2530
+VFPCLASSSSZri	2531
+VFPCLASSSSZrik	2532
+VFRCZPDYrm	2533
+VFRCZPDYrr	2534
+VFRCZPDrm	2535
+VFRCZPDrr	2536
+VFRCZPSYrm	2537
+VFRCZPSYrr	2538
+VFRCZPSrm	2539
+VFRCZPSrr	2540
+VFRCZSDrm	2541
+VFRCZSDrr	2542
+VFRCZSSrm	2543
+VFRCZSSrr	2544
+VGATHERDPDYrm	2545
+VGATHERDPDZ	2546
+VGATHERDPDZrm	2547
+VGATHERDPDrm	2548
+VGATHERDPSYrm	2549
+VGATHERDPSZ	2550
+VGATHERDPSZrm	2551
+VGATHERDPSrm	2552
+VGATHERPF	2553
+VGATHERQPDYrm	2554
+VGATHERQPDZ	2555
+VGATHERQPDZrm	2556
+VGATHERQPDrm	2557
+VGATHERQPSYrm	2558
+VGATHERQPSZ	2559
+VGATHERQPSZrm	2560
+VGATHERQPSrm	2561
+VGETEXPBF	2562
+VGETEXPPDZ	2563
+VGETEXPPDZm	2564
+VGETEXPPDZmb	2565
+VGETEXPPDZmbk	2566
+VGETEXPPDZmbkz	2567
+VGETEXPPDZmk	2568
+VGETEXPPDZmkz	2569
+VGETEXPPDZr	2570
+VGETEXPPDZrb	2571
+VGETEXPPDZrbk	2572
+VGETEXPPDZrbkz	2573
+VGETEXPPDZrk	2574
+VGETEXPPDZrkz	2575
+VGETEXPPHZ	2576
+VGETEXPPHZm	2577
+VGETEXPPHZmb	2578
+VGETEXPPHZmbk	2579
+VGETEXPPHZmbkz	2580
+VGETEXPPHZmk	2581
+VGETEXPPHZmkz	2582
+VGETEXPPHZr	2583
+VGETEXPPHZrb	2584
+VGETEXPPHZrbk	2585
+VGETEXPPHZrbkz	2586
+VGETEXPPHZrk	2587
+VGETEXPPHZrkz	2588
+VGETEXPPSZ	2589
+VGETEXPPSZm	2590
+VGETEXPPSZmb	2591
+VGETEXPPSZmbk	2592
+VGETEXPPSZmbkz	2593
+VGETEXPPSZmk	2594
+VGETEXPPSZmkz	2595
+VGETEXPPSZr	2596
+VGETEXPPSZrb	2597
+VGETEXPPSZrbk	2598
+VGETEXPPSZrbkz	2599
+VGETEXPPSZrk	2600
+VGETEXPPSZrkz	2601
+VGETEXPSDZm	2602
+VGETEXPSDZmk	2603
+VGETEXPSDZmkz	2604
+VGETEXPSDZr	2605
+VGETEXPSDZrb	2606
+VGETEXPSDZrbk	2607
+VGETEXPSDZrbkz	2608
+VGETEXPSDZrk	2609
+VGETEXPSDZrkz	2610
+VGETEXPSHZm	2611
+VGETEXPSHZmk	2612
+VGETEXPSHZmkz	2613
+VGETEXPSHZr	2614
+VGETEXPSHZrb	2615
+VGETEXPSHZrbk	2616
+VGETEXPSHZrbkz	2617
+VGETEXPSHZrk	2618
+VGETEXPSHZrkz	2619
+VGETEXPSSZm	2620
+VGETEXPSSZmk	2621
+VGETEXPSSZmkz	2622
+VGETEXPSSZr	2623
+VGETEXPSSZrb	2624
+VGETEXPSSZrbk	2625
+VGETEXPSSZrbkz	2626
+VGETEXPSSZrk	2627
+VGETEXPSSZrkz	2628
+VGETMANTBF	2629
+VGETMANTPDZ	2630
+VGETMANTPDZrmbi	2631
+VGETMANTPDZrmbik	2632
+VGETMANTPDZrmbikz	2633
+VGETMANTPDZrmi	2634
+VGETMANTPDZrmik	2635
+VGETMANTPDZrmikz	2636
+VGETMANTPDZrri	2637
+VGETMANTPDZrrib	2638
+VGETMANTPDZrribk	2639
+VGETMANTPDZrribkz	2640
+VGETMANTPDZrrik	2641
+VGETMANTPDZrrikz	2642
+VGETMANTPHZ	2643
+VGETMANTPHZrmbi	2644
+VGETMANTPHZrmbik	2645
+VGETMANTPHZrmbikz	2646
+VGETMANTPHZrmi	2647
+VGETMANTPHZrmik	2648
+VGETMANTPHZrmikz	2649
+VGETMANTPHZrri	2650
+VGETMANTPHZrrib	2651
+VGETMANTPHZrribk	2652
+VGETMANTPHZrribkz	2653
+VGETMANTPHZrrik	2654
+VGETMANTPHZrrikz	2655
+VGETMANTPSZ	2656
+VGETMANTPSZrmbi	2657
+VGETMANTPSZrmbik	2658
+VGETMANTPSZrmbikz	2659
+VGETMANTPSZrmi	2660
+VGETMANTPSZrmik	2661
+VGETMANTPSZrmikz	2662
+VGETMANTPSZrri	2663
+VGETMANTPSZrrib	2664
+VGETMANTPSZrribk	2665
+VGETMANTPSZrribkz	2666
+VGETMANTPSZrrik	2667
+VGETMANTPSZrrikz	2668
+VGETMANTSDZrmi	2669
+VGETMANTSDZrmik	2670
+VGETMANTSDZrmikz	2671
+VGETMANTSDZrri	2672
+VGETMANTSDZrrib	2673
+VGETMANTSDZrribk	2674
+VGETMANTSDZrribkz	2675
+VGETMANTSDZrrik	2676
+VGETMANTSDZrrikz	2677
+VGETMANTSHZrmi	2678
+VGETMANTSHZrmik	2679
+VGETMANTSHZrmikz	2680
+VGETMANTSHZrri	2681
+VGETMANTSHZrrib	2682
+VGETMANTSHZrribk	2683
+VGETMANTSHZrribkz	2684
+VGETMANTSHZrrik	2685
+VGETMANTSHZrrikz	2686
+VGETMANTSSZrmi	2687
+VGETMANTSSZrmik	2688
+VGETMANTSSZrmikz	2689
+VGETMANTSSZrri	2690
+VGETMANTSSZrrib	2691
+VGETMANTSSZrribk	2692
+VGETMANTSSZrribkz	2693
+VGETMANTSSZrrik	2694
+VGETMANTSSZrrikz	2695
+VGF	2696
+VHADDPDYrm	2697
+VHADDPDYrr	2698
+VHADDPDrm	2699
+VHADDPDrr	2700
+VHADDPSYrm	2701
+VHADDPSYrr	2702
+VHADDPSrm	2703
+VHADDPSrr	2704
+VHSUBPDYrm	2705
+VHSUBPDYrr	2706
+VHSUBPDrm	2707
+VHSUBPDrr	2708
+VHSUBPSYrm	2709
+VHSUBPSYrr	2710
+VHSUBPSrm	2711
+VHSUBPSrr	2712
+VINSERTF	2713
+VINSERTI	2714
+VINSERTPSZrmi	2715
+VINSERTPSZrri	2716
+VINSERTPSrmi	2717
+VINSERTPSrri	2718
+VLDDQUYrm	2719
+VLDDQUrm	2720
+VLDMXCSR	2721
+VMASKMOVDQU	2722
+VMASKMOVPDYmr	2723
+VMASKMOVPDYrm	2724
+VMASKMOVPDmr	2725
+VMASKMOVPDrm	2726
+VMASKMOVPSYmr	2727
+VMASKMOVPSYrm	2728
+VMASKMOVPSmr	2729
+VMASKMOVPSrm	2730
+VMAXBF	2731
+VMAXCPDYrm	2732
+VMAXCPDYrr	2733
+VMAXCPDZ	2734
+VMAXCPDZrm	2735
+VMAXCPDZrmb	2736
+VMAXCPDZrmbk	2737
+VMAXCPDZrmbkz	2738
+VMAXCPDZrmk	2739
+VMAXCPDZrmkz	2740
+VMAXCPDZrr	2741
+VMAXCPDZrrk	2742
+VMAXCPDZrrkz	2743
+VMAXCPDrm	2744
+VMAXCPDrr	2745
+VMAXCPHZ	2746
+VMAXCPHZrm	2747
+VMAXCPHZrmb	2748
+VMAXCPHZrmbk	2749
+VMAXCPHZrmbkz	2750
+VMAXCPHZrmk	2751
+VMAXCPHZrmkz	2752
+VMAXCPHZrr	2753
+VMAXCPHZrrk	2754
+VMAXCPHZrrkz	2755
+VMAXCPSYrm	2756
+VMAXCPSYrr	2757
+VMAXCPSZ	2758
+VMAXCPSZrm	2759
+VMAXCPSZrmb	2760
+VMAXCPSZrmbk	2761
+VMAXCPSZrmbkz	2762
+VMAXCPSZrmk	2763
+VMAXCPSZrmkz	2764
+VMAXCPSZrr	2765
+VMAXCPSZrrk	2766
+VMAXCPSZrrkz	2767
+VMAXCPSrm	2768
+VMAXCPSrr	2769
+VMAXCSDZrm	2770
+VMAXCSDZrr	2771
+VMAXCSDrm	2772
+VMAXCSDrr	2773
+VMAXCSHZrm	2774
+VMAXCSHZrr	2775
+VMAXCSSZrm	2776
+VMAXCSSZrr	2777
+VMAXCSSrm	2778
+VMAXCSSrr	2779
+VMAXPDYrm	2780
+VMAXPDYrr	2781
+VMAXPDZ	2782
+VMAXPDZrm	2783
+VMAXPDZrmb	2784
+VMAXPDZrmbk	2785
+VMAXPDZrmbkz	2786
+VMAXPDZrmk	2787
+VMAXPDZrmkz	2788
+VMAXPDZrr	2789
+VMAXPDZrrb	2790
+VMAXPDZrrbk	2791
+VMAXPDZrrbkz	2792
+VMAXPDZrrk	2793
+VMAXPDZrrkz	2794
+VMAXPDrm	2795
+VMAXPDrr	2796
+VMAXPHZ	2797
+VMAXPHZrm	2798
+VMAXPHZrmb	2799
+VMAXPHZrmbk	2800
+VMAXPHZrmbkz	2801
+VMAXPHZrmk	2802
+VMAXPHZrmkz	2803
+VMAXPHZrr	2804
+VMAXPHZrrb	2805
+VMAXPHZrrbk	2806
+VMAXPHZrrbkz	2807
+VMAXPHZrrk	2808
+VMAXPHZrrkz	2809
+VMAXPSYrm	2810
+VMAXPSYrr	2811
+VMAXPSZ	2812
+VMAXPSZrm	2813
+VMAXPSZrmb	2814
+VMAXPSZrmbk	2815
+VMAXPSZrmbkz	2816
+VMAXPSZrmk	2817
+VMAXPSZrmkz	2818
+VMAXPSZrr	2819
+VMAXPSZrrb	2820
+VMAXPSZrrbk	2821
+VMAXPSZrrbkz	2822
+VMAXPSZrrk	2823
+VMAXPSZrrkz	2824
+VMAXPSrm	2825
+VMAXPSrr	2826
+VMAXSDZrm	2827
+VMAXSDZrm_Int	2828
+VMAXSDZrmk_Int	2829
+VMAXSDZrmkz_Int	2830
+VMAXSDZrr	2831
+VMAXSDZrr_Int	2832
+VMAXSDZrrb_Int	2833
+VMAXSDZrrbk_Int	2834
+VMAXSDZrrbkz_Int	2835
+VMAXSDZrrk_Int	2836
+VMAXSDZrrkz_Int	2837
+VMAXSDrm	2838
+VMAXSDrm_Int	2839
+VMAXSDrr	2840
+VMAXSDrr_Int	2841
+VMAXSHZrm	2842
+VMAXSHZrm_Int	2843
+VMAXSHZrmk_Int	2844
+VMAXSHZrmkz_Int	2845
+VMAXSHZrr	2846
+VMAXSHZrr_Int	2847
+VMAXSHZrrb_Int	2848
+VMAXSHZrrbk_Int	2849
+VMAXSHZrrbkz_Int	2850
+VMAXSHZrrk_Int	2851
+VMAXSHZrrkz_Int	2852
+VMAXSSZrm	2853
+VMAXSSZrm_Int	2854
+VMAXSSZrmk_Int	2855
+VMAXSSZrmkz_Int	2856
+VMAXSSZrr	2857
+VMAXSSZrr_Int	2858
+VMAXSSZrrb_Int	2859
+VMAXSSZrrbk_Int	2860
+VMAXSSZrrbkz_Int	2861
+VMAXSSZrrk_Int	2862
+VMAXSSZrrkz_Int	2863
+VMAXSSrm	2864
+VMAXSSrm_Int	2865
+VMAXSSrr	2866
+VMAXSSrr_Int	2867
+VMCALL	2868
+VMCLEARm	2869
+VMFUNC	2870
+VMINBF	2871
+VMINCPDYrm	2872
+VMINCPDYrr	2873
+VMINCPDZ	2874
+VMINCPDZrm	2875
+VMINCPDZrmb	2876
+VMINCPDZrmbk	2877
+VMINCPDZrmbkz	2878
+VMINCPDZrmk	2879
+VMINCPDZrmkz	2880
+VMINCPDZrr	2881
+VMINCPDZrrk	2882
+VMINCPDZrrkz	2883
+VMINCPDrm	2884
+VMINCPDrr	2885
+VMINCPHZ	2886
+VMINCPHZrm	2887
+VMINCPHZrmb	2888
+VMINCPHZrmbk	2889
+VMINCPHZrmbkz	2890
+VMINCPHZrmk	2891
+VMINCPHZrmkz	2892
+VMINCPHZrr	2893
+VMINCPHZrrk	2894
+VMINCPHZrrkz	2895
+VMINCPSYrm	2896
+VMINCPSYrr	2897
+VMINCPSZ	2898
+VMINCPSZrm	2899
+VMINCPSZrmb	2900
+VMINCPSZrmbk	2901
+VMINCPSZrmbkz	2902
+VMINCPSZrmk	2903
+VMINCPSZrmkz	2904
+VMINCPSZrr	2905
+VMINCPSZrrk	2906
+VMINCPSZrrkz	2907
+VMINCPSrm	2908
+VMINCPSrr	2909
+VMINCSDZrm	2910
+VMINCSDZrr	2911
+VMINCSDrm	2912
+VMINCSDrr	2913
+VMINCSHZrm	2914
+VMINCSHZrr	2915
+VMINCSSZrm	2916
+VMINCSSZrr	2917
+VMINCSSrm	2918
+VMINCSSrr	2919
+VMINMAXBF	2920
+VMINMAXPDZ	2921
+VMINMAXPDZrmbi	2922
+VMINMAXPDZrmbik	2923
+VMINMAXPDZrmbikz	2924
+VMINMAXPDZrmi	2925
+VMINMAXPDZrmik	2926
+VMINMAXPDZrmikz	2927
+VMINMAXPDZrri	2928
+VMINMAXPDZrrib	2929
+VMINMAXPDZrribk	2930
+VMINMAXPDZrribkz	2931
+VMINMAXPDZrrik	2932
+VMINMAXPDZrrikz	2933
+VMINMAXPHZ	2934
+VMINMAXPHZrmbi	2935
+VMINMAXPHZrmbik	2936
+VMINMAXPHZrmbikz	2937
+VMINMAXPHZrmi	2938
+VMINMAXPHZrmik	2939
+VMINMAXPHZrmikz	2940
+VMINMAXPHZrri	2941
+VMINMAXPHZrrib	2942
+VMINMAXPHZrribk	2943
+VMINMAXPHZrribkz	2944
+VMINMAXPHZrrik	2945
+VMINMAXPHZrrikz	2946
+VMINMAXPSZ	2947
+VMINMAXPSZrmbi	2948
+VMINMAXPSZrmbik	2949
+VMINMAXPSZrmbikz	2950
+VMINMAXPSZrmi	2951
+VMINMAXPSZrmik	2952
+VMINMAXPSZrmikz	2953
+VMINMAXPSZrri	2954
+VMINMAXPSZrrib	2955
+VMINMAXPSZrribk	2956
+VMINMAXPSZrribkz	2957
+VMINMAXPSZrrik	2958
+VMINMAXPSZrrikz	2959
+VMINMAXSDrmi	2960
+VMINMAXSDrmi_Int	2961
+VMINMAXSDrmik_Int	2962
+VMINMAXSDrmikz_Int	2963
+VMINMAXSDrri	2964
+VMINMAXSDrri_Int	2965
+VMINMAXSDrrib_Int	2966
+VMINMAXSDrribk_Int	2967
+VMINMAXSDrribkz_Int	2968
+VMINMAXSDrrik_Int	2969
+VMINMAXSDrrikz_Int	2970
+VMINMAXSHrmi	2971
+VMINMAXSHrmi_Int	2972
+VMINMAXSHrmik_Int	2973
+VMINMAXSHrmikz_Int	2974
+VMINMAXSHrri	2975
+VMINMAXSHrri_Int	2976
+VMINMAXSHrrib_Int	2977
+VMINMAXSHrribk_Int	2978
+VMINMAXSHrribkz_Int	2979
+VMINMAXSHrrik_Int	2980
+VMINMAXSHrrikz_Int	2981
+VMINMAXSSrmi	2982
+VMINMAXSSrmi_Int	2983
+VMINMAXSSrmik_Int	2984
+VMINMAXSSrmikz_Int	2985
+VMINMAXSSrri	2986
+VMINMAXSSrri_Int	2987
+VMINMAXSSrrib_Int	2988
+VMINMAXSSrribk_Int	2989
+VMINMAXSSrribkz_Int	2990
+VMINMAXSSrrik_Int	2991
+VMINMAXSSrrikz_Int	2992
+VMINPDYrm	2993
+VMINPDYrr	2994
+VMINPDZ	2995
+VMINPDZrm	2996
+VMINPDZrmb	2997
+VMINPDZrmbk	2998
+VMINPDZrmbkz	2999
+VMINPDZrmk	3000
+VMINPDZrmkz	3001
+VMINPDZrr	3002
+VMINPDZrrb	3003
+VMINPDZrrbk	3004
+VMINPDZrrbkz	3005
+VMINPDZrrk	3006
+VMINPDZrrkz	3007
+VMINPDrm	3008
+VMINPDrr	3009
+VMINPHZ	3010
+VMINPHZrm	3011
+VMINPHZrmb	3012
+VMINPHZrmbk	3013
+VMINPHZrmbkz	3014
+VMINPHZrmk	3015
+VMINPHZrmkz	3016
+VMINPHZrr	3017
+VMINPHZrrb	3018
+VMINPHZrrbk	3019
+VMINPHZrrbkz	3020
+VMINPHZrrk	3021
+VMINPHZrrkz	3022
+VMINPSYrm	3023
+VMINPSYrr	3024
+VMINPSZ	3025
+VMINPSZrm	3026
+VMINPSZrmb	3027
+VMINPSZrmbk	3028
+VMINPSZrmbkz	3029
+VMINPSZrmk	3030
+VMINPSZrmkz	3031
+VMINPSZrr	3032
+VMINPSZrrb	3033
+VMINPSZrrbk	3034
+VMINPSZrrbkz	3035
+VMINPSZrrk	3036
+VMINPSZrrkz	3037
+VMINPSrm	3038
+VMINPSrr	3039
+VMINSDZrm	3040
+VMINSDZrm_Int	3041
+VMINSDZrmk_Int	3042
+VMINSDZrmkz_Int	3043
+VMINSDZrr	3044
+VMINSDZrr_Int	3045
+VMINSDZrrb_Int	3046
+VMINSDZrrbk_Int	3047
+VMINSDZrrbkz_Int	3048
+VMINSDZrrk_Int	3049
+VMINSDZrrkz_Int	3050
+VMINSDrm	3051
+VMINSDrm_Int	3052
+VMINSDrr	3053
+VMINSDrr_Int	3054
+VMINSHZrm	3055
+VMINSHZrm_Int	3056
+VMINSHZrmk_Int	3057
+VMINSHZrmkz_Int	3058
+VMINSHZrr	3059
+VMINSHZrr_Int	3060
+VMINSHZrrb_Int	3061
+VMINSHZrrbk_Int	3062
+VMINSHZrrbkz_Int	3063
+VMINSHZrrk_Int	3064
+VMINSHZrrkz_Int	3065
+VMINSSZrm	3066
+VMINSSZrm_Int	3067
+VMINSSZrmk_Int	3068
+VMINSSZrmkz_Int	3069
+VMINSSZrr	3070
+VMINSSZrr_Int	3071
+VMINSSZrrb_Int	3072
+VMINSSZrrbk_Int	3073
+VMINSSZrrbkz_Int	3074
+VMINSSZrrk_Int	3075
+VMINSSZrrkz_Int	3076
+VMINSSrm	3077
+VMINSSrm_Int	3078
+VMINSSrr	3079
+VMINSSrr_Int	3080
+VMLAUNCH	3081
+VMLOAD	3082
+VMMCALL	3083
+VMOV	3084
+VMOVAPDYmr	3085
+VMOVAPDYrm	3086
+VMOVAPDYrr	3087
+VMOVAPDYrr_REV	3088
+VMOVAPDZ	3089
+VMOVAPDZmr	3090
+VMOVAPDZmrk	3091
+VMOVAPDZrm	3092
+VMOVAPDZrmk	3093
+VMOVAPDZrmkz	3094
+VMOVAPDZrr	3095
+VMOVAPDZrr_REV	3096
+VMOVAPDZrrk	3097
+VMOVAPDZrrk_REV	3098
+VMOVAPDZrrkz	3099
+VMOVAPDZrrkz_REV	3100
+VMOVAPDmr	3101
+VMOVAPDrm	3102
+VMOVAPDrr	3103
+VMOVAPDrr_REV	3104
+VMOVAPSYmr	3105
+VMOVAPSYrm	3106
+VMOVAPSYrr	3107
+VMOVAPSYrr_REV	3108
+VMOVAPSZ	3109
+VMOVAPSZmr	3110
+VMOVAPSZmrk	3111
+VMOVAPSZrm	3112
+VMOVAPSZrmk	3113
+VMOVAPSZrmkz	3114
+VMOVAPSZrr	3115
+VMOVAPSZrr_REV	3116
+VMOVAPSZrrk	3117
+VMOVAPSZrrk_REV	3118
+VMOVAPSZrrkz	3119
+VMOVAPSZrrkz_REV	3120
+VMOVAPSmr	3121
+VMOVAPSrm	3122
+VMOVAPSrr	3123
+VMOVAPSrr_REV	3124
+VMOVDDUPYrm	3125
+VMOVDDUPYrr	3126
+VMOVDDUPZ	3127
+VMOVDDUPZrm	3128
+VMOVDDUPZrmk	3129
+VMOVDDUPZrmkz	3130
+VMOVDDUPZrr	3131
+VMOVDDUPZrrk	3132
+VMOVDDUPZrrkz	3133
+VMOVDDUPrm	3134
+VMOVDDUPrr	3135
+VMOVDI	3136
+VMOVDQA	3137
+VMOVDQAYmr	3138
+VMOVDQAYrm	3139
+VMOVDQAYrr	3140
+VMOVDQAYrr_REV	3141
+VMOVDQAmr	3142
+VMOVDQArm	3143
+VMOVDQArr	3144
+VMOVDQArr_REV	3145
+VMOVDQU	3146
+VMOVDQUYmr	3147
+VMOVDQUYrm	3148
+VMOVDQUYrr	3149
+VMOVDQUYrr_REV	3150
+VMOVDQUmr	3151
+VMOVDQUrm	3152
+VMOVDQUrr	3153
+VMOVDQUrr_REV	3154
+VMOVHLPSZrr	3155
+VMOVHLPSrr	3156
+VMOVHPDZ	3157
+VMOVHPDmr	3158
+VMOVHPDrm	3159
+VMOVHPSZ	3160
+VMOVHPSmr	3161
+VMOVHPSrm	3162
+VMOVLHPSZrr	3163
+VMOVLHPSrr	3164
+VMOVLPDZ	3165
+VMOVLPDmr	3166
+VMOVLPDrm	3167
+VMOVLPSZ	3168
+VMOVLPSmr	3169
+VMOVLPSrm	3170
+VMOVMSKPDYrr	3171
+VMOVMSKPDrr	3172
+VMOVMSKPSYrr	3173
+VMOVMSKPSrr	3174
+VMOVNTDQAYrm	3175
+VMOVNTDQAZ	3176
+VMOVNTDQAZrm	3177
+VMOVNTDQArm	3178
+VMOVNTDQYmr	3179
+VMOVNTDQZ	3180
+VMOVNTDQZmr	3181
+VMOVNTDQmr	3182
+VMOVNTPDYmr	3183
+VMOVNTPDZ	3184
+VMOVNTPDZmr	3185
+VMOVNTPDmr	3186
+VMOVNTPSYmr	3187
+VMOVNTPSZ	3188
+VMOVNTPSZmr	3189
+VMOVNTPSmr	3190
+VMOVPDI	3191
+VMOVPQI	3192
+VMOVPQIto	3193
+VMOVQI	3194
+VMOVRSBZ	3195
+VMOVRSBZm	3196
+VMOVRSBZmk	3197
+VMOVRSBZmkz	3198
+VMOVRSDZ	3199
+VMOVRSDZm	3200
+VMOVRSDZmk	3201
+VMOVRSDZmkz	3202
+VMOVRSQZ	3203
+VMOVRSQZm	3204
+VMOVRSQZmk	3205
+VMOVRSQZmkz	3206
+VMOVRSWZ	3207
+VMOVRSWZm	3208
+VMOVRSWZmk	3209
+VMOVRSWZmkz	3210
+VMOVSDZmr	3211
+VMOVSDZmrk	3212
+VMOVSDZrm	3213
+VMOVSDZrm_alt	3214
+VMOVSDZrmk	3215
+VMOVSDZrmkz	3216
+VMOVSDZrr	3217
+VMOVSDZrr_REV	3218
+VMOVSDZrrk	3219
+VMOVSDZrrk_REV	3220
+VMOVSDZrrkz	3221
+VMOVSDZrrkz_REV	3222
+VMOVSDmr	3223
+VMOVSDrm	3224
+VMOVSDrm_alt	3225
+VMOVSDrr	3226
+VMOVSDrr_REV	3227
+VMOVSDto	3228
+VMOVSH	3229
+VMOVSHDUPYrm	3230
+VMOVSHDUPYrr	3231
+VMOVSHDUPZ	3232
+VMOVSHDUPZrm	3233
+VMOVSHDUPZrmk	3234
+VMOVSHDUPZrmkz	3235
+VMOVSHDUPZrr	3236
+VMOVSHDUPZrrk	3237
+VMOVSHDUPZrrkz	3238
+VMOVSHDUPrm	3239
+VMOVSHDUPrr	3240
+VMOVSHZmr	3241
+VMOVSHZmrk	3242
+VMOVSHZrm	3243
+VMOVSHZrm_alt	3244
+VMOVSHZrmk	3245
+VMOVSHZrmkz	3246
+VMOVSHZrr	3247
+VMOVSHZrr_REV	3248
+VMOVSHZrrk	3249
+VMOVSHZrrk_REV	3250
+VMOVSHZrrkz	3251
+VMOVSHZrrkz_REV	3252
+VMOVSHtoW	3253
+VMOVSLDUPYrm	3254
+VMOVSLDUPYrr	3255
+VMOVSLDUPZ	3256
+VMOVSLDUPZrm	3257
+VMOVSLDUPZrmk	3258
+VMOVSLDUPZrmkz	3259
+VMOVSLDUPZrr	3260
+VMOVSLDUPZrrk	3261
+VMOVSLDUPZrrkz	3262
+VMOVSLDUPrm	3263
+VMOVSLDUPrr	3264
+VMOVSS	3265
+VMOVSSZmr	3266
+VMOVSSZmrk	3267
+VMOVSSZrm	3268
+VMOVSSZrm_alt	3269
+VMOVSSZrmk	3270
+VMOVSSZrmkz	3271
+VMOVSSZrr	3272
+VMOVSSZrr_REV	3273
+VMOVSSZrrk	3274
+VMOVSSZrrk_REV	3275
+VMOVSSZrrkz	3276
+VMOVSSZrrkz_REV	3277
+VMOVSSmr	3278
+VMOVSSrm	3279
+VMOVSSrm_alt	3280
+VMOVSSrr	3281
+VMOVSSrr_REV	3282
+VMOVUPDYmr	3283
+VMOVUPDYrm	3284
+VMOVUPDYrr	3285
+VMOVUPDYrr_REV	3286
+VMOVUPDZ	3287
+VMOVUPDZmr	3288
+VMOVUPDZmrk	3289
+VMOVUPDZrm	3290
+VMOVUPDZrmk	3291
+VMOVUPDZrmkz	3292
+VMOVUPDZrr	3293
+VMOVUPDZrr_REV	3294
+VMOVUPDZrrk	3295
+VMOVUPDZrrk_REV	3296
+VMOVUPDZrrkz	3297
+VMOVUPDZrrkz_REV	3298
+VMOVUPDmr	3299
+VMOVUPDrm	3300
+VMOVUPDrr	3301
+VMOVUPDrr_REV	3302
+VMOVUPSYmr	3303
+VMOVUPSYrm	3304
+VMOVUPSYrr	3305
+VMOVUPSYrr_REV	3306
+VMOVUPSZ	3307
+VMOVUPSZmr	3308
+VMOVUPSZmrk	3309
+VMOVUPSZrm	3310
+VMOVUPSZrmk	3311
+VMOVUPSZrmkz	3312
+VMOVUPSZrr	3313
+VMOVUPSZrr_REV	3314
+VMOVUPSZrrk	3315
+VMOVUPSZrrk_REV	3316
+VMOVUPSZrrkz	3317
+VMOVUPSZrrkz_REV	3318
+VMOVUPSmr	3319
+VMOVUPSrm	3320
+VMOVUPSrr	3321
+VMOVUPSrr_REV	3322
+VMOVW	3323
+VMOVWmr	3324
+VMOVWrm	3325
+VMOVZPDILo	3326
+VMOVZPQILo	3327
+VMOVZPWILo	3328
+VMPSADBWYrmi	3329
+VMPSADBWYrri	3330
+VMPSADBWZ	3331
+VMPSADBWZrmi	3332
+VMPSADBWZrmik	3333
+VMPSADBWZrmikz	3334
+VMPSADBWZrri	3335
+VMPSADBWZrrik	3336
+VMPSADBWZrrikz	3337
+VMPSADBWrmi	3338
+VMPSADBWrri	3339
+VMPTRLDm	3340
+VMPTRSTm	3341
+VMREAD	3342
+VMRESUME	3343
+VMRUN	3344
+VMSAVE	3345
+VMULBF	3346
+VMULPDYrm	3347
+VMULPDYrr	3348
+VMULPDZ	3349
+VMULPDZrm	3350
+VMULPDZrmb	3351
+VMULPDZrmbk	3352
+VMULPDZrmbkz	3353
+VMULPDZrmk	3354
+VMULPDZrmkz	3355
+VMULPDZrr	3356
+VMULPDZrrb	3357
+VMULPDZrrbk	3358
+VMULPDZrrbkz	3359
+VMULPDZrrk	3360
+VMULPDZrrkz	3361
+VMULPDrm	3362
+VMULPDrr	3363
+VMULPHZ	3364
+VMULPHZrm	3365
+VMULPHZrmb	3366
+VMULPHZrmbk	3367
+VMULPHZrmbkz	3368
+VMULPHZrmk	3369
+VMULPHZrmkz	3370
+VMULPHZrr	3371
+VMULPHZrrb	3372
+VMULPHZrrbk	3373
+VMULPHZrrbkz	3374
+VMULPHZrrk	3375
+VMULPHZrrkz	3376
+VMULPSYrm	3377
+VMULPSYrr	3378
+VMULPSZ	3379
+VMULPSZrm	3380
+VMULPSZrmb	3381
+VMULPSZrmbk	3382
+VMULPSZrmbkz	3383
+VMULPSZrmk	3384
+VMULPSZrmkz	3385
+VMULPSZrr	3386
+VMULPSZrrb	3387
+VMULPSZrrbk	3388
+VMULPSZrrbkz	3389
+VMULPSZrrk	3390
+VMULPSZrrkz	3391
+VMULPSrm	3392
+VMULPSrr	3393
+VMULSDZrm	3394
+VMULSDZrm_Int	3395
+VMULSDZrmk_Int	3396
+VMULSDZrmkz_Int	3397
+VMULSDZrr	3398
+VMULSDZrr_Int	3399
+VMULSDZrrb_Int	3400
+VMULSDZrrbk_Int	3401
+VMULSDZrrbkz_Int	3402
+VMULSDZrrk_Int	3403
+VMULSDZrrkz_Int	3404
+VMULSDrm	3405
+VMULSDrm_Int	3406
+VMULSDrr	3407
+VMULSDrr_Int	3408
+VMULSHZrm	3409
+VMULSHZrm_Int	3410
+VMULSHZrmk_Int	3411
+VMULSHZrmkz_Int	3412
+VMULSHZrr	3413
+VMULSHZrr_Int	3414
+VMULSHZrrb_Int	3415
+VMULSHZrrbk_Int	3416
+VMULSHZrrbkz_Int	3417
+VMULSHZrrk_Int	3418
+VMULSHZrrkz_Int	3419
+VMULSSZrm	3420
+VMULSSZrm_Int	3421
+VMULSSZrmk_Int	3422
+VMULSSZrmkz_Int	3423
+VMULSSZrr	3424
+VMULSSZrr_Int	3425
+VMULSSZrrb_Int	3426
+VMULSSZrrbk_Int	3427
+VMULSSZrrbkz_Int	3428
+VMULSSZrrk_Int	3429
+VMULSSZrrkz_Int	3430
+VMULSSrm	3431
+VMULSSrm_Int	3432
+VMULSSrr	3433
+VMULSSrr_Int	3434
+VMWRITE	3435
+VMXOFF	3436
+VMXON	3437
+VORPDYrm	3438
+VORPDYrr	3439
+VORPDZ	3440
+VORPDZrm	3441
+VORPDZrmb	3442
+VORPDZrmbk	3443
+VORPDZrmbkz	3444
+VORPDZrmk	3445
+VORPDZrmkz	3446
+VORPDZrr	3447
+VORPDZrrk	3448
+VORPDZrrkz	3449
+VORPDrm	3450
+VORPDrr	3451
+VORPSYrm	3452
+VORPSYrr	3453
+VORPSZ	3454
+VORPSZrm	3455
+VORPSZrmb	3456
+VORPSZrmbk	3457
+VORPSZrmbkz	3458
+VORPSZrmk	3459
+VORPSZrmkz	3460
+VORPSZrr	3461
+VORPSZrrk	3462
+VORPSZrrkz	3463
+VORPSrm	3464
+VORPSrr	3465
+VP	3466
+VPABSBYrm	3467
+VPABSBYrr	3468
+VPABSBZ	3469
+VPABSBZrm	3470
+VPABSBZrmk	3471
+VPABSBZrmkz	3472
+VPABSBZrr	3473
+VPABSBZrrk	3474
+VPABSBZrrkz	3475
+VPABSBrm	3476
+VPABSBrr	3477
+VPABSDYrm	3478
+VPABSDYrr	3479
+VPABSDZ	3480
+VPABSDZrm	3481
+VPABSDZrmb	3482
+VPABSDZrmbk	3483
+VPABSDZrmbkz	3484
+VPABSDZrmk	3485
+VPABSDZrmkz	3486
+VPABSDZrr	3487
+VPABSDZrrk	3488
+VPABSDZrrkz	3489
+VPABSDrm	3490
+VPABSDrr	3491
+VPABSQZ	3492
+VPABSQZrm	3493
+VPABSQZrmb	3494
+VPABSQZrmbk	3495
+VPABSQZrmbkz	3496
+VPABSQZrmk	3497
+VPABSQZrmkz	3498
+VPABSQZrr	3499
+VPABSQZrrk	3500
+VPABSQZrrkz	3501
+VPABSWYrm	3502
+VPABSWYrr	3503
+VPABSWZ	3504
+VPABSWZrm	3505
+VPABSWZrmk	3506
+VPABSWZrmkz	3507
+VPABSWZrr	3508
+VPABSWZrrk	3509
+VPABSWZrrkz	3510
+VPABSWrm	3511
+VPABSWrr	3512
+VPACKSSDWYrm	3513
+VPACKSSDWYrr	3514
+VPACKSSDWZ	3515
+VPACKSSDWZrm	3516
+VPACKSSDWZrmb	3517
+VPACKSSDWZrmbk	3518
+VPACKSSDWZrmbkz	3519
+VPACKSSDWZrmk	3520
+VPACKSSDWZrmkz	3521
+VPACKSSDWZrr	3522
+VPACKSSDWZrrk	3523
+VPACKSSDWZrrkz	3524
+VPACKSSDWrm	3525
+VPACKSSDWrr	3526
+VPACKSSWBYrm	3527
+VPACKSSWBYrr	3528
+VPACKSSWBZ	3529
+VPACKSSWBZrm	3530
+VPACKSSWBZrmk	3531
+VPACKSSWBZrmkz	3532
+VPACKSSWBZrr	3533
+VPACKSSWBZrrk	3534
+VPACKSSWBZrrkz	3535
+VPACKSSWBrm	3536
+VPACKSSWBrr	3537
+VPACKUSDWYrm	3538
+VPACKUSDWYrr	3539
+VPACKUSDWZ	3540
+VPACKUSDWZrm	3541
+VPACKUSDWZrmb	3542
+VPACKUSDWZrmbk	3543
+VPACKUSDWZrmbkz	3544
+VPACKUSDWZrmk	3545
+VPACKUSDWZrmkz	3546
+VPACKUSDWZrr	3547
+VPACKUSDWZrrk	3548
+VPACKUSDWZrrkz	3549
+VPACKUSDWrm	3550
+VPACKUSDWrr	3551
+VPACKUSWBYrm	3552
+VPACKUSWBYrr	3553
+VPACKUSWBZ	3554
+VPACKUSWBZrm	3555
+VPACKUSWBZrmk	3556
+VPACKUSWBZrmkz	3557
+VPACKUSWBZrr	3558
+VPACKUSWBZrrk	3559
+VPACKUSWBZrrkz	3560
+VPACKUSWBrm	3561
+VPACKUSWBrr	3562
+VPADDBYrm	3563
+VPADDBYrr	3564
+VPADDBZ	3565
+VPADDBZrm	3566
+VPADDBZrmk	3567
+VPADDBZrmkz	3568
+VPADDBZrr	3569
+VPADDBZrrk	3570
+VPADDBZrrkz	3571
+VPADDBrm	3572
+VPADDBrr	3573
+VPADDDYrm	3574
+VPADDDYrr	3575
+VPADDDZ	3576
+VPADDDZrm	3577
+VPADDDZrmb	3578
+VPADDDZrmbk	3579
+VPADDDZrmbkz	3580
+VPADDDZrmk	3581
+VPADDDZrmkz	3582
+VPADDDZrr	3583
+VPADDDZrrk	3584
+VPADDDZrrkz	3585
+VPADDDrm	3586
+VPADDDrr	3587
+VPADDQYrm	3588
+VPADDQYrr	3589
+VPADDQZ	3590
+VPADDQZrm	3591
+VPADDQZrmb	3592
+VPADDQZrmbk	3593
+VPADDQZrmbkz	3594
+VPADDQZrmk	3595
+VPADDQZrmkz	3596
+VPADDQZrr	3597
+VPADDQZrrk	3598
+VPADDQZrrkz	3599
+VPADDQrm	3600
+VPADDQrr	3601
+VPADDSBYrm	3602
+VPADDSBYrr	3603
+VPADDSBZ	3604
+VPADDSBZrm	3605
+VPADDSBZrmk	3606
+VPADDSBZrmkz	3607
+VPADDSBZrr	3608
+VPADDSBZrrk	3609
+VPADDSBZrrkz	3610
+VPADDSBrm	3611
+VPADDSBrr	3612
+VPADDSWYrm	3613
+VPADDSWYrr	3614
+VPADDSWZ	3615
+VPADDSWZrm	3616
+VPADDSWZrmk	3617
+VPADDSWZrmkz	3618
+VPADDSWZrr	3619
+VPADDSWZrrk	3620
+VPADDSWZrrkz	3621
+VPADDSWrm	3622
+VPADDSWrr	3623
+VPADDUSBYrm	3624
+VPADDUSBYrr	3625
+VPADDUSBZ	3626
+VPADDUSBZrm	3627
+VPADDUSBZrmk	3628
+VPADDUSBZrmkz	3629
+VPADDUSBZrr	3630
+VPADDUSBZrrk	3631
+VPADDUSBZrrkz	3632
+VPADDUSBrm	3633
+VPADDUSBrr	3634
+VPADDUSWYrm	3635
+VPADDUSWYrr	3636
+VPADDUSWZ	3637
+VPADDUSWZrm	3638
+VPADDUSWZrmk	3639
+VPADDUSWZrmkz	3640
+VPADDUSWZrr	3641
+VPADDUSWZrrk	3642
+VPADDUSWZrrkz	3643
+VPADDUSWrm	3644
+VPADDUSWrr	3645
+VPADDWYrm	3646
+VPADDWYrr	3647
+VPADDWZ	3648
+VPADDWZrm	3649
+VPADDWZrmk	3650
+VPADDWZrmkz	3651
+VPADDWZrr	3652
+VPADDWZrrk	3653
+VPADDWZrrkz	3654
+VPADDWrm	3655
+VPADDWrr	3656
+VPALIGNRYrmi	3657
+VPALIGNRYrri	3658
+VPALIGNRZ	3659
+VPALIGNRZrmi	3660
+VPALIGNRZrmik	3661
+VPALIGNRZrmikz	3662
+VPALIGNRZrri	3663
+VPALIGNRZrrik	3664
+VPALIGNRZrrikz	3665
+VPALIGNRrmi	3666
+VPALIGNRrri	3667
+VPANDDZ	3668
+VPANDDZrm	3669
+VPANDDZrmb	3670
+VPANDDZrmbk	3671
+VPANDDZrmbkz	3672
+VPANDDZrmk	3673
+VPANDDZrmkz	3674
+VPANDDZrr	3675
+VPANDDZrrk	3676
+VPANDDZrrkz	3677
+VPANDNDZ	3678
+VPANDNDZrm	3679
+VPANDNDZrmb	3680
+VPANDNDZrmbk	3681
+VPANDNDZrmbkz	3682
+VPANDNDZrmk	3683
+VPANDNDZrmkz	3684
+VPANDNDZrr	3685
+VPANDNDZrrk	3686
+VPANDNDZrrkz	3687
+VPANDNQZ	3688
+VPANDNQZrm	3689
+VPANDNQZrmb	3690
+VPANDNQZrmbk	3691
+VPANDNQZrmbkz	3692
+VPANDNQZrmk	3693
+VPANDNQZrmkz	3694
+VPANDNQZrr	3695
+VPANDNQZrrk	3696
+VPANDNQZrrkz	3697
+VPANDNYrm	3698
+VPANDNYrr	3699
+VPANDNrm	3700
+VPANDNrr	3701
+VPANDQZ	3702
+VPANDQZrm	3703
+VPANDQZrmb	3704
+VPANDQZrmbk	3705
+VPANDQZrmbkz	3706
+VPANDQZrmk	3707
+VPANDQZrmkz	3708
+VPANDQZrr	3709
+VPANDQZrrk	3710
+VPANDQZrrkz	3711
+VPANDYrm	3712
+VPANDYrr	3713
+VPANDrm	3714
+VPANDrr	3715
+VPAVGBYrm	3716
+VPAVGBYrr	3717
+VPAVGBZ	3718
+VPAVGBZrm	3719
+VPAVGBZrmk	3720
+VPAVGBZrmkz	3721
+VPAVGBZrr	3722
+VPAVGBZrrk	3723
+VPAVGBZrrkz	3724
+VPAVGBrm	3725
+VPAVGBrr	3726
+VPAVGWYrm	3727
+VPAVGWYrr	3728
+VPAVGWZ	3729
+VPAVGWZrm	3730
+VPAVGWZrmk	3731
+VPAVGWZrmkz	3732
+VPAVGWZrr	3733
+VPAVGWZrrk	3734
+VPAVGWZrrkz	3735
+VPAVGWrm	3736
+VPAVGWrr	3737
+VPBLENDDYrmi	3738
+VPBLENDDYrri	3739
+VPBLENDDrmi	3740
+VPBLENDDrri	3741
+VPBLENDMBZ	3742
+VPBLENDMBZrm	3743
+VPBLENDMBZrmk	3744
+VPBLENDMBZrmkz	3745
+VPBLENDMBZrr	3746
+VPBLENDMBZrrk	3747
+VPBLENDMBZrrkz	3748
+VPBLENDMDZ	3749
+VPBLENDMDZrm	3750
+VPBLENDMDZrmb	3751
+VPBLENDMDZrmbk	3752
+VPBLENDMDZrmbkz	3753
+VPBLENDMDZrmk	3754
+VPBLENDMDZrmkz	3755
+VPBLENDMDZrr	3756
+VPBLENDMDZrrk	3757
+VPBLENDMDZrrkz	3758
+VPBLENDMQZ	3759
+VPBLENDMQZrm	3760
+VPBLENDMQZrmb	3761
+VPBLENDMQZrmbk	3762
+VPBLENDMQZrmbkz	3763
+VPBLENDMQZrmk	3764
+VPBLENDMQZrmkz	3765
+VPBLENDMQZrr	3766
+VPBLENDMQZrrk	3767
+VPBLENDMQZrrkz	3768
+VPBLENDMWZ	3769
+VPBLENDMWZrm	3770
+VPBLENDMWZrmk	3771
+VPBLENDMWZrmkz	3772
+VPBLENDMWZrr	3773
+VPBLENDMWZrrk	3774
+VPBLENDMWZrrkz	3775
+VPBLENDVBYrmr	3776
+VPBLENDVBYrrr	3777
+VPBLENDVBrmr	3778
+VPBLENDVBrrr	3779
+VPBLENDWYrmi	3780
+VPBLENDWYrri	3781
+VPBLENDWrmi	3782
+VPBLENDWrri	3783
+VPBROADCASTBYrm	3784
+VPBROADCASTBYrr	3785
+VPBROADCASTBZ	3786
+VPBROADCASTBZrm	3787
+VPBROADCASTBZrmk	3788
+VPBROADCASTBZrmkz	3789
+VPBROADCASTBZrr	3790
+VPBROADCASTBZrrk	3791
+VPBROADCASTBZrrkz	3792
+VPBROADCASTBrZ	3793
+VPBROADCASTBrZrr	3794
+VPBROADCASTBrZrrk	3795
+VPBROADCASTBrZrrkz	3796
+VPBROADCASTBrm	3797
+VPBROADCASTBrr	3798
+VPBROADCASTDYrm	3799
+VPBROADCASTDYrr	3800
+VPBROADCASTDZ	3801
+VPBROADCASTDZrm	3802
+VPBROADCASTDZrmk	3803
+VPBROADCASTDZrmkz	3804
+VPBROADCASTDZrr	3805
+VPBROADCASTDZrrk	3806
+VPBROADCASTDZrrkz	3807
+VPBROADCASTDrZ	3808
+VPBROADCASTDrZrr	3809
+VPBROADCASTDrZrrk	3810
+VPBROADCASTDrZrrkz	3811
+VPBROADCASTDrm	3812
+VPBROADCASTDrr	3813
+VPBROADCASTMB	3814
+VPBROADCASTMW	3815
+VPBROADCASTQYrm	3816
+VPBROADCASTQYrr	3817
+VPBROADCASTQZ	3818
+VPBROADCASTQZrm	3819
+VPBROADCASTQZrmk	3820
+VPBROADCASTQZrmkz	3821
+VPBROADCASTQZrr	3822
+VPBROADCASTQZrrk	3823
+VPBROADCASTQZrrkz	3824
+VPBROADCASTQrZ	3825
+VPBROADCASTQrZrr	3826
+VPBROADCASTQrZrrk	3827
+VPBROADCASTQrZrrkz	3828
+VPBROADCASTQrm	3829
+VPBROADCASTQrr	3830
+VPBROADCASTWYrm	3831
+VPBROADCASTWYrr	3832
+VPBROADCASTWZ	3833
+VPBROADCASTWZrm	3834
+VPBROADCASTWZrmk	3835
+VPBROADCASTWZrmkz	3836
+VPBROADCASTWZrr	3837
+VPBROADCASTWZrrk	3838
+VPBROADCASTWZrrkz	3839
+VPBROADCASTWrZ	3840
+VPBROADCASTWrZrr	3841
+VPBROADCASTWrZrrk	3842
+VPBROADCASTWrZrrkz	3843
+VPBROADCASTWrm	3844
+VPBROADCASTWrr	3845
+VPCLMULQDQYrmi	3846
+VPCLMULQDQYrri	3847
+VPCLMULQDQZ	3848
+VPCLMULQDQZrmi	3849
+VPCLMULQDQZrri	3850
+VPCLMULQDQrmi	3851
+VPCLMULQDQrri	3852
+VPCMOVYrmr	3853
+VPCMOVYrrm	3854
+VPCMOVYrrr	3855
+VPCMOVYrrr_REV	3856
+VPCMOVrmr	3857
+VPCMOVrrm	3858
+VPCMOVrrr	3859
+VPCMOVrrr_REV	3860
+VPCMPBZ	3861
+VPCMPBZrmi	3862
+VPCMPBZrmik	3863
+VPCMPBZrri	3864
+VPCMPBZrrik	3865
+VPCMPDZ	3866
+VPCMPDZrmbi	3867
+VPCMPDZrmbik	3868
+VPCMPDZrmi	3869
+VPCMPDZrmik	3870
+VPCMPDZrri	3871
+VPCMPDZrrik	3872
+VPCMPEQBYrm	3873
+VPCMPEQBYrr	3874
+VPCMPEQBZ	3875
+VPCMPEQBZrm	3876
+VPCMPEQBZrmk	3877
+VPCMPEQBZrr	3878
+VPCMPEQBZrrk	3879
+VPCMPEQBrm	3880
+VPCMPEQBrr	3881
+VPCMPEQDYrm	3882
+VPCMPEQDYrr	3883
+VPCMPEQDZ	3884
+VPCMPEQDZrm	3885
+VPCMPEQDZrmb	3886
+VPCMPEQDZrmbk	3887
+VPCMPEQDZrmk	3888
+VPCMPEQDZrr	3889
+VPCMPEQDZrrk	3890
+VPCMPEQDrm	3891
+VPCMPEQDrr	3892
+VPCMPEQQYrm	3893
+VPCMPEQQYrr	3894
+VPCMPEQQZ	3895
+VPCMPEQQZrm	3896
+VPCMPEQQZrmb	3897
+VPCMPEQQZrmbk	3898
+VPCMPEQQZrmk	3899
+VPCMPEQQZrr	3900
+VPCMPEQQZrrk	3901
+VPCMPEQQrm	3902
+VPCMPEQQrr	3903
+VPCMPEQWYrm	3904
+VPCMPEQWYrr	3905
+VPCMPEQWZ	3906
+VPCMPEQWZrm	3907
+VPCMPEQWZrmk	3908
+VPCMPEQWZrr	3909
+VPCMPEQWZrrk	3910
+VPCMPEQWrm	3911
+VPCMPEQWrr	3912
+VPCMPESTRIrmi	3913
+VPCMPESTRIrri	3914
+VPCMPESTRMrmi	3915
+VPCMPESTRMrri	3916
+VPCMPGTBYrm	3917
+VPCMPGTBYrr	3918
+VPCMPGTBZ	3919
+VPCMPGTBZrm	3920
+VPCMPGTBZrmk	3921
+VPCMPGTBZrr	3922
+VPCMPGTBZrrk	3923
+VPCMPGTBrm	3924
+VPCMPGTBrr	3925
+VPCMPGTDYrm	3926
+VPCMPGTDYrr	3927
+VPCMPGTDZ	3928
+VPCMPGTDZrm	3929
+VPCMPGTDZrmb	3930
+VPCMPGTDZrmbk	3931
+VPCMPGTDZrmk	3932
+VPCMPGTDZrr	3933
+VPCMPGTDZrrk	3934
+VPCMPGTDrm	3935
+VPCMPGTDrr	3936
+VPCMPGTQYrm	3937
+VPCMPGTQYrr	3938
+VPCMPGTQZ	3939
+VPCMPGTQZrm	3940
+VPCMPGTQZrmb	3941
+VPCMPGTQZrmbk	3942
+VPCMPGTQZrmk	3943
+VPCMPGTQZrr	3944
+VPCMPGTQZrrk	3945
+VPCMPGTQrm	3946
+VPCMPGTQrr	3947
+VPCMPGTWYrm	3948
+VPCMPGTWYrr	3949
+VPCMPGTWZ	3950
+VPCMPGTWZrm	3951
+VPCMPGTWZrmk	3952
+VPCMPGTWZrr	3953
+VPCMPGTWZrrk	3954
+VPCMPGTWrm	3955
+VPCMPGTWrr	3956
+VPCMPISTRIrmi	3957
+VPCMPISTRIrri	3958
+VPCMPISTRMrmi	3959
+VPCMPISTRMrri	3960
+VPCMPQZ	3961
+VPCMPQZrmbi	3962
+VPCMPQZrmbik	3963
+VPCMPQZrmi	3964
+VPCMPQZrmik	3965
+VPCMPQZrri	3966
+VPCMPQZrrik	3967
+VPCMPUBZ	3968
+VPCMPUBZrmi	3969
+VPCMPUBZrmik	3970
+VPCMPUBZrri	3971
+VPCMPUBZrrik	3972
+VPCMPUDZ	3973
+VPCMPUDZrmbi	3974
+VPCMPUDZrmbik	3975
+VPCMPUDZrmi	3976
+VPCMPUDZrmik	3977
+VPCMPUDZrri	3978
+VPCMPUDZrrik	3979
+VPCMPUQZ	3980
+VPCMPUQZrmbi	3981
+VPCMPUQZrmbik	3982
+VPCMPUQZrmi	3983
+VPCMPUQZrmik	3984
+VPCMPUQZrri	3985
+VPCMPUQZrrik	3986
+VPCMPUWZ	3987
+VPCMPUWZrmi	3988
+VPCMPUWZrmik	3989
+VPCMPUWZrri	3990
+VPCMPUWZrrik	3991
+VPCMPWZ	3992
+VPCMPWZrmi	3993
+VPCMPWZrmik	3994
+VPCMPWZrri	3995
+VPCMPWZrrik	3996
+VPCOMBmi	3997
+VPCOMBri	3998
+VPCOMDmi	3999
+VPCOMDri	4000
+VPCOMPRESSBZ	4001
+VPCOMPRESSBZmr	4002
+VPCOMPRESSBZmrk	4003
+VPCOMPRESSBZrr	4004
+VPCOMPRESSBZrrk	4005
+VPCOMPRESSBZrrkz	4006
+VPCOMPRESSDZ	4007
+VPCOMPRESSDZmr	4008
+VPCOMPRESSDZmrk	4009
+VPCOMPRESSDZrr	4010
+VPCOMPRESSDZrrk	4011
+VPCOMPRESSDZrrkz	4012
+VPCOMPRESSQZ	4013
+VPCOMPRESSQZmr	4014
+VPCOMPRESSQZmrk	4015
+VPCOMPRESSQZrr	4016
+VPCOMPRESSQZrrk	4017
+VPCOMPRESSQZrrkz	4018
+VPCOMPRESSWZ	4019
+VPCOMPRESSWZmr	4020
+VPCOMPRESSWZmrk	4021
+VPCOMPRESSWZrr	4022
+VPCOMPRESSWZrrk	4023
+VPCOMPRESSWZrrkz	4024
+VPCOMQmi	4025
+VPCOMQri	4026
+VPCOMUBmi	4027
+VPCOMUBri	4028
+VPCOMUDmi	4029
+VPCOMUDri	4030
+VPCOMUQmi	4031
+VPCOMUQri	4032
+VPCOMUWmi	4033
+VPCOMUWri	4034
+VPCOMWmi	4035
+VPCOMWri	4036
+VPCONFLICTDZ	4037
+VPCONFLICTDZrm	4038
+VPCONFLICTDZrmb	4039
+VPCONFLICTDZrmbk	4040
+VPCONFLICTDZrmbkz	4041
+VPCONFLICTDZrmk	4042
+VPCONFLICTDZrmkz	4043
+VPCONFLICTDZrr	4044
+VPCONFLICTDZrrk	4045
+VPCONFLICTDZrrkz	4046
+VPCONFLICTQZ	4047
+VPCONFLICTQZrm	4048
+VPCONFLICTQZrmb	4049
+VPCONFLICTQZrmbk	4050
+VPCONFLICTQZrmbkz	4051
+VPCONFLICTQZrmk	4052
+VPCONFLICTQZrmkz	4053
+VPCONFLICTQZrr	4054
+VPCONFLICTQZrrk	4055
+VPCONFLICTQZrrkz	4056
+VPDPBSSDSYrm	4057
+VPDPBSSDSYrr	4058
+VPDPBSSDSZ	4059
+VPDPBSSDSZrm	4060
+VPDPBSSDSZrmb	4061
+VPDPBSSDSZrmbk	4062
+VPDPBSSDSZrmbkz	4063
+VPDPBSSDSZrmk	4064
+VPDPBSSDSZrmkz	4065
+VPDPBSSDSZrr	4066
+VPDPBSSDSZrrk	4067
+VPDPBSSDSZrrkz	4068
+VPDPBSSDSrm	4069
+VPDPBSSDSrr	4070
+VPDPBSSDYrm	4071
+VPDPBSSDYrr	4072
+VPDPBSSDZ	4073
+VPDPBSSDZrm	4074
+VPDPBSSDZrmb	4075
+VPDPBSSDZrmbk	4076
+VPDPBSSDZrmbkz	4077
+VPDPBSSDZrmk	4078
+VPDPBSSDZrmkz	4079
+VPDPBSSDZrr	4080
+VPDPBSSDZrrk	4081
+VPDPBSSDZrrkz	4082
+VPDPBSSDrm	4083
+VPDPBSSDrr	4084
+VPDPBSUDSYrm	4085
+VPDPBSUDSYrr	4086
+VPDPBSUDSZ	4087
+VPDPBSUDSZrm	4088
+VPDPBSUDSZrmb	4089
+VPDPBSUDSZrmbk	4090
+VPDPBSUDSZrmbkz	4091
+VPDPBSUDSZrmk	4092
+VPDPBSUDSZrmkz	4093
+VPDPBSUDSZrr	4094
+VPDPBSUDSZrrk	4095
+VPDPBSUDSZrrkz	4096
+VPDPBSUDSrm	4097
+VPDPBSUDSrr	4098
+VPDPBSUDYrm	4099
+VPDPBSUDYrr	4100
+VPDPBSUDZ	4101
+VPDPBSUDZrm	4102
+VPDPBSUDZrmb	4103
+VPDPBSUDZrmbk	4104
+VPDPBSUDZrmbkz	4105
+VPDPBSUDZrmk	4106
+VPDPBSUDZrmkz	4107
+VPDPBSUDZrr	4108
+VPDPBSUDZrrk	4109
+VPDPBSUDZrrkz	4110
+VPDPBSUDrm	4111
+VPDPBSUDrr	4112
+VPDPBUSDSYrm	4113
+VPDPBUSDSYrr	4114
+VPDPBUSDSZ	4115
+VPDPBUSDSZrm	4116
+VPDPBUSDSZrmb	4117
+VPDPBUSDSZrmbk	4118
+VPDPBUSDSZrmbkz	4119
+VPDPBUSDSZrmk	4120
+VPDPBUSDSZrmkz	4121
+VPDPBUSDSZrr	4122
+VPDPBUSDSZrrk	4123
+VPDPBUSDSZrrkz	4124
+VPDPBUSDSrm	4125
+VPDPBUSDSrr	4126
+VPDPBUSDYrm	4127
+VPDPBUSDYrr	4128
+VPDPBUSDZ	4129
+VPDPBUSDZrm	4130
+VPDPBUSDZrmb	4131
+VPDPBUSDZrmbk	4132
+VPDPBUSDZrmbkz	4133
+VPDPBUSDZrmk	4134
+VPDPBUSDZrmkz	4135
+VPDPBUSDZrr	4136
+VPDPBUSDZrrk	4137
+VPDPBUSDZrrkz	4138
+VPDPBUSDrm	4139
+VPDPBUSDrr	4140
+VPDPBUUDSYrm	4141
+VPDPBUUDSYrr	4142
+VPDPBUUDSZ	4143
+VPDPBUUDSZrm	4144
+VPDPBUUDSZrmb	4145
+VPDPBUUDSZrmbk	4146
+VPDPBUUDSZrmbkz	4147
+VPDPBUUDSZrmk	4148
+VPDPBUUDSZrmkz	4149
+VPDPBUUDSZrr	4150
+VPDPBUUDSZrrk	4151
+VPDPBUUDSZrrkz	4152
+VPDPBUUDSrm	4153
+VPDPBUUDSrr	4154
+VPDPBUUDYrm	4155
+VPDPBUUDYrr	4156
+VPDPBUUDZ	4157
+VPDPBUUDZrm	4158
+VPDPBUUDZrmb	4159
+VPDPBUUDZrmbk	4160
+VPDPBUUDZrmbkz	4161
+VPDPBUUDZrmk	4162
+VPDPBUUDZrmkz	4163
+VPDPBUUDZrr	4164
+VPDPBUUDZrrk	4165
+VPDPBUUDZrrkz	4166
+VPDPBUUDrm	4167
+VPDPBUUDrr	4168
+VPDPWSSDSYrm	4169
+VPDPWSSDSYrr	4170
+VPDPWSSDSZ	4171
+VPDPWSSDSZrm	4172
+VPDPWSSDSZrmb	4173
+VPDPWSSDSZrmbk	4174
+VPDPWSSDSZrmbkz	4175
+VPDPWSSDSZrmk	4176
+VPDPWSSDSZrmkz	4177
+VPDPWSSDSZrr	4178
+VPDPWSSDSZrrk	4179
+VPDPWSSDSZrrkz	4180
+VPDPWSSDSrm	4181
+VPDPWSSDSrr	4182
+VPDPWSSDYrm	4183
+VPDPWSSDYrr	4184
+VPDPWSSDZ	4185
+VPDPWSSDZrm	4186
+VPDPWSSDZrmb	4187
+VPDPWSSDZrmbk	4188
+VPDPWSSDZrmbkz	4189
+VPDPWSSDZrmk	4190
+VPDPWSSDZrmkz	4191
+VPDPWSSDZrr	4192
+VPDPWSSDZrrk	4193
+VPDPWSSDZrrkz	4194
+VPDPWSSDrm	4195
+VPDPWSSDrr	4196
+VPDPWSUDSYrm	4197
+VPDPWSUDSYrr	4198
+VPDPWSUDSZ	4199
+VPDPWSUDSZrm	4200
+VPDPWSUDSZrmb	4201
+VPDPWSUDSZrmbk	4202
+VPDPWSUDSZrmbkz	4203
+VPDPWSUDSZrmk	4204
+VPDPWSUDSZrmkz	4205
+VPDPWSUDSZrr	4206
+VPDPWSUDSZrrk	4207
+VPDPWSUDSZrrkz	4208
+VPDPWSUDSrm	4209
+VPDPWSUDSrr	4210
+VPDPWSUDYrm	4211
+VPDPWSUDYrr	4212
+VPDPWSUDZ	4213
+VPDPWSUDZrm	4214
+VPDPWSUDZrmb	4215
+VPDPWSUDZrmbk	4216
+VPDPWSUDZrmbkz	4217
+VPDPWSUDZrmk	4218
+VPDPWSUDZrmkz	4219
+VPDPWSUDZrr	4220
+VPDPWSUDZrrk	4221
+VPDPWSUDZrrkz	4222
+VPDPWSUDrm	4223
+VPDPWSUDrr	4224
+VPDPWUSDSYrm	4225
+VPDPWUSDSYrr	4226
+VPDPWUSDSZ	4227
+VPDPWUSDSZrm	4228
+VPDPWUSDSZrmb	4229
+VPDPWUSDSZrmbk	4230
+VPDPWUSDSZrmbkz	4231
+VPDPWUSDSZrmk	4232
+VPDPWUSDSZrmkz	4233
+VPDPWUSDSZrr	4234
+VPDPWUSDSZrrk	4235
+VPDPWUSDSZrrkz	4236
+VPDPWUSDSrm	4237
+VPDPWUSDSrr	4238
+VPDPWUSDYrm	4239
+VPDPWUSDYrr	4240
+VPDPWUSDZ	4241
+VPDPWUSDZrm	4242
+VPDPWUSDZrmb	4243
+VPDPWUSDZrmbk	4244
+VPDPWUSDZrmbkz	4245
+VPDPWUSDZrmk	4246
+VPDPWUSDZrmkz	4247
+VPDPWUSDZrr	4248
+VPDPWUSDZrrk	4249
+VPDPWUSDZrrkz	4250
+VPDPWUSDrm	4251
+VPDPWUSDrr	4252
+VPDPWUUDSYrm	4253
+VPDPWUUDSYrr	4254
+VPDPWUUDSZ	4255
+VPDPWUUDSZrm	4256
+VPDPWUUDSZrmb	4257
+VPDPWUUDSZrmbk	4258
+VPDPWUUDSZrmbkz	4259
+VPDPWUUDSZrmk	4260
+VPDPWUUDSZrmkz	4261
+VPDPWUUDSZrr	4262
+VPDPWUUDSZrrk	4263
+VPDPWUUDSZrrkz	4264
+VPDPWUUDSrm	4265
+VPDPWUUDSrr	4266
+VPDPWUUDYrm	4267
+VPDPWUUDYrr	4268
+VPDPWUUDZ	4269
+VPDPWUUDZrm	4270
+VPDPWUUDZrmb	4271
+VPDPWUUDZrmbk	4272
+VPDPWUUDZrmbkz	4273
+VPDPWUUDZrmk	4274
+VPDPWUUDZrmkz	4275
+VPDPWUUDZrr	4276
+VPDPWUUDZrrk	4277
+VPDPWUUDZrrkz	4278
+VPDPWUUDrm	4279
+VPDPWUUDrr	4280
+VPERM	4281
+VPERMBZ	4282
+VPERMBZrm	4283
+VPERMBZrmk	4284
+VPERMBZrmkz	4285
+VPERMBZrr	4286
+VPERMBZrrk	4287
+VPERMBZrrkz	4288
+VPERMDYrm	4289
+VPERMDYrr	4290
+VPERMDZ	4291
+VPERMDZrm	4292
+VPERMDZrmb	4293
+VPERMDZrmbk	4294
+VPERMDZrmbkz	4295
+VPERMDZrmk	4296
+VPERMDZrmkz	4297
+VPERMDZrr	4298
+VPERMDZrrk	4299
+VPERMDZrrkz	4300
+VPERMI	4301
+VPERMIL	4302
+VPERMILPDYmi	4303
+VPERMILPDYri	4304
+VPERMILPDYrm	4305
+VPERMILPDYrr	4306
+VPERMILPDZ	4307
+VPERMILPDZmbi	4308
+VPERMILPDZmbik	4309
+VPERMILPDZmbikz	4310
+VPERMILPDZmi	4311
+VPERMILPDZmik	4312
+VPERMILPDZmikz	4313
+VPERMILPDZri	4314
+VPERMILPDZrik	4315
+VPERMILPDZrikz	4316
+VPERMILPDZrm	4317
+VPERMILPDZrmb	4318
+VPERMILPDZrmbk	4319
+VPERMILPDZrmbkz	4320
+VPERMILPDZrmk	4321
+VPERMILPDZrmkz	4322
+VPERMILPDZrr	4323
+VPERMILPDZrrk	4324
+VPERMILPDZrrkz	4325
+VPERMILPDmi	4326
+VPERMILPDri	4327
+VPERMILPDrm	4328
+VPERMILPDrr	4329
+VPERMILPSYmi	4330
+VPERMILPSYri	4331
+VPERMILPSYrm	4332
+VPERMILPSYrr	4333
+VPERMILPSZ	4334
+VPERMILPSZmbi	4335
+VPERMILPSZmbik	4336
+VPERMILPSZmbikz	4337
+VPERMILPSZmi	4338
+VPERMILPSZmik	4339
+VPERMILPSZmikz	4340
+VPERMILPSZri	4341
+VPERMILPSZrik	4342
+VPERMILPSZrikz	4343
+VPERMILPSZrm	4344
+VPERMILPSZrmb	4345
+VPERMILPSZrmbk	4346
+VPERMILPSZrmbkz	4347
+VPERMILPSZrmk	4348
+VPERMILPSZrmkz	4349
+VPERMILPSZrr	4350
+VPERMILPSZrrk	4351
+VPERMILPSZrrkz	4352
+VPERMILPSmi	4353
+VPERMILPSri	4354
+VPERMILPSrm	4355
+VPERMILPSrr	4356
+VPERMPDYmi	4357
+VPERMPDYri	4358
+VPERMPDZ	4359
+VPERMPDZmbi	4360
+VPERMPDZmbik	4361
+VPERMPDZmbikz	4362
+VPERMPDZmi	4363
+VPERMPDZmik	4364
+VPERMPDZmikz	4365
+VPERMPDZri	4366
+VPERMPDZrik	4367
+VPERMPDZrikz	4368
+VPERMPDZrm	4369
+VPERMPDZrmb	4370
+VPERMPDZrmbk	4371
+VPERMPDZrmbkz	4372
+VPERMPDZrmk	4373
+VPERMPDZrmkz	4374
+VPERMPDZrr	4375
+VPERMPDZrrk	4376
+VPERMPDZrrkz	4377
+VPERMPSYrm	4378
+VPERMPSYrr	4379
+VPERMPSZ	4380
+VPERMPSZrm	4381
+VPERMPSZrmb	4382
+VPERMPSZrmbk	4383
+VPERMPSZrmbkz	4384
+VPERMPSZrmk	4385
+VPERMPSZrmkz	4386
+VPERMPSZrr	4387
+VPERMPSZrrk	4388
+VPERMPSZrrkz	4389
+VPERMQYmi	4390
+VPERMQYri	4391
+VPERMQZ	4392
+VPERMQZmbi	4393
+VPERMQZmbik	4394
+VPERMQZmbikz	4395
+VPERMQZmi	4396
+VPERMQZmik	4397
+VPERMQZmikz	4398
+VPERMQZri	4399
+VPERMQZrik	4400
+VPERMQZrikz	4401
+VPERMQZrm	4402
+VPERMQZrmb	4403
+VPERMQZrmbk	4404
+VPERMQZrmbkz	4405
+VPERMQZrmk	4406
+VPERMQZrmkz	4407
+VPERMQZrr	4408
+VPERMQZrrk	4409
+VPERMQZrrkz	4410
+VPERMT	4411
+VPERMWZ	4412
+VPERMWZrm	4413
+VPERMWZrmk	4414
+VPERMWZrmkz	4415
+VPERMWZrr	4416
+VPERMWZrrk	4417
+VPERMWZrrkz	4418
+VPEXPANDBZ	4419
+VPEXPANDBZrm	4420
+VPEXPANDBZrmk	4421
+VPEXPANDBZrmkz	4422
+VPEXPANDBZrr	4423
+VPEXPANDBZrrk	4424
+VPEXPANDBZrrkz	4425
+VPEXPANDDZ	4426
+VPEXPANDDZrm	4427
+VPEXPANDDZrmk	4428
+VPEXPANDDZrmkz	4429
+VPEXPANDDZrr	4430
+VPEXPANDDZrrk	4431
+VPEXPANDDZrrkz	4432
+VPEXPANDQZ	4433
+VPEXPANDQZrm	4434
+VPEXPANDQZrmk	4435
+VPEXPANDQZrmkz	4436
+VPEXPANDQZrr	4437
+VPEXPANDQZrrk	4438
+VPEXPANDQZrrkz	4439
+VPEXPANDWZ	4440
+VPEXPANDWZrm	4441
+VPEXPANDWZrmk	4442
+VPEXPANDWZrmkz	4443
+VPEXPANDWZrr	4444
+VPEXPANDWZrrk	4445
+VPEXPANDWZrrkz	4446
+VPEXTRBZmri	4447
+VPEXTRBZrri	4448
+VPEXTRBmri	4449
+VPEXTRBrri	4450
+VPEXTRDZmri	4451
+VPEXTRDZrri	4452
+VPEXTRDmri	4453
+VPEXTRDrri	4454
+VPEXTRQZmri	4455
+VPEXTRQZrri	4456
+VPEXTRQmri	4457
+VPEXTRQrri	4458
+VPEXTRWZmri	4459
+VPEXTRWZrri	4460
+VPEXTRWZrri_REV	4461
+VPEXTRWmri	4462
+VPEXTRWrri	4463
+VPEXTRWrri_REV	4464
+VPGATHERDDYrm	4465
+VPGATHERDDZ	4466
+VPGATHERDDZrm	4467
+VPGATHERDDrm	4468
+VPGATHERDQYrm	4469
+VPGATHERDQZ	4470
+VPGATHERDQZrm	4471
+VPGATHERDQrm	4472
+VPGATHERQDYrm	4473
+VPGATHERQDZ	4474
+VPGATHERQDZrm	4475
+VPGATHERQDrm	4476
+VPGATHERQQYrm	4477
+VPGATHERQQZ	4478
+VPGATHERQQZrm	4479
+VPGATHERQQrm	4480
+VPHADDBDrm	4481
+VPHADDBDrr	4482
+VPHADDBQrm	4483
+VPHADDBQrr	4484
+VPHADDBWrm	4485
+VPHADDBWrr	4486
+VPHADDDQrm	4487
+VPHADDDQrr	4488
+VPHADDDYrm	4489
+VPHADDDYrr	4490
+VPHADDDrm	4491
+VPHADDDrr	4492
+VPHADDSWYrm	4493
+VPHADDSWYrr	4494
+VPHADDSWrm	4495
+VPHADDSWrr	4496
+VPHADDUBDrm	4497
+VPHADDUBDrr	4498
+VPHADDUBQrm	4499
+VPHADDUBQrr	4500
+VPHADDUBWrm	4501
+VPHADDUBWrr	4502
+VPHADDUDQrm	4503
+VPHADDUDQrr	4504
+VPHADDUWDrm	4505
+VPHADDUWDrr	4506
+VPHADDUWQrm	4507
+VPHADDUWQrr	4508
+VPHADDWDrm	4509
+VPHADDWDrr	4510
+VPHADDWQrm	4511
+VPHADDWQrr	4512
+VPHADDWYrm	4513
+VPHADDWYrr	4514
+VPHADDWrm	4515
+VPHADDWrr	4516
+VPHMINPOSUWrm	4517
+VPHMINPOSUWrr	4518
+VPHSUBBWrm	4519
+VPHSUBBWrr	4520
+VPHSUBDQrm	4521
+VPHSUBDQrr	4522
+VPHSUBDYrm	4523
+VPHSUBDYrr	4524
+VPHSUBDrm	4525
+VPHSUBDrr	4526
+VPHSUBSWYrm	4527
+VPHSUBSWYrr	4528
+VPHSUBSWrm	4529
+VPHSUBSWrr	4530
+VPHSUBWDrm	4531
+VPHSUBWDrr	4532
+VPHSUBWYrm	4533
+VPHSUBWYrr	4534
+VPHSUBWrm	4535
+VPHSUBWrr	4536
+VPINSRBZrmi	4537
+VPINSRBZrri	4538
+VPINSRBrmi	4539
+VPINSRBrri	4540
+VPINSRDZrmi	4541
+VPINSRDZrri	4542
+VPINSRDrmi	4543
+VPINSRDrri	4544
+VPINSRQZrmi	4545
+VPINSRQZrri	4546
+VPINSRQrmi	4547
+VPINSRQrri	4548
+VPINSRWZrmi	4549
+VPINSRWZrri	4550
+VPINSRWrmi	4551
+VPINSRWrri	4552
+VPLZCNTDZ	4553
+VPLZCNTDZrm	4554
+VPLZCNTDZrmb	4555
+VPLZCNTDZrmbk	4556
+VPLZCNTDZrmbkz	4557
+VPLZCNTDZrmk	4558
+VPLZCNTDZrmkz	4559
+VPLZCNTDZrr	4560
+VPLZCNTDZrrk	4561
+VPLZCNTDZrrkz	4562
+VPLZCNTQZ	4563
+VPLZCNTQZrm	4564
+VPLZCNTQZrmb	4565
+VPLZCNTQZrmbk	4566
+VPLZCNTQZrmbkz	4567
+VPLZCNTQZrmk	4568
+VPLZCNTQZrmkz	4569
+VPLZCNTQZrr	4570
+VPLZCNTQZrrk	4571
+VPLZCNTQZrrkz	4572
+VPMACSDDrm	4573
+VPMACSDDrr	4574
+VPMACSDQHrm	4575
+VPMACSDQHrr	4576
+VPMACSDQLrm	4577
+VPMACSDQLrr	4578
+VPMACSSDDrm	4579
+VPMACSSDDrr	4580
+VPMACSSDQHrm	4581
+VPMACSSDQHrr	4582
+VPMACSSDQLrm	4583
+VPMACSSDQLrr	4584
+VPMACSSWDrm	4585
+VPMACSSWDrr	4586
+VPMACSSWWrm	4587
+VPMACSSWWrr	4588
+VPMACSWDrm	4589
+VPMACSWDrr	4590
+VPMACSWWrm	4591
+VPMACSWWrr	4592
+VPMADCSSWDrm	4593
+VPMADCSSWDrr	4594
+VPMADCSWDrm	4595
+VPMADCSWDrr	4596
+VPMADD	4597
+VPMADDUBSWYrm	4598
+VPMADDUBSWYrr	4599
+VPMADDUBSWZ	4600
+VPMADDUBSWZrm	4601
+VPMADDUBSWZrmk	4602
+VPMADDUBSWZrmkz	4603
+VPMADDUBSWZrr	4604
+VPMADDUBSWZrrk	4605
+VPMADDUBSWZrrkz	4606
+VPMADDUBSWrm	4607
+VPMADDUBSWrr	4608
+VPMADDWDYrm	4609
+VPMADDWDYrr	4610
+VPMADDWDZ	4611
+VPMADDWDZrm	4612
+VPMADDWDZrmk	4613
+VPMADDWDZrmkz	4614
+VPMADDWDZrr	4615
+VPMADDWDZrrk	4616
+VPMADDWDZrrkz	4617
+VPMADDWDrm	4618
+VPMADDWDrr	4619
+VPMASKMOVDYmr	4620
+VPMASKMOVDYrm	4621
+VPMASKMOVDmr	4622
+VPMASKMOVDrm	4623
+VPMASKMOVQYmr	4624
+VPMASKMOVQYrm	4625
+VPMASKMOVQmr	4626
+VPMASKMOVQrm	4627
+VPMAXSBYrm	4628
+VPMAXSBYrr	4629
+VPMAXSBZ	4630
+VPMAXSBZrm	4631
+VPMAXSBZrmk	4632
+VPMAXSBZrmkz	4633
+VPMAXSBZrr	4634
+VPMAXSBZrrk	4635
+VPMAXSBZrrkz	4636
+VPMAXSBrm	4637
+VPMAXSBrr	4638
+VPMAXSDYrm	4639
+VPMAXSDYrr	4640
+VPMAXSDZ	4641
+VPMAXSDZrm	4642
+VPMAXSDZrmb	4643
+VPMAXSDZrmbk	4644
+VPMAXSDZrmbkz	4645
+VPMAXSDZrmk	4646
+VPMAXSDZrmkz	4647
+VPMAXSDZrr	4648
+VPMAXSDZrrk	4649
+VPMAXSDZrrkz	4650
+VPMAXSDrm	4651
+VPMAXSDrr	4652
+VPMAXSQZ	4653
+VPMAXSQZrm	4654
+VPMAXSQZrmb	4655
+VPMAXSQZrmbk	4656
+VPMAXSQZrmbkz	4657
+VPMAXSQZrmk	4658
+VPMAXSQZrmkz	4659
+VPMAXSQZrr	4660
+VPMAXSQZrrk	4661
+VPMAXSQZrrkz	4662
+VPMAXSWYrm	4663
+VPMAXSWYrr	4664
+VPMAXSWZ	4665
+VPMAXSWZrm	4666
+VPMAXSWZrmk	4667
+VPMAXSWZrmkz	4668
+VPMAXSWZrr	4669
+VPMAXSWZrrk	4670
+VPMAXSWZrrkz	4671
+VPMAXSWrm	4672
+VPMAXSWrr	4673
+VPMAXUBYrm	4674
+VPMAXUBYrr	4675
+VPMAXUBZ	4676
+VPMAXUBZrm	4677
+VPMAXUBZrmk	4678
+VPMAXUBZrmkz	4679
+VPMAXUBZrr	4680
+VPMAXUBZrrk	4681
+VPMAXUBZrrkz	4682
+VPMAXUBrm	4683
+VPMAXUBrr	4684
+VPMAXUDYrm	4685
+VPMAXUDYrr	4686
+VPMAXUDZ	4687
+VPMAXUDZrm	4688
+VPMAXUDZrmb	4689
+VPMAXUDZrmbk	4690
+VPMAXUDZrmbkz	4691
+VPMAXUDZrmk	4692
+VPMAXUDZrmkz	4693
+VPMAXUDZrr	4694
+VPMAXUDZrrk	4695
+VPMAXUDZrrkz	4696
+VPMAXUDrm	4697
+VPMAXUDrr	4698
+VPMAXUQZ	4699
+VPMAXUQZrm	4700
+VPMAXUQZrmb	4701
+VPMAXUQZrmbk	4702
+VPMAXUQZrmbkz	4703
+VPMAXUQZrmk	4704
+VPMAXUQZrmkz	4705
+VPMAXUQZrr	4706
+VPMAXUQZrrk	4707
+VPMAXUQZrrkz	4708
+VPMAXUWYrm	4709
+VPMAXUWYrr	4710
+VPMAXUWZ	4711
+VPMAXUWZrm	4712
+VPMAXUWZrmk	4713
+VPMAXUWZrmkz	4714
+VPMAXUWZrr	4715
+VPMAXUWZrrk	4716
+VPMAXUWZrrkz	4717
+VPMAXUWrm	4718
+VPMAXUWrr	4719
+VPMINSBYrm	4720
+VPMINSBYrr	4721
+VPMINSBZ	4722
+VPMINSBZrm	4723
+VPMINSBZrmk	4724
+VPMINSBZrmkz	4725
+VPMINSBZrr	4726
+VPMINSBZrrk	4727
+VPMINSBZrrkz	4728
+VPMINSBrm	4729
+VPMINSBrr	4730
+VPMINSDYrm	4731
+VPMINSDYrr	4732
+VPMINSDZ	4733
+VPMINSDZrm	4734
+VPMINSDZrmb	4735
+VPMINSDZrmbk	4736
+VPMINSDZrmbkz	4737
+VPMINSDZrmk	4738
+VPMINSDZrmkz	4739
+VPMINSDZrr	4740
+VPMINSDZrrk	4741
+VPMINSDZrrkz	4742
+VPMINSDrm	4743
+VPMINSDrr	4744
+VPMINSQZ	4745
+VPMINSQZrm	4746
+VPMINSQZrmb	4747
+VPMINSQZrmbk	4748
+VPMINSQZrmbkz	4749
+VPMINSQZrmk	4750
+VPMINSQZrmkz	4751
+VPMINSQZrr	4752
+VPMINSQZrrk	4753
+VPMINSQZrrkz	4754
+VPMINSWYrm	4755
+VPMINSWYrr	4756
+VPMINSWZ	4757
+VPMINSWZrm	4758
+VPMINSWZrmk	4759
+VPMINSWZrmkz	4760
+VPMINSWZrr	4761
+VPMINSWZrrk	4762
+VPMINSWZrrkz	4763
+VPMINSWrm	4764
+VPMINSWrr	4765
+VPMINUBYrm	4766
+VPMINUBYrr	4767
+VPMINUBZ	4768
+VPMINUBZrm	4769
+VPMINUBZrmk	4770
+VPMINUBZrmkz	4771
+VPMINUBZrr	4772
+VPMINUBZrrk	4773
+VPMINUBZrrkz	4774
+VPMINUBrm	4775
+VPMINUBrr	4776
+VPMINUDYrm	4777
+VPMINUDYrr	4778
+VPMINUDZ	4779
+VPMINUDZrm	4780
+VPMINUDZrmb	4781
+VPMINUDZrmbk	4782
+VPMINUDZrmbkz	4783
+VPMINUDZrmk	4784
+VPMINUDZrmkz	4785
+VPMINUDZrr	4786
+VPMINUDZrrk	4787
+VPMINUDZrrkz	4788
+VPMINUDrm	4789
+VPMINUDrr	4790
+VPMINUQZ	4791
+VPMINUQZrm	4792
+VPMINUQZrmb	4793
+VPMINUQZrmbk	4794
+VPMINUQZrmbkz	4795
+VPMINUQZrmk	4796
+VPMINUQZrmkz	4797
+VPMINUQZrr	4798
+VPMINUQZrrk	4799
+VPMINUQZrrkz	4800
+VPMINUWYrm	4801
+VPMINUWYrr	4802
+VPMINUWZ	4803
+VPMINUWZrm	4804
+VPMINUWZrmk	4805
+VPMINUWZrmkz	4806
+VPMINUWZrr	4807
+VPMINUWZrrk	4808
+VPMINUWZrrkz	4809
+VPMINUWrm	4810
+VPMINUWrr	4811
+VPMOVB	4812
+VPMOVD	4813
+VPMOVDBZ	4814
+VPMOVDBZmr	4815
+VPMOVDBZmrk	4816
+VPMOVDBZrr	4817
+VPMOVDBZrrk	4818
+VPMOVDBZrrkz	4819
+VPMOVDWZ	4820
+VPMOVDWZmr	4821
+VPMOVDWZmrk	4822
+VPMOVDWZrr	4823
+VPMOVDWZrrk	4824
+VPMOVDWZrrkz	4825
+VPMOVM	4826
+VPMOVMSKBYrr	4827
+VPMOVMSKBrr	4828
+VPMOVQ	4829
+VPMOVQBZ	4830
+VPMOVQBZmr	4831
+VPMOVQBZmrk	4832
+VPMOVQBZrr	4833
+VPMOVQBZrrk	4834
+VPMOVQBZrrkz	4835
+VPMOVQDZ	4836
+VPMOVQDZmr	4837
+VPMOVQDZmrk	4838
+VPMOVQDZrr	4839
+VPMOVQDZrrk	4840
+VPMOVQDZrrkz	4841
+VPMOVQWZ	4842
+VPMOVQWZmr	4843
+VPMOVQWZmrk	4844
+VPMOVQWZrr	4845
+VPMOVQWZrrk	4846
+VPMOVQWZrrkz	4847
+VPMOVSDBZ	4848
+VPMOVSDBZmr	4849
+VPMOVSDBZmrk	4850
+VPMOVSDBZrr	4851
+VPMOVSDBZrrk	4852
+VPMOVSDBZrrkz	4853
+VPMOVSDWZ	4854
+VPMOVSDWZmr	4855
+VPMOVSDWZmrk	4856
+VPMOVSDWZrr	4857
+VPMOVSDWZrrk	4858
+VPMOVSDWZrrkz	4859
+VPMOVSQBZ	4860
+VPMOVSQBZmr	4861
+VPMOVSQBZmrk	4862
+VPMOVSQBZrr	4863
+VPMOVSQBZrrk	4864
+VPMOVSQBZrrkz	4865
+VPMOVSQDZ	4866
+VPMOVSQDZmr	4867
+VPMOVSQDZmrk	4868
+VPMOVSQDZrr	4869
+VPMOVSQDZrrk	4870
+VPMOVSQDZrrkz	4871
+VPMOVSQWZ	4872
+VPMOVSQWZmr	4873
+VPMOVSQWZmrk	4874
+VPMOVSQWZrr	4875
+VPMOVSQWZrrk	4876
+VPMOVSQWZrrkz	4877
+VPMOVSWBZ	4878
+VPMOVSWBZmr	4879
+VPMOVSWBZmrk	4880
+VPMOVSWBZrr	4881
+VPMOVSWBZrrk	4882
+VPMOVSWBZrrkz	4883
+VPMOVSXBDYrm	4884
+VPMOVSXBDYrr	4885
+VPMOVSXBDZ	4886
+VPMOVSXBDZrm	4887
+VPMOVSXBDZrmk	4888
+VPMOVSXBDZrmkz	4889
+VPMOVSXBDZrr	4890
+VPMOVSXBDZrrk	4891
+VPMOVSXBDZrrkz	4892
+VPMOVSXBDrm	4893
+VPMOVSXBDrr	4894
+VPMOVSXBQYrm	4895
+VPMOVSXBQYrr	4896
+VPMOVSXBQZ	4897
+VPMOVSXBQZrm	4898
+VPMOVSXBQZrmk	4899
+VPMOVSXBQZrmkz	4900
+VPMOVSXBQZrr	4901
+VPMOVSXBQZrrk	4902
+VPMOVSXBQZrrkz	4903
+VPMOVSXBQrm	4904
+VPMOVSXBQrr	4905
+VPMOVSXBWYrm	4906
+VPMOVSXBWYrr	4907
+VPMOVSXBWZ	4908
+VPMOVSXBWZrm	4909
+VPMOVSXBWZrmk	4910
+VPMOVSXBWZrmkz	4911
+VPMOVSXBWZrr	4912
+VPMOVSXBWZrrk	4913
+VPMOVSXBWZrrkz	4914
+VPMOVSXBWrm	4915
+VPMOVSXBWrr	4916
+VPMOVSXDQYrm	4917
+VPMOVSXDQYrr	4918
+VPMOVSXDQZ	4919
+VPMOVSXDQZrm	4920
+VPMOVSXDQZrmk	4921
+VPMOVSXDQZrmkz	4922
+VPMOVSXDQZrr	4923
+VPMOVSXDQZrrk	4924
+VPMOVSXDQZrrkz	4925
+VPMOVSXDQrm	4926
+VPMOVSXDQrr	4927
+VPMOVSXWDYrm	4928
+VPMOVSXWDYrr	4929
+VPMOVSXWDZ	4930
+VPMOVSXWDZrm	4931
+VPMOVSXWDZrmk	4932
+VPMOVSXWDZrmkz	4933
+VPMOVSXWDZrr	4934
+VPMOVSXWDZrrk	4935
+VPMOVSXWDZrrkz	4936
+VPMOVSXWDrm	4937
+VPMOVSXWDrr	4938
+VPMOVSXWQYrm	4939
+VPMOVSXWQYrr	4940
+VPMOVSXWQZ	4941
+VPMOVSXWQZrm	4942
+VPMOVSXWQZrmk	4943
+VPMOVSXWQZrmkz	4944
+VPMOVSXWQZrr	4945
+VPMOVSXWQZrrk	4946
+VPMOVSXWQZrrkz	4947
+VPMOVSXWQrm	4948
+VPMOVSXWQrr	4949
+VPMOVUSDBZ	4950
+VPMOVUSDBZmr	4951
+VPMOVUSDBZmrk	4952
+VPMOVUSDBZrr	4953
+VPMOVUSDBZrrk	4954
+VPMOVUSDBZrrkz	4955
+VPMOVUSDWZ	4956
+VPMOVUSDWZmr	4957
+VPMOVUSDWZmrk	4958
+VPMOVUSDWZrr	4959
+VPMOVUSDWZrrk	4960
+VPMOVUSDWZrrkz	4961
+VPMOVUSQBZ	4962
+VPMOVUSQBZmr	4963
+VPMOVUSQBZmrk	4964
+VPMOVUSQBZrr	4965
+VPMOVUSQBZrrk	4966
+VPMOVUSQBZrrkz	4967
+VPMOVUSQDZ	4968
+VPMOVUSQDZmr	4969
+VPMOVUSQDZmrk	4970
+VPMOVUSQDZrr	4971
+VPMOVUSQDZrrk	4972
+VPMOVUSQDZrrkz	4973
+VPMOVUSQWZ	4974
+VPMOVUSQWZmr	4975
+VPMOVUSQWZmrk	4976
+VPMOVUSQWZrr	4977
+VPMOVUSQWZrrk	4978
+VPMOVUSQWZrrkz	4979
+VPMOVUSWBZ	4980
+VPMOVUSWBZmr	4981
+VPMOVUSWBZmrk	4982
+VPMOVUSWBZrr	4983
+VPMOVUSWBZrrk	4984
+VPMOVUSWBZrrkz	4985
+VPMOVW	4986
+VPMOVWBZ	4987
+VPMOVWBZmr	4988
+VPMOVWBZmrk	4989
+VPMOVWBZrr	4990
+VPMOVWBZrrk	4991
+VPMOVWBZrrkz	4992
+VPMOVZXBDYrm	4993
+VPMOVZXBDYrr	4994
+VPMOVZXBDZ	4995
+VPMOVZXBDZrm	4996
+VPMOVZXBDZrmk	4997
+VPMOVZXBDZrmkz	4998
+VPMOVZXBDZrr	4999
+VPMOVZXBDZrrk	5000
+VPMOVZXBDZrrkz	5001
+VPMOVZXBDrm	5002
+VPMOVZXBDrr	5003
+VPMOVZXBQYrm	5004
+VPMOVZXBQYrr	5005
+VPMOVZXBQZ	5006
+VPMOVZXBQZrm	5007
+VPMOVZXBQZrmk	5008
+VPMOVZXBQZrmkz	5009
+VPMOVZXBQZrr	5010
+VPMOVZXBQZrrk	5011
+VPMOVZXBQZrrkz	5012
+VPMOVZXBQrm	5013
+VPMOVZXBQrr	5014
+VPMOVZXBWYrm	5015
+VPMOVZXBWYrr	5016
+VPMOVZXBWZ	5017
+VPMOVZXBWZrm	5018
+VPMOVZXBWZrmk	5019
+VPMOVZXBWZrmkz	5020
+VPMOVZXBWZrr	5021
+VPMOVZXBWZrrk	5022
+VPMOVZXBWZrrkz	5023
+VPMOVZXBWrm	5024
+VPMOVZXBWrr	5025
+VPMOVZXDQYrm	5026
+VPMOVZXDQYrr	5027
+VPMOVZXDQZ	5028
+VPMOVZXDQZrm	5029
+VPMOVZXDQZrmk	5030
+VPMOVZXDQZrmkz	5031
+VPMOVZXDQZrr	5032
+VPMOVZXDQZrrk	5033
+VPMOVZXDQZrrkz	5034
+VPMOVZXDQrm	5035
+VPMOVZXDQrr	5036
+VPMOVZXWDYrm	5037
+VPMOVZXWDYrr	5038
+VPMOVZXWDZ	5039
+VPMOVZXWDZrm	5040
+VPMOVZXWDZrmk	5041
+VPMOVZXWDZrmkz	5042
+VPMOVZXWDZrr	5043
+VPMOVZXWDZrrk	5044
+VPMOVZXWDZrrkz	5045
+VPMOVZXWDrm	5046
+VPMOVZXWDrr	5047
+VPMOVZXWQYrm	5048
+VPMOVZXWQYrr	5049
+VPMOVZXWQZ	5050
+VPMOVZXWQZrm	5051
+VPMOVZXWQZrmk	5052
+VPMOVZXWQZrmkz	5053
+VPMOVZXWQZrr	5054
+VPMOVZXWQZrrk	5055
+VPMOVZXWQZrrkz	5056
+VPMOVZXWQrm	5057
+VPMOVZXWQrr	5058
+VPMULDQYrm	5059
+VPMULDQYrr	5060
+VPMULDQZ	5061
+VPMULDQZrm	5062
+VPMULDQZrmb	5063
+VPMULDQZrmbk	5064
+VPMULDQZrmbkz	5065
+VPMULDQZrmk	5066
+VPMULDQZrmkz	5067
+VPMULDQZrr	5068
+VPMULDQZrrk	5069
+VPMULDQZrrkz	5070
+VPMULDQrm	5071
+VPMULDQrr	5072
+VPMULHRSWYrm	5073
+VPMULHRSWYrr	5074
+VPMULHRSWZ	5075
+VPMULHRSWZrm	5076
+VPMULHRSWZrmk	5077
+VPMULHRSWZrmkz	5078
+VPMULHRSWZrr	5079
+VPMULHRSWZrrk	5080
+VPMULHRSWZrrkz	5081
+VPMULHRSWrm	5082
+VPMULHRSWrr	5083
+VPMULHUWYrm	5084
+VPMULHUWYrr	5085
+VPMULHUWZ	5086
+VPMULHUWZrm	5087
+VPMULHUWZrmk	5088
+VPMULHUWZrmkz	5089
+VPMULHUWZrr	5090
+VPMULHUWZrrk	5091
+VPMULHUWZrrkz	5092
+VPMULHUWrm	5093
+VPMULHUWrr	5094
+VPMULHWYrm	5095
+VPMULHWYrr	5096
+VPMULHWZ	5097
+VPMULHWZrm	5098
+VPMULHWZrmk	5099
+VPMULHWZrmkz	5100
+VPMULHWZrr	5101
+VPMULHWZrrk	5102
+VPMULHWZrrkz	5103
+VPMULHWrm	5104
+VPMULHWrr	5105
+VPMULLDYrm	5106
+VPMULLDYrr	5107
+VPMULLDZ	5108
+VPMULLDZrm	5109
+VPMULLDZrmb	5110
+VPMULLDZrmbk	5111
+VPMULLDZrmbkz	5112
+VPMULLDZrmk	5113
+VPMULLDZrmkz	5114
+VPMULLDZrr	5115
+VPMULLDZrrk	5116
+VPMULLDZrrkz	5117
+VPMULLDrm	5118
+VPMULLDrr	5119
+VPMULLQZ	5120
+VPMULLQZrm	5121
+VPMULLQZrmb	5122
+VPMULLQZrmbk	5123
+VPMULLQZrmbkz	5124
+VPMULLQZrmk	5125
+VPMULLQZrmkz	5126
+VPMULLQZrr	5127
+VPMULLQZrrk	5128
+VPMULLQZrrkz	5129
+VPMULLWYrm	5130
+VPMULLWYrr	5131
+VPMULLWZ	5132
+VPMULLWZrm	5133
+VPMULLWZrmk	5134
+VPMULLWZrmkz	5135
+VPMULLWZrr	5136
+VPMULLWZrrk	5137
+VPMULLWZrrkz	5138
+VPMULLWrm	5139
+VPMULLWrr	5140
+VPMULTISHIFTQBZ	5141
+VPMULTISHIFTQBZrm	5142
+VPMULTISHIFTQBZrmb	5143
+VPMULTISHIFTQBZrmbk	5144
+VPMULTISHIFTQBZrmbkz	5145
+VPMULTISHIFTQBZrmk	5146
+VPMULTISHIFTQBZrmkz	5147
+VPMULTISHIFTQBZrr	5148
+VPMULTISHIFTQBZrrk	5149
+VPMULTISHIFTQBZrrkz	5150
+VPMULUDQYrm	5151
+VPMULUDQYrr	5152
+VPMULUDQZ	5153
+VPMULUDQZrm	5154
+VPMULUDQZrmb	5155
+VPMULUDQZrmbk	5156
+VPMULUDQZrmbkz	5157
+VPMULUDQZrmk	5158
+VPMULUDQZrmkz	5159
+VPMULUDQZrr	5160
+VPMULUDQZrrk	5161
+VPMULUDQZrrkz	5162
+VPMULUDQrm	5163
+VPMULUDQrr	5164
+VPOPCNTBZ	5165
+VPOPCNTBZrm	5166
+VPOPCNTBZrmk	5167
+VPOPCNTBZrmkz	5168
+VPOPCNTBZrr	5169
+VPOPCNTBZrrk	5170
+VPOPCNTBZrrkz	5171
+VPOPCNTDZ	5172
+VPOPCNTDZrm	5173
+VPOPCNTDZrmb	5174
+VPOPCNTDZrmbk	5175
+VPOPCNTDZrmbkz	5176
+VPOPCNTDZrmk	5177
+VPOPCNTDZrmkz	5178
+VPOPCNTDZrr	5179
+VPOPCNTDZrrk	5180
+VPOPCNTDZrrkz	5181
+VPOPCNTQZ	5182
+VPOPCNTQZrm	5183
+VPOPCNTQZrmb	5184
+VPOPCNTQZrmbk	5185
+VPOPCNTQZrmbkz	5186
+VPOPCNTQZrmk	5187
+VPOPCNTQZrmkz	5188
+VPOPCNTQZrr	5189
+VPOPCNTQZrrk	5190
+VPOPCNTQZrrkz	5191
+VPOPCNTWZ	5192
+VPOPCNTWZrm	5193
+VPOPCNTWZrmk	5194
+VPOPCNTWZrmkz	5195
+VPOPCNTWZrr	5196
+VPOPCNTWZrrk	5197
+VPOPCNTWZrrkz	5198
+VPORDZ	5199
+VPORDZrm	5200
+VPORDZrmb	5201
+VPORDZrmbk	5202
+VPORDZrmbkz	5203
+VPORDZrmk	5204
+VPORDZrmkz	5205
+VPORDZrr	5206
+VPORDZrrk	5207
+VPORDZrrkz	5208
+VPORQZ	5209
+VPORQZrm	5210
+VPORQZrmb	5211
+VPORQZrmbk	5212
+VPORQZrmbkz	5213
+VPORQZrmk	5214
+VPORQZrmkz	5215
+VPORQZrr	5216
+VPORQZrrk	5217
+VPORQZrrkz	5218
+VPORYrm	5219
+VPORYrr	5220
+VPORrm	5221
+VPORrr	5222
+VPPERMrmr	5223
+VPPERMrrm	5224
+VPPERMrrr	5225
+VPPERMrrr_REV	5226
+VPROLDZ	5227
+VPROLDZmbi	5228
+VPROLDZmbik	5229
+VPROLDZmbikz	5230
+VPROLDZmi	5231
+VPROLDZmik	5232
+VPROLDZmikz	5233
+VPROLDZri	5234
+VPROLDZrik	5235
+VPROLDZrikz	5236
+VPROLQZ	5237
+VPROLQZmbi	5238
+VPROLQZmbik	5239
+VPROLQZmbikz	5240
+VPROLQZmi	5241
+VPROLQZmik	5242
+VPROLQZmikz	5243
+VPROLQZri	5244
+VPROLQZrik	5245
+VPROLQZrikz	5246
+VPROLVDZ	5247
+VPROLVDZrm	5248
+VPROLVDZrmb	5249
+VPROLVDZrmbk	5250
+VPROLVDZrmbkz	5251
+VPROLVDZrmk	5252
+VPROLVDZrmkz	5253
+VPROLVDZrr	5254
+VPROLVDZrrk	5255
+VPROLVDZrrkz	5256
+VPROLVQZ	5257
+VPROLVQZrm	5258
+VPROLVQZrmb	5259
+VPROLVQZrmbk	5260
+VPROLVQZrmbkz	5261
+VPROLVQZrmk	5262
+VPROLVQZrmkz	5263
+VPROLVQZrr	5264
+VPROLVQZrrk	5265
+VPROLVQZrrkz	5266
+VPRORDZ	5267
+VPRORDZmbi	5268
+VPRORDZmbik	5269
+VPRORDZmbikz	5270
+VPRORDZmi	5271
+VPRORDZmik	5272
+VPRORDZmikz	5273
+VPRORDZri	5274
+VPRORDZrik	5275
+VPRORDZrikz	5276
+VPRORQZ	5277
+VPRORQZmbi	5278
+VPRORQZmbik	5279
+VPRORQZmbikz	5280
+VPRORQZmi	5281
+VPRORQZmik	5282
+VPRORQZmikz	5283
+VPRORQZri	5284
+VPRORQZrik	5285
+VPRORQZrikz	5286
+VPRORVDZ	5287
+VPRORVDZrm	5288
+VPRORVDZrmb	5289
+VPRORVDZrmbk	5290
+VPRORVDZrmbkz	5291
+VPRORVDZrmk	5292
+VPRORVDZrmkz	5293
+VPRORVDZrr	5294
+VPRORVDZrrk	5295
+VPRORVDZrrkz	5296
+VPRORVQZ	5297
+VPRORVQZrm	5298
+VPRORVQZrmb	5299
+VPRORVQZrmbk	5300
+VPRORVQZrmbkz	5301
+VPRORVQZrmk	5302
+VPRORVQZrmkz	5303
+VPRORVQZrr	5304
+VPRORVQZrrk	5305
+VPRORVQZrrkz	5306
+VPROTBmi	5307
+VPROTBmr	5308
+VPROTBri	5309
+VPROTBrm	5310
+VPROTBrr	5311
+VPROTBrr_REV	5312
+VPROTDmi	5313
+VPROTDmr	5314
+VPROTDri	5315
+VPROTDrm	5316
+VPROTDrr	5317
+VPROTDrr_REV	5318
+VPROTQmi	5319
+VPROTQmr	5320
+VPROTQri	5321
+VPROTQrm	5322
+VPROTQrr	5323
+VPROTQrr_REV	5324
+VPROTWmi	5325
+VPROTWmr	5326
+VPROTWri	5327
+VPROTWrm	5328
+VPROTWrr	5329
+VPROTWrr_REV	5330
+VPSADBWYrm	5331
+VPSADBWYrr	5332
+VPSADBWZ	5333
+VPSADBWZrm	5334
+VPSADBWZrr	5335
+VPSADBWrm	5336
+VPSADBWrr	5337
+VPSCATTERDDZ	5338
+VPSCATTERDDZmr	5339
+VPSCATTERDQZ	5340
+VPSCATTERDQZmr	5341
+VPSCATTERQDZ	5342
+VPSCATTERQDZmr	5343
+VPSCATTERQQZ	5344
+VPSCATTERQQZmr	5345
+VPSHABmr	5346
+VPSHABrm	5347
+VPSHABrr	5348
+VPSHABrr_REV	5349
+VPSHADmr	5350
+VPSHADrm	5351
+VPSHADrr	5352
+VPSHADrr_REV	5353
+VPSHAQmr	5354
+VPSHAQrm	5355
+VPSHAQrr	5356
+VPSHAQrr_REV	5357
+VPSHAWmr	5358
+VPSHAWrm	5359
+VPSHAWrr	5360
+VPSHAWrr_REV	5361
+VPSHLBmr	5362
+VPSHLBrm	5363
+VPSHLBrr	5364
+VPSHLBrr_REV	5365
+VPSHLDDZ	5366
+VPSHLDDZrmbi	5367
+VPSHLDDZrmbik	5368
+VPSHLDDZrmbikz	5369
+VPSHLDDZrmi	5370
+VPSHLDDZrmik	5371
+VPSHLDDZrmikz	5372
+VPSHLDDZrri	5373
+VPSHLDDZrrik	5374
+VPSHLDDZrrikz	5375
+VPSHLDQZ	5376
+VPSHLDQZrmbi	5377
+VPSHLDQZrmbik	5378
+VPSHLDQZrmbikz	5379
+VPSHLDQZrmi	5380
+VPSHLDQZrmik	5381
+VPSHLDQZrmikz	5382
+VPSHLDQZrri	5383
+VPSHLDQZrrik	5384
+VPSHLDQZrrikz	5385
+VPSHLDVDZ	5386
+VPSHLDVDZm	5387
+VPSHLDVDZmb	5388
+VPSHLDVDZmbk	5389
+VPSHLDVDZmbkz	5390
+VPSHLDVDZmk	5391
+VPSHLDVDZmkz	5392
+VPSHLDVDZr	5393
+VPSHLDVDZrk	5394
+VPSHLDVDZrkz	5395
+VPSHLDVQZ	5396
+VPSHLDVQZm	5397
+VPSHLDVQZmb	5398
+VPSHLDVQZmbk	5399
+VPSHLDVQZmbkz	5400
+VPSHLDVQZmk	5401
+VPSHLDVQZmkz	5402
+VPSHLDVQZr	5403
+VPSHLDVQZrk	5404
+VPSHLDVQZrkz	5405
+VPSHLDVWZ	5406
+VPSHLDVWZm	5407
+VPSHLDVWZmk	5408
+VPSHLDVWZmkz	5409
+VPSHLDVWZr	5410
+VPSHLDVWZrk	5411
+VPSHLDVWZrkz	5412
+VPSHLDWZ	5413
+VPSHLDWZrmi	5414
+VPSHLDWZrmik	5415
+VPSHLDWZrmikz	5416
+VPSHLDWZrri	5417
+VPSHLDWZrrik	5418
+VPSHLDWZrrikz	5419
+VPSHLDmr	5420
+VPSHLDrm	5421
+VPSHLDrr	5422
+VPSHLDrr_REV	5423
+VPSHLQmr	5424
+VPSHLQrm	5425
+VPSHLQrr	5426
+VPSHLQrr_REV	5427
+VPSHLWmr	5428
+VPSHLWrm	5429
+VPSHLWrr	5430
+VPSHLWrr_REV	5431
+VPSHRDDZ	5432
+VPSHRDDZrmbi	5433
+VPSHRDDZrmbik	5434
+VPSHRDDZrmbikz	5435
+VPSHRDDZrmi	5436
+VPSHRDDZrmik	5437
+VPSHRDDZrmikz	5438
+VPSHRDDZrri	5439
+VPSHRDDZrrik	5440
+VPSHRDDZrrikz	5441
+VPSHRDQZ	5442
+VPSHRDQZrmbi	5443
+VPSHRDQZrmbik	5444
+VPSHRDQZrmbikz	5445
+VPSHRDQZrmi	5446
+VPSHRDQZrmik	5447
+VPSHRDQZrmikz	5448
+VPSHRDQZrri	5449
+VPSHRDQZrrik	5450
+VPSHRDQZrrikz	5451
+VPSHRDVDZ	5452
+VPSHRDVDZm	5453
+VPSHRDVDZmb	5454
+VPSHRDVDZmbk	5455
+VPSHRDVDZmbkz	5456
+VPSHRDVDZmk	5457
+VPSHRDVDZmkz	5458
+VPSHRDVDZr	5459
+VPSHRDVDZrk	5460
+VPSHRDVDZrkz	5461
+VPSHRDVQZ	5462
+VPSHRDVQZm	5463
+VPSHRDVQZmb	5464
+VPSHRDVQZmbk	5465
+VPSHRDVQZmbkz	5466
+VPSHRDVQZmk	5467
+VPSHRDVQZmkz	5468
+VPSHRDVQZr	5469
+VPSHRDVQZrk	5470
+VPSHRDVQZrkz	5471
+VPSHRDVWZ	5472
+VPSHRDVWZm	5473
+VPSHRDVWZmk	5474
+VPSHRDVWZmkz	5475
+VPSHRDVWZr	5476
+VPSHRDVWZrk	5477
+VPSHRDVWZrkz	5478
+VPSHRDWZ	5479
+VPSHRDWZrmi	5480
+VPSHRDWZrmik	5481
+VPSHRDWZrmikz	5482
+VPSHRDWZrri	5483
+VPSHRDWZrrik	5484
+VPSHRDWZrrikz	5485
+VPSHUFBITQMBZ	5486
+VPSHUFBITQMBZrm	5487
+VPSHUFBITQMBZrmk	5488
+VPSHUFBITQMBZrr	5489
+VPSHUFBITQMBZrrk	5490
+VPSHUFBYrm	5491
+VPSHUFBYrr	5492
+VPSHUFBZ	5493
+VPSHUFBZrm	5494
+VPSHUFBZrmk	5495
+VPSHUFBZrmkz	5496
+VPSHUFBZrr	5497
+VPSHUFBZrrk	5498
+VPSHUFBZrrkz	5499
+VPSHUFBrm	5500
+VPSHUFBrr	5501
+VPSHUFDYmi	5502
+VPSHUFDYri	5503
+VPSHUFDZ	5504
+VPSHUFDZmbi	5505
+VPSHUFDZmbik	5506
+VPSHUFDZmbikz	5507
+VPSHUFDZmi	5508
+VPSHUFDZmik	5509
+VPSHUFDZmikz	5510
+VPSHUFDZri	5511
+VPSHUFDZrik	5512
+VPSHUFDZrikz	5513
+VPSHUFDmi	5514
+VPSHUFDri	5515
+VPSHUFHWYmi	5516
+VPSHUFHWYri	5517
+VPSHUFHWZ	5518
+VPSHUFHWZmi	5519
+VPSHUFHWZmik	5520
+VPSHUFHWZmikz	5521
+VPSHUFHWZri	5522
+VPSHUFHWZrik	5523
+VPSHUFHWZrikz	5524
+VPSHUFHWmi	5525
+VPSHUFHWri	5526
+VPSHUFLWYmi	5527
+VPSHUFLWYri	5528
+VPSHUFLWZ	5529
+VPSHUFLWZmi	5530
+VPSHUFLWZmik	5531
+VPSHUFLWZmikz	5532
+VPSHUFLWZri	5533
+VPSHUFLWZrik	5534
+VPSHUFLWZrikz	5535
+VPSHUFLWmi	5536
+VPSHUFLWri	5537
+VPSIGNBYrm	5538
+VPSIGNBYrr	5539
+VPSIGNBrm	5540
+VPSIGNBrr	5541
+VPSIGNDYrm	5542
+VPSIGNDYrr	5543
+VPSIGNDrm	5544
+VPSIGNDrr	5545
+VPSIGNWYrm	5546
+VPSIGNWYrr	5547
+VPSIGNWrm	5548
+VPSIGNWrr	5549
+VPSLLDQYri	5550
+VPSLLDQZ	5551
+VPSLLDQZmi	5552
+VPSLLDQZri	5553
+VPSLLDQri	5554
+VPSLLDYri	5555
+VPSLLDYrm	5556
+VPSLLDYrr	5557
+VPSLLDZ	5558
+VPSLLDZmbi	5559
+VPSLLDZmbik	5560
+VPSLLDZmbikz	5561
+VPSLLDZmi	5562
+VPSLLDZmik	5563
+VPSLLDZmikz	5564
+VPSLLDZri	5565
+VPSLLDZrik	5566
+VPSLLDZrikz	5567
+VPSLLDZrm	5568
+VPSLLDZrmk	5569
+VPSLLDZrmkz	5570
+VPSLLDZrr	5571
+VPSLLDZrrk	5572
+VPSLLDZrrkz	5573
+VPSLLDri	5574
+VPSLLDrm	5575
+VPSLLDrr	5576
+VPSLLQYri	5577
+VPSLLQYrm	5578
+VPSLLQYrr	5579
+VPSLLQZ	5580
+VPSLLQZmbi	5581
+VPSLLQZmbik	5582
+VPSLLQZmbikz	5583
+VPSLLQZmi	5584
+VPSLLQZmik	5585
+VPSLLQZmikz	5586
+VPSLLQZri	5587
+VPSLLQZrik	5588
+VPSLLQZrikz	5589
+VPSLLQZrm	5590
+VPSLLQZrmk	5591
+VPSLLQZrmkz	5592
+VPSLLQZrr	5593
+VPSLLQZrrk	5594
+VPSLLQZrrkz	5595
+VPSLLQri	5596
+VPSLLQrm	5597
+VPSLLQrr	5598
+VPSLLVDYrm	5599
+VPSLLVDYrr	5600
+VPSLLVDZ	5601
+VPSLLVDZrm	5602
+VPSLLVDZrmb	5603
+VPSLLVDZrmbk	5604
+VPSLLVDZrmbkz	5605
+VPSLLVDZrmk	5606
+VPSLLVDZrmkz	5607
+VPSLLVDZrr	5608
+VPSLLVDZrrk	5609
+VPSLLVDZrrkz	5610
+VPSLLVDrm	5611
+VPSLLVDrr	5612
+VPSLLVQYrm	5613
+VPSLLVQYrr	5614
+VPSLLVQZ	5615
+VPSLLVQZrm	5616
+VPSLLVQZrmb	5617
+VPSLLVQZrmbk	5618
+VPSLLVQZrmbkz	5619
+VPSLLVQZrmk	5620
+VPSLLVQZrmkz	5621
+VPSLLVQZrr	5622
+VPSLLVQZrrk	5623
+VPSLLVQZrrkz	5624
+VPSLLVQrm	5625
+VPSLLVQrr	5626
+VPSLLVWZ	5627
+VPSLLVWZrm	5628
+VPSLLVWZrmk	5629
+VPSLLVWZrmkz	5630
+VPSLLVWZrr	5631
+VPSLLVWZrrk	5632
+VPSLLVWZrrkz	5633
+VPSLLWYri	5634
+VPSLLWYrm	5635
+VPSLLWYrr	5636
+VPSLLWZ	5637
+VPSLLWZmi	5638
+VPSLLWZmik	5639
+VPSLLWZmikz	5640
+VPSLLWZri	5641
+VPSLLWZrik	5642
+VPSLLWZrikz	5643
+VPSLLWZrm	5644
+VPSLLWZrmk	5645
+VPSLLWZrmkz	5646
+VPSLLWZrr	5647
+VPSLLWZrrk	5648
+VPSLLWZrrkz	5649
+VPSLLWri	5650
+VPSLLWrm	5651
+VPSLLWrr	5652
+VPSRADYri	5653
+VPSRADYrm	5654
+VPSRADYrr	5655
+VPSRADZ	5656
+VPSRADZmbi	5657
+VPSRADZmbik	5658
+VPSRADZmbikz	5659
+VPSRADZmi	5660
+VPSRADZmik	5661
+VPSRADZmikz	5662
+VPSRADZri	5663
+VPSRADZrik	5664
+VPSRADZrikz	5665
+VPSRADZrm	5666
+VPSRADZrmk	5667
+VPSRADZrmkz	5668
+VPSRADZrr	5669
+VPSRADZrrk	5670
+VPSRADZrrkz	5671
+VPSRADri	5672
+VPSRADrm	5673
+VPSRADrr	5674
+VPSRAQZ	5675
+VPSRAQZmbi	5676
+VPSRAQZmbik	5677
+VPSRAQZmbikz	5678
+VPSRAQZmi	5679
+VPSRAQZmik	5680
+VPSRAQZmikz	5681
+VPSRAQZri	5682
+VPSRAQZrik	5683
+VPSRAQZrikz	5684
+VPSRAQZrm	5685
+VPSRAQZrmk	5686
+VPSRAQZrmkz	5687
+VPSRAQZrr	5688
+VPSRAQZrrk	5689
+VPSRAQZrrkz	5690
+VPSRAVDYrm	5691
+VPSRAVDYrr	5692
+VPSRAVDZ	5693
+VPSRAVDZrm	5694
+VPSRAVDZrmb	5695
+VPSRAVDZrmbk	5696
+VPSRAVDZrmbkz	5697
+VPSRAVDZrmk	5698
+VPSRAVDZrmkz	5699
+VPSRAVDZrr	5700
+VPSRAVDZrrk	5701
+VPSRAVDZrrkz	5702
+VPSRAVDrm	5703
+VPSRAVDrr	5704
+VPSRAVQZ	5705
+VPSRAVQZrm	5706
+VPSRAVQZrmb	5707
+VPSRAVQZrmbk	5708
+VPSRAVQZrmbkz	5709
+VPSRAVQZrmk	5710
+VPSRAVQZrmkz	5711
+VPSRAVQZrr	5712
+VPSRAVQZrrk	5713
+VPSRAVQZrrkz	5714
+VPSRAVWZ	5715
+VPSRAVWZrm	5716
+VPSRAVWZrmk	5717
+VPSRAVWZrmkz	5718
+VPSRAVWZrr	5719
+VPSRAVWZrrk	5720
+VPSRAVWZrrkz	5721
+VPSRAWYri	5722
+VPSRAWYrm	5723
+VPSRAWYrr	5724
+VPSRAWZ	5725
+VPSRAWZmi	5726
+VPSRAWZmik	5727
+VPSRAWZmikz	5728
+VPSRAWZri	5729
+VPSRAWZrik	5730
+VPSRAWZrikz	5731
+VPSRAWZrm	5732
+VPSRAWZrmk	5733
+VPSRAWZrmkz	5734
+VPSRAWZrr	5735
+VPSRAWZrrk	5736
+VPSRAWZrrkz	5737
+VPSRAWri	5738
+VPSRAWrm	5739
+VPSRAWrr	5740
+VPSRLDQYri	5741
+VPSRLDQZ	5742
+VPSRLDQZmi	5743
+VPSRLDQZri	5744
+VPSRLDQri	5745
+VPSRLDYri	5746
+VPSRLDYrm	5747
+VPSRLDYrr	5748
+VPSRLDZ	5749
+VPSRLDZmbi	5750
+VPSRLDZmbik	5751
+VPSRLDZmbikz	5752
+VPSRLDZmi	5753
+VPSRLDZmik	5754
+VPSRLDZmikz	5755
+VPSRLDZri	5756
+VPSRLDZrik	5757
+VPSRLDZrikz	5758
+VPSRLDZrm	5759
+VPSRLDZrmk	5760
+VPSRLDZrmkz	5761
+VPSRLDZrr	5762
+VPSRLDZrrk	5763
+VPSRLDZrrkz	5764
+VPSRLDri	5765
+VPSRLDrm	5766
+VPSRLDrr	5767
+VPSRLQYri	5768
+VPSRLQYrm	5769
+VPSRLQYrr	5770
+VPSRLQZ	5771
+VPSRLQZmbi	5772
+VPSRLQZmbik	5773
+VPSRLQZmbikz	5774
+VPSRLQZmi	5775
+VPSRLQZmik	5776
+VPSRLQZmikz	5777
+VPSRLQZri	5778
+VPSRLQZrik	5779
+VPSRLQZrikz	5780
+VPSRLQZrm	5781
+VPSRLQZrmk	5782
+VPSRLQZrmkz	5783
+VPSRLQZrr	5784
+VPSRLQZrrk	5785
+VPSRLQZrrkz	5786
+VPSRLQri	5787
+VPSRLQrm	5788
+VPSRLQrr	5789
+VPSRLVDYrm	5790
+VPSRLVDYrr	5791
+VPSRLVDZ	5792
+VPSRLVDZrm	5793
+VPSRLVDZrmb	5794
+VPSRLVDZrmbk	5795
+VPSRLVDZrmbkz	5796
+VPSRLVDZrmk	5797
+VPSRLVDZrmkz	5798
+VPSRLVDZrr	5799
+VPSRLVDZrrk	5800
+VPSRLVDZrrkz	5801
+VPSRLVDrm	5802
+VPSRLVDrr	5803
+VPSRLVQYrm	5804
+VPSRLVQYrr	5805
+VPSRLVQZ	5806
+VPSRLVQZrm	5807
+VPSRLVQZrmb	5808
+VPSRLVQZrmbk	5809
+VPSRLVQZrmbkz	5810
+VPSRLVQZrmk	5811
+VPSRLVQZrmkz	5812
+VPSRLVQZrr	5813
+VPSRLVQZrrk	5814
+VPSRLVQZrrkz	5815
+VPSRLVQrm	5816
+VPSRLVQrr	5817
+VPSRLVWZ	5818
+VPSRLVWZrm	5819
+VPSRLVWZrmk	5820
+VPSRLVWZrmkz	5821
+VPSRLVWZrr	5822
+VPSRLVWZrrk	5823
+VPSRLVWZrrkz	5824
+VPSRLWYri	5825
+VPSRLWYrm	5826
+VPSRLWYrr	5827
+VPSRLWZ	5828
+VPSRLWZmi	5829
+VPSRLWZmik	5830
+VPSRLWZmikz	5831
+VPSRLWZri	5832
+VPSRLWZrik	5833
+VPSRLWZrikz	5834
+VPSRLWZrm	5835
+VPSRLWZrmk	5836
+VPSRLWZrmkz	5837
+VPSRLWZrr	5838
+VPSRLWZrrk	5839
+VPSRLWZrrkz	5840
+VPSRLWri	5841
+VPSRLWrm	5842
+VPSRLWrr	5843
+VPSUBBYrm	5844
+VPSUBBYrr	5845
+VPSUBBZ	5846
+VPSUBBZrm	5847
+VPSUBBZrmk	5848
+VPSUBBZrmkz	5849
+VPSUBBZrr	5850
+VPSUBBZrrk	5851
+VPSUBBZrrkz	5852
+VPSUBBrm	5853
+VPSUBBrr	5854
+VPSUBDYrm	5855
+VPSUBDYrr	5856
+VPSUBDZ	5857
+VPSUBDZrm	5858
+VPSUBDZrmb	5859
+VPSUBDZrmbk	5860
+VPSUBDZrmbkz	5861
+VPSUBDZrmk	5862
+VPSUBDZrmkz	5863
+VPSUBDZrr	5864
+VPSUBDZrrk	5865
+VPSUBDZrrkz	5866
+VPSUBDrm	5867
+VPSUBDrr	5868
+VPSUBQYrm	5869
+VPSUBQYrr	5870
+VPSUBQZ	5871
+VPSUBQZrm	5872
+VPSUBQZrmb	5873
+VPSUBQZrmbk	5874
+VPSUBQZrmbkz	5875
+VPSUBQZrmk	5876
+VPSUBQZrmkz	5877
+VPSUBQZrr	5878
+VPSUBQZrrk	5879
+VPSUBQZrrkz	5880
+VPSUBQrm	5881
+VPSUBQrr	5882
+VPSUBSBYrm	5883
+VPSUBSBYrr	5884
+VPSUBSBZ	5885
+VPSUBSBZrm	5886
+VPSUBSBZrmk	5887
+VPSUBSBZrmkz	5888
+VPSUBSBZrr	5889
+VPSUBSBZrrk	5890
+VPSUBSBZrrkz	5891
+VPSUBSBrm	5892
+VPSUBSBrr	5893
+VPSUBSWYrm	5894
+VPSUBSWYrr	5895
+VPSUBSWZ	5896
+VPSUBSWZrm	5897
+VPSUBSWZrmk	5898
+VPSUBSWZrmkz	5899
+VPSUBSWZrr	5900
+VPSUBSWZrrk	5901
+VPSUBSWZrrkz	5902
+VPSUBSWrm	5903
+VPSUBSWrr	5904
+VPSUBUSBYrm	5905
+VPSUBUSBYrr	5906
+VPSUBUSBZ	5907
+VPSUBUSBZrm	5908
+VPSUBUSBZrmk	5909
+VPSUBUSBZrmkz	5910
+VPSUBUSBZrr	5911
+VPSUBUSBZrrk	5912
+VPSUBUSBZrrkz	5913
+VPSUBUSBrm	5914
+VPSUBUSBrr	5915
+VPSUBUSWYrm	5916
+VPSUBUSWYrr	5917
+VPSUBUSWZ	5918
+VPSUBUSWZrm	5919
+VPSUBUSWZrmk	5920
+VPSUBUSWZrmkz	5921
+VPSUBUSWZrr	5922
+VPSUBUSWZrrk	5923
+VPSUBUSWZrrkz	5924
+VPSUBUSWrm	5925
+VPSUBUSWrr	5926
+VPSUBWYrm	5927
+VPSUBWYrr	5928
+VPSUBWZ	5929
+VPSUBWZrm	5930
+VPSUBWZrmk	5931
+VPSUBWZrmkz	5932
+VPSUBWZrr	5933
+VPSUBWZrrk	5934
+VPSUBWZrrkz	5935
+VPSUBWrm	5936
+VPSUBWrr	5937
+VPTERNLOGDZ	5938
+VPTERNLOGDZrmbi	5939
+VPTERNLOGDZrmbik	5940
+VPTERNLOGDZrmbikz	5941
+VPTERNLOGDZrmi	5942
+VPTERNLOGDZrmik	5943
+VPTERNLOGDZrmikz	5944
+VPTERNLOGDZrri	5945
+VPTERNLOGDZrrik	5946
+VPTERNLOGDZrrikz	5947
+VPTERNLOGQZ	5948
+VPTERNLOGQZrmbi	5949
+VPTERNLOGQZrmbik	5950
+VPTERNLOGQZrmbikz	5951
+VPTERNLOGQZrmi	5952
+VPTERNLOGQZrmik	5953
+VPTERNLOGQZrmikz	5954
+VPTERNLOGQZrri	5955
+VPTERNLOGQZrrik	5956
+VPTERNLOGQZrrikz	5957
+VPTESTMBZ	5958
+VPTESTMBZrm	5959
+VPTESTMBZrmk	5960
+VPTESTMBZrr	5961
+VPTESTMBZrrk	5962
+VPTESTMDZ	5963
+VPTESTMDZrm	5964
+VPTESTMDZrmb	5965
+VPTESTMDZrmbk	5966
+VPTESTMDZrmk	5967
+VPTESTMDZrr	5968
+VPTESTMDZrrk	5969
+VPTESTMQZ	5970
+VPTESTMQZrm	5971
+VPTESTMQZrmb	5972
+VPTESTMQZrmbk	5973
+VPTESTMQZrmk	5974
+VPTESTMQZrr	5975
+VPTESTMQZrrk	5976
+VPTESTMWZ	5977
+VPTESTMWZrm	5978
+VPTESTMWZrmk	5979
+VPTESTMWZrr	5980
+VPTESTMWZrrk	5981
+VPTESTNMBZ	5982
+VPTESTNMBZrm	5983
+VPTESTNMBZrmk	5984
+VPTESTNMBZrr	5985
+VPTESTNMBZrrk	5986
+VPTESTNMDZ	5987
+VPTESTNMDZrm	5988
+VPTESTNMDZrmb	5989
+VPTESTNMDZrmbk	5990
+VPTESTNMDZrmk	5991
+VPTESTNMDZrr	5992
+VPTESTNMDZrrk	5993
+VPTESTNMQZ	5994
+VPTESTNMQZrm	5995
+VPTESTNMQZrmb	5996
+VPTESTNMQZrmbk	5997
+VPTESTNMQZrmk	5998
+VPTESTNMQZrr	5999
+VPTESTNMQZrrk	6000
+VPTESTNMWZ	6001
+VPTESTNMWZrm	6002
+VPTESTNMWZrmk	6003
+VPTESTNMWZrr	6004
+VPTESTNMWZrrk	6005
+VPTESTYrm	6006
+VPTESTYrr	6007
+VPTESTrm	6008
+VPTESTrr	6009
+VPUNPCKHBWYrm	6010
+VPUNPCKHBWYrr	6011
+VPUNPCKHBWZ	6012
+VPUNPCKHBWZrm	6013
+VPUNPCKHBWZrmk	6014
+VPUNPCKHBWZrmkz	6015
+VPUNPCKHBWZrr	6016
+VPUNPCKHBWZrrk	6017
+VPUNPCKHBWZrrkz	6018
+VPUNPCKHBWrm	6019
+VPUNPCKHBWrr	6020
+VPUNPCKHDQYrm	6021
+VPUNPCKHDQYrr	6022
+VPUNPCKHDQZ	6023
+VPUNPCKHDQZrm	6024
+VPUNPCKHDQZrmb	6025
+VPUNPCKHDQZrmbk	6026
+VPUNPCKHDQZrmbkz	6027
+VPUNPCKHDQZrmk	6028
+VPUNPCKHDQZrmkz	6029
+VPUNPCKHDQZrr	6030
+VPUNPCKHDQZrrk	6031
+VPUNPCKHDQZrrkz	6032
+VPUNPCKHDQrm	6033
+VPUNPCKHDQrr	6034
+VPUNPCKHQDQYrm	6035
+VPUNPCKHQDQYrr	6036
+VPUNPCKHQDQZ	6037
+VPUNPCKHQDQZrm	6038
+VPUNPCKHQDQZrmb	6039
+VPUNPCKHQDQZrmbk	6040
+VPUNPCKHQDQZrmbkz	6041
+VPUNPCKHQDQZrmk	6042
+VPUNPCKHQDQZrmkz	6043
+VPUNPCKHQDQZrr	6044
+VPUNPCKHQDQZrrk	6045
+VPUNPCKHQDQZrrkz	6046
+VPUNPCKHQDQrm	6047
+VPUNPCKHQDQrr	6048
+VPUNPCKHWDYrm	6049
+VPUNPCKHWDYrr	6050
+VPUNPCKHWDZ	6051
+VPUNPCKHWDZrm	6052
+VPUNPCKHWDZrmk	6053
+VPUNPCKHWDZrmkz	6054
+VPUNPCKHWDZrr	6055
+VPUNPCKHWDZrrk	6056
+VPUNPCKHWDZrrkz	6057
+VPUNPCKHWDrm	6058
+VPUNPCKHWDrr	6059
+VPUNPCKLBWYrm	6060
+VPUNPCKLBWYrr	6061
+VPUNPCKLBWZ	6062
+VPUNPCKLBWZrm	6063
+VPUNPCKLBWZrmk	6064
+VPUNPCKLBWZrmkz	6065
+VPUNPCKLBWZrr	6066
+VPUNPCKLBWZrrk	6067
+VPUNPCKLBWZrrkz	6068
+VPUNPCKLBWrm	6069
+VPUNPCKLBWrr	6070
+VPUNPCKLDQYrm	6071
+VPUNPCKLDQYrr	6072
+VPUNPCKLDQZ	6073
+VPUNPCKLDQZrm	6074
+VPUNPCKLDQZrmb	6075
+VPUNPCKLDQZrmbk	6076
+VPUNPCKLDQZrmbkz	6077
+VPUNPCKLDQZrmk	6078
+VPUNPCKLDQZrmkz	6079
+VPUNPCKLDQZrr	6080
+VPUNPCKLDQZrrk	6081
+VPUNPCKLDQZrrkz	6082
+VPUNPCKLDQrm	6083
+VPUNPCKLDQrr	6084
+VPUNPCKLQDQYrm	6085
+VPUNPCKLQDQYrr	6086
+VPUNPCKLQDQZ	6087
+VPUNPCKLQDQZrm	6088
+VPUNPCKLQDQZrmb	6089
+VPUNPCKLQDQZrmbk	6090
+VPUNPCKLQDQZrmbkz	6091
+VPUNPCKLQDQZrmk	6092
+VPUNPCKLQDQZrmkz	6093
+VPUNPCKLQDQZrr	6094
+VPUNPCKLQDQZrrk	6095
+VPUNPCKLQDQZrrkz	6096
+VPUNPCKLQDQrm	6097
+VPUNPCKLQDQrr	6098
+VPUNPCKLWDYrm	6099
+VPUNPCKLWDYrr	6100
+VPUNPCKLWDZ	6101
+VPUNPCKLWDZrm	6102
+VPUNPCKLWDZrmk	6103
+VPUNPCKLWDZrmkz	6104
+VPUNPCKLWDZrr	6105
+VPUNPCKLWDZrrk	6106
+VPUNPCKLWDZrrkz	6107
+VPUNPCKLWDrm	6108
+VPUNPCKLWDrr	6109
+VPXORDZ	6110
+VPXORDZrm	6111
+VPXORDZrmb	6112
+VPXORDZrmbk	6113
+VPXORDZrmbkz	6114
+VPXORDZrmk	6115
+VPXORDZrmkz	6116
+VPXORDZrr	6117
+VPXORDZrrk	6118
+VPXORDZrrkz	6119
+VPXORQZ	6120
+VPXORQZrm	6121
+VPXORQZrmb	6122
+VPXORQZrmbk	6123
+VPXORQZrmbkz	6124
+VPXORQZrmk	6125
+VPXORQZrmkz	6126
+VPXORQZrr	6127
+VPXORQZrrk	6128
+VPXORQZrrkz	6129
+VPXORYrm	6130
+VPXORYrr	6131
+VPXORrm	6132
+VPXORrr	6133
+VRANGEPDZ	6134
+VRANGEPDZrmbi	6135
+VRANGEPDZrmbik	6136
+VRANGEPDZrmbikz	6137
+VRANGEPDZrmi	6138
+VRANGEPDZrmik	6139
+VRANGEPDZrmikz	6140
+VRANGEPDZrri	6141
+VRANGEPDZrrib	6142
+VRANGEPDZrribk	6143
+VRANGEPDZrribkz	6144
+VRANGEPDZrrik	6145
+VRANGEPDZrrikz	6146
+VRANGEPSZ	6147
+VRANGEPSZrmbi	6148
+VRANGEPSZrmbik	6149
+VRANGEPSZrmbikz	6150
+VRANGEPSZrmi	6151
+VRANGEPSZrmik	6152
+VRANGEPSZrmikz	6153
+VRANGEPSZrri	6154
+VRANGEPSZrrib	6155
+VRANGEPSZrribk	6156
+VRANGEPSZrribkz	6157
+VRANGEPSZrrik	6158
+VRANGEPSZrrikz	6159
+VRANGESDZrmi	6160
+VRANGESDZrmik	6161
+VRANGESDZrmikz	6162
+VRANGESDZrri	6163
+VRANGESDZrrib	6164
+VRANGESDZrribk	6165
+VRANGESDZrribkz	6166
+VRANGESDZrrik	6167
+VRANGESDZrrikz	6168
+VRANGESSZrmi	6169
+VRANGESSZrmik	6170
+VRANGESSZrmikz	6171
+VRANGESSZrri	6172
+VRANGESSZrrib	6173
+VRANGESSZrribk	6174
+VRANGESSZrribkz	6175
+VRANGESSZrrik	6176
+VRANGESSZrrikz	6177
+VRCP	6178
+VRCPBF	6179
+VRCPPHZ	6180
+VRCPPHZm	6181
+VRCPPHZmb	6182
+VRCPPHZmbk	6183
+VRCPPHZmbkz	6184
+VRCPPHZmk	6185
+VRCPPHZmkz	6186
+VRCPPHZr	6187
+VRCPPHZrk	6188
+VRCPPHZrkz	6189
+VRCPPSYm	6190
+VRCPPSYr	6191
+VRCPPSm	6192
+VRCPPSr	6193
+VRCPSHZrm	6194
+VRCPSHZrmk	6195
+VRCPSHZrmkz	6196
+VRCPSHZrr	6197
+VRCPSHZrrk	6198
+VRCPSHZrrkz	6199
+VRCPSSm	6200
+VRCPSSm_Int	6201
+VRCPSSr	6202
+VRCPSSr_Int	6203
+VREDUCEBF	6204
+VREDUCEPDZ	6205
+VREDUCEPDZrmbi	6206
+VREDUCEPDZrmbik	6207
+VREDUCEPDZrmbikz	6208
+VREDUCEPDZrmi	6209
+VREDUCEPDZrmik	6210
+VREDUCEPDZrmikz	6211
+VREDUCEPDZrri	6212
+VREDUCEPDZrrib	6213
+VREDUCEPDZrribk	6214
+VREDUCEPDZrribkz	6215
+VREDUCEPDZrrik	6216
+VREDUCEPDZrrikz	6217
+VREDUCEPHZ	6218
+VREDUCEPHZrmbi	6219
+VREDUCEPHZrmbik	6220
+VREDUCEPHZrmbikz	6221
+VREDUCEPHZrmi	6222
+VREDUCEPHZrmik	6223
+VREDUCEPHZrmikz	6224
+VREDUCEPHZrri	6225
+VREDUCEPHZrrib	6226
+VREDUCEPHZrribk	6227
+VREDUCEPHZrribkz	6228
+VREDUCEPHZrrik	6229
+VREDUCEPHZrrikz	6230
+VREDUCEPSZ	6231
+VREDUCEPSZrmbi	6232
+VREDUCEPSZrmbik	6233
+VREDUCEPSZrmbikz	6234
+VREDUCEPSZrmi	6235
+VREDUCEPSZrmik	6236
+VREDUCEPSZrmikz	6237
+VREDUCEPSZrri	6238
+VREDUCEPSZrrib	6239
+VREDUCEPSZrribk	6240
+VREDUCEPSZrribkz	6241
+VREDUCEPSZrrik	6242
+VREDUCEPSZrrikz	6243
+VREDUCESDZrmi	6244
+VREDUCESDZrmik	6245
+VREDUCESDZrmikz	6246
+VREDUCESDZrri	6247
+VREDUCESDZrrib	6248
+VREDUCESDZrribk	6249
+VREDUCESDZrribkz	6250
+VREDUCESDZrrik	6251
+VREDUCESDZrrikz	6252
+VREDUCESHZrmi	6253
+VREDUCESHZrmik	6254
+VREDUCESHZrmikz	6255
+VREDUCESHZrri	6256
+VREDUCESHZrrib	6257
+VREDUCESHZrribk	6258
+VREDUCESHZrribkz	6259
+VREDUCESHZrrik	6260
+VREDUCESHZrrikz	6261
+VREDUCESSZrmi	6262
+VREDUCESSZrmik	6263
+VREDUCESSZrmikz	6264
+VREDUCESSZrri	6265
+VREDUCESSZrrib	6266
+VREDUCESSZrribk	6267
+VREDUCESSZrribkz	6268
+VREDUCESSZrrik	6269
+VREDUCESSZrrikz	6270
+VRNDSCALEBF	6271
+VRNDSCALEPDZ	6272
+VRNDSCALEPDZrmbi	6273
+VRNDSCALEPDZrmbik	6274
+VRNDSCALEPDZrmbikz	6275
+VRNDSCALEPDZrmi	6276
+VRNDSCALEPDZrmik	6277
+VRNDSCALEPDZrmikz	6278
+VRNDSCALEPDZrri	6279
+VRNDSCALEPDZrrib	6280
+VRNDSCALEPDZrribk	6281
+VRNDSCALEPDZrribkz	6282
+VRNDSCALEPDZrrik	6283
+VRNDSCALEPDZrrikz	6284
+VRNDSCALEPHZ	6285
+VRNDSCALEPHZrmbi	6286
+VRNDSCALEPHZrmbik	6287
+VRNDSCALEPHZrmbikz	6288
+VRNDSCALEPHZrmi	6289
+VRNDSCALEPHZrmik	6290
+VRNDSCALEPHZrmikz	6291
+VRNDSCALEPHZrri	6292
+VRNDSCALEPHZrrib	6293
+VRNDSCALEPHZrribk	6294
+VRNDSCALEPHZrribkz	6295
+VRNDSCALEPHZrrik	6296
+VRNDSCALEPHZrrikz	6297
+VRNDSCALEPSZ	6298
+VRNDSCALEPSZrmbi	6299
+VRNDSCALEPSZrmbik	6300
+VRNDSCALEPSZrmbikz	6301
+VRNDSCALEPSZrmi	6302
+VRNDSCALEPSZrmik	6303
+VRNDSCALEPSZrmikz	6304
+VRNDSCALEPSZrri	6305
+VRNDSCALEPSZrrib	6306
+VRNDSCALEPSZrribk	6307
+VRNDSCALEPSZrribkz	6308
+VRNDSCALEPSZrrik	6309
+VRNDSCALEPSZrrikz	6310
+VRNDSCALESDZrmi	6311
+VRNDSCALESDZrmi_Int	6312
+VRNDSCALESDZrmik_Int	6313
+VRNDSCALESDZrmikz_Int	6314
+VRNDSCALESDZrri	6315
+VRNDSCALESDZrri_Int	6316
+VRNDSCALESDZrrib_Int	6317
+VRNDSCALESDZrribk_Int	6318
+VRNDSCALESDZrribkz_Int	6319
+VRNDSCALESDZrrik_Int	6320
+VRNDSCALESDZrrikz_Int	6321
+VRNDSCALESHZrmi	6322
+VRNDSCALESHZrmi_Int	6323
+VRNDSCALESHZrmik_Int	6324
+VRNDSCALESHZrmikz_Int	6325
+VRNDSCALESHZrri	6326
+VRNDSCALESHZrri_Int	6327
+VRNDSCALESHZrrib_Int	6328
+VRNDSCALESHZrribk_Int	6329
+VRNDSCALESHZrribkz_Int	6330
+VRNDSCALESHZrrik_Int	6331
+VRNDSCALESHZrrikz_Int	6332
+VRNDSCALESSZrmi	6333
+VRNDSCALESSZrmi_Int	6334
+VRNDSCALESSZrmik_Int	6335
+VRNDSCALESSZrmikz_Int	6336
+VRNDSCALESSZrri	6337
+VRNDSCALESSZrri_Int	6338
+VRNDSCALESSZrrib_Int	6339
+VRNDSCALESSZrribk_Int	6340
+VRNDSCALESSZrribkz_Int	6341
+VRNDSCALESSZrrik_Int	6342
+VRNDSCALESSZrrikz_Int	6343
+VROUNDPDYmi	6344
+VROUNDPDYri	6345
+VROUNDPDmi	6346
+VROUNDPDri	6347
+VROUNDPSYmi	6348
+VROUNDPSYri	6349
+VROUNDPSmi	6350
+VROUNDPSri	6351
+VROUNDSDmi	6352
+VROUNDSDmi_Int	6353
+VROUNDSDri	6354
+VROUNDSDri_Int	6355
+VROUNDSSmi	6356
+VROUNDSSmi_Int	6357
+VROUNDSSri	6358
+VROUNDSSri_Int	6359
+VRSQRT	6360
+VRSQRTBF	6361
+VRSQRTPHZ	6362
+VRSQRTPHZm	6363
+VRSQRTPHZmb	6364
+VRSQRTPHZmbk	6365
+VRSQRTPHZmbkz	6366
+VRSQRTPHZmk	6367
+VRSQRTPHZmkz	6368
+VRSQRTPHZr	6369
+VRSQRTPHZrk	6370
+VRSQRTPHZrkz	6371
+VRSQRTPSYm	6372
+VRSQRTPSYr	6373
+VRSQRTPSm	6374
+VRSQRTPSr	6375
+VRSQRTSHZrm	6376
+VRSQRTSHZrmk	6377
+VRSQRTSHZrmkz	6378
+VRSQRTSHZrr	6379
+VRSQRTSHZrrk	6380
+VRSQRTSHZrrkz	6381
+VRSQRTSSm	6382
+VRSQRTSSm_Int	6383
+VRSQRTSSr	6384
+VRSQRTSSr_Int	6385
+VSCALEFBF	6386
+VSCALEFPDZ	6387
+VSCALEFPDZrm	6388
+VSCALEFPDZrmb	6389
+VSCALEFPDZrmbk	6390
+VSCALEFPDZrmbkz	6391
+VSCALEFPDZrmk	6392
+VSCALEFPDZrmkz	6393
+VSCALEFPDZrr	6394
+VSCALEFPDZrrb	6395
+VSCALEFPDZrrbk	6396
+VSCALEFPDZrrbkz	6397
+VSCALEFPDZrrk	6398
+VSCALEFPDZrrkz	6399
+VSCALEFPHZ	6400
+VSCALEFPHZrm	6401
+VSCALEFPHZrmb	6402
+VSCALEFPHZrmbk	6403
+VSCALEFPHZrmbkz	6404
+VSCALEFPHZrmk	6405
+VSCALEFPHZrmkz	6406
+VSCALEFPHZrr	6407
+VSCALEFPHZrrb	6408
+VSCALEFPHZrrbk	6409
+VSCALEFPHZrrbkz	6410
+VSCALEFPHZrrk	6411
+VSCALEFPHZrrkz	6412
+VSCALEFPSZ	6413
+VSCALEFPSZrm	6414
+VSCALEFPSZrmb	6415
+VSCALEFPSZrmbk	6416
+VSCALEFPSZrmbkz	6417
+VSCALEFPSZrmk	6418
+VSCALEFPSZrmkz	6419
+VSCALEFPSZrr	6420
+VSCALEFPSZrrb	6421
+VSCALEFPSZrrbk	6422
+VSCALEFPSZrrbkz	6423
+VSCALEFPSZrrk	6424
+VSCALEFPSZrrkz	6425
+VSCALEFSDZrm	6426
+VSCALEFSDZrmk	6427
+VSCALEFSDZrmkz	6428
+VSCALEFSDZrr	6429
+VSCALEFSDZrrb_Int	6430
+VSCALEFSDZrrbk_Int	6431
+VSCALEFSDZrrbkz_Int	6432
+VSCALEFSDZrrk	6433
+VSCALEFSDZrrkz	6434
+VSCALEFSHZrm	6435
+VSCALEFSHZrmk	6436
+VSCALEFSHZrmkz	6437
+VSCALEFSHZrr	6438
+VSCALEFSHZrrb_Int	6439
+VSCALEFSHZrrbk_Int	6440
+VSCALEFSHZrrbkz_Int	6441
+VSCALEFSHZrrk	6442
+VSCALEFSHZrrkz	6443
+VSCALEFSSZrm	6444
+VSCALEFSSZrmk	6445
+VSCALEFSSZrmkz	6446
+VSCALEFSSZrr	6447
+VSCALEFSSZrrb_Int	6448
+VSCALEFSSZrrbk_Int	6449
+VSCALEFSSZrrbkz_Int	6450
+VSCALEFSSZrrk	6451
+VSCALEFSSZrrkz	6452
+VSCATTERDPDZ	6453
+VSCATTERDPDZmr	6454
+VSCATTERDPSZ	6455
+VSCATTERDPSZmr	6456
+VSCATTERPF	6457
+VSCATTERQPDZ	6458
+VSCATTERQPDZmr	6459
+VSCATTERQPSZ	6460
+VSCATTERQPSZmr	6461
+VSHA	6462
+VSHUFF	6463
+VSHUFI	6464
+VSHUFPDYrmi	6465
+VSHUFPDYrri	6466
+VSHUFPDZ	6467
+VSHUFPDZrmbi	6468
+VSHUFPDZrmbik	6469
+VSHUFPDZrmbikz	6470
+VSHUFPDZrmi	6471
+VSHUFPDZrmik	6472
+VSHUFPDZrmikz	6473
+VSHUFPDZrri	6474
+VSHUFPDZrrik	6475
+VSHUFPDZrrikz	6476
+VSHUFPDrmi	6477
+VSHUFPDrri	6478
+VSHUFPSYrmi	6479
+VSHUFPSYrri	6480
+VSHUFPSZ	6481
+VSHUFPSZrmbi	6482
+VSHUFPSZrmbik	6483
+VSHUFPSZrmbikz	6484
+VSHUFPSZrmi	6485
+VSHUFPSZrmik	6486
+VSHUFPSZrmikz	6487
+VSHUFPSZrri	6488
+VSHUFPSZrrik	6489
+VSHUFPSZrrikz	6490
+VSHUFPSrmi	6491
+VSHUFPSrri	6492
+VSM	6493
+VSQRTBF	6494
+VSQRTPDYm	6495
+VSQRTPDYr	6496
+VSQRTPDZ	6497
+VSQRTPDZm	6498
+VSQRTPDZmb	6499
+VSQRTPDZmbk	6500
+VSQRTPDZmbkz	6501
+VSQRTPDZmk	6502
+VSQRTPDZmkz	6503
+VSQRTPDZr	6504
+VSQRTPDZrb	6505
+VSQRTPDZrbk	6506
+VSQRTPDZrbkz	6507
+VSQRTPDZrk	6508
+VSQRTPDZrkz	6509
+VSQRTPDm	6510
+VSQRTPDr	6511
+VSQRTPHZ	6512
+VSQRTPHZm	6513
+VSQRTPHZmb	6514
+VSQRTPHZmbk	6515
+VSQRTPHZmbkz	6516
+VSQRTPHZmk	6517
+VSQRTPHZmkz	6518
+VSQRTPHZr	6519
+VSQRTPHZrb	6520
+VSQRTPHZrbk	6521
+VSQRTPHZrbkz	6522
+VSQRTPHZrk	6523
+VSQRTPHZrkz	6524
+VSQRTPSYm	6525
+VSQRTPSYr	6526
+VSQRTPSZ	6527
+VSQRTPSZm	6528
+VSQRTPSZmb	6529
+VSQRTPSZmbk	6530
+VSQRTPSZmbkz	6531
+VSQRTPSZmk	6532
+VSQRTPSZmkz	6533
+VSQRTPSZr	6534
+VSQRTPSZrb	6535
+VSQRTPSZrbk	6536
+VSQRTPSZrbkz	6537
+VSQRTPSZrk	6538
+VSQRTPSZrkz	6539
+VSQRTPSm	6540
+VSQRTPSr	6541
+VSQRTSDZm	6542
+VSQRTSDZm_Int	6543
+VSQRTSDZmk_Int	6544
+VSQRTSDZmkz_Int	6545
+VSQRTSDZr	6546
+VSQRTSDZr_Int	6547
+VSQRTSDZrb_Int	6548
+VSQRTSDZrbk_Int	6549
+VSQRTSDZrbkz_Int	6550
+VSQRTSDZrk_Int	6551
+VSQRTSDZrkz_Int	6552
+VSQRTSDm	6553
+VSQRTSDm_Int	6554
+VSQRTSDr	6555
+VSQRTSDr_Int	6556
+VSQRTSHZm	6557
+VSQRTSHZm_Int	6558
+VSQRTSHZmk_Int	6559
+VSQRTSHZmkz_Int	6560
+VSQRTSHZr	6561
+VSQRTSHZr_Int	6562
+VSQRTSHZrb_Int	6563
+VSQRTSHZrbk_Int	6564
+VSQRTSHZrbkz_Int	6565
+VSQRTSHZrk_Int	6566
+VSQRTSHZrkz_Int	6567
+VSQRTSSZm	6568
+VSQRTSSZm_Int	6569
+VSQRTSSZmk_Int	6570
+VSQRTSSZmkz_Int	6571
+VSQRTSSZr	6572
+VSQRTSSZr_Int	6573
+VSQRTSSZrb_Int	6574
+VSQRTSSZrbk_Int	6575
+VSQRTSSZrbkz_Int	6576
+VSQRTSSZrk_Int	6577
+VSQRTSSZrkz_Int	6578
+VSQRTSSm	6579
+VSQRTSSm_Int	6580
+VSQRTSSr	6581
+VSQRTSSr_Int	6582
+VSTMXCSR	6583
+VSUBBF	6584
+VSUBPDYrm	6585
+VSUBPDYrr	6586
+VSUBPDZ	6587
+VSUBPDZrm	6588
+VSUBPDZrmb	6589
+VSUBPDZrmbk	6590
+VSUBPDZrmbkz	6591
+VSUBPDZrmk	6592
+VSUBPDZrmkz	6593
+VSUBPDZrr	6594
+VSUBPDZrrb	6595
+VSUBPDZrrbk	6596
+VSUBPDZrrbkz	6597
+VSUBPDZrrk	6598
+VSUBPDZrrkz	6599
+VSUBPDrm	6600
+VSUBPDrr	6601
+VSUBPHZ	6602
+VSUBPHZrm	6603
+VSUBPHZrmb	6604
+VSUBPHZrmbk	6605
+VSUBPHZrmbkz	6606
+VSUBPHZrmk	6607
+VSUBPHZrmkz	6608
+VSUBPHZrr	6609
+VSUBPHZrrb	6610
+VSUBPHZrrbk	6611
+VSUBPHZrrbkz	6612
+VSUBPHZrrk	6613
+VSUBPHZrrkz	6614
+VSUBPSYrm	6615
+VSUBPSYrr	6616
+VSUBPSZ	6617
+VSUBPSZrm	6618
+VSUBPSZrmb	6619
+VSUBPSZrmbk	6620
+VSUBPSZrmbkz	6621
+VSUBPSZrmk	6622
+VSUBPSZrmkz	6623
+VSUBPSZrr	6624
+VSUBPSZrrb	6625
+VSUBPSZrrbk	6626
+VSUBPSZrrbkz	6627
+VSUBPSZrrk	6628
+VSUBPSZrrkz	6629
+VSUBPSrm	6630
+VSUBPSrr	6631
+VSUBSDZrm	6632
+VSUBSDZrm_Int	6633
+VSUBSDZrmk_Int	6634
+VSUBSDZrmkz_Int	6635
+VSUBSDZrr	6636
+VSUBSDZrr_Int	6637
+VSUBSDZrrb_Int	6638
+VSUBSDZrrbk_Int	6639
+VSUBSDZrrbkz_Int	6640
+VSUBSDZrrk_Int	6641
+VSUBSDZrrkz_Int	6642
+VSUBSDrm	6643
+VSUBSDrm_Int	6644
+VSUBSDrr	6645
+VSUBSDrr_Int	6646
+VSUBSHZrm	6647
+VSUBSHZrm_Int	6648
+VSUBSHZrmk_Int	6649
+VSUBSHZrmkz_Int	6650
+VSUBSHZrr	6651
+VSUBSHZrr_Int	6652
+VSUBSHZrrb_Int	6653
+VSUBSHZrrbk_Int	6654
+VSUBSHZrrbkz_Int	6655
+VSUBSHZrrk_Int	6656
+VSUBSHZrrkz_Int	6657
+VSUBSSZrm	6658
+VSUBSSZrm_Int	6659
+VSUBSSZrmk_Int	6660
+VSUBSSZrmkz_Int	6661
+VSUBSSZrr	6662
+VSUBSSZrr_Int	6663
+VSUBSSZrrb_Int	6664
+VSUBSSZrrbk_Int	6665
+VSUBSSZrrbkz_Int	6666
+VSUBSSZrrk_Int	6667
+VSUBSSZrrkz_Int	6668
+VSUBSSrm	6669
+VSUBSSrm_Int	6670
+VSUBSSrr	6671
+VSUBSSrr_Int	6672
+VTESTPDYrm	6673
+VTESTPDYrr	6674
+VTESTPDrm	6675
+VTESTPDrr	6676
+VTESTPSYrm	6677
+VTESTPSYrr	6678
+VTESTPSrm	6679
+VTESTPSrr	6680
+VUCOMISDZrm	6681
+VUCOMISDZrm_Int	6682
+VUCOMISDZrr	6683
+VUCOMISDZrr_Int	6684
+VUCOMISDZrrb	6685
+VUCOMISDrm	6686
+VUCOMISDrm_Int	6687
+VUCOMISDrr	6688
+VUCOMISDrr_Int	6689
+VUCOMISHZrm	6690
+VUCOMISHZrm_Int	6691
+VUCOMISHZrr	6692
+VUCOMISHZrr_Int	6693
+VUCOMISHZrrb	6694
+VUCOMISSZrm	6695
+VUCOMISSZrm_Int	6696
+VUCOMISSZrr	6697
+VUCOMISSZrr_Int	6698
+VUCOMISSZrrb	6699
+VUCOMISSrm	6700
+VUCOMISSrm_Int	6701
+VUCOMISSrr	6702
+VUCOMISSrr_Int	6703
+VUCOMXSDZrm	6704
+VUCOMXSDZrm_Int	6705
+VUCOMXSDZrr	6706
+VUCOMXSDZrr_Int	6707
+VUCOMXSDZrrb_Int	6708
+VUCOMXSHZrm	6709
+VUCOMXSHZrm_Int	6710
+VUCOMXSHZrr	6711
+VUCOMXSHZrr_Int	6712
+VUCOMXSHZrrb_Int	6713
+VUCOMXSSZrm	6714
+VUCOMXSSZrm_Int	6715
+VUCOMXSSZrr	6716
+VUCOMXSSZrr_Int	6717
+VUCOMXSSZrrb_Int	6718
+VUNPCKHPDYrm	6719
+VUNPCKHPDYrr	6720
+VUNPCKHPDZ	6721
+VUNPCKHPDZrm	6722
+VUNPCKHPDZrmb	6723
+VUNPCKHPDZrmbk	6724
+VUNPCKHPDZrmbkz	6725
+VUNPCKHPDZrmk	6726
+VUNPCKHPDZrmkz	6727
+VUNPCKHPDZrr	6728
+VUNPCKHPDZrrk	6729
+VUNPCKHPDZrrkz	6730
+VUNPCKHPDrm	6731
+VUNPCKHPDrr	6732
+VUNPCKHPSYrm	6733
+VUNPCKHPSYrr	6734
+VUNPCKHPSZ	6735
+VUNPCKHPSZrm	6736
+VUNPCKHPSZrmb	6737
+VUNPCKHPSZrmbk	6738
+VUNPCKHPSZrmbkz	6739
+VUNPCKHPSZrmk	6740
+VUNPCKHPSZrmkz	6741
+VUNPCKHPSZrr	6742
+VUNPCKHPSZrrk	6743
+VUNPCKHPSZrrkz	6744
+VUNPCKHPSrm	6745
+VUNPCKHPSrr	6746
+VUNPCKLPDYrm	6747
+VUNPCKLPDYrr	6748
+VUNPCKLPDZ	6749
+VUNPCKLPDZrm	6750
+VUNPCKLPDZrmb	6751
+VUNPCKLPDZrmbk	6752
+VUNPCKLPDZrmbkz	6753
+VUNPCKLPDZrmk	6754
+VUNPCKLPDZrmkz	6755
+VUNPCKLPDZrr	6756
+VUNPCKLPDZrrk	6757
+VUNPCKLPDZrrkz	6758
+VUNPCKLPDrm	6759
+VUNPCKLPDrr	6760
+VUNPCKLPSYrm	6761
+VUNPCKLPSYrr	6762
+VUNPCKLPSZ	6763
+VUNPCKLPSZrm	6764
+VUNPCKLPSZrmb	6765
+VUNPCKLPSZrmbk	6766
+VUNPCKLPSZrmbkz	6767
+VUNPCKLPSZrmk	6768
+VUNPCKLPSZrmkz	6769
+VUNPCKLPSZrr	6770
+VUNPCKLPSZrrk	6771
+VUNPCKLPSZrrkz	6772
+VUNPCKLPSrm	6773
+VUNPCKLPSrr	6774
+VXORPDYrm	6775
+VXORPDYrr	6776
+VXORPDZ	6777
+VXORPDZrm	6778
+VXORPDZrmb	6779
+VXORPDZrmbk	6780
+VXORPDZrmbkz	6781
+VXORPDZrmk	6782
+VXORPDZrmkz	6783
+VXORPDZrr	6784
+VXORPDZrrk	6785
+VXORPDZrrkz	6786
+VXORPDrm	6787
+VXORPDrr	6788
+VXORPSYrm	6789
+VXORPSYrr	6790
+VXORPSZ	6791
+VXORPSZrm	6792
+VXORPSZrmb	6793
+VXORPSZrmbk	6794
+VXORPSZrmbkz	6795
+VXORPSZrmk	6796
+VXORPSZrmkz	6797
+VXORPSZrr	6798
+VXORPSZrrk	6799
+VXORPSZrrkz	6800
+VXORPSrm	6801
+VXORPSrr	6802
+VZEROALL	6803
+VZEROUPPER	6804
+V_SET	6805
+V_SETALLONES	6806
+WAIT	6807
+WBINVD	6808
+WBNOINVD	6809
+WRFLAGS	6810
+WRFSBASE	6811
+WRGSBASE	6812
+WRMSR	6813
+WRMSRLIST	6814
+WRMSRNS	6815
+WRMSRNSir	6816
+WRMSRNSir_EVEX	6817
+WRPKRUr	6818
+WRSSD	6819
+WRSSD_EVEX	6820
+WRSSQ	6821
+WRSSQ_EVEX	6822
+WRUSSD	6823
+WRUSSD_EVEX	6824
+WRUSSQ	6825
+WRUSSQ_EVEX	6826
+XABORT	6827
+XABORT_DEF	6828
+XACQUIRE_PREFIX	6829
+XADD	6830
+XAM_F	6831
+XAM_Fp	6832
+XBEGIN	6833
+XCHG	6834
+XCH_F	6835
+XCRYPTCBC	6836
+XCRYPTCFB	6837
+XCRYPTCTR	6838
+XCRYPTECB	6839
+XCRYPTOFB	6840
+XEND	6841
+XGETBV	6842
+XLAT	6843
+XOR	6844
+XORPDrm	6845
+XORPDrr	6846
+XORPSrm	6847
+XORPSrr	6848
+XRELEASE_PREFIX	6849
+XRESLDTRK	6850
+XRSTOR	6851
+XRSTORS	6852
+XSAVE	6853
+XSAVEC	6854
+XSAVEOPT	6855
+XSAVES	6856
+XSETBV	6857
+XSHA	6858
+XSTORE	6859
+XSUSLDTRK	6860
+XTEST	6861
+Immediate	6862
+CImmediate	6863
+FPImmediate	6864
+MBB	6865
+FrameIndex	6866
+ConstantPoolIndex	6867
+TargetIndex	6868
+JumpTableIndex	6869
+ExternalSymbol	6870
+GlobalAddress	6871
+BlockAddress	6872
+RegisterMask	6873
+RegisterLiveOut	6874
+Metadata	6875
+MCSymbol	6876
+CFIIndex	6877
+IntrinsicID	6878
+Predicate	6879
+ShuffleMask	6880
+PhyReg_GR8	6881
+PhyReg_GRH8	6882
+PhyReg_GR8_NOREX2	6883
+PhyReg_GR8_NOREX	6884
+PhyReg_GR8_ABCD_H	6885
+PhyReg_GR8_ABCD_L	6886
+PhyReg_GRH16	6887
+PhyReg_GR16	6888
+PhyReg_GR16_NOREX2	6889
+PhyReg_GR16_NOREX	6890
+PhyReg_VK1	6891
+PhyReg_VK16	6892
+PhyReg_VK2	6893
+PhyReg_VK4	6894
+PhyReg_VK8	6895
+PhyReg_VK16WM	6896
+PhyReg_VK1WM	6897
+PhyReg_VK2WM	6898
+PhyReg_VK4WM	6899
+PhyReg_VK8WM	6900
+PhyReg_SEGMENT_REG	6901
+PhyReg_GR16_ABCD	6902
+PhyReg_FPCCR	6903
+PhyReg_FR16X	6904
+PhyReg_FR16	6905
+PhyReg_VK16PAIR	6906
+PhyReg_VK1PAIR	6907
+PhyReg_VK2PAIR	6908
+PhyReg_VK4PAIR	6909
+PhyReg_VK8PAIR	6910
+PhyReg_VK1PAIR_with_sub_mask_0_in_VK1WM	6911
+PhyReg_LOW32_ADDR_ACCESS_RBP	6912
+PhyReg_LOW32_ADDR_ACCESS	6913
+PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit	6914
+PhyReg_FR32X	6915
+PhyReg_GR32	6916
+PhyReg_GR32_NOSP	6917
+PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX2	6918
+PhyReg_DEBUG_REG	6919
+PhyReg_FR32	6920
+PhyReg_GR32_NOREX2	6921
+PhyReg_GR32_NOREX2_NOSP	6922
+PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX	6923
+PhyReg_GR32_NOREX	6924
+PhyReg_VK32	6925
+PhyReg_GR32_NOREX_NOSP	6926
+PhyReg_RFP32	6927
+PhyReg_VK32WM	6928
+PhyReg_GR32_ABCD	6929
+PhyReg_GR32_TC	6930
+PhyReg_GR32_ABCD_and_GR32_TC	6931
+PhyReg_GR32_AD	6932
+PhyReg_GR32_ArgRef	6933
+PhyReg_GR32_BPSP	6934
+PhyReg_GR32_BSI	6935
+PhyReg_GR32_CB	6936
+PhyReg_GR32_DC	6937
+PhyReg_GR32_DIBP	6938
+PhyReg_GR32_SIDI	6939
+PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_32bit	6940
+PhyReg_CCR	6941
+PhyReg_DFCCR	6942
+PhyReg_GR32_ABCD_and_GR32_BSI	6943
+PhyReg_GR32_AD_and_GR32_ArgRef	6944
+PhyReg_GR32_ArgRef_and_GR32_CB	6945
+PhyReg_GR32_BPSP_and_GR32_DIBP	6946
+PhyReg_GR32_BPSP_and_GR32_TC	6947
+PhyReg_GR32_BSI_and_GR32_SIDI	6948
+PhyReg_GR32_DIBP_and_GR32_SIDI	6949
+PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit_with_sub_32bit	6950
+PhyReg_LOW32_ADDR_ACCESS_with_sub_32bit	6951
+PhyReg_RFP64	6952
+PhyReg_GR64	6953
+PhyReg_FR64X	6954
+PhyReg_GR64_with_sub_8bit	6955
+PhyReg_GR64_NOSP	6956
+PhyReg_GR64_NOREX2	6957
+PhyReg_CONTROL_REG	6958
+PhyReg_FR64	6959
+PhyReg_GR64_with_sub_16bit_in_GR16_NOREX2	6960
+PhyReg_GR64_NOREX2_NOSP	6961
+PhyReg_GR64PLTSafe	6962
+PhyReg_GR64_TC	6963
+PhyReg_GR64_NOREX	6964
+PhyReg_GR64_TCW64	6965
+PhyReg_GR64_TC_with_sub_8bit	6966
+PhyReg_GR64_NOREX2_NOSP_and_GR64_TC	6967
+PhyReg_GR64_TCW64_with_sub_8bit	6968
+PhyReg_GR64_TC_and_GR64_TCW64	6969
+PhyReg_GR64_with_sub_16bit_in_GR16_NOREX	6970
+PhyReg_VK64	6971
+PhyReg_VR64	6972
+PhyReg_GR64PLTSafe_and_GR64_TC	6973
+PhyReg_GR64_NOREX2_NOSP_and_GR64_TCW64	6974
+PhyReg_GR64_NOREX_NOSP	6975
+PhyReg_GR64_NOREX_and_GR64_TC	6976
+PhyReg_GR64_TCW64_and_GR64_TC_with_sub_8bit	6977
+PhyReg_VK64WM	6978
+PhyReg_GR64_TC_and_GR64_NOREX2_NOSP_and_GR64_TCW64	6979
+PhyReg_GR64_TC_and_GR64_with_sub_16bit_in_GR16_NOREX	6980
+PhyReg_GR64PLTSafe_and_GR64_TCW64	6981
+PhyReg_GR64_NOREX_and_GR64PLTSafe_and_GR64_TC	6982
+PhyReg_GR64_NOREX_and_GR64_TCW64	6983
+PhyReg_GR64_ABCD	6984
+PhyReg_GR64_with_sub_32bit_in_GR32_TC	6985
+PhyReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_TC	6986
+PhyReg_GR64_AD	6987
+PhyReg_GR64_ArgRef	6988
+PhyReg_GR64_and_LOW32_ADDR_ACCESS_RBP	6989
+PhyReg_GR64_with_sub_32bit_in_GR32_ArgRef	6990
+PhyReg_GR64_with_sub_32bit_in_GR32_BPSP	6991
+PhyReg_GR64_with_sub_32bit_in_GR32_BSI	6992
+PhyReg_GR64_with_sub_32bit_in_GR32_CB	6993
+PhyReg_GR64_with_sub_32bit_in_GR32_DIBP	6994
+PhyReg_GR64_with_sub_32bit_in_GR32_SIDI	6995
+PhyReg_GR64_A	6996
+PhyReg_GR64_ArgRef_and_GR64_TC	6997
+PhyReg_GR64_and_LOW32_ADDR_ACCESS	6998
+PhyReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_BSI	6999
+PhyReg_GR64_with_sub_32bit_in_GR32_AD_and_GR32_ArgRef	7000
+PhyReg_GR64_with_sub_32bit_in_GR32_ArgRef_and_GR32_CB	7001
+PhyReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_DIBP	7002
+PhyReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_TC	7003
+PhyReg_GR64_with_sub_32bit_in_GR32_BSI_and_GR32_SIDI	7004
+PhyReg_GR64_with_sub_32bit_in_GR32_DIBP_and_GR32_SIDI	7005
+PhyReg_RST	7006
+PhyReg_RFP80	7007
+PhyReg_RFP80_7	7008
+PhyReg_VR128X	7009
+PhyReg_VR128	7010
+PhyReg_VR256X	7011
+PhyReg_VR256	7012
+PhyReg_VR512	7013
+PhyReg_VR512_0_15	7014
+PhyReg_TILE	7015
+VirtReg_GR8	7016
+VirtReg_GRH8	7017
+VirtReg_GR8_NOREX2	7018
+VirtReg_GR8_NOREX	7019
+VirtReg_GR8_ABCD_H	7020
+VirtReg_GR8_ABCD_L	7021
+VirtReg_GRH16	7022
+VirtReg_GR16	7023
+VirtReg_GR16_NOREX2	7024
+VirtReg_GR16_NOREX	7025
+VirtReg_VK1	7026
+VirtReg_VK16	7027
+VirtReg_VK2	7028
+VirtReg_VK4	7029
+VirtReg_VK8	7030
+VirtReg_VK16WM	7031
+VirtReg_VK1WM	7032
+VirtReg_VK2WM	7033
+VirtReg_VK4WM	7034
+VirtReg_VK8WM	7035
+VirtReg_SEGMENT_REG	7036
+VirtReg_GR16_ABCD	7037
+VirtReg_FPCCR	7038
+VirtReg_FR16X	7039
+VirtReg_FR16	7040
+VirtReg_VK16PAIR	7041
+VirtReg_VK1PAIR	7042
+VirtReg_VK2PAIR	7043
+VirtReg_VK4PAIR	7044
+VirtReg_VK8PAIR	7045
+VirtReg_VK1PAIR_with_sub_mask_0_in_VK1WM	7046
+VirtReg_LOW32_ADDR_ACCESS_RBP	7047
+VirtReg_LOW32_ADDR_ACCESS	7048
+VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit	7049
+VirtReg_FR32X	7050
+VirtReg_GR32	7051
+VirtReg_GR32_NOSP	7052
+VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX2	7053
+VirtReg_DEBUG_REG	7054
+VirtReg_FR32	7055
+VirtReg_GR32_NOREX2	7056
+VirtReg_GR32_NOREX2_NOSP	7057
+VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX	7058
+VirtReg_GR32_NOREX	7059
+VirtReg_VK32	7060
+VirtReg_GR32_NOREX_NOSP	7061
+VirtReg_RFP32	7062
+VirtReg_VK32WM	7063
+VirtReg_GR32_ABCD	7064
+VirtReg_GR32_TC	7065
+VirtReg_GR32_ABCD_and_GR32_TC	7066
+VirtReg_GR32_AD	7067
+VirtReg_GR32_ArgRef	7068
+VirtReg_GR32_BPSP	7069
+VirtReg_GR32_BSI	7070
+VirtReg_GR32_CB	7071
+VirtReg_GR32_DC	7072
+VirtReg_GR32_DIBP	7073
+VirtReg_GR32_SIDI	7074
+VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_32bit	7075
+VirtReg_CCR	7076
+VirtReg_DFCCR	7077
+VirtReg_GR32_ABCD_and_GR32_BSI	7078
+VirtReg_GR32_AD_and_GR32_ArgRef	7079
+VirtReg_GR32_ArgRef_and_GR32_CB	7080
+VirtReg_GR32_BPSP_and_GR32_DIBP	7081
+VirtReg_GR32_BPSP_and_GR32_TC	7082
+VirtReg_GR32_BSI_and_GR32_SIDI	7083
+VirtReg_GR32_DIBP_and_GR32_SIDI	7084
+VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit_with_sub_32bit	7085
+VirtReg_LOW32_ADDR_ACCESS_with_sub_32bit	7086
+VirtReg_RFP64	7087
+VirtReg_GR64	7088
+VirtReg_FR64X	7089
+VirtReg_GR64_with_sub_8bit	7090
+VirtReg_GR64_NOSP	7091
+VirtReg_GR64_NOREX2	7092
+VirtReg_CONTROL_REG	7093
+VirtReg_FR64	7094
+VirtReg_GR64_with_sub_16bit_in_GR16_NOREX2	7095
+VirtReg_GR64_NOREX2_NOSP	7096
+VirtReg_GR64PLTSafe	7097
+VirtReg_GR64_TC	7098
+VirtReg_GR64_NOREX	7099
+VirtReg_GR64_TCW64	7100
+VirtReg_GR64_TC_with_sub_8bit	7101
+VirtReg_GR64_NOREX2_NOSP_and_GR64_TC	7102
+VirtReg_GR64_TCW64_with_sub_8bit	7103
+VirtReg_GR64_TC_and_GR64_TCW64	7104
+VirtReg_GR64_with_sub_16bit_in_GR16_NOREX	7105
+VirtReg_VK64	7106
+VirtReg_VR64	7107
+VirtReg_GR64PLTSafe_and_GR64_TC	7108
+VirtReg_GR64_NOREX2_NOSP_and_GR64_TCW64	7109
+VirtReg_GR64_NOREX_NOSP	7110
+VirtReg_GR64_NOREX_and_GR64_TC	7111
+VirtReg_GR64_TCW64_and_GR64_TC_with_sub_8bit	7112
+VirtReg_VK64WM	7113
+VirtReg_GR64_TC_and_GR64_NOREX2_NOSP_and_GR64_TCW64	7114
+VirtReg_GR64_TC_and_GR64_with_sub_16bit_in_GR16_NOREX	7115
+VirtReg_GR64PLTSafe_and_GR64_TCW64	7116
+VirtReg_GR64_NOREX_and_GR64PLTSafe_and_GR64_TC	7117
+VirtReg_GR64_NOREX_and_GR64_TCW64	7118
+VirtReg_GR64_ABCD	7119
+VirtReg_GR64_with_sub_32bit_in_GR32_TC	7120
+VirtReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_TC	7121
+VirtReg_GR64_AD	7122
+VirtReg_GR64_ArgRef	7123
+VirtReg_GR64_and_LOW32_ADDR_ACCESS_RBP	7124
+VirtReg_GR64_with_sub_32bit_in_GR32_ArgRef	7125
+VirtReg_GR64_with_sub_32bit_in_GR32_BPSP	7126
+VirtReg_GR64_with_sub_32bit_in_GR32_BSI	7127
+VirtReg_GR64_with_sub_32bit_in_GR32_CB	7128
+VirtReg_GR64_with_sub_32bit_in_GR32_DIBP	7129
+VirtReg_GR64_with_sub_32bit_in_GR32_SIDI	7130
+VirtReg_GR64_A	7131
+VirtReg_GR64_ArgRef_and_GR64_TC	7132
+VirtReg_GR64_and_LOW32_ADDR_ACCESS	7133
+VirtReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_BSI	7134
+VirtReg_GR64_with_sub_32bit_in_GR32_AD_and_GR32_ArgRef	7135
+VirtReg_GR64_with_sub_32bit_in_GR32_ArgRef_and_GR32_CB	7136
+VirtReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_DIBP	7137
+VirtReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_TC	7138
+VirtReg_GR64_with_sub_32bit_in_GR32_BSI_and_GR32_SIDI	7139
+VirtReg_GR64_with_sub_32bit_in_GR32_DIBP_and_GR32_SIDI	7140
+VirtReg_RST	7141
+VirtReg_RFP80	7142
+VirtReg_RFP80_7	7143
+VirtReg_VR128X	7144
+VirtReg_VR128	7145
+VirtReg_VR256X	7146
+VirtReg_VR256	7147
+VirtReg_VR512	7148
+VirtReg_VR512_0_15	7149
+VirtReg_TILE	7150
diff --git a/llvm/test/tools/llvm-readobj/ELF/bb-addr-map-feature-warning.test b/llvm/test/tools/llvm-readobj/ELF/bb-addr-map-feature-warning.test
new file mode 100644
index 0000000000000..24726c34d3509
--- /dev/null
+++ b/llvm/test/tools/llvm-readobj/ELF/bb-addr-map-feature-warning.test
@@ -0,0 +1,37 @@
+## This test checks that we output a warning when the specified version is too old to support the given features.
+
+# RUN: yaml2obj %s -o %t
+# RUN: llvm-readobj --bb-addr-map %t 2>&1 | FileCheck -DFILE=%t %s
+
+--- !ELF
+FileHeader:
+  Class: ELFCLASS64
+  Data:  ELFDATA2LSB
+  Type:  ET_EXEC
+
+# CHECK: BBAddrMap [
+# CHECK-NEXT: warning: '[[FILE]]': unable to dump SHT_LLVM_BB_ADDR_MAP section with index 1: version should be >= 3 for SHT_LLVM_BB_ADDR_MAP when callsite offsets feature is enabled: version = 2 feature = 32
+Sections:
+  - Name: '.llvm_bb_addr_map (1)'
+    Type: SHT_LLVM_BB_ADDR_MAP
+    Entries:
+      - Version: 2
+        Feature: 0x20
+
+# CHECK: BBAddrMap [
+# CHECK-NEXT: warning: '[[FILE]]': unable to dump SHT_LLVM_BB_ADDR_MAP section with index 2: version should be >= 4 for SHT_LLVM_BB_ADDR_MAP when basic block hash feature is enabled: version = 3 feature = 64
+
+  - Name: '.llvm_bb_addr_map (2)'
+    Type: SHT_LLVM_BB_ADDR_MAP
+    Entries:
+      - Version: 3
+        Feature: 0x40
+
+# CHECK: BBAddrMap [
+# CHECK-NEXT: warning: '[[FILE]]': unable to dump SHT_LLVM_BB_ADDR_MAP section with index 3: version should be >= 5 for SHT_LLVM_BB_ADDR_MAP when post link cfg feature is enabled: version = 4 feature = 128
+
+  - Name: '.llvm_bb_addr_map (3)'
+    Type: SHT_LLVM_BB_ADDR_MAP
+    Entries:
+      - Version: 4
+        Feature: 0x80
diff --git a/llvm/test/tools/llvm-readobj/ELF/bb-addr-map-pgo-analysis-map.test b/llvm/test/tools/llvm-readobj/ELF/bb-addr-map-pgo-analysis-map.test
index 5faafd4d83b2f..8e9d2271b8721 100644
--- a/llvm/test/tools/llvm-readobj/ELF/bb-addr-map-pgo-analysis-map.test
+++ b/llvm/test/tools/llvm-readobj/ELF/bb-addr-map-pgo-analysis-map.test
@@ -15,7 +15,7 @@
 
 ## Check that a malformed section can be handled.
 # RUN: yaml2obj %s -DBITS=32 -DSIZE=24 -o %t2.o
-# RUN: llvm-readobj %t2.o --bb-addr-map 2>&1 | FileCheck --match-full-lines %s -DOFFSET=0x00000018 -DFILE=%t2.o --check-prefix=TRUNCATED
+# RUN: llvm-readobj %t2.o --bb-addr-map 2>&1 | FileCheck --match-full-lines %s -DOFFSET=0x00000015 -DFILE=%t2.o --check-prefix=TRUNCATED
 
 ## Check that missing features can be handled.
 # RUN: yaml2obj %s -DBITS=32 -DFEATURE=0x2 -o %t3.o
@@ -59,17 +59,20 @@
 # CHECK-NEXT:         {
 # RAW-NEXT:             Frequency: 100
 # PRETTY-NEXT:          Frequency: 1.0
+# CHECK-NEXT:           PostLink Frequency: 10
 # CHECK-NEXT:           Successors [
 # CHECK-NEXT:             {
 # CHECK-NEXT:               ID: 2
 # RAW-NEXT:                 Probability: 0x80000000
 # PRETTY-NEXT:              Probability: 0x80000000 / 0x80000000 = 100.00%
+# CHECK-NEXT:               PostLink Probability: 7
 # CHECK-NEXT:             }
 # CHECK-NEXT:           ]
 # CHECK-NEXT:         }
 # CHECK-NEXT:         {
 # RAW-NEXT:             Frequency: 100
 # PRETTY-NEXT:          Frequency: 1.0
+# CHECK-NEXT:           PostLink Frequency: 0
 # CHECK-NEXT:           Successors [
 # CHECK-NEXT:           ]
 # CHECK-NEXT:         }
@@ -172,8 +175,8 @@ Sections:
     ShSize: [[SIZE=<none>]]
     Link:   .text
     Entries:
-      - Version: 2
-        Feature: 0x7
+      - Version: 5
+        Feature: 0x87
         BBRanges:
           - BaseAddress: [[ADDR=0x11111]]
             BBEntries:
@@ -197,10 +200,12 @@ Sections:
     PGOAnalyses:
       - FuncEntryCount: 100
         PGOBBEntries:
-          - BBFreq:        100
+          - BBFreq:          100
+            PostLinkBBFreq: 10
             Successors:
-              - ID:        2
-                BrProb:    0x80000000
+              - ID:              2
+                BrProb:          0x80000000
+                PostLinkBrFreq: 7
           - BBFreq:        100
             Successors:    []
       - FuncEntryCount: 8888
diff --git a/llvm/test/tools/obj2yaml/ELF/bb-addr-map-pgo-analysis-map.yaml b/llvm/test/tools/obj2yaml/ELF/bb-addr-map-pgo-analysis-map.yaml
index 299bf463cf4bc..645507af080cb 100644
--- a/llvm/test/tools/obj2yaml/ELF/bb-addr-map-pgo-analysis-map.yaml
+++ b/llvm/test/tools/obj2yaml/ELF/bb-addr-map-pgo-analysis-map.yaml
@@ -15,7 +15,7 @@
 # VALID-NEXT:     Type: SHT_LLVM_BB_ADDR_MAP
 # VALID-NEXT:     Entries:
 # VALID-NEXT:       - Version: 2
-# VALID-NEXT:         Feature: 0x7
+# VALID-NEXT:         Feature: 0x87
 ## The 'BaseAddress' field is omitted when it's zero.
 # VALID-NEXT:         BBRanges:
 # VALID-NEXT:           - BBEntries:
@@ -43,17 +43,23 @@
 # VALID-NEXT:     PGOAnalyses:
 # VALID-NEXT:       - FuncEntryCount: 100
 # VALID-NEXT:         PGOBBEntries:
-# VALID-NEXT:           - BBFreq:        100
+# VALID-NEXT:           - BBFreq:           100
+# VALID-NEXT:             PostLinkBBFreq:   10
 # VALID-NEXT:             Successors:
-# VALID-NEXT:               - ID:        2
-# VALID-NEXT:                 BrProb:    0x80000000
-# VALID-NEXT:               - ID:        4
-# VALID-NEXT:                 BrProb:    0x80000000
-# VALID-NEXT:           - BBFreq:        50
+# VALID-NEXT:               - ID:              2
+# VALID-NEXT:                 BrProb:          0x80000000
+# VALID-NEXT:                 PostLinkBrFreq:  7
+# VALID-NEXT:               - ID:              4
+# VALID-NEXT:                 BrProb:          0x80000000
+# VALID-NEXT:                 PostLinkBrFreq:  0
+# VALID-NEXT:           - BBFreq:           50
+# VALID-NEXT:             PostLinkBBFreq:   0
 # VALID-NEXT:             Successors:
-# VALID-NEXT:               - ID:        4
-# VALID-NEXT:                 BrProb:    0xFFFFFFFF
-# VALID-NEXT:           - BBFreq:        100
+# VALID-NEXT:               - ID:              4
+# VALID-NEXT:                 BrProb:          0xFFFFFFFF
+# VALID-NEXT:                 PostLinkBrFreq:  0
+# VALID-NEXT:           - BBFreq:           100
+# VALID-NEXT:             PostLinkBBFreq:   3
 # VALID-NEXT:             Successors:    []
 # VALID-NEXT:         PGOBBEntries:
 # VALID-NEXT:           - BBFreq:        20
@@ -69,7 +75,7 @@ Sections:
     ShSize: [[SIZE=<none>]]
     Entries:
       - Version: 2
-        Feature: 0x7
+        Feature: 0x87
         BBRanges:
           - BaseAddress: 0x0
             BBEntries:
@@ -97,17 +103,20 @@ Sections:
     PGOAnalyses:
       - FuncEntryCount: 100
         PGOBBEntries:
-          - BBFreq:        100
+          - BBFreq:          100
+            PostLinkBBFreq:  10
             Successors:
-              - ID:        2
-                BrProb:    0x80000000
-              - ID:        4
-                BrProb:    0x80000000
-          - BBFreq:        50
+              - ID:              2
+                BrProb:          0x80000000
+                PostLinkBrFreq:  7
+              - ID:              4
+                BrProb:          0x80000000
+          - BBFreq:              50
             Successors:
-              - ID:        4
-                BrProb:    0xFFFFFFFF
-          - BBFreq:        100
+              - ID:              4
+                BrProb:          0xFFFFFFFF
+          - BBFreq:              100
+            PostLinkBBFreq:      3
             Successors: []
       - PGOBBEntries:
           - BBFreq:        20
diff --git a/llvm/test/tools/yaml2obj/ELF/bb-addr-map-pgo-analysis-map.yaml b/llvm/test/tools/yaml2obj/ELF/bb-addr-map-pgo-analysis-map.yaml
index a4cb572e6d993..ac9c8d402b0a6 100644
--- a/llvm/test/tools/yaml2obj/ELF/bb-addr-map-pgo-analysis-map.yaml
+++ b/llvm/test/tools/yaml2obj/ELF/bb-addr-map-pgo-analysis-map.yaml
@@ -6,8 +6,9 @@
 # Case 4: Specify Entries.
 # CHECK:        Name: .llvm_bb_addr_map (1)
 # CHECK:        SectionData (
-# CHECK-NEXT:     0000: 02072000 00000000 0000010B 010203E8
-# CHECK-NEXT:     0010: 07E80702 0CEEDDBB F70E0D91 A2C48801
+# CHECK-NEXT:     0000: 02872000 00000000 0000010B 010203E8
+# CHECK-NEXT:     0010: 07E80764 020CEEDD BBF70E28 0D91A2C4
+# CHECK-NEXT:     0020: 880100
 # CHECK-NEXT:   )
 
 # Case 7: Not including a field which is enabled in feature doesn't emit value
@@ -26,12 +27,12 @@ Sections:
 ## Test the following cases:
 
 ## 1) We can produce an .llvm_bb_addr_map section from a description with
-##    Entries and PGO Analysis data.
+##    Entries and PGO Analysis and Post Link data.
   - Name: '.llvm_bb_addr_map (1)'
     Type: SHT_LLVM_BB_ADDR_MAP
     Entries:
       - Version: 2
-        Feature: 0x7
+        Feature: 0x87
         BBRanges:
           - BaseAddress: 0x0000000000000020
             BBEntries:
@@ -42,12 +43,14 @@ Sections:
     PGOAnalyses:
       - FuncEntryCount: 1000
         PGOBBEntries:
-          - BBFreq:        1000
+          - BBFreq:          1000
+            PostLinkBBFreq:  100
             Successors:
-              - ID:        12
-                BrProb:    0xeeeeeeee
-              - ID:        13
-                BrProb:    0x11111111
+              - ID:               12
+                BrProb:           0xeeeeeeee
+                PostLinkBrFreq:   40
+              - ID:               13
+                BrProb:           0x11111111
 
 ## 2) According to feature we have FuncEntryCount but none is provided in yaml
   - Name: '.llvm_bb_addr_map (2)'
@@ -66,7 +69,7 @@ Sections:
 
 ## Check that yaml2obj generates a warning when we use unsupported feature.
 # RUN: yaml2obj --docnum=2  %s 2>&1 | FileCheck %s --check-prefix=INVALID-FEATURE
-# INVALID-FEATURE: warning: invalid encoding for BBAddrMap::Features: 0xf0
+# INVALID-FEATURE: warning: invalid encoding for BBAddrMap::Features: 0x100
 
 --- !ELF
 FileHeader:
@@ -79,4 +82,4 @@ Sections:
     Entries:
       - Version: 2
 ##  Specify unsupported feature
-        Feature: 0xF0
+        Feature: 0x100
diff --git a/llvm/test/tools/yaml2obj/ELF/bb-addr-map.yaml b/llvm/test/tools/yaml2obj/ELF/bb-addr-map.yaml
index 339e419b39458..05d77d67e4468 100644
--- a/llvm/test/tools/yaml2obj/ELF/bb-addr-map.yaml
+++ b/llvm/test/tools/yaml2obj/ELF/bb-addr-map.yaml
@@ -220,7 +220,7 @@ Sections:
 
 ## Check that yaml2obj generates a warning when we use unsupported versions.
 # RUN: yaml2obj --docnum=3  %s 2>&1 | FileCheck %s --check-prefix=INVALID-VERSION
-# INVALID-VERSION: warning: unsupported SHT_LLVM_BB_ADDR_MAP version: 5; encoding using the most recent version
+# INVALID-VERSION: warning: unsupported SHT_LLVM_BB_ADDR_MAP version: 6; encoding using the most recent version
 
 --- !ELF
 FileHeader:
@@ -232,4 +232,4 @@ Sections:
     Type: SHT_LLVM_BB_ADDR_MAP
     Entries:
 ##  Specify unsupported version
-      - Version: 5
+      - Version: 6
diff --git a/llvm/tools/bugpoint/ListReducer.h b/llvm/tools/bugpoint/ListReducer.h
index 06f8ddb255346..ceee85325129e 100644
--- a/llvm/tools/bugpoint/ListReducer.h
+++ b/llvm/tools/bugpoint/ListReducer.h
@@ -32,7 +32,7 @@ template <typename ElTy> struct ListReducer {
     KeepPrefix  // The prefix alone satisfies the predicate
   };
 
-  virtual ~ListReducer() {}
+  virtual ~ListReducer() = default;
 
   /// This virtual function should be overriden by subclasses to implement the
   /// test desired.  The testcase is only required to test to see if the Kept
diff --git a/llvm/tools/bugpoint/ToolRunner.h b/llvm/tools/bugpoint/ToolRunner.h
index c9da9afba0e46..9ff06639d311d 100644
--- a/llvm/tools/bugpoint/ToolRunner.h
+++ b/llvm/tools/bugpoint/ToolRunner.h
@@ -105,7 +105,7 @@ class AbstractInterpreter {
   createCustomExecutor(const char *Argv0, std::string &Message,
                        const std::string &ExecCommandLine);
 
-  virtual ~AbstractInterpreter() {}
+  virtual ~AbstractInterpreter() = default;
 
   /// compileProgram - Compile the specified program from bitcode to executable
   /// code.  This does not produce any output, it is only used when debugging
diff --git a/llvm/tools/dsymutil/BinaryHolder.h b/llvm/tools/dsymutil/BinaryHolder.h
index cb5bd95978144..27d71514cb73e 100644
--- a/llvm/tools/dsymutil/BinaryHolder.h
+++ b/llvm/tools/dsymutil/BinaryHolder.h
@@ -110,7 +110,7 @@ class BinaryHolder {
       std::string Filename;
       TimestampTy Timestamp;
 
-      KeyTy() {}
+      KeyTy() = default;
       KeyTy(StringRef Filename, TimestampTy Timestamp)
           : Filename(Filename.str()), Timestamp(Timestamp) {}
     };
diff --git a/llvm/tools/lli/lli.cpp b/llvm/tools/lli/lli.cpp
index 7fee06b5d7b4f..017e2102348b7 100644
--- a/llvm/tools/lli/lli.cpp
+++ b/llvm/tools/lli/lli.cpp
@@ -305,7 +305,7 @@ class LLIObjectCache : public ObjectCache {
         this->CacheDir[this->CacheDir.size() - 1] != '/')
       this->CacheDir += '/';
   }
-  ~LLIObjectCache() override {}
+  ~LLIObjectCache() override = default;
 
   void notifyObjectCompiled(const Module *M, MemoryBufferRef Obj) override {
     const std::string &ModuleID = M->getModuleIdentifier();
diff --git a/llvm/tools/llvm-config/llvm-config.cpp b/llvm/tools/llvm-config/llvm-config.cpp
index 020b1b5e093d5..5300c5c83e5ce 100644
--- a/llvm/tools/llvm-config/llvm-config.cpp
+++ b/llvm/tools/llvm-config/llvm-config.cpp
@@ -24,6 +24,7 @@
 #include "llvm/Config/config.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
+#include "llvm/Support/Program.h"
 #include "llvm/Support/WithColor.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/TargetParser/Triple.h"
@@ -232,6 +233,7 @@ Options:\n\
   --link-static     Link the component libraries statically.\n\
   --obj-root        Print the object root used to build LLVM.\n\
   --prefix          Print the installation prefix.\n\
+  --quote-paths     Quote and escape paths when needed.\n\
   --shared-mode     Print how the provided components can be collectively linked (`shared` or `static`).\n\
   --system-libs     System Libraries needed to link against LLVM components.\n\
   --targets-built   List of all targets currently built.\n\
@@ -324,7 +326,7 @@ int main(int argc, char **argv) {
   // information.
   std::string ActivePrefix, ActiveBinDir, ActiveIncludeDir, ActiveLibDir,
               ActiveCMakeDir;
-  std::string ActiveIncludeOption;
+  std::vector<std::string> ActiveIncludeOptions;
   if (IsInDevelopmentTree) {
     ActiveIncludeDir = std::string(LLVM_SRC_ROOT) + "/include";
     ActivePrefix = CurrentExecPrefix;
@@ -350,8 +352,8 @@ int main(int argc, char **argv) {
     }
 
     // We need to include files from both the source and object trees.
-    ActiveIncludeOption =
-        ("-I" + ActiveIncludeDir + " " + "-I" + ActiveObjRoot + "/include");
+    ActiveIncludeOptions.push_back(ActiveIncludeDir);
+    ActiveIncludeOptions.push_back(ActiveObjRoot + "/include");
   } else {
     ActivePrefix = CurrentExecPrefix;
     {
@@ -370,7 +372,7 @@ int main(int argc, char **argv) {
       sys::path::make_absolute(ActivePrefix, Path);
       ActiveCMakeDir = std::string(Path);
     }
-    ActiveIncludeOption = "-I" + ActiveIncludeDir;
+    ActiveIncludeOptions.push_back(ActiveIncludeDir);
   }
 
   /// We only use `shared library` mode in cases where the static library form
@@ -399,7 +401,9 @@ int main(int argc, char **argv) {
       llvm::replace(ActiveBinDir, '/', '\\');
       llvm::replace(ActiveLibDir, '/', '\\');
       llvm::replace(ActiveCMakeDir, '/', '\\');
-      llvm::replace(ActiveIncludeOption, '/', '\\');
+      llvm::replace(ActiveIncludeDir, '/', '\\');
+      for (auto &Include : ActiveIncludeOptions)
+        llvm::replace(Include, '/', '\\');
     }
     SharedDir = ActiveBinDir;
     StaticDir = ActiveLibDir;
@@ -501,6 +505,32 @@ int main(int argc, char **argv) {
   };
 
   raw_ostream &OS = outs();
+
+  // Check if we want quoting and escaping.
+  bool QuotePaths = std::any_of(&argv[0], &argv[argc], [](const char *Arg) {
+    return StringRef(Arg) == "--quote-paths";
+  });
+
+  auto MaybePrintQuoted = [&](StringRef Str) {
+    if (QuotePaths)
+      sys::printArg(OS, Str, /*Quote=*/false); // only add quotes if necessary
+    else
+      OS << Str;
+  };
+
+  // Render include paths and associated flags
+  auto RenderFlags = [&](StringRef Flags) {
+    bool First = true;
+    for (auto &Include : ActiveIncludeOptions) {
+      if (!First)
+        OS << ' ';
+      std::string FlagsStr = "-I" + Include;
+      MaybePrintQuoted(FlagsStr);
+      First = false;
+    }
+    OS << ' ' << Flags << '\n';
+  };
+
   for (int i = 1; i != argc; ++i) {
     StringRef Arg = argv[i];
 
@@ -509,24 +539,32 @@ int main(int argc, char **argv) {
       if (Arg == "--version") {
         OS << PACKAGE_VERSION << '\n';
       } else if (Arg == "--prefix") {
-        OS << ActivePrefix << '\n';
+        MaybePrintQuoted(ActivePrefix);
+        OS << '\n';
       } else if (Arg == "--bindir") {
-        OS << ActiveBinDir << '\n';
+        MaybePrintQuoted(ActiveBinDir);
+        OS << '\n';
       } else if (Arg == "--includedir") {
-        OS << ActiveIncludeDir << '\n';
+        MaybePrintQuoted(ActiveIncludeDir);
+        OS << '\n';
       } else if (Arg == "--libdir") {
-        OS << ActiveLibDir << '\n';
+        MaybePrintQuoted(ActiveLibDir);
+        OS << '\n';
       } else if (Arg == "--cmakedir") {
-        OS << ActiveCMakeDir << '\n';
+        MaybePrintQuoted(ActiveCMakeDir);
+        OS << '\n';
       } else if (Arg == "--cppflags") {
-        OS << ActiveIncludeOption << ' ' << LLVM_CPPFLAGS << '\n';
+        RenderFlags(LLVM_CPPFLAGS);
       } else if (Arg == "--cflags") {
-        OS << ActiveIncludeOption << ' ' << LLVM_CFLAGS << '\n';
+        RenderFlags(LLVM_CFLAGS);
       } else if (Arg == "--cxxflags") {
-        OS << ActiveIncludeOption << ' ' << LLVM_CXXFLAGS << '\n';
+        RenderFlags(LLVM_CXXFLAGS);
       } else if (Arg == "--ldflags") {
-        OS << ((HostTriple.isWindowsMSVCEnvironment()) ? "-LIBPATH:" : "-L")
-           << ActiveLibDir << ' ' << LLVM_LDFLAGS << '\n';
+        std::string LDFlags =
+            HostTriple.isWindowsMSVCEnvironment() ? "-LIBPATH:" : "-L";
+        LDFlags += ActiveLibDir;
+        MaybePrintQuoted(LDFlags);
+        OS << ' ' << LLVM_LDFLAGS << '\n';
       } else if (Arg == "--system-libs") {
         PrintSystemLibs = true;
       } else if (Arg == "--libs") {
@@ -580,7 +618,8 @@ int main(int argc, char **argv) {
       } else if (Arg == "--shared-mode") {
         PrintSharedMode = true;
       } else if (Arg == "--obj-root") {
-        OS << ActivePrefix << '\n';
+        MaybePrintQuoted(ActivePrefix);
+        OS << '\n';
       } else if (Arg == "--ignore-libllvm") {
         LinkDyLib = false;
         LinkMode = BuiltSharedLibs ? LinkModeShared : LinkModeAuto;
@@ -590,6 +629,8 @@ int main(int argc, char **argv) {
         LinkMode = LinkModeStatic;
       } else if (Arg == "--help") {
         usage(false);
+      } else if (Arg == "--quote-paths") {
+        // Was already handled above this loop.
       } else {
         usage();
       }
@@ -682,26 +723,30 @@ int main(int argc, char **argv) {
 
       auto PrintForLib = [&](const StringRef &Lib) {
         const bool Shared = LinkMode == LinkModeShared;
+        std::string LibFileName;
         if (PrintLibNames) {
-          OS << GetComponentLibraryFileName(Lib, Shared);
+          LibFileName = GetComponentLibraryFileName(Lib, Shared);
         } else if (PrintLibFiles) {
-          OS << GetComponentLibraryPath(Lib, Shared);
+          LibFileName = GetComponentLibraryPath(Lib, Shared);
         } else if (PrintLibs) {
           // On Windows, output full path to library without parameters.
           // Elsewhere, if this is a typical library name, include it using -l.
           if (HostTriple.isWindowsMSVCEnvironment()) {
-            OS << GetComponentLibraryPath(Lib, Shared);
+            LibFileName = GetComponentLibraryPath(Lib, Shared);
           } else {
+            LibFileName = "-l";
             StringRef LibName;
             if (GetComponentLibraryNameSlice(Lib, LibName)) {
               // Extract library name (remove prefix and suffix).
-              OS << "-l" << LibName;
+              LibFileName += LibName;
             } else {
               // Lib is already a library name without prefix and suffix.
-              OS << "-l" << Lib;
+              LibFileName += Lib;
             }
           }
         }
+        if (!LibFileName.empty())
+          MaybePrintQuoted(LibFileName);
       };
 
       if (LinkMode == LinkModeShared && LinkDyLib)
diff --git a/llvm/tools/llvm-cov/CoverageExporter.h b/llvm/tools/llvm-cov/CoverageExporter.h
index 751e55dc09161..ba946a14e6e5c 100644
--- a/llvm/tools/llvm-cov/CoverageExporter.h
+++ b/llvm/tools/llvm-cov/CoverageExporter.h
@@ -37,7 +37,7 @@ class CoverageExporter {
       : Coverage(CoverageMapping), Options(Options), OS(OS) {}
 
 public:
-  virtual ~CoverageExporter(){};
+  virtual ~CoverageExporter() = default;
 
   /// Render the CoverageMapping object.
   virtual void renderRoot(const CoverageFilters &IgnoreFilters) = 0;
diff --git a/llvm/tools/llvm-cov/CoverageFilters.h b/llvm/tools/llvm-cov/CoverageFilters.h
index 5345b0c87cc27..3cee23ae50dbf 100644
--- a/llvm/tools/llvm-cov/CoverageFilters.h
+++ b/llvm/tools/llvm-cov/CoverageFilters.h
@@ -28,7 +28,7 @@ struct FunctionRecord;
 /// Matches specific functions that pass the requirement of this filter.
 class CoverageFilter {
 public:
-  virtual ~CoverageFilter() {}
+  virtual ~CoverageFilter() = default;
 
   /// Return true if the function passes the requirements of this filter.
   virtual bool matches(const coverage::CoverageMapping &CM,
diff --git a/llvm/tools/llvm-cov/SourceCoverageView.h b/llvm/tools/llvm-cov/SourceCoverageView.h
index 43fb890ad7687..bde187ea35ed1 100644
--- a/llvm/tools/llvm-cov/SourceCoverageView.h
+++ b/llvm/tools/llvm-cov/SourceCoverageView.h
@@ -122,7 +122,7 @@ class CoveragePrinter {
   static std::unique_ptr<CoveragePrinter>
   create(const CoverageViewOptions &Opts);
 
-  virtual ~CoveragePrinter() {}
+  virtual ~CoveragePrinter() = default;
 
   /// @name File Creation Interface
   /// @{
@@ -288,7 +288,7 @@ class SourceCoverageView {
   create(StringRef SourceName, const MemoryBuffer &File,
          const CoverageViewOptions &Options, CoverageData &&CoverageInfo);
 
-  virtual ~SourceCoverageView() {}
+  virtual ~SourceCoverageView() = default;
 
   /// Return the source name formatted for the host OS.
   std::string getSourceName() const;
diff --git a/llvm/tools/llvm-diff/lib/DiffConsumer.h b/llvm/tools/llvm-diff/lib/DiffConsumer.h
index 08c3afcbe111e..d4f339bd560f4 100644
--- a/llvm/tools/llvm-diff/lib/DiffConsumer.h
+++ b/llvm/tools/llvm-diff/lib/DiffConsumer.h
@@ -49,7 +49,7 @@ class StringRef;
     virtual void logd(const DiffLogBuilder &Log) = 0;
 
   protected:
-    virtual ~Consumer() {}
+    virtual ~Consumer() = default;
   };
 
   class DiffConsumer : public Consumer {
diff --git a/llvm/tools/llvm-diff/lib/DifferenceEngine.h b/llvm/tools/llvm-diff/lib/DifferenceEngine.h
index 436a35566360f..b829b2cd0bcbc 100644
--- a/llvm/tools/llvm-diff/lib/DifferenceEngine.h
+++ b/llvm/tools/llvm-diff/lib/DifferenceEngine.h
@@ -54,7 +54,7 @@ namespace llvm {
       virtual bool operator()(const Value *L, const Value *R) = 0;
 
     protected:
-      virtual ~Oracle() {}
+      virtual ~Oracle() = default;
     };
 
     DifferenceEngine(Consumer &consumer)
diff --git a/llvm/tools/llvm-exegesis/lib/AArch64/Target.cpp b/llvm/tools/llvm-exegesis/lib/AArch64/Target.cpp
index 2c13dd514a744..0e73adab15d86 100644
--- a/llvm/tools/llvm-exegesis/lib/AArch64/Target.cpp
+++ b/llvm/tools/llvm-exegesis/lib/AArch64/Target.cpp
@@ -112,7 +112,7 @@ namespace {
 
 // Use X19 as the loop counter register since it's a callee-saved register
 // that's available for temporary use.
-constexpr const MCPhysReg kDefaultLoopCounterReg = AArch64::X19;
+constexpr MCPhysReg kDefaultLoopCounterReg = AArch64::X19;
 
 class ExegesisAArch64Target : public ExegesisTarget {
 public:
diff --git a/llvm/tools/llvm-exegesis/lib/Analysis.cpp b/llvm/tools/llvm-exegesis/lib/Analysis.cpp
index fb843285ada2a..f3bf9690d2a6e 100644
--- a/llvm/tools/llvm-exegesis/lib/Analysis.cpp
+++ b/llvm/tools/llvm-exegesis/lib/Analysis.cpp
@@ -446,7 +446,7 @@ void Analysis::printClusterRawHtml(const BenchmarkClustering::ClusterId &Id,
 
 } // namespace exegesis
 
-static constexpr const char kHtmlHead[] = R"(
+static constexpr char kHtmlHead[] = R"(
 <head>
 <title>llvm-exegesis Analysis Results</title>
 <style>
diff --git a/llvm/tools/llvm-exegesis/lib/Assembler.cpp b/llvm/tools/llvm-exegesis/lib/Assembler.cpp
index fd7924db08441..163f1419b370c 100644
--- a/llvm/tools/llvm-exegesis/lib/Assembler.cpp
+++ b/llvm/tools/llvm-exegesis/lib/Assembler.cpp
@@ -44,8 +44,8 @@
 namespace llvm {
 namespace exegesis {
 
-static constexpr const char ModuleID[] = "ExegesisInfoTest";
-static constexpr const char FunctionID[] = "foo";
+static constexpr char ModuleID[] = "ExegesisInfoTest";
+static constexpr char FunctionID[] = "foo";
 static const Align kFunctionAlignment(4096);
 
 // Fills the given basic block with register setup code, and returns true if
diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkResult.cpp b/llvm/tools/llvm-exegesis/lib/BenchmarkResult.cpp
index 1823a534a301a..c6164b6323e20 100644
--- a/llvm/tools/llvm-exegesis/lib/BenchmarkResult.cpp
+++ b/llvm/tools/llvm-exegesis/lib/BenchmarkResult.cpp
@@ -21,9 +21,9 @@
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
 
-static constexpr const char kIntegerPrefix[] = "i_0x";
-static constexpr const char kDoublePrefix[] = "f_";
-static constexpr const char kInvalidOperand[] = "INVALID";
+static constexpr char kIntegerPrefix[] = "i_0x";
+static constexpr char kDoublePrefix[] = "f_";
+static constexpr char kInvalidOperand[] = "INVALID";
 
 namespace llvm {
 
@@ -202,7 +202,7 @@ struct CustomMappingTraits<std::map<exegesis::ValidationEvent, int64_t>> {
       Io.setError("Key is not a valid validation event");
       return;
     }
-    Io.mapRequired(KeyStr.str().c_str(), VI[*Key]);
+    Io.mapRequired(KeyStr, VI[*Key]);
   }
 
   static void output(IO &Io, std::map<exegesis::ValidationEvent, int64_t> &VI) {
@@ -245,8 +245,8 @@ template <> struct SequenceElementTraits<exegesis::RegisterValue> {
 };
 
 template <> struct ScalarTraits<exegesis::RegisterValue> {
-  static constexpr const unsigned kRadix = 16;
-  static constexpr const bool kSigned = false;
+  static constexpr unsigned kRadix = 16;
+  static constexpr bool kSigned = false;
 
   static void output(const exegesis::RegisterValue &RV, void *Ctx,
                      raw_ostream &Out) {
diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
index 1fd0a15dcfa91..12fad7d57444f 100644
--- a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
+++ b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
@@ -155,7 +155,7 @@ class InProcessFunctionExecutorImpl : public BenchmarkRunner::FunctionExecutor {
 #ifdef LLVM_ON_UNIX
         // See "Exit Status for Commands":
         // https://pubs.opengroup.org/onlinepubs/9699919799/xrat/V4_xcu_chap02.html
-        constexpr const int kSigOffset = 128;
+        constexpr int kSigOffset = 128;
         return make_error<SnippetSignal>(CRC.RetCode - kSigOffset);
 #else
         // The exit code of the process on windows is not meaningful as a
@@ -877,7 +877,7 @@ Error BenchmarkRunner::getValidationCountersToRun(
   return Error::success();
 }
 
-BenchmarkRunner::FunctionExecutor::~FunctionExecutor() {}
+BenchmarkRunner::FunctionExecutor::~FunctionExecutor() = default;
 
 } // namespace exegesis
 } // namespace llvm
diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.h b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.h
index e688b814d1c83..16d3c9ccd7658 100644
--- a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.h
+++ b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.h
@@ -73,8 +73,8 @@ class BenchmarkRunner {
 
   // Scratch space to run instructions that touch memory.
   struct ScratchSpace {
-    static constexpr const size_t kAlignment = 1024;
-    static constexpr const size_t kSize = 1 << 20; // 1MB.
+    static constexpr size_t kAlignment = 1024;
+    static constexpr size_t kSize = 1 << 20; // 1MB.
     ScratchSpace()
         : UnalignedPtr(std::make_unique<char[]>(kSize + kAlignment)),
           AlignedPtr(
diff --git a/llvm/tools/llvm-exegesis/lib/Clustering.h b/llvm/tools/llvm-exegesis/lib/Clustering.h
index 9d6c110e2e854..2b0f5b4e4a147 100644
--- a/llvm/tools/llvm-exegesis/lib/Clustering.h
+++ b/llvm/tools/llvm-exegesis/lib/Clustering.h
@@ -67,11 +67,11 @@ class BenchmarkClustering {
     ClusterId(size_t Id, bool IsUnstable = false)
         : Id_(Id), IsUnstable_(IsUnstable) {}
 
-    static constexpr const size_t kMaxValid =
+    static constexpr size_t kMaxValid =
         (std::numeric_limits<size_t>::max() >> 1) - 4;
-    static constexpr const size_t kNoise = kMaxValid + 1;
-    static constexpr const size_t kError = kMaxValid + 2;
-    static constexpr const size_t kUndef = kMaxValid + 3;
+    static constexpr size_t kNoise = kMaxValid + 1;
+    static constexpr size_t kError = kMaxValid + 2;
+    static constexpr size_t kUndef = kMaxValid + 3;
 
     size_t Id_ : (std::numeric_limits<size_t>::digits - 1);
     size_t IsUnstable_ : 1;
diff --git a/llvm/tools/llvm-exegesis/lib/Error.h b/llvm/tools/llvm-exegesis/lib/Error.h
index 9b71fe8f56897..c899023e46607 100644
--- a/llvm/tools/llvm-exegesis/lib/Error.h
+++ b/llvm/tools/llvm-exegesis/lib/Error.h
@@ -81,7 +81,7 @@ class SnippetSignal : public SnippetExecutionFailure {
 struct PerfCounterNotFullyEnabled
     : public ErrorInfo<PerfCounterNotFullyEnabled> {
   static char ID;
-  PerfCounterNotFullyEnabled() {}
+  PerfCounterNotFullyEnabled() = default;
 
   void log(raw_ostream &OS) const override;
 
diff --git a/llvm/tools/llvm-exegesis/lib/ParallelSnippetGenerator.cpp b/llvm/tools/llvm-exegesis/lib/ParallelSnippetGenerator.cpp
index 79a585ec52957..aa409138ae71b 100644
--- a/llvm/tools/llvm-exegesis/lib/ParallelSnippetGenerator.cpp
+++ b/llvm/tools/llvm-exegesis/lib/ParallelSnippetGenerator.cpp
@@ -350,7 +350,5 @@ ParallelSnippetGenerator::generateCodeTemplates(
   return Result;
 }
 
-constexpr const size_t ParallelSnippetGenerator::kMinNumDifferentAddresses;
-
 } // namespace exegesis
 } // namespace llvm
diff --git a/llvm/tools/llvm-exegesis/lib/ParallelSnippetGenerator.h b/llvm/tools/llvm-exegesis/lib/ParallelSnippetGenerator.h
index 8a6b8569c5d4c..d3c85c0c303a2 100644
--- a/llvm/tools/llvm-exegesis/lib/ParallelSnippetGenerator.h
+++ b/llvm/tools/llvm-exegesis/lib/ParallelSnippetGenerator.h
@@ -28,7 +28,7 @@ class ParallelSnippetGenerator : public SnippetGenerator {
   generateCodeTemplates(InstructionTemplate Variant,
                         const BitVector &ForbiddenRegisters) const override;
 
-  static constexpr const size_t kMinNumDifferentAddresses = 6;
+  static constexpr size_t kMinNumDifferentAddresses = 6;
 
 private:
   // Instantiates memory operands within a snippet.
diff --git a/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.cpp b/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.cpp
index 80f5ce4a2f1db..37dcc7c56e5af 100644
--- a/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.cpp
+++ b/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.cpp
@@ -131,7 +131,7 @@ class LoopSnippetRepetitor : public SnippetRepetitor {
 
 } // namespace
 
-SnippetRepetitor::~SnippetRepetitor() {}
+SnippetRepetitor::~SnippetRepetitor() = default;
 
 std::unique_ptr<const SnippetRepetitor>
 SnippetRepetitor::Create(Benchmark::RepetitionModeE Mode,
diff --git a/llvm/tools/llvm-exegesis/lib/SubprocessMemory.h b/llvm/tools/llvm-exegesis/lib/SubprocessMemory.h
index 572d1085d9cff..52ee980b6defd 100644
--- a/llvm/tools/llvm-exegesis/lib/SubprocessMemory.h
+++ b/llvm/tools/llvm-exegesis/lib/SubprocessMemory.h
@@ -32,8 +32,8 @@ namespace exegesis {
 
 class SubprocessMemory {
 public:
-  static constexpr const size_t AuxiliaryMemoryOffset = 1;
-  static constexpr const size_t AuxiliaryMemorySize = 4096;
+  static constexpr size_t AuxiliaryMemoryOffset = 1;
+  static constexpr size_t AuxiliaryMemorySize = 4096;
 
   // Gets the thread ID for the calling thread.
   static long getCurrentTID();
diff --git a/llvm/tools/llvm-exegesis/lib/Target.cpp b/llvm/tools/llvm-exegesis/lib/Target.cpp
index fc5f82f288ae4..2ad6c5af4de4d 100644
--- a/llvm/tools/llvm-exegesis/lib/Target.cpp
+++ b/llvm/tools/llvm-exegesis/lib/Target.cpp
@@ -23,7 +23,7 @@ cl::OptionCategory Options("llvm-exegesis options");
 cl::OptionCategory BenchmarkOptions("llvm-exegesis benchmark options");
 cl::OptionCategory AnalysisOptions("llvm-exegesis analysis options");
 
-ExegesisTarget::~ExegesisTarget() {} // anchor.
+ExegesisTarget::~ExegesisTarget() = default; // anchor.
 
 static ExegesisTarget *FirstTarget = nullptr;
 
@@ -215,7 +215,7 @@ const PfmCountersInfo &ExegesisTarget::getDummyPfmCounters() const {
   return PfmCountersInfo::Dummy;
 }
 
-ExegesisTarget::SavedState::~SavedState() {} // anchor.
+ExegesisTarget::SavedState::~SavedState() = default; // anchor.
 
 namespace {
 
diff --git a/llvm/tools/llvm-exegesis/lib/UopsBenchmarkRunner.h b/llvm/tools/llvm-exegesis/lib/UopsBenchmarkRunner.h
index ef47b7fe8a655..74a18dab80608 100644
--- a/llvm/tools/llvm-exegesis/lib/UopsBenchmarkRunner.h
+++ b/llvm/tools/llvm-exegesis/lib/UopsBenchmarkRunner.h
@@ -30,7 +30,7 @@ class UopsBenchmarkRunner : public BenchmarkRunner {
                         ExecutionMode, ValCounters) {}
   ~UopsBenchmarkRunner() override;
 
-  static constexpr const size_t kMinNumDifferentAddresses = 6;
+  static constexpr size_t kMinNumDifferentAddresses = 6;
 
 private:
   Expected<std::vector<BenchmarkMeasure>>
diff --git a/llvm/tools/llvm-exegesis/lib/X86/Target.cpp b/llvm/tools/llvm-exegesis/lib/X86/Target.cpp
index b4437f798d485..6dc647655d92f 100644
--- a/llvm/tools/llvm-exegesis/lib/X86/Target.cpp
+++ b/llvm/tools/llvm-exegesis/lib/X86/Target.cpp
@@ -278,9 +278,9 @@ static Expected<std::vector<CodeTemplate>> generateLEATemplatesCommon(
   assert(X86II::getMemoryOperandNo(Instr.Description.TSFlags) == 1 &&
          "invalid LEA");
 
-  constexpr const int kDestOp = 0;
-  constexpr const int kBaseOp = 1;
-  constexpr const int kIndexOp = 3;
+  constexpr int kDestOp = 0;
+  constexpr int kBaseOp = 1;
+  constexpr int kIndexOp = 3;
   auto PossibleDestRegs =
       Instr.Operands[kDestOp].getRegisterAliasing().sourceBits();
   remove(PossibleDestRegs, ForbiddenRegisters);
@@ -548,7 +548,7 @@ struct ConstantInliner {
 
   void initStack(unsigned Bytes);
 
-  static constexpr const unsigned kF80Bytes = 10; // 80 bits.
+  static constexpr unsigned kF80Bytes = 10; // 80 bits.
 
   APInt Constant_;
   std::vector<MCInst> Instructions;
@@ -864,7 +864,7 @@ const MCPhysReg ExegesisX86Target::kUnavailableRegistersSSE[12] = {
 // We're using one of R8-R15 because these registers are never hardcoded in
 // instructions (e.g. MOVS writes to EDI, ESI, EDX), so they have less
 // conflicts.
-constexpr const MCPhysReg kDefaultLoopCounterReg = X86::R8;
+constexpr MCPhysReg kDefaultLoopCounterReg = X86::R8;
 
 } // namespace
 
@@ -1110,9 +1110,9 @@ std::vector<MCInst> ExegesisX86Target::setRegTo(const MCSubtargetInfo &STI,
 #ifdef __linux__
 
 #ifdef __arm__
-static constexpr const uintptr_t VAddressSpaceCeiling = 0xC0000000;
+static constexpr uintptr_t VAddressSpaceCeiling = 0xC0000000;
 #else
-static constexpr const uintptr_t VAddressSpaceCeiling = 0x0000800000000000;
+static constexpr uintptr_t VAddressSpaceCeiling = 0x0000800000000000;
 #endif
 
 void generateRoundToNearestPage(unsigned int Register,
diff --git a/llvm/tools/llvm-libtool-darwin/DependencyInfo.h b/llvm/tools/llvm-libtool-darwin/DependencyInfo.h
index 784ec3f50cd53..80bad8f5fa545 100644
--- a/llvm/tools/llvm-libtool-darwin/DependencyInfo.h
+++ b/llvm/tools/llvm-libtool-darwin/DependencyInfo.h
@@ -18,7 +18,7 @@ class DependencyInfo {
   explicit DependencyInfo(std::string DependencyInfoPath)
       : DependencyInfoPath(DependencyInfoPath) {}
 
-  virtual ~DependencyInfo(){};
+  virtual ~DependencyInfo() = default;
 
   virtual void addMissingInput(llvm::StringRef Path) {
     NotFounds.insert(Path.str());
diff --git a/llvm/tools/llvm-mca/CodeRegionGenerator.cpp b/llvm/tools/llvm-mca/CodeRegionGenerator.cpp
index f7f929e49ead9..146907189929e 100644
--- a/llvm/tools/llvm-mca/CodeRegionGenerator.cpp
+++ b/llvm/tools/llvm-mca/CodeRegionGenerator.cpp
@@ -26,7 +26,7 @@ namespace llvm {
 namespace mca {
 
 // This virtual dtor serves as the anchor for the CodeRegionGenerator class.
-CodeRegionGenerator::~CodeRegionGenerator() {}
+CodeRegionGenerator::~CodeRegionGenerator() = default;
 
 Expected<const CodeRegions &> AsmCodeRegionGenerator::parseCodeRegions(
     const std::unique_ptr<MCInstPrinter> &IP, bool SkipFailures) {
diff --git a/llvm/tools/llvm-mca/CodeRegionGenerator.h b/llvm/tools/llvm-mca/CodeRegionGenerator.h
index a48c67a22f27b..c30f67a53eac0 100644
--- a/llvm/tools/llvm-mca/CodeRegionGenerator.h
+++ b/llvm/tools/llvm-mca/CodeRegionGenerator.h
@@ -151,7 +151,7 @@ class CodeRegionGenerator {
                    bool SkipFailures) = 0;
 
 public:
-  CodeRegionGenerator() {}
+  CodeRegionGenerator() = default;
   virtual ~CodeRegionGenerator();
 };
 
diff --git a/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp b/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
index 3d7f33cd64bf4..8aa843b6a5155 100644
--- a/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
+++ b/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
@@ -811,12 +811,12 @@ objcopy::parseObjcopyOptions(ArrayRef<const char *> ArgsArr,
             .Case("boot_application",
                   COFF::IMAGE_SUBSYSTEM_WINDOWS_BOOT_APPLICATION)
             .Case("console", COFF::IMAGE_SUBSYSTEM_WINDOWS_CUI)
-            .Cases("efi_application", "efi-app",
+            .Cases({"efi_application", "efi-app"},
                    COFF::IMAGE_SUBSYSTEM_EFI_APPLICATION)
-            .Cases("efi_boot_service_driver", "efi-bsd",
+            .Cases({"efi_boot_service_driver", "efi-bsd"},
                    COFF::IMAGE_SUBSYSTEM_EFI_BOOT_SERVICE_DRIVER)
             .Case("efi_rom", COFF::IMAGE_SUBSYSTEM_EFI_ROM)
-            .Cases("efi_runtime_driver", "efi-rtd",
+            .Cases({"efi_runtime_driver", "efi-rtd"},
                    COFF::IMAGE_SUBSYSTEM_EFI_RUNTIME_DRIVER)
             .Case("native", COFF::IMAGE_SUBSYSTEM_NATIVE)
             .Case("posix", COFF::IMAGE_SUBSYSTEM_POSIX_CUI)
diff --git a/llvm/tools/llvm-objdump/SourcePrinter.h b/llvm/tools/llvm-objdump/SourcePrinter.h
index 5c131a0eb1fd7..19acc871707aa 100644
--- a/llvm/tools/llvm-objdump/SourcePrinter.h
+++ b/llvm/tools/llvm-objdump/SourcePrinter.h
@@ -34,7 +34,7 @@ class LiveElement {
   LiveElement(const char *Name, DWARFUnit *Unit, const DWARFDie FuncDie)
       : Name(Name), Unit(Unit), FuncDie(FuncDie) {}
 
-  virtual ~LiveElement() {};
+  virtual ~LiveElement() = default;
   const char *getName() const { return Name; }
 
   virtual bool liveAtAddress(object::SectionedAddress Addr) const = 0;
diff --git a/llvm/tools/llvm-objdump/llvm-objdump.h b/llvm/tools/llvm-objdump/llvm-objdump.h
index 3525be9a5314a..bac858929ae9c 100644
--- a/llvm/tools/llvm-objdump/llvm-objdump.h
+++ b/llvm/tools/llvm-objdump/llvm-objdump.h
@@ -84,7 +84,7 @@ class Dumper {
 
 public:
   Dumper(const object::ObjectFile &O);
-  virtual ~Dumper() {}
+  virtual ~Dumper() = default;
 
   void reportUniqueWarning(Error Err);
   void reportUniqueWarning(const Twine &Msg);
diff --git a/llvm/tools/llvm-pdbutil/DumpOutputStyle.cpp b/llvm/tools/llvm-pdbutil/DumpOutputStyle.cpp
index b2362ecb75703..d836d98e1b0d0 100644
--- a/llvm/tools/llvm-pdbutil/DumpOutputStyle.cpp
+++ b/llvm/tools/llvm-pdbutil/DumpOutputStyle.cpp
@@ -68,7 +68,7 @@ DumpOutputStyle::DumpOutputStyle(InputFile &File)
     RefTracker.reset(new TypeReferenceTracker(File));
 }
 
-DumpOutputStyle::~DumpOutputStyle() {}
+DumpOutputStyle::~DumpOutputStyle() = default;
 
 PDBFile &DumpOutputStyle::getPdb() { return File.pdb(); }
 object::COFFObjectFile &DumpOutputStyle::getObj() { return File.obj(); }
diff --git a/llvm/tools/llvm-pdbutil/DumpOutputStyle.h b/llvm/tools/llvm-pdbutil/DumpOutputStyle.h
index 6714a6aa59f17..ea4a47f6f4f06 100644
--- a/llvm/tools/llvm-pdbutil/DumpOutputStyle.h
+++ b/llvm/tools/llvm-pdbutil/DumpOutputStyle.h
@@ -29,7 +29,7 @@ class TypeReferenceTracker;
 
 struct StatCollection {
   struct Stat {
-    Stat() {}
+    Stat() = default;
     Stat(uint32_t Count, uint32_t Size) : Count(Count), Size(Size) {}
     uint32_t Count = 0;
     uint32_t Size = 0;
diff --git a/llvm/tools/llvm-pdbutil/OutputStyle.h b/llvm/tools/llvm-pdbutil/OutputStyle.h
index 8cc9016d79a28..a09fb82f066bb 100644
--- a/llvm/tools/llvm-pdbutil/OutputStyle.h
+++ b/llvm/tools/llvm-pdbutil/OutputStyle.h
@@ -17,7 +17,7 @@ namespace pdb {
 
 class OutputStyle {
 public:
-  virtual ~OutputStyle() {}
+  virtual ~OutputStyle() = default;
 
   virtual Error dump() = 0;
 };
diff --git a/llvm/tools/llvm-pdbutil/StreamUtil.h b/llvm/tools/llvm-pdbutil/StreamUtil.h
index 9d6030c7ba9c4..6b8c13f0c0c25 100644
--- a/llvm/tools/llvm-pdbutil/StreamUtil.h
+++ b/llvm/tools/llvm-pdbutil/StreamUtil.h
@@ -35,7 +35,7 @@ enum class StreamPurpose {
 
 struct StreamInfo {
 public:
-  StreamInfo() {}
+  StreamInfo() = default;
 
   uint32_t getModuleIndex() const { return *ModuleIndex; }
   StreamPurpose getPurpose() const { return Purpose; }
diff --git a/llvm/tools/llvm-profgen/ProfiledBinary.cpp b/llvm/tools/llvm-profgen/ProfiledBinary.cpp
index 94728ce4abffe..96db6a714572a 100644
--- a/llvm/tools/llvm-profgen/ProfiledBinary.cpp
+++ b/llvm/tools/llvm-profgen/ProfiledBinary.cpp
@@ -187,7 +187,7 @@ ProfiledBinary::ProfiledBinary(const StringRef ExeBinPath,
   load();
 }
 
-ProfiledBinary::~ProfiledBinary() {}
+ProfiledBinary::~ProfiledBinary() = default;
 
 void ProfiledBinary::warnNoFuncEntry() {
   uint64_t NoFuncEntryNum = 0;
diff --git a/llvm/tools/llvm-rc/ResourceScriptStmt.h b/llvm/tools/llvm-rc/ResourceScriptStmt.h
index a81e384fda365..84da9beb64be9 100644
--- a/llvm/tools/llvm-rc/ResourceScriptStmt.h
+++ b/llvm/tools/llvm-rc/ResourceScriptStmt.h
@@ -242,9 +242,9 @@ class RCResource {
   virtual raw_ostream &log(raw_ostream &OS) const {
     return OS << "Base statement\n";
   };
-  RCResource() {}
+  RCResource() = default;
   RCResource(uint16_t Flags) : MemoryFlags(Flags) {}
-  virtual ~RCResource() {}
+  virtual ~RCResource() = default;
 
   virtual Error visit(Visitor *) const {
     llvm_unreachable("This is unable to call methods from Visitor base");
@@ -290,7 +290,7 @@ class OptionalStmtList : public OptionalStmt {
   std::vector<std::unique_ptr<OptionalStmt>> Statements;
 
 public:
-  OptionalStmtList() {}
+  OptionalStmtList() = default;
   raw_ostream &log(raw_ostream &OS) const override;
 
   void addStmt(std::unique_ptr<OptionalStmt> Stmt) {
@@ -510,7 +510,7 @@ class MenuDefinition {
   virtual raw_ostream &log(raw_ostream &OS) const {
     return OS << "Base menu definition\n";
   }
-  virtual ~MenuDefinition() {}
+  virtual ~MenuDefinition() = default;
 
   virtual uint16_t getResFlags() const { return 0; }
   virtual MenuDefKind getKind() const { return MkBase; }
@@ -818,7 +818,7 @@ class VersionInfoStmt {
   enum StmtKind { StBase = 0, StBlock = 1, StValue = 2 };
 
   virtual raw_ostream &log(raw_ostream &OS) const { return OS << "VI stmt\n"; }
-  virtual ~VersionInfoStmt() {}
+  virtual ~VersionInfoStmt() = default;
 
   virtual StmtKind getKind() const { return StBase; }
   static bool classof(const VersionInfoStmt *S) {
diff --git a/llvm/tools/llvm-rc/ResourceVisitor.h b/llvm/tools/llvm-rc/ResourceVisitor.h
index a121a0a507c27..1815c6b8a2f52 100644
--- a/llvm/tools/llvm-rc/ResourceVisitor.h
+++ b/llvm/tools/llvm-rc/ResourceVisitor.h
@@ -55,7 +55,7 @@ class Visitor {
   virtual Error visitVersionStmt(const VersionStmt *) = 0;
   virtual Error visitMenuStmt(const MenuStmt *) = 0;
 
-  virtual ~Visitor() {}
+  virtual ~Visitor() = default;
 };
 
 } // namespace rc
diff --git a/llvm/tools/llvm-readobj/ELFDumper.cpp b/llvm/tools/llvm-readobj/ELFDumper.cpp
index 423a11fd5b72a..6f09da5a4099f 100644
--- a/llvm/tools/llvm-readobj/ELFDumper.cpp
+++ b/llvm/tools/llvm-readobj/ELFDumper.cpp
@@ -8188,6 +8188,8 @@ void LLVMELFDumper<ELFT>::printBBAddrMaps(bool PrettyPGOAnalysis) {
               } else {
                 W.printNumber("Frequency", PBBE.BlockFreq.getFrequency());
               }
+              if (PAM.FeatEnable.PostLinkCfg)
+                W.printNumber("PostLink Frequency", PBBE.PostLinkBlockFreq);
             }
 
             if (PAM.FeatEnable.BrProb) {
@@ -8200,6 +8202,8 @@ void LLVMELFDumper<ELFT>::printBBAddrMaps(bool PrettyPGOAnalysis) {
                 } else {
                   W.printHex("Probability", Succ.Prob.getNumerator());
                 }
+                if (PAM.FeatEnable.PostLinkCfg)
+                  W.printNumber("PostLink Probability", Succ.PostLinkFreq);
               }
             }
           }
diff --git a/llvm/tools/llvm-readobj/ObjDumper.cpp b/llvm/tools/llvm-readobj/ObjDumper.cpp
index 0b59dd48d4203..20e027aa5a5ef 100644
--- a/llvm/tools/llvm-readobj/ObjDumper.cpp
+++ b/llvm/tools/llvm-readobj/ObjDumper.cpp
@@ -41,7 +41,7 @@ ObjDumper::ObjDumper(ScopedPrinter &Writer, StringRef ObjName) : W(Writer) {
   };
 }
 
-ObjDumper::~ObjDumper() {}
+ObjDumper::~ObjDumper() = default;
 
 void ObjDumper::reportUniqueWarning(Error Err) const {
   reportUniqueWarning(toString(std::move(Err)));
diff --git a/llvm/tools/llvm-readtapi/DiffEngine.h b/llvm/tools/llvm-readtapi/DiffEngine.h
index 7ab57d43e3af2..b350ceef5c6e0 100644
--- a/llvm/tools/llvm-readtapi/DiffEngine.h
+++ b/llvm/tools/llvm-readtapi/DiffEngine.h
@@ -39,7 +39,7 @@ enum DiffAttrKind {
 class AttributeDiff {
 public:
   AttributeDiff(DiffAttrKind Kind) : Kind(Kind){};
-  virtual ~AttributeDiff(){};
+  virtual ~AttributeDiff() = default;
   DiffAttrKind getKind() const { return Kind; }
 
 private:
diff --git a/llvm/tools/llvm-xray/xray-graph.h b/llvm/tools/llvm-xray/xray-graph.h
index fd9644908426f..bf25f8d11c5b9 100644
--- a/llvm/tools/llvm-xray/xray-graph.h
+++ b/llvm/tools/llvm-xray/xray-graph.h
@@ -86,7 +86,7 @@ class GraphRenderer {
   };
 
   GraphT G;
-  using VertexIdentifier = typename decltype(G)::VertexIdentifier;
+  using VertexIdentifier = decltype(G)::VertexIdentifier;
   using EdgeIdentifier = decltype(G)::EdgeIdentifier;
 
   /// Use a Map to store the Function stack for each thread whilst building the
diff --git a/llvm/tools/obj2yaml/elf2yaml.cpp b/llvm/tools/obj2yaml/elf2yaml.cpp
index 68e18f6c79202..4364d15a8b455 100644
--- a/llvm/tools/obj2yaml/elf2yaml.cpp
+++ b/llvm/tools/obj2yaml/elf2yaml.cpp
@@ -895,7 +895,7 @@ ELFDumper<ELFT>::dumpBBAddrMapSection(const Elf_Shdr *Shdr) {
   std::vector<ELFYAML::PGOAnalysisMapEntry> PGOAnalyses;
   DataExtractor::Cursor Cur(0);
   uint8_t Version = 0;
-  uint8_t Feature = 0;
+  uint16_t Feature = 0;
   uint64_t Address = 0;
   while (Cur && Cur.tell() < Content.size()) {
     if (Shdr->sh_type == ELF::SHT_LLVM_BB_ADDR_MAP) {
@@ -905,7 +905,7 @@ ELFDumper<ELFT>::dumpBBAddrMapSection(const Elf_Shdr *Shdr) {
             errc::invalid_argument,
             "invalid SHT_LLVM_BB_ADDR_MAP section version: " +
                 Twine(static_cast<int>(Version)));
-      Feature = Data.getU8(Cur);
+      Feature = Version < 5 ? Data.getU8(Cur) : Data.getU16(Cur);
     }
     uint64_t NumBBRanges = 1;
     uint64_t NumBlocks = 0;
@@ -972,6 +972,8 @@ ELFDumper<ELFT>::dumpBBAddrMapSection(const Elf_Shdr *Shdr) {
           auto &PGOBBEntry = PGOBBEntries.emplace_back();
           if (FeatureOrErr->BBFreq) {
             PGOBBEntry.BBFreq = Data.getULEB128(Cur);
+            if (FeatureOrErr->PostLinkCfg)
+              PGOBBEntry.PostLinkBBFreq = Data.getULEB128(Cur);
             if (!Cur)
               break;
           }
@@ -982,7 +984,10 @@ ELFDumper<ELFT>::dumpBBAddrMapSection(const Elf_Shdr *Shdr) {
             for (uint64_t SuccIdx = 0; Cur && SuccIdx < SuccCount; ++SuccIdx) {
               uint32_t ID = Data.getULEB128(Cur);
               uint32_t BrProb = Data.getULEB128(Cur);
-              SuccEntries.push_back({ID, BrProb});
+              std::optional<uint32_t> PostLinkBrFreq;
+              if (FeatureOrErr->PostLinkCfg)
+                PostLinkBrFreq = Data.getULEB128(Cur);
+              SuccEntries.push_back({ID, BrProb, PostLinkBrFreq});
             }
           }
         }
diff --git a/llvm/tools/opt-viewer/optrecord.py b/llvm/tools/opt-viewer/optrecord.py
index 8014204a64f45..b9244fd1ae739 100644
--- a/llvm/tools/opt-viewer/optrecord.py
+++ b/llvm/tools/opt-viewer/optrecord.py
@@ -19,35 +19,18 @@
 from multiprocessing import Lock
 import os, os.path
 import subprocess
-
-try:
-    # The previously builtin function `intern()` was moved
-    # to the `sys` module in Python 3.
-    from sys import intern
-except:
-    pass
-
+from sys import intern
 import re
 
 import optpmap
 
-try:
-    dict.iteritems
-except AttributeError:
-    # Python 3
-    def itervalues(d):
-        return iter(d.values())
-
-    def iteritems(d):
-        return iter(d.items())
-
-else:
-    # Python 2
-    def itervalues(d):
-        return d.itervalues()
-
-    def iteritems(d):
-        return d.iteritems()
+
+def itervalues(d):
+    return iter(d.values())
+
+
+def iteritems(d):
+    return iter(d.items())
 
 
 def html_file_name(filename):
diff --git a/llvm/unittests/ADT/BreadthFirstIteratorTest.cpp b/llvm/unittests/ADT/BreadthFirstIteratorTest.cpp
index 0cd7fd3f706f0..a737390e79d8d 100644
--- a/llvm/unittests/ADT/BreadthFirstIteratorTest.cpp
+++ b/llvm/unittests/ADT/BreadthFirstIteratorTest.cpp
@@ -78,7 +78,7 @@ TEST(BreadthFristIteratorTest, Cycle) {
 
 static_assert(
     std::is_convertible_v<decltype(*std::declval<bf_iterator<Graph<3>>>()),
-                          typename bf_iterator<Graph<3>>::reference>);
+                          bf_iterator<Graph<3>>::reference>);
 
 // bf_iterator should be (at-least) a forward-iterator
 static_assert(std::is_base_of_v<std::forward_iterator_tag,
diff --git a/llvm/unittests/ADT/ConcurrentHashtableTest.cpp b/llvm/unittests/ADT/ConcurrentHashtableTest.cpp
index ee1ee41f453a3..1b82df1fbffd0 100644
--- a/llvm/unittests/ADT/ConcurrentHashtableTest.cpp
+++ b/llvm/unittests/ADT/ConcurrentHashtableTest.cpp
@@ -21,7 +21,7 @@ using namespace parallel;
 namespace {
 class String {
 public:
-  String() {}
+  String() = default;
   const std::string &getKey() const { return Data; }
 
   template <typename AllocatorTy>
diff --git a/llvm/unittests/ADT/DepthFirstIteratorTest.cpp b/llvm/unittests/ADT/DepthFirstIteratorTest.cpp
index 95923b8100838..f792878004e7a 100644
--- a/llvm/unittests/ADT/DepthFirstIteratorTest.cpp
+++ b/llvm/unittests/ADT/DepthFirstIteratorTest.cpp
@@ -59,7 +59,7 @@ TEST(DepthFirstIteratorTest, ActuallyUpdateIterator) {
 
 static_assert(
     std::is_convertible_v<decltype(*std::declval<df_iterator<Graph<3>>>()),
-                          typename df_iterator<Graph<3>>::reference>);
+                          df_iterator<Graph<3>>::reference>);
 
 // df_iterator should be (at-least) a forward-iterator
 static_assert(std::is_base_of_v<std::forward_iterator_tag,
diff --git a/llvm/unittests/ADT/DirectedGraphTest.cpp b/llvm/unittests/ADT/DirectedGraphTest.cpp
index 49ccf06ddc00c..82a631b4f83f3 100644
--- a/llvm/unittests/ADT/DirectedGraphTest.cpp
+++ b/llvm/unittests/ADT/DirectedGraphTest.cpp
@@ -43,7 +43,7 @@ class DGTestEdge : public DGTestEdgeBase {
 class DGTestGraph : public DGTestBase {
 public:
   DGTestGraph() = default;
-  ~DGTestGraph(){};
+  ~DGTestGraph() = default;
 };
 
 using EdgeListTy = SmallVector<DGTestEdge *, 2>;
diff --git a/llvm/unittests/ADT/IListIteratorBitsTest.cpp b/llvm/unittests/ADT/IListIteratorBitsTest.cpp
index 97c14265e6bcb..b430bcbe1eb0e 100644
--- a/llvm/unittests/ADT/IListIteratorBitsTest.cpp
+++ b/llvm/unittests/ADT/IListIteratorBitsTest.cpp
@@ -93,8 +93,8 @@ TEST(IListIteratorBitsTest, ConsAndAssignment) {
 class dummy {
   // Test that we get an ilist_iterator_w_bits out of the node given that the
   // options are enabled.
-  using node_options = typename ilist_detail::compute_node_options<
-      Node, ilist_iterator_bits<true>>::type;
+  using node_options =
+      ilist_detail::compute_node_options<Node, ilist_iterator_bits<true>>::type;
   static_assert(std::is_same<Node::self_iterator,
                              llvm::ilist_iterator_w_bits<node_options, false,
                                                          false>>::value);
@@ -102,7 +102,7 @@ class dummy {
   // Now test that a plain node, without the option, gets a plain
   // ilist_iterator.
   using plain_node_options =
-      typename ilist_detail::compute_node_options<PlainNode>::type;
+      ilist_detail::compute_node_options<PlainNode>::type;
   static_assert(std::is_same<
                 PlainNode::self_iterator,
                 llvm::ilist_iterator<plain_node_options, false, false>>::value);
diff --git a/llvm/unittests/ADT/IListTest.cpp b/llvm/unittests/ADT/IListTest.cpp
index 2fdc8e12d0fa8..984014f679db6 100644
--- a/llvm/unittests/ADT/IListTest.cpp
+++ b/llvm/unittests/ADT/IListTest.cpp
@@ -19,7 +19,7 @@ namespace {
 struct Node : ilist_node<Node> {
   int Value;
 
-  Node() {}
+  Node() = default;
   Node(int Value) : Value(Value) {}
   Node(const Node&) = default;
   ~Node() { Value = -1; }
diff --git a/llvm/unittests/ADT/IteratorTest.cpp b/llvm/unittests/ADT/IteratorTest.cpp
index 691fbce5080ff..b5d63efd8ccba 100644
--- a/llvm/unittests/ADT/IteratorTest.cpp
+++ b/llvm/unittests/ADT/IteratorTest.cpp
@@ -48,11 +48,10 @@ struct AdaptedIter : iterator_adaptor_base<AdaptedIter, WeirdIter> {};
 
 // Test that iterator_adaptor_base forwards typedefs, if value_type is
 // unchanged.
-static_assert(std::is_same_v<typename AdaptedIter::value_type, Shadow<0>>, "");
-static_assert(std::is_same_v<typename AdaptedIter::difference_type, Shadow<1>>,
-              "");
-static_assert(std::is_same_v<typename AdaptedIter::pointer, Shadow<2>>, "");
-static_assert(std::is_same_v<typename AdaptedIter::reference, Shadow<3>>, "");
+static_assert(std::is_same_v<AdaptedIter::value_type, Shadow<0>>, "");
+static_assert(std::is_same_v<AdaptedIter::difference_type, Shadow<1>>, "");
+static_assert(std::is_same_v<AdaptedIter::pointer, Shadow<2>>, "");
+static_assert(std::is_same_v<AdaptedIter::reference, Shadow<3>>, "");
 
 // Ensure that pointe{e,r}_iterator adaptors correctly forward the category of
 // the underlying iterator.
diff --git a/llvm/unittests/ADT/PostOrderIteratorTest.cpp b/llvm/unittests/ADT/PostOrderIteratorTest.cpp
index 4c2a66e8d5b62..838481f76ed7f 100644
--- a/llvm/unittests/ADT/PostOrderIteratorTest.cpp
+++ b/llvm/unittests/ADT/PostOrderIteratorTest.cpp
@@ -44,7 +44,7 @@ TEST(PostOrderIteratorTest, Compiles) {
 
 static_assert(
     std::is_convertible_v<decltype(*std::declval<po_iterator<Graph<3>>>()),
-                          typename po_iterator<Graph<3>>::reference>);
+                          po_iterator<Graph<3>>::reference>);
 
 // Test post-order and reverse post-order traversals for simple graph type.
 TEST(PostOrderIteratorTest, PostOrderAndReversePostOrderTraverrsal) {
diff --git a/llvm/unittests/ADT/SmallVectorTest.cpp b/llvm/unittests/ADT/SmallVectorTest.cpp
index 1a01f30e8dd35..74fc737f29335 100644
--- a/llvm/unittests/ADT/SmallVectorTest.cpp
+++ b/llvm/unittests/ADT/SmallVectorTest.cpp
@@ -159,7 +159,7 @@ int Constructable::numCopyAssignmentCalls;
 int Constructable::numMoveAssignmentCalls;
 
 struct NonCopyable {
-  NonCopyable() {}
+  NonCopyable() = default;
   NonCopyable(NonCopyable &&) {}
   NonCopyable &operator=(NonCopyable &&) { return *this; }
 private:
diff --git a/llvm/unittests/ADT/StringMapTest.cpp b/llvm/unittests/ADT/StringMapTest.cpp
index 92ae364d45d5e..1d92de4e92325 100644
--- a/llvm/unittests/ADT/StringMapTest.cpp
+++ b/llvm/unittests/ADT/StringMapTest.cpp
@@ -367,7 +367,7 @@ TEST_F(StringMapTest, NonDefaultConstructable) {
 }
 
 struct Immovable {
-  Immovable() {}
+  Immovable() = default;
   Immovable(Immovable &&) = delete; // will disable the other special members
 };
 
diff --git a/llvm/unittests/ADT/TypeSwitchTest.cpp b/llvm/unittests/ADT/TypeSwitchTest.cpp
index a7d934265c5f0..0a9271785d168 100644
--- a/llvm/unittests/ADT/TypeSwitchTest.cpp
+++ b/llvm/unittests/ADT/TypeSwitchTest.cpp
@@ -142,3 +142,44 @@ TEST(TypeSwitchTest, DefaultUnreachableWithVoid) {
   EXPECT_DEATH((void)translate(DerivedD()), "Unhandled type");
 #endif
 }
+
+TEST(TypeSwitchTest, DefaultNullopt) {
+  auto translate = [](auto value) {
+    return TypeSwitch<Base *, std::optional<int>>(&value)
+        .Case([](DerivedA *) { return 0; })
+        .Default(std::nullopt);
+  };
+  EXPECT_EQ(0, translate(DerivedA()));
+  EXPECT_EQ(std::nullopt, translate(DerivedD()));
+}
+
+TEST(TypeSwitchTest, DefaultNullptr) {
+  float foo = 0.0f;
+  auto translate = [&](auto value) {
+    return TypeSwitch<Base *, float *>(&value)
+        .Case([&](DerivedA *) { return &foo; })
+        .Default(nullptr);
+  };
+  EXPECT_EQ(&foo, translate(DerivedA()));
+  EXPECT_EQ(nullptr, translate(DerivedD()));
+}
+
+TEST(TypeSwitchTest, DefaultNullptrForPointerLike) {
+  struct Value {
+    void *ptr;
+    Value(const Value &other) = default;
+    Value(std::nullptr_t) : ptr(nullptr) {}
+    Value() : Value(nullptr) {}
+  };
+
+  float foo = 0.0f;
+  Value fooVal;
+  fooVal.ptr = &foo;
+  auto translate = [&](auto value) {
+    return TypeSwitch<Base *, Value>(&value)
+        .Case([&](DerivedA *) { return fooVal; })
+        .Default(nullptr);
+  };
+  EXPECT_EQ(&foo, translate(DerivedA()).ptr);
+  EXPECT_EQ(nullptr, translate(DerivedD()).ptr);
+}
diff --git a/llvm/unittests/CodeGen/AsmPrinterDwarfTest.cpp b/llvm/unittests/CodeGen/AsmPrinterDwarfTest.cpp
index af2d56df33d38..d0991e6201343 100644
--- a/llvm/unittests/CodeGen/AsmPrinterDwarfTest.cpp
+++ b/llvm/unittests/CodeGen/AsmPrinterDwarfTest.cpp
@@ -383,7 +383,7 @@ class AsmPrinterHandlerTest : public AsmPrinterFixtureBase {
 
   public:
     TestHandler(AsmPrinterHandlerTest &Test) : Test(Test) {}
-    ~TestHandler() override {}
+    ~TestHandler() override = default;
     void setSymbolSize(const MCSymbol *Sym, uint64_t Size) override {}
     void beginModule(Module *M) override { Test.BeginCount++; }
     void endModule() override { Test.EndCount++; }
diff --git a/llvm/unittests/CodeGen/InstrRefLDVTest.cpp b/llvm/unittests/CodeGen/InstrRefLDVTest.cpp
index ff87e7b6a1018..235a53dcc156e 100644
--- a/llvm/unittests/CodeGen/InstrRefLDVTest.cpp
+++ b/llvm/unittests/CodeGen/InstrRefLDVTest.cpp
@@ -1113,7 +1113,7 @@ TEST_F(InstrRefLDVTest, MLocDiamondSpills) {
   // Create a stack location and ensure it's tracked.
   SpillLoc SL = {getRegByName("RSP"), StackOffset::getFixed(-8)};
   SpillLocationNo SpillNo = *MTracker->getOrTrackSpillLoc(SL);
-  ASSERT_EQ(MTracker->getNumLocs(), 13u); // Tracks all possible stack locs.
+  ASSERT_EQ(MTracker->getNumLocs(), 11u); // Tracks all possible stack locs.
   // Locations are: RSP, stack slots from 2^3 bits wide up to 2^9 for zmm regs,
   // then slots for sub_8bit_hi and sub_16bit_hi ({8, 8} and {16, 16}).
   // Finally, one for spilt fp80 registers.
diff --git a/llvm/unittests/CodeGen/MFCommon.inc b/llvm/unittests/CodeGen/MFCommon.inc
index a86a68cb4adf1..0180ba0a6c163 100644
--- a/llvm/unittests/CodeGen/MFCommon.inc
+++ b/llvm/unittests/CodeGen/MFCommon.inc
@@ -86,7 +86,7 @@ public:
       : TargetSubtargetInfo(Triple(""), "", "", "", {}, {}, {}, nullptr,
                             nullptr, nullptr, nullptr, nullptr, nullptr),
         FL(), TL(TM) {}
-  ~BogusSubtarget() override {}
+  ~BogusSubtarget() override = default;
 
   const TargetFrameLowering *getFrameLowering() const override { return &FL; }
 
@@ -117,7 +117,7 @@ public:
             Reloc::Static, CodeModel::Small, CodeGenOptLevel::Default),
         ST(*this) {}
 
-  ~BogusTargetMachine() override {}
+  ~BogusTargetMachine() override = default;
 
   const TargetSubtargetInfo *getSubtargetImpl(const Function &) const override {
     return &ST;
diff --git a/llvm/unittests/CodeGen/MachineOperandTest.cpp b/llvm/unittests/CodeGen/MachineOperandTest.cpp
index 3f3f48fcc7c58..0373c7a0f629b 100644
--- a/llvm/unittests/CodeGen/MachineOperandTest.cpp
+++ b/llvm/unittests/CodeGen/MachineOperandTest.cpp
@@ -424,4 +424,24 @@ TEST(MachineOperandTest, HashValue) {
   ASSERT_TRUE(MO1.isIdenticalTo(MO2));
 }
 
+TEST(MachineOperandTest, RegisterLiveOutHashValue) {
+  LLVMContext Ctx;
+  Module Mod("Module", Ctx);
+  auto MF = createMachineFunction(Ctx, Mod);
+  MachineBasicBlock *MBB = MF->CreateMachineBasicBlock();
+  MCInstrDesc MCID = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  auto *MI1 = MF->CreateMachineInstr(MCID, DebugLoc());
+  auto *MI2 = MF->CreateMachineInstr(MCID, DebugLoc());
+  MBB->insert(MBB->begin(), MI1);
+  MBB->insert(MBB->begin(), MI2);
+  uint32_t Mask1 = 0;
+  uint32_t Mask2 = 0;
+  MI1->addOperand(*MF, MachineOperand::CreateRegLiveOut(&Mask1));
+  MI2->addOperand(*MF, MachineOperand::CreateRegLiveOut(&Mask2));
+  auto MO1 = MI1->getOperand(0);
+  auto MO2 = MI2->getOperand(0);
+  EXPECT_EQ(hash_value(MO1), hash_value(MO2));
+  EXPECT_TRUE(MO1.isIdenticalTo(MO2));
+}
+
 } // end namespace
diff --git a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
index aa56aafa2812c..ceaee52a3948b 100644
--- a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
+++ b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
@@ -354,6 +354,76 @@ TEST_F(SelectionDAGPatternMatchTest, matchBinaryOp) {
       sd_match(InsertELT, m_InsertElt(m_Value(), m_Value(), m_SpecificInt(1))));
 }
 
+TEST_F(SelectionDAGPatternMatchTest, matchGenericTernaryOp) {
+  SDLoc DL;
+  auto Float32VT = EVT::getFloatingPointVT(32);
+
+  SDValue Op0 = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 1, Float32VT);
+  SDValue Op1 = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 2, Float32VT);
+  SDValue Op2 = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 3, Float32VT);
+
+  SDValue FMA = DAG->getNode(ISD::FMA, DL, Float32VT, Op0, Op1, Op2);
+  SDValue FAdd = DAG->getNode(ISD::FADD, DL, Float32VT, Op0, Op1);
+
+  using namespace SDPatternMatch;
+  SDValue A, B, C;
+
+  EXPECT_TRUE(sd_match(FMA, m_TernaryOp(ISD::FMA, m_Specific(Op0),
+                                        m_Specific(Op1), m_Specific(Op2))));
+  EXPECT_FALSE(sd_match(FMA, m_TernaryOp(ISD::FADD, m_Specific(Op0),
+                                         m_Specific(Op1), m_Specific(Op2))));
+  EXPECT_FALSE(
+      sd_match(FAdd, m_TernaryOp(ISD::FMA, m_Value(), m_Value(), m_Value())));
+  EXPECT_FALSE(sd_match(FMA, m_TernaryOp(ISD::FMA, m_Specific(Op1),
+                                         m_Specific(Op0), m_Specific(Op2))));
+
+  EXPECT_TRUE(
+      sd_match(FMA, m_TernaryOp(ISD::FMA, m_Value(A), m_Value(B), m_Value(C))));
+  EXPECT_EQ(A, Op0);
+  EXPECT_EQ(B, Op1);
+  EXPECT_EQ(C, Op2);
+
+  A = B = C = SDValue();
+
+  EXPECT_TRUE(sd_match(FMA, m_c_TernaryOp(ISD::FMA, m_Specific(Op0),
+                                          m_Specific(Op1), m_Specific(Op2))));
+  EXPECT_TRUE(sd_match(FMA, m_c_TernaryOp(ISD::FMA, m_Specific(Op1),
+                                          m_Specific(Op0), m_Specific(Op2))));
+
+  EXPECT_FALSE(sd_match(FMA, m_c_TernaryOp(ISD::FMA, m_Specific(Op2),
+                                           m_Specific(Op1), m_Specific(Op0))));
+  EXPECT_FALSE(sd_match(FMA, m_c_TernaryOp(ISD::FMA, m_Specific(Op2),
+                                           m_Specific(Op0), m_Specific(Op1))));
+
+  EXPECT_FALSE(sd_match(FMA, m_c_TernaryOp(ISD::FMA, m_Specific(Op0),
+                                           m_Specific(Op2), m_Specific(Op1))));
+  EXPECT_FALSE(sd_match(FMA, m_c_TernaryOp(ISD::FMA, m_Specific(Op1),
+                                           m_Specific(Op2), m_Specific(Op0))));
+
+  EXPECT_TRUE(sd_match(
+      FMA, m_c_TernaryOp(ISD::FMA, m_Value(A), m_Value(B), m_Value(C))));
+  EXPECT_EQ(A, Op0);
+  EXPECT_EQ(B, Op1);
+  EXPECT_EQ(C, Op2);
+
+  A = B = C = SDValue();
+  EXPECT_TRUE(sd_match(
+      FMA, m_c_TernaryOp(ISD::FMA, m_Value(B), m_Value(A), m_Value(C))));
+  EXPECT_EQ(A, Op1);
+  EXPECT_EQ(B, Op0);
+  EXPECT_EQ(C, Op2);
+
+  A = B = C = SDValue();
+  EXPECT_TRUE(sd_match(
+      FMA, m_c_TernaryOp(ISD::FMA, m_Value(A), m_Value(B), m_Value(C))));
+  EXPECT_EQ(A, Op0);
+  EXPECT_EQ(B, Op1);
+  EXPECT_EQ(C, Op2);
+
+  EXPECT_FALSE(
+      sd_match(FAdd, m_c_TernaryOp(ISD::FMA, m_Value(), m_Value(), m_Value())));
+}
+
 TEST_F(SelectionDAGPatternMatchTest, matchUnaryOp) {
   SDLoc DL;
   auto Int32VT = EVT::getIntegerVT(Context, 32);
diff --git a/llvm/unittests/DebugInfo/CodeView/RandomAccessVisitorTest.cpp b/llvm/unittests/DebugInfo/CodeView/RandomAccessVisitorTest.cpp
index 5c961998a4157..fab40b963731b 100644
--- a/llvm/unittests/DebugInfo/CodeView/RandomAccessVisitorTest.cpp
+++ b/llvm/unittests/DebugInfo/CodeView/RandomAccessVisitorTest.cpp
@@ -84,7 +84,7 @@ class MockCallbacks : public TypeVisitorCallbacks {
 
 class RandomAccessVisitorTest : public testing::Test {
 public:
-  RandomAccessVisitorTest() {}
+  RandomAccessVisitorTest() = default;
 
   static void SetUpTestCase() {
     GlobalState = std::make_unique<GlobalTestState>();
diff --git a/llvm/unittests/DebugInfo/CodeView/TypeIndexDiscoveryTest.cpp b/llvm/unittests/DebugInfo/CodeView/TypeIndexDiscoveryTest.cpp
index b1f19e9e20891..62b75912814dd 100644
--- a/llvm/unittests/DebugInfo/CodeView/TypeIndexDiscoveryTest.cpp
+++ b/llvm/unittests/DebugInfo/CodeView/TypeIndexDiscoveryTest.cpp
@@ -21,7 +21,7 @@ using namespace llvm::codeview;
 
 class TypeIndexIteratorTest : public testing::Test {
 public:
-  TypeIndexIteratorTest() {}
+  TypeIndexIteratorTest() = default;
 
   void SetUp() override {
     Refs.clear();
diff --git a/llvm/unittests/DebugInfo/DWARF/DWARFDebugLineTest.cpp b/llvm/unittests/DebugInfo/DWARF/DWARFDebugLineTest.cpp
index 2fe52600df923..aa5b2926ab55a 100644
--- a/llvm/unittests/DebugInfo/DWARF/DWARFDebugLineTest.cpp
+++ b/llvm/unittests/DebugInfo/DWARF/DWARFDebugLineTest.cpp
@@ -864,7 +864,7 @@ TEST_F(DebugLineBasicFixture, CallbackUsedForUnterminatedSequence) {
 }
 
 struct AdjustAddressFixtureBase : public CommonFixture {
-  virtual ~AdjustAddressFixtureBase() {}
+  virtual ~AdjustAddressFixtureBase() = default;
 
   // Create and update the prologue as specified by the subclass, then return
   // the length of the table.
diff --git a/llvm/unittests/ExecutionEngine/JITLink/JITLinkTestUtils.h b/llvm/unittests/ExecutionEngine/JITLink/JITLinkTestUtils.h
index f03c82f76c324..a2732e36a8a68 100644
--- a/llvm/unittests/ExecutionEngine/JITLink/JITLinkTestUtils.h
+++ b/llvm/unittests/ExecutionEngine/JITLink/JITLinkTestUtils.h
@@ -19,7 +19,7 @@ class MockJITLinkMemoryManager : public llvm::jitlink::JITLinkMemoryManager {
 public:
   class Alloc {
   public:
-    virtual ~Alloc() {}
+    virtual ~Alloc() = default;
   };
 
   class SimpleAlloc : public Alloc {
diff --git a/llvm/unittests/ExecutionEngine/Orc/CMakeLists.txt b/llvm/unittests/ExecutionEngine/Orc/CMakeLists.txt
index b06aa2565bb04..7b563d7bcc68c 100644
--- a/llvm/unittests/ExecutionEngine/Orc/CMakeLists.txt
+++ b/llvm/unittests/ExecutionEngine/Orc/CMakeLists.txt
@@ -26,6 +26,7 @@ add_llvm_unittest(OrcJITTests
   IndirectionUtilsTest.cpp
   JITTargetMachineBuilderTest.cpp
   LazyCallThroughAndReexportsTest.cpp
+  LibraryResolverTest.cpp
   LookupAndRecordAddrsTest.cpp
   MachOPlatformTest.cpp
   MapperJITLinkMemoryManagerTest.cpp
diff --git a/llvm/unittests/ExecutionEngine/Orc/Inputs/A/A_linux.yaml b/llvm/unittests/ExecutionEngine/Orc/Inputs/A/A_linux.yaml
new file mode 100644
index 0000000000000..afd1d9e69448d
--- /dev/null
+++ b/llvm/unittests/ExecutionEngine/Orc/Inputs/A/A_linux.yaml
@@ -0,0 +1,460 @@
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_DYN
+  Machine:         EM_X86_64
+ProgramHeaders:
+  - Type:            PT_LOAD
+    Flags:           [ PF_R ]
+    FirstSec:        .note.gnu.property
+    LastSec:         .rela.plt
+    Align:           0x1000
+    Offset:          0x0
+  - Type:            PT_LOAD
+    Flags:           [ PF_X, PF_R ]
+    FirstSec:        .init
+    LastSec:         .fini
+    VAddr:           0x1000
+    Align:           0x1000
+    Offset:          0x1000
+  - Type:            PT_LOAD
+    Flags:           [ PF_R ]
+    FirstSec:        .rodata
+    LastSec:         .eh_frame
+    VAddr:           0x2000
+    Align:           0x1000
+    Offset:          0x2000
+  - Type:            PT_LOAD
+    Flags:           [ PF_W, PF_R ]
+    FirstSec:        .init_array
+    LastSec:         .bss
+    VAddr:           0x3E10
+    Align:           0x1000
+    Offset:          0x2E10
+  - Type:            PT_DYNAMIC
+    Flags:           [ PF_W, PF_R ]
+    FirstSec:        .dynamic
+    LastSec:         .dynamic
+    VAddr:           0x3E20
+    Align:           0x8
+    Offset:          0x2E20
+  - Type:            PT_NOTE
+    Flags:           [ PF_R ]
+    FirstSec:        .note.gnu.property
+    LastSec:         .note.gnu.property
+    VAddr:           0x2A8
+    Align:           0x8
+    Offset:          0x2A8
+  - Type:            PT_NOTE
+    Flags:           [ PF_R ]
+    FirstSec:        .note.gnu.build-id
+    LastSec:         .note.gnu.build-id
+    VAddr:           0x2C8
+    Align:           0x4
+    Offset:          0x2C8
+  - Type:            PT_GNU_PROPERTY
+    Flags:           [ PF_R ]
+    FirstSec:        .note.gnu.property
+    LastSec:         .note.gnu.property
+    VAddr:           0x2A8
+    Align:           0x8
+    Offset:          0x2A8
+  - Type:            PT_GNU_EH_FRAME
+    Flags:           [ PF_R ]
+    FirstSec:        .eh_frame_hdr
+    LastSec:         .eh_frame_hdr
+    VAddr:           0x2010
+    Align:           0x4
+    Offset:          0x2010
+  - Type:            PT_GNU_STACK
+    Flags:           [ PF_W, PF_R ]
+    Align:           0x10
+    Offset:          0x0
+  - Type:            PT_GNU_RELRO
+    Flags:           [ PF_R ]
+    FirstSec:        .init_array
+    LastSec:         .got
+    VAddr:           0x3E10
+    Offset:          0x2E10
+Sections:
+  - Name:            .note.gnu.property
+    Type:            SHT_NOTE
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2A8
+    AddressAlign:    0x8
+    Notes:
+      - Name:            GNU
+        Desc:            020000C0040000000300000000000000
+        Type:            NT_GNU_PROPERTY_TYPE_0
+  - Name:            .note.gnu.build-id
+    Type:            SHT_NOTE
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2C8
+    AddressAlign:    0x4
+    Notes:
+      - Name:            GNU
+        Desc:            73604396C95840D5C380A0950F085A778F94EE7C
+        Type:            NT_PRPSINFO
+  - Name:            .gnu.hash
+    Type:            SHT_GNU_HASH
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2F0
+    Link:            .dynsym
+    AddressAlign:    0x8
+    Header:
+      SymNdx:          0x6
+      Shift2:          0x6
+    BloomFilter:     [ 0x400000080000 ]
+    HashBuckets:     [ 0x0, 0x6 ]
+    HashValues:      [ 0x7C9DCB93 ]
+  - Name:            .dynsym
+    Type:            SHT_DYNSYM
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x318
+    Link:            .dynstr
+    AddressAlign:    0x8
+  - Name:            .dynstr
+    Type:            SHT_STRTAB
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x3C0
+    AddressAlign:    0x1
+  - Name:            .gnu.version
+    Type:            SHT_GNU_versym
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x436
+    Link:            .dynsym
+    AddressAlign:    0x2
+    Entries:         [ 0, 1, 2, 1, 1, 2, 1 ]
+  - Name:            .gnu.version_r
+    Type:            SHT_GNU_verneed
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x448
+    Link:            .dynstr
+    AddressAlign:    0x8
+    Dependencies:
+      - Version:         1
+        File:            libc.so.6
+        Entries:
+          - Name:            GLIBC_2.2.5
+            Hash:            157882997
+            Flags:           0
+            Other:           2
+  - Name:            .rela.dyn
+    Type:            SHT_RELA
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x468
+    Link:            .dynsym
+    AddressAlign:    0x8
+    Relocations:
+      - Offset:          0x3E10
+        Type:            R_X86_64_RELATIVE
+        Addend:          4368
+      - Offset:          0x3E18
+        Type:            R_X86_64_RELATIVE
+        Addend:          4304
+      - Offset:          0x4020
+        Type:            R_X86_64_RELATIVE
+        Addend:          16416
+      - Offset:          0x3FE0
+        Symbol:          _ITM_deregisterTMCloneTable
+        Type:            R_X86_64_GLOB_DAT
+      - Offset:          0x3FE8
+        Symbol:          __gmon_start__
+        Type:            R_X86_64_GLOB_DAT
+      - Offset:          0x3FF0
+        Symbol:          _ITM_registerTMCloneTable
+        Type:            R_X86_64_GLOB_DAT
+      - Offset:          0x3FF8
+        Symbol:          __cxa_finalize
+        Type:            R_X86_64_GLOB_DAT
+  - Name:            .rela.plt
+    Type:            SHT_RELA
+    Flags:           [ SHF_ALLOC, SHF_INFO_LINK ]
+    Address:         0x510
+    Link:            .dynsym
+    AddressAlign:    0x8
+    Info:            .got.plt
+    Relocations:
+      - Offset:          0x4018
+        Symbol:          puts
+        Type:            R_X86_64_JUMP_SLOT
+  - Name:            .init
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1000
+    AddressAlign:    0x4
+    Offset:          0x1000
+    Content:         F30F1EFA4883EC08488B05D92F00004885C07402FFD04883C408C3
+  - Name:            .plt
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1020
+    AddressAlign:    0x10
+    EntSize:         0x10
+    Content:         FF35E22F0000F2FF25E32F00000F1F00F30F1EFA6800000000F2E9E1FFFFFF90
+  - Name:            .plt.got
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1040
+    AddressAlign:    0x10
+    EntSize:         0x10
+    Content:         F30F1EFAF2FF25AD2F00000F1F440000
+  - Name:            .plt.sec
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1050
+    AddressAlign:    0x10
+    EntSize:         0x10
+    Content:         F30F1EFAF2FF25BD2F00000F1F440000
+  - Name:            .text
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1060
+    AddressAlign:    0x10
+    Content:         488D3DC12F0000488D05BA2F00004839F87415488B05662F00004885C07409FFE00F1F8000000000C30F1F8000000000488D3D912F0000488D358A2F00004829FE4889F048C1EE3F48C1F8034801C648D1FE7414488B05352F00004885C07408FFE0660F1F440000C30F1F8000000000F30F1EFA803D4D2F000000752B5548833D122F0000004889E5740C488B3D2E2F0000E849FFFFFFE864FFFFFFC605252F0000015DC30F1F00C30F1F8000000000F30F1EFAE977FFFFFFF30F1EFA554889E5488D05D80E00004889C7E820FFFFFF905DC3
+  - Name:            .fini
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1134
+    AddressAlign:    0x4
+    Content:         F30F1EFA4883EC084883C408C3
+  - Name:            .rodata
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2000
+    AddressAlign:    0x1
+    Offset:          0x2000
+    Content:         48656C6C6F2066726F6D204100
+  - Name:            .eh_frame_hdr
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2010
+    AddressAlign:    0x4
+    Content:         011B033B2C0000000400000010F0FFFF4800000030F0FFFF7000000040F0FFFF8800000009F1FFFFA0000000
+  - Name:            .eh_frame
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2040
+    AddressAlign:    0x8
+    Content:         1400000000000000017A5200017810011B0C070890010000240000001C000000C0EFFFFF20000000000E10460E184A0F0B770880003F1A3A2A332422000000001400000044000000B8EFFFFF100000000000000000000000140000005C000000B0EFFFFF1000000000000000000000001C0000007400000061F0FFFF1A00000000450E108602430D06510C070800000000000000
+  - Name:            .init_array
+    Type:            SHT_INIT_ARRAY
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x3E10
+    AddressAlign:    0x8
+    EntSize:         0x8
+    Offset:          0x2E10
+    Content:         '1011000000000000'
+  - Name:            .fini_array
+    Type:            SHT_FINI_ARRAY
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x3E18
+    AddressAlign:    0x8
+    EntSize:         0x8
+    Content:         D010000000000000
+  - Name:            .dynamic
+    Type:            SHT_DYNAMIC
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x3E20
+    Link:            .dynstr
+    AddressAlign:    0x8
+    Entries:
+      - Tag:             DT_NEEDED
+        Value:           0x5F
+      - Tag:             DT_INIT
+        Value:           0x1000
+      - Tag:             DT_FINI
+        Value:           0x1134
+      - Tag:             DT_INIT_ARRAY
+        Value:           0x3E10
+      - Tag:             DT_INIT_ARRAYSZ
+        Value:           0x8
+      - Tag:             DT_FINI_ARRAY
+        Value:           0x3E18
+      - Tag:             DT_FINI_ARRAYSZ
+        Value:           0x8
+      - Tag:             DT_GNU_HASH
+        Value:           0x2F0
+      - Tag:             DT_STRTAB
+        Value:           0x3C0
+      - Tag:             DT_SYMTAB
+        Value:           0x318
+      - Tag:             DT_STRSZ
+        Value:           0x75
+      - Tag:             DT_SYMENT
+        Value:           0x18
+      - Tag:             DT_PLTGOT
+        Value:           0x4000
+      - Tag:             DT_PLTRELSZ
+        Value:           0x18
+      - Tag:             DT_PLTREL
+        Value:           0x7
+      - Tag:             DT_JMPREL
+        Value:           0x510
+      - Tag:             DT_RELA
+        Value:           0x468
+      - Tag:             DT_RELASZ
+        Value:           0xA8
+      - Tag:             DT_RELAENT
+        Value:           0x18
+      - Tag:             DT_VERNEED
+        Value:           0x448
+      - Tag:             DT_VERNEEDNUM
+        Value:           0x1
+      - Tag:             DT_VERSYM
+        Value:           0x436
+      - Tag:             DT_RELACOUNT
+        Value:           0x3
+      - Tag:             DT_NULL
+        Value:           0x0
+      - Tag:             DT_NULL
+        Value:           0x0
+      - Tag:             DT_NULL
+        Value:           0x0
+      - Tag:             DT_NULL
+        Value:           0x0
+      - Tag:             DT_NULL
+        Value:           0x0
+  - Name:            .got
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x3FE0
+    AddressAlign:    0x8
+    EntSize:         0x8
+    Content:         '0000000000000000000000000000000000000000000000000000000000000000'
+  - Name:            .got.plt
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x4000
+    AddressAlign:    0x8
+    EntSize:         0x8
+    Content:         '203E000000000000000000000000000000000000000000003010000000000000'
+  - Name:            .data
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x4020
+    AddressAlign:    0x8
+    Content:         '2040000000000000'
+  - Name:            .bss
+    Type:            SHT_NOBITS
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x4028
+    AddressAlign:    0x1
+    Size:            0x8
+  - Name:            .comment
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_MERGE, SHF_STRINGS ]
+    AddressAlign:    0x1
+    EntSize:         0x1
+    Content:         4743433A20285562756E74752031312E342E302D317562756E7475317E32322E30342E32292031312E342E3000
+Symbols:
+  - Name:            crtstuff.c
+    Type:            STT_FILE
+    Index:           SHN_ABS
+  - Name:            deregister_tm_clones
+    Type:            STT_FUNC
+    Section:         .text
+    Value:           0x1060
+  - Name:            register_tm_clones
+    Type:            STT_FUNC
+    Section:         .text
+    Value:           0x1090
+  - Name:            __do_global_dtors_aux
+    Type:            STT_FUNC
+    Section:         .text
+    Value:           0x10D0
+  - Name:            completed.0
+    Type:            STT_OBJECT
+    Section:         .bss
+    Value:           0x4028
+    Size:            0x1
+  - Name:            __do_global_dtors_aux_fini_array_entry
+    Type:            STT_OBJECT
+    Section:         .fini_array
+    Value:           0x3E18
+  - Name:            frame_dummy
+    Type:            STT_FUNC
+    Section:         .text
+    Value:           0x1110
+  - Name:            __frame_dummy_init_array_entry
+    Type:            STT_OBJECT
+    Section:         .init_array
+    Value:           0x3E10
+  - Name:            libA.c
+    Type:            STT_FILE
+    Index:           SHN_ABS
+  - Name:            'crtstuff.c (1)'
+    Type:            STT_FILE
+    Index:           SHN_ABS
+  - Name:            __FRAME_END__
+    Type:            STT_OBJECT
+    Section:         .eh_frame
+    Value:           0x20D0
+  - Type:            STT_FILE
+    Index:           SHN_ABS
+  - Name:            _fini
+    Type:            STT_FUNC
+    Section:         .fini
+    Value:           0x1134
+  - Name:            __dso_handle
+    Type:            STT_OBJECT
+    Section:         .data
+    Value:           0x4020
+  - Name:            _DYNAMIC
+    Type:            STT_OBJECT
+    Section:         .dynamic
+    Value:           0x3E20
+  - Name:            __GNU_EH_FRAME_HDR
+    Section:         .eh_frame_hdr
+    Value:           0x2010
+  - Name:            __TMC_END__
+    Type:            STT_OBJECT
+    Section:         .data
+    Value:           0x4028
+  - Name:            _GLOBAL_OFFSET_TABLE_
+    Type:            STT_OBJECT
+    Section:         .got.plt
+    Value:           0x4000
+  - Name:            _init
+    Type:            STT_FUNC
+    Section:         .init
+    Value:           0x1000
+  - Name:            _ITM_deregisterTMCloneTable
+    Binding:         STB_WEAK
+  - Name:            'puts@GLIBC_2.2.5'
+    Type:            STT_FUNC
+    Binding:         STB_GLOBAL
+  - Name:            sayA
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+    Value:           0x1119
+    Size:            0x1A
+  - Name:            __gmon_start__
+    Binding:         STB_WEAK
+  - Name:            _ITM_registerTMCloneTable
+    Binding:         STB_WEAK
+  - Name:            '__cxa_finalize@GLIBC_2.2.5'
+    Type:            STT_FUNC
+    Binding:         STB_WEAK
+DynamicSymbols:
+  - Name:            _ITM_deregisterTMCloneTable
+    Binding:         STB_WEAK
+  - Name:            puts
+    Type:            STT_FUNC
+    Binding:         STB_GLOBAL
+  - Name:            __gmon_start__
+    Binding:         STB_WEAK
+  - Name:            _ITM_registerTMCloneTable
+    Binding:         STB_WEAK
+  - Name:            __cxa_finalize
+    Type:            STT_FUNC
+    Binding:         STB_WEAK
+  - Name:            sayA
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+    Value:           0x1119
+    Size:            0x1A
+...
diff --git a/llvm/unittests/ExecutionEngine/Orc/Inputs/A/A_macho.yaml b/llvm/unittests/ExecutionEngine/Orc/Inputs/A/A_macho.yaml
new file mode 100644
index 0000000000000..2e851a90c21ed
--- /dev/null
+++ b/llvm/unittests/ExecutionEngine/Orc/Inputs/A/A_macho.yaml
@@ -0,0 +1,723 @@
+--- !fat-mach-o
+FatHeader:
+  magic:           0xCAFEBABE
+  nfat_arch:       3
+FatArchs:
+  - cputype:         0x1000007
+    cpusubtype:      0x3
+    offset:          0x1000
+    size:            8376
+    align:           12
+  - cputype:         0x100000C
+    cpusubtype:      0x0
+    offset:          0x4000
+    size:            33376
+    align:           14
+  - cputype:         0x100000C
+    cpusubtype:      0x80000002
+    offset:          0x10000
+    size:            33376
+    align:           14
+Slices:
+  - !mach-o
+    FileHeader:
+      magic:           0xFEEDFACF
+      cputype:         0x1000007
+      cpusubtype:      0x3
+      filetype:        0x6
+      ncmds:           14
+      sizeofcmds:      960
+      flags:           0x100085
+      reserved:        0x0
+    LoadCommands:
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         392
+        segname:         __TEXT
+        vmaddr:          0
+        vmsize:          4096
+        fileoff:         0
+        filesize:        4096
+        maxprot:         5
+        initprot:        5
+        nsects:          4
+        flags:           0
+        Sections:
+          - sectname:        __text
+            segname:         __TEXT
+            addr:            0xF80
+            size:            20
+            offset:          0xF80
+            align:           4
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000400
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         554889E5488D3D0F000000B000E8020000005DC3
+          - sectname:        __stubs
+            segname:         __TEXT
+            addr:            0xF94
+            size:            6
+            offset:          0xF94
+            align:           1
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000408
+            reserved1:       0x0
+            reserved2:       0x6
+            reserved3:       0x0
+            content:         FF2566000000
+          - sectname:        __cstring
+            segname:         __TEXT
+            addr:            0xF9A
+            size:            14
+            offset:          0xF9A
+            align:           0
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x2
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         48656C6C6F2066726F6D20410A00
+          - sectname:        __unwind_info
+            segname:         __TEXT
+            addr:            0xFA8
+            size:            88
+            offset:          0xFA8
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x0
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         010000001C000000000000001C000000000000001C00000002000000800F00004000000040000000940F00000000000040000000000000000000000000000000030000000C00010010000100000000000000000100000000
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         152
+        segname:         __DATA_CONST
+        vmaddr:          4096
+        vmsize:          4096
+        fileoff:         4096
+        filesize:        4096
+        maxprot:         3
+        initprot:        3
+        nsects:          1
+        flags:           16
+        Sections:
+          - sectname:        __got
+            segname:         __DATA_CONST
+            addr:            0x1000
+            size:            8
+            offset:          0x1000
+            align:           3
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x6
+            reserved1:       0x1
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         '0000000000000080'
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         72
+        segname:         __LINKEDIT
+        vmaddr:          8192
+        vmsize:          4096
+        fileoff:         8192
+        filesize:        184
+        maxprot:         1
+        initprot:        1
+        nsects:          0
+        flags:           0
+      - cmd:             LC_ID_DYLIB
+        cmdsize:         48
+        dylib:
+          name:            24
+          timestamp:       1
+          current_version: 0
+          compatibility_version: 0
+        Content:         '@rpath/libA.dylib'
+        ZeroPadBytes:    7
+      - cmd:             LC_DYLD_CHAINED_FIXUPS
+        cmdsize:         16
+        dataoff:         8192
+        datasize:        96
+      - cmd:             LC_DYLD_EXPORTS_TRIE
+        cmdsize:         16
+        dataoff:         8288
+        datasize:        24
+      - cmd:             LC_SYMTAB
+        cmdsize:         24
+        symoff:          8320
+        nsyms:           2
+        stroff:          8360
+        strsize:         16
+      - cmd:             LC_DYSYMTAB
+        cmdsize:         80
+        ilocalsym:       0
+        nlocalsym:       0
+        iextdefsym:      0
+        nextdefsym:      1
+        iundefsym:       1
+        nundefsym:       1
+        tocoff:          0
+        ntoc:            0
+        modtaboff:       0
+        nmodtab:         0
+        extrefsymoff:    0
+        nextrefsyms:     0
+        indirectsymoff:  8352
+        nindirectsyms:   2
+        extreloff:       0
+        nextrel:         0
+        locreloff:       0
+        nlocrel:         0
+      - cmd:             LC_UUID
+        cmdsize:         24
+        uuid:            ADFFA141-C3EE-37CD-B1E7-906D69F81BCB
+      - cmd:             LC_BUILD_VERSION
+        cmdsize:         32
+        platform:        1
+        minos:           983040
+        sdk:             983552
+        ntools:          1
+        Tools:
+          - tool:            3
+            version:         73074435
+      - cmd:             LC_SOURCE_VERSION
+        cmdsize:         16
+        version:         0
+      - cmd:             LC_LOAD_DYLIB
+        cmdsize:         56
+        dylib:
+          name:            24
+          timestamp:       2
+          current_version: 88539136
+          compatibility_version: 65536
+        Content:         '/usr/lib/libSystem.B.dylib'
+        ZeroPadBytes:    6
+      - cmd:             LC_FUNCTION_STARTS
+        cmdsize:         16
+        dataoff:         8312
+        datasize:        8
+      - cmd:             LC_DATA_IN_CODE
+        cmdsize:         16
+        dataoff:         8320
+        datasize:        0
+    LinkEditData:
+      ExportTrie:
+        TerminalSize:    0
+        NodeOffset:      0
+        Name:            ''
+        Flags:           0x0
+        Address:         0x0
+        Other:           0x0
+        ImportName:      ''
+        Children:
+          - TerminalSize:    3
+            NodeOffset:      13
+            Name:            _sayA
+            Flags:           0x0
+            Address:         0xF80
+            Other:           0x0
+            ImportName:      ''
+      NameList:
+        - n_strx:          2
+          n_type:          0xF
+          n_sect:          1
+          n_desc:          0
+          n_value:         3968
+        - n_strx:          8
+          n_type:          0x1
+          n_sect:          0
+          n_desc:          256
+          n_value:         0
+      StringTable:
+        - ' '
+        - _sayA
+        - _printf
+      IndirectSymbols: [ 0x1, 0x1 ]
+      FunctionStarts:  [ 0xF80 ]
+      ChainedFixups:   [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x48, 
+                         0x0, 0x0, 0x0, 0x50, 0x0, 0x0, 0x0, 0x1, 0x0, 
+                         0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x18, 0x0, 0x0, 0x0, 0x0, 0x10, 0x6, 0x0, 
+                         0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x2, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x5F, 0x70, 0x72, 
+                         0x69, 0x6E, 0x74, 0x66, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0 ]
+  - !mach-o
+    FileHeader:
+      magic:           0xFEEDFACF
+      cputype:         0x100000C
+      cpusubtype:      0x0
+      filetype:        0x6
+      ncmds:           15
+      sizeofcmds:      976
+      flags:           0x100085
+      reserved:        0x0
+    LoadCommands:
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         392
+        segname:         __TEXT
+        vmaddr:          0
+        vmsize:          16384
+        fileoff:         0
+        filesize:        16384
+        maxprot:         5
+        initprot:        5
+        nsects:          4
+        flags:           0
+        Sections:
+          - sectname:        __text
+            segname:         __TEXT
+            addr:            0x3F70
+            size:            28
+            offset:          0x3F70
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000400
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         FD7BBFA9FD0300910000009000603E9103000094FD7BC1A8C0035FD6
+          - sectname:        __stubs
+            segname:         __TEXT
+            addr:            0x3F8C
+            size:            12
+            offset:          0x3F8C
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000408
+            reserved1:       0x0
+            reserved2:       0xC
+            reserved3:       0x0
+            content:         100000B0100240F900021FD6
+          - sectname:        __cstring
+            segname:         __TEXT
+            addr:            0x3F98
+            size:            14
+            offset:          0x3F98
+            align:           0
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x2
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         48656C6C6F2066726F6D20410A00
+          - sectname:        __unwind_info
+            segname:         __TEXT
+            addr:            0x3FA8
+            size:            88
+            offset:          0x3FA8
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x0
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         010000001C000000000000001C000000000000001C00000002000000703F000040000000400000008C3F00000000000040000000000000000000000000000000030000000C00010010000100000000000000000400000000
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         152
+        segname:         __DATA_CONST
+        vmaddr:          16384
+        vmsize:          16384
+        fileoff:         16384
+        filesize:        16384
+        maxprot:         3
+        initprot:        3
+        nsects:          1
+        flags:           16
+        Sections:
+          - sectname:        __got
+            segname:         __DATA_CONST
+            addr:            0x4000
+            size:            8
+            offset:          0x4000
+            align:           3
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x6
+            reserved1:       0x1
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         '0000000000000080'
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         72
+        segname:         __LINKEDIT
+        vmaddr:          32768
+        vmsize:          16384
+        fileoff:         32768
+        filesize:        608
+        maxprot:         1
+        initprot:        1
+        nsects:          0
+        flags:           0
+      - cmd:             LC_ID_DYLIB
+        cmdsize:         48
+        dylib:
+          name:            24
+          timestamp:       1
+          current_version: 0
+          compatibility_version: 0
+        Content:         '@rpath/libA.dylib'
+        ZeroPadBytes:    7
+      - cmd:             LC_DYLD_CHAINED_FIXUPS
+        cmdsize:         16
+        dataoff:         32768
+        datasize:        96
+      - cmd:             LC_DYLD_EXPORTS_TRIE
+        cmdsize:         16
+        dataoff:         32864
+        datasize:        24
+      - cmd:             LC_SYMTAB
+        cmdsize:         24
+        symoff:          32896
+        nsyms:           2
+        stroff:          32936
+        strsize:         16
+      - cmd:             LC_DYSYMTAB
+        cmdsize:         80
+        ilocalsym:       0
+        nlocalsym:       0
+        iextdefsym:      0
+        nextdefsym:      1
+        iundefsym:       1
+        nundefsym:       1
+        tocoff:          0
+        ntoc:            0
+        modtaboff:       0
+        nmodtab:         0
+        extrefsymoff:    0
+        nextrefsyms:     0
+        indirectsymoff:  32928
+        nindirectsyms:   2
+        extreloff:       0
+        nextrel:         0
+        locreloff:       0
+        nlocrel:         0
+      - cmd:             LC_UUID
+        cmdsize:         24
+        uuid:            C45227E0-C6C0-3137-969B-36AABF9D5487
+      - cmd:             LC_BUILD_VERSION
+        cmdsize:         32
+        platform:        1
+        minos:           983040
+        sdk:             983552
+        ntools:          1
+        Tools:
+          - tool:            3
+            version:         73074435
+      - cmd:             LC_SOURCE_VERSION
+        cmdsize:         16
+        version:         0
+      - cmd:             LC_LOAD_DYLIB
+        cmdsize:         56
+        dylib:
+          name:            24
+          timestamp:       2
+          current_version: 88539136
+          compatibility_version: 65536
+        Content:         '/usr/lib/libSystem.B.dylib'
+        ZeroPadBytes:    6
+      - cmd:             LC_FUNCTION_STARTS
+        cmdsize:         16
+        dataoff:         32888
+        datasize:        8
+      - cmd:             LC_DATA_IN_CODE
+        cmdsize:         16
+        dataoff:         32896
+        datasize:        0
+      - cmd:             LC_CODE_SIGNATURE
+        cmdsize:         16
+        dataoff:         32960
+        datasize:        416
+    LinkEditData:
+      ExportTrie:
+        TerminalSize:    0
+        NodeOffset:      0
+        Name:            ''
+        Flags:           0x0
+        Address:         0x0
+        Other:           0x0
+        ImportName:      ''
+        Children:
+          - TerminalSize:    3
+            NodeOffset:      13
+            Name:            _sayA
+            Flags:           0x0
+            Address:         0x3F70
+            Other:           0x0
+            ImportName:      ''
+      NameList:
+        - n_strx:          2
+          n_type:          0xF
+          n_sect:          1
+          n_desc:          0
+          n_value:         16240
+        - n_strx:          8
+          n_type:          0x1
+          n_sect:          0
+          n_desc:          256
+          n_value:         0
+      StringTable:
+        - ' '
+        - _sayA
+        - _printf
+      IndirectSymbols: [ 0x1, 0x1 ]
+      FunctionStarts:  [ 0x3F70 ]
+      ChainedFixups:   [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x48, 
+                         0x0, 0x0, 0x0, 0x50, 0x0, 0x0, 0x0, 0x1, 0x0, 
+                         0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x18, 0x0, 0x0, 0x0, 0x0, 0x40, 0x6, 0x0, 
+                         0x0, 0x40, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x2, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x5F, 0x70, 0x72, 
+                         0x69, 0x6E, 0x74, 0x66, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0 ]
+  - !mach-o
+    FileHeader:
+      magic:           0xFEEDFACF
+      cputype:         0x100000C
+      cpusubtype:      0x80000002
+      filetype:        0x6
+      ncmds:           15
+      sizeofcmds:      976
+      flags:           0x100085
+      reserved:        0x0
+    LoadCommands:
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         392
+        segname:         __TEXT
+        vmaddr:          0
+        vmsize:          16384
+        fileoff:         0
+        filesize:        16384
+        maxprot:         5
+        initprot:        5
+        nsects:          4
+        flags:           0
+        Sections:
+          - sectname:        __text
+            segname:         __TEXT
+            addr:            0x3F68
+            size:            32
+            offset:          0x3F68
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000400
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         7F2303D5FD7BBFA9FD0300910000009000603E9103000094FD7BC1A8FF0F5FD6
+          - sectname:        __auth_stubs
+            segname:         __TEXT
+            addr:            0x3F88
+            size:            16
+            offset:          0x3F88
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000408
+            reserved1:       0x0
+            reserved2:       0x10
+            reserved3:       0x0
+            content:         110000B031020091300240F9110A1FD7
+          - sectname:        __cstring
+            segname:         __TEXT
+            addr:            0x3F98
+            size:            14
+            offset:          0x3F98
+            align:           0
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x2
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         48656C6C6F2066726F6D20410A00
+          - sectname:        __unwind_info
+            segname:         __TEXT
+            addr:            0x3FA8
+            size:            88
+            offset:          0x3FA8
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x0
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         010000001C000000000000001C000000000000001C00000002000000683F00004000000040000000883F00000000000040000000000000000000000000000000030000000C00010010000100000000000000000400000000
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         152
+        segname:         __DATA_CONST
+        vmaddr:          16384
+        vmsize:          16384
+        fileoff:         16384
+        filesize:        16384
+        maxprot:         3
+        initprot:        3
+        nsects:          1
+        flags:           16
+        Sections:
+          - sectname:        __auth_got
+            segname:         __DATA_CONST
+            addr:            0x4000
+            size:            8
+            offset:          0x4000
+            align:           3
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x6
+            reserved1:       0x1
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         00000000000001C0
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         72
+        segname:         __LINKEDIT
+        vmaddr:          32768
+        vmsize:          16384
+        fileoff:         32768
+        filesize:        608
+        maxprot:         1
+        initprot:        1
+        nsects:          0
+        flags:           0
+      - cmd:             LC_ID_DYLIB
+        cmdsize:         48
+        dylib:
+          name:            24
+          timestamp:       1
+          current_version: 0
+          compatibility_version: 0
+        Content:         '@rpath/libA.dylib'
+        ZeroPadBytes:    7
+      - cmd:             LC_DYLD_CHAINED_FIXUPS
+        cmdsize:         16
+        dataoff:         32768
+        datasize:        96
+      - cmd:             LC_DYLD_EXPORTS_TRIE
+        cmdsize:         16
+        dataoff:         32864
+        datasize:        24
+      - cmd:             LC_SYMTAB
+        cmdsize:         24
+        symoff:          32896
+        nsyms:           2
+        stroff:          32936
+        strsize:         16
+      - cmd:             LC_DYSYMTAB
+        cmdsize:         80
+        ilocalsym:       0
+        nlocalsym:       0
+        iextdefsym:      0
+        nextdefsym:      1
+        iundefsym:       1
+        nundefsym:       1
+        tocoff:          0
+        ntoc:            0
+        modtaboff:       0
+        nmodtab:         0
+        extrefsymoff:    0
+        nextrefsyms:     0
+        indirectsymoff:  32928
+        nindirectsyms:   2
+        extreloff:       0
+        nextrel:         0
+        locreloff:       0
+        nlocrel:         0
+      - cmd:             LC_UUID
+        cmdsize:         24
+        uuid:            C9DC00C2-E721-365C-9C2D-E9FDB7C838BB
+      - cmd:             LC_BUILD_VERSION
+        cmdsize:         32
+        platform:        1
+        minos:           983040
+        sdk:             983552
+        ntools:          1
+        Tools:
+          - tool:            3
+            version:         73074435
+      - cmd:             LC_SOURCE_VERSION
+        cmdsize:         16
+        version:         0
+      - cmd:             LC_LOAD_DYLIB
+        cmdsize:         56
+        dylib:
+          name:            24
+          timestamp:       2
+          current_version: 88539136
+          compatibility_version: 65536
+        Content:         '/usr/lib/libSystem.B.dylib'
+        ZeroPadBytes:    6
+      - cmd:             LC_FUNCTION_STARTS
+        cmdsize:         16
+        dataoff:         32888
+        datasize:        8
+      - cmd:             LC_DATA_IN_CODE
+        cmdsize:         16
+        dataoff:         32896
+        datasize:        0
+      - cmd:             LC_CODE_SIGNATURE
+        cmdsize:         16
+        dataoff:         32960
+        datasize:        416
+    LinkEditData:
+      ExportTrie:
+        TerminalSize:    0
+        NodeOffset:      0
+        Name:            ''
+        Flags:           0x0
+        Address:         0x0
+        Other:           0x0
+        ImportName:      ''
+        Children:
+          - TerminalSize:    3
+            NodeOffset:      13
+            Name:            _sayA
+            Flags:           0x0
+            Address:         0x3F68
+            Other:           0x0
+            ImportName:      ''
+      NameList:
+        - n_strx:          2
+          n_type:          0xF
+          n_sect:          1
+          n_desc:          0
+          n_value:         16232
+        - n_strx:          8
+          n_type:          0x1
+          n_sect:          0
+          n_desc:          256
+          n_value:         0
+      StringTable:
+        - ' '
+        - _sayA
+        - _printf
+      IndirectSymbols: [ 0x1, 0x1 ]
+      FunctionStarts:  [ 0x3F68 ]
+      ChainedFixups:   [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x48, 
+                         0x0, 0x0, 0x0, 0x50, 0x0, 0x0, 0x0, 0x1, 0x0, 
+                         0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x18, 0x0, 0x0, 0x0, 0x0, 0x40, 0xC, 0x0, 
+                         0x0, 0x40, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x2, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x5F, 0x70, 0x72, 
+                         0x69, 0x6E, 0x74, 0x66, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0 ]
+...
diff --git a/llvm/unittests/ExecutionEngine/Orc/Inputs/B/B_linux.yaml b/llvm/unittests/ExecutionEngine/Orc/Inputs/B/B_linux.yaml
new file mode 100644
index 0000000000000..fe4393e108d96
--- /dev/null
+++ b/llvm/unittests/ExecutionEngine/Orc/Inputs/B/B_linux.yaml
@@ -0,0 +1,460 @@
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_DYN
+  Machine:         EM_X86_64
+ProgramHeaders:
+  - Type:            PT_LOAD
+    Flags:           [ PF_R ]
+    FirstSec:        .note.gnu.property
+    LastSec:         .rela.plt
+    Align:           0x1000
+    Offset:          0x0
+  - Type:            PT_LOAD
+    Flags:           [ PF_X, PF_R ]
+    FirstSec:        .init
+    LastSec:         .fini
+    VAddr:           0x1000
+    Align:           0x1000
+    Offset:          0x1000
+  - Type:            PT_LOAD
+    Flags:           [ PF_R ]
+    FirstSec:        .rodata
+    LastSec:         .eh_frame
+    VAddr:           0x2000
+    Align:           0x1000
+    Offset:          0x2000
+  - Type:            PT_LOAD
+    Flags:           [ PF_W, PF_R ]
+    FirstSec:        .init_array
+    LastSec:         .bss
+    VAddr:           0x3E10
+    Align:           0x1000
+    Offset:          0x2E10
+  - Type:            PT_DYNAMIC
+    Flags:           [ PF_W, PF_R ]
+    FirstSec:        .dynamic
+    LastSec:         .dynamic
+    VAddr:           0x3E20
+    Align:           0x8
+    Offset:          0x2E20
+  - Type:            PT_NOTE
+    Flags:           [ PF_R ]
+    FirstSec:        .note.gnu.property
+    LastSec:         .note.gnu.property
+    VAddr:           0x2A8
+    Align:           0x8
+    Offset:          0x2A8
+  - Type:            PT_NOTE
+    Flags:           [ PF_R ]
+    FirstSec:        .note.gnu.build-id
+    LastSec:         .note.gnu.build-id
+    VAddr:           0x2C8
+    Align:           0x4
+    Offset:          0x2C8
+  - Type:            PT_GNU_PROPERTY
+    Flags:           [ PF_R ]
+    FirstSec:        .note.gnu.property
+    LastSec:         .note.gnu.property
+    VAddr:           0x2A8
+    Align:           0x8
+    Offset:          0x2A8
+  - Type:            PT_GNU_EH_FRAME
+    Flags:           [ PF_R ]
+    FirstSec:        .eh_frame_hdr
+    LastSec:         .eh_frame_hdr
+    VAddr:           0x2010
+    Align:           0x4
+    Offset:          0x2010
+  - Type:            PT_GNU_STACK
+    Flags:           [ PF_W, PF_R ]
+    Align:           0x10
+    Offset:          0x0
+  - Type:            PT_GNU_RELRO
+    Flags:           [ PF_R ]
+    FirstSec:        .init_array
+    LastSec:         .got
+    VAddr:           0x3E10
+    Offset:          0x2E10
+Sections:
+  - Name:            .note.gnu.property
+    Type:            SHT_NOTE
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2A8
+    AddressAlign:    0x8
+    Notes:
+      - Name:            GNU
+        Desc:            020000C0040000000300000000000000
+        Type:            NT_GNU_PROPERTY_TYPE_0
+  - Name:            .note.gnu.build-id
+    Type:            SHT_NOTE
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2C8
+    AddressAlign:    0x4
+    Notes:
+      - Name:            GNU
+        Desc:            6337F7C1BF21A1DE17630C55602EB4CAC50435BB
+        Type:            NT_PRPSINFO
+  - Name:            .gnu.hash
+    Type:            SHT_GNU_HASH
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2F0
+    Link:            .dynsym
+    AddressAlign:    0x8
+    Header:
+      SymNdx:          0x6
+      Shift2:          0x6
+    BloomFilter:     [ 0x400000100000 ]
+    HashBuckets:     [ 0x6, 0x0 ]
+    HashValues:      [ 0x7C9DCB95 ]
+  - Name:            .dynsym
+    Type:            SHT_DYNSYM
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x318
+    Link:            .dynstr
+    AddressAlign:    0x8
+  - Name:            .dynstr
+    Type:            SHT_STRTAB
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x3C0
+    AddressAlign:    0x1
+  - Name:            .gnu.version
+    Type:            SHT_GNU_versym
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x436
+    Link:            .dynsym
+    AddressAlign:    0x2
+    Entries:         [ 0, 1, 2, 1, 1, 2, 1 ]
+  - Name:            .gnu.version_r
+    Type:            SHT_GNU_verneed
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x448
+    Link:            .dynstr
+    AddressAlign:    0x8
+    Dependencies:
+      - Version:         1
+        File:            libc.so.6
+        Entries:
+          - Name:            GLIBC_2.2.5
+            Hash:            157882997
+            Flags:           0
+            Other:           2
+  - Name:            .rela.dyn
+    Type:            SHT_RELA
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x468
+    Link:            .dynsym
+    AddressAlign:    0x8
+    Relocations:
+      - Offset:          0x3E10
+        Type:            R_X86_64_RELATIVE
+        Addend:          4368
+      - Offset:          0x3E18
+        Type:            R_X86_64_RELATIVE
+        Addend:          4304
+      - Offset:          0x4020
+        Type:            R_X86_64_RELATIVE
+        Addend:          16416
+      - Offset:          0x3FE0
+        Symbol:          _ITM_deregisterTMCloneTable
+        Type:            R_X86_64_GLOB_DAT
+      - Offset:          0x3FE8
+        Symbol:          __gmon_start__
+        Type:            R_X86_64_GLOB_DAT
+      - Offset:          0x3FF0
+        Symbol:          _ITM_registerTMCloneTable
+        Type:            R_X86_64_GLOB_DAT
+      - Offset:          0x3FF8
+        Symbol:          __cxa_finalize
+        Type:            R_X86_64_GLOB_DAT
+  - Name:            .rela.plt
+    Type:            SHT_RELA
+    Flags:           [ SHF_ALLOC, SHF_INFO_LINK ]
+    Address:         0x510
+    Link:            .dynsym
+    AddressAlign:    0x8
+    Info:            .got.plt
+    Relocations:
+      - Offset:          0x4018
+        Symbol:          puts
+        Type:            R_X86_64_JUMP_SLOT
+  - Name:            .init
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1000
+    AddressAlign:    0x4
+    Offset:          0x1000
+    Content:         F30F1EFA4883EC08488B05D92F00004885C07402FFD04883C408C3
+  - Name:            .plt
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1020
+    AddressAlign:    0x10
+    EntSize:         0x10
+    Content:         FF35E22F0000F2FF25E32F00000F1F00F30F1EFA6800000000F2E9E1FFFFFF90
+  - Name:            .plt.got
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1040
+    AddressAlign:    0x10
+    EntSize:         0x10
+    Content:         F30F1EFAF2FF25AD2F00000F1F440000
+  - Name:            .plt.sec
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1050
+    AddressAlign:    0x10
+    EntSize:         0x10
+    Content:         F30F1EFAF2FF25BD2F00000F1F440000
+  - Name:            .text
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1060
+    AddressAlign:    0x10
+    Content:         488D3DC12F0000488D05BA2F00004839F87415488B05662F00004885C07409FFE00F1F8000000000C30F1F8000000000488D3D912F0000488D358A2F00004829FE4889F048C1EE3F48C1F8034801C648D1FE7414488B05352F00004885C07408FFE0660F1F440000C30F1F8000000000F30F1EFA803D4D2F000000752B5548833D122F0000004889E5740C488B3D2E2F0000E849FFFFFFE864FFFFFFC605252F0000015DC30F1F00C30F1F8000000000F30F1EFAE977FFFFFFF30F1EFA554889E5488D05D80E00004889C7E820FFFFFF905DC3
+  - Name:            .fini
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1134
+    AddressAlign:    0x4
+    Content:         F30F1EFA4883EC084883C408C3
+  - Name:            .rodata
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2000
+    AddressAlign:    0x1
+    Offset:          0x2000
+    Content:         48656C6C6F2066726F6D204200
+  - Name:            .eh_frame_hdr
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2010
+    AddressAlign:    0x4
+    Content:         011B033B2C0000000400000010F0FFFF4800000030F0FFFF7000000040F0FFFF8800000009F1FFFFA0000000
+  - Name:            .eh_frame
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2040
+    AddressAlign:    0x8
+    Content:         1400000000000000017A5200017810011B0C070890010000240000001C000000C0EFFFFF20000000000E10460E184A0F0B770880003F1A3A2A332422000000001400000044000000B8EFFFFF100000000000000000000000140000005C000000B0EFFFFF1000000000000000000000001C0000007400000061F0FFFF1A00000000450E108602430D06510C070800000000000000
+  - Name:            .init_array
+    Type:            SHT_INIT_ARRAY
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x3E10
+    AddressAlign:    0x8
+    EntSize:         0x8
+    Offset:          0x2E10
+    Content:         '1011000000000000'
+  - Name:            .fini_array
+    Type:            SHT_FINI_ARRAY
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x3E18
+    AddressAlign:    0x8
+    EntSize:         0x8
+    Content:         D010000000000000
+  - Name:            .dynamic
+    Type:            SHT_DYNAMIC
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x3E20
+    Link:            .dynstr
+    AddressAlign:    0x8
+    Entries:
+      - Tag:             DT_NEEDED
+        Value:           0x5F
+      - Tag:             DT_INIT
+        Value:           0x1000
+      - Tag:             DT_FINI
+        Value:           0x1134
+      - Tag:             DT_INIT_ARRAY
+        Value:           0x3E10
+      - Tag:             DT_INIT_ARRAYSZ
+        Value:           0x8
+      - Tag:             DT_FINI_ARRAY
+        Value:           0x3E18
+      - Tag:             DT_FINI_ARRAYSZ
+        Value:           0x8
+      - Tag:             DT_GNU_HASH
+        Value:           0x2F0
+      - Tag:             DT_STRTAB
+        Value:           0x3C0
+      - Tag:             DT_SYMTAB
+        Value:           0x318
+      - Tag:             DT_STRSZ
+        Value:           0x75
+      - Tag:             DT_SYMENT
+        Value:           0x18
+      - Tag:             DT_PLTGOT
+        Value:           0x4000
+      - Tag:             DT_PLTRELSZ
+        Value:           0x18
+      - Tag:             DT_PLTREL
+        Value:           0x7
+      - Tag:             DT_JMPREL
+        Value:           0x510
+      - Tag:             DT_RELA
+        Value:           0x468
+      - Tag:             DT_RELASZ
+        Value:           0xA8
+      - Tag:             DT_RELAENT
+        Value:           0x18
+      - Tag:             DT_VERNEED
+        Value:           0x448
+      - Tag:             DT_VERNEEDNUM
+        Value:           0x1
+      - Tag:             DT_VERSYM
+        Value:           0x436
+      - Tag:             DT_RELACOUNT
+        Value:           0x3
+      - Tag:             DT_NULL
+        Value:           0x0
+      - Tag:             DT_NULL
+        Value:           0x0
+      - Tag:             DT_NULL
+        Value:           0x0
+      - Tag:             DT_NULL
+        Value:           0x0
+      - Tag:             DT_NULL
+        Value:           0x0
+  - Name:            .got
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x3FE0
+    AddressAlign:    0x8
+    EntSize:         0x8
+    Content:         '0000000000000000000000000000000000000000000000000000000000000000'
+  - Name:            .got.plt
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x4000
+    AddressAlign:    0x8
+    EntSize:         0x8
+    Content:         '203E000000000000000000000000000000000000000000003010000000000000'
+  - Name:            .data
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x4020
+    AddressAlign:    0x8
+    Content:         '2040000000000000'
+  - Name:            .bss
+    Type:            SHT_NOBITS
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x4028
+    AddressAlign:    0x1
+    Size:            0x8
+  - Name:            .comment
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_MERGE, SHF_STRINGS ]
+    AddressAlign:    0x1
+    EntSize:         0x1
+    Content:         4743433A20285562756E74752031312E342E302D317562756E7475317E32322E30342E32292031312E342E3000
+Symbols:
+  - Name:            crtstuff.c
+    Type:            STT_FILE
+    Index:           SHN_ABS
+  - Name:            deregister_tm_clones
+    Type:            STT_FUNC
+    Section:         .text
+    Value:           0x1060
+  - Name:            register_tm_clones
+    Type:            STT_FUNC
+    Section:         .text
+    Value:           0x1090
+  - Name:            __do_global_dtors_aux
+    Type:            STT_FUNC
+    Section:         .text
+    Value:           0x10D0
+  - Name:            completed.0
+    Type:            STT_OBJECT
+    Section:         .bss
+    Value:           0x4028
+    Size:            0x1
+  - Name:            __do_global_dtors_aux_fini_array_entry
+    Type:            STT_OBJECT
+    Section:         .fini_array
+    Value:           0x3E18
+  - Name:            frame_dummy
+    Type:            STT_FUNC
+    Section:         .text
+    Value:           0x1110
+  - Name:            __frame_dummy_init_array_entry
+    Type:            STT_OBJECT
+    Section:         .init_array
+    Value:           0x3E10
+  - Name:            libB.c
+    Type:            STT_FILE
+    Index:           SHN_ABS
+  - Name:            'crtstuff.c (1)'
+    Type:            STT_FILE
+    Index:           SHN_ABS
+  - Name:            __FRAME_END__
+    Type:            STT_OBJECT
+    Section:         .eh_frame
+    Value:           0x20D0
+  - Type:            STT_FILE
+    Index:           SHN_ABS
+  - Name:            _fini
+    Type:            STT_FUNC
+    Section:         .fini
+    Value:           0x1134
+  - Name:            __dso_handle
+    Type:            STT_OBJECT
+    Section:         .data
+    Value:           0x4020
+  - Name:            _DYNAMIC
+    Type:            STT_OBJECT
+    Section:         .dynamic
+    Value:           0x3E20
+  - Name:            __GNU_EH_FRAME_HDR
+    Section:         .eh_frame_hdr
+    Value:           0x2010
+  - Name:            __TMC_END__
+    Type:            STT_OBJECT
+    Section:         .data
+    Value:           0x4028
+  - Name:            _GLOBAL_OFFSET_TABLE_
+    Type:            STT_OBJECT
+    Section:         .got.plt
+    Value:           0x4000
+  - Name:            _init
+    Type:            STT_FUNC
+    Section:         .init
+    Value:           0x1000
+  - Name:            _ITM_deregisterTMCloneTable
+    Binding:         STB_WEAK
+  - Name:            'puts@GLIBC_2.2.5'
+    Type:            STT_FUNC
+    Binding:         STB_GLOBAL
+  - Name:            __gmon_start__
+    Binding:         STB_WEAK
+  - Name:            sayB
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+    Value:           0x1119
+    Size:            0x1A
+  - Name:            _ITM_registerTMCloneTable
+    Binding:         STB_WEAK
+  - Name:            '__cxa_finalize@GLIBC_2.2.5'
+    Type:            STT_FUNC
+    Binding:         STB_WEAK
+DynamicSymbols:
+  - Name:            _ITM_deregisterTMCloneTable
+    Binding:         STB_WEAK
+  - Name:            puts
+    Type:            STT_FUNC
+    Binding:         STB_GLOBAL
+  - Name:            __gmon_start__
+    Binding:         STB_WEAK
+  - Name:            _ITM_registerTMCloneTable
+    Binding:         STB_WEAK
+  - Name:            __cxa_finalize
+    Type:            STT_FUNC
+    Binding:         STB_WEAK
+  - Name:            sayB
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+    Value:           0x1119
+    Size:            0x1A
+...
diff --git a/llvm/unittests/ExecutionEngine/Orc/Inputs/B/B_macho.yaml b/llvm/unittests/ExecutionEngine/Orc/Inputs/B/B_macho.yaml
new file mode 100644
index 0000000000000..3d57c4f9271c6
--- /dev/null
+++ b/llvm/unittests/ExecutionEngine/Orc/Inputs/B/B_macho.yaml
@@ -0,0 +1,723 @@
+--- !fat-mach-o
+FatHeader:
+  magic:           0xCAFEBABE
+  nfat_arch:       3
+FatArchs:
+  - cputype:         0x1000007
+    cpusubtype:      0x3
+    offset:          0x1000
+    size:            8376
+    align:           12
+  - cputype:         0x100000C
+    cpusubtype:      0x0
+    offset:          0x4000
+    size:            33376
+    align:           14
+  - cputype:         0x100000C
+    cpusubtype:      0x80000002
+    offset:          0x10000
+    size:            33376
+    align:           14
+Slices:
+  - !mach-o
+    FileHeader:
+      magic:           0xFEEDFACF
+      cputype:         0x1000007
+      cpusubtype:      0x3
+      filetype:        0x6
+      ncmds:           14
+      sizeofcmds:      960
+      flags:           0x100085
+      reserved:        0x0
+    LoadCommands:
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         392
+        segname:         __TEXT
+        vmaddr:          0
+        vmsize:          4096
+        fileoff:         0
+        filesize:        4096
+        maxprot:         5
+        initprot:        5
+        nsects:          4
+        flags:           0
+        Sections:
+          - sectname:        __text
+            segname:         __TEXT
+            addr:            0xF80
+            size:            20
+            offset:          0xF80
+            align:           4
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000400
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         554889E5488D3D0F000000B000E8020000005DC3
+          - sectname:        __stubs
+            segname:         __TEXT
+            addr:            0xF94
+            size:            6
+            offset:          0xF94
+            align:           1
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000408
+            reserved1:       0x0
+            reserved2:       0x6
+            reserved3:       0x0
+            content:         FF2566000000
+          - sectname:        __cstring
+            segname:         __TEXT
+            addr:            0xF9A
+            size:            14
+            offset:          0xF9A
+            align:           0
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x2
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         48656C6C6F2066726F6D20420A00
+          - sectname:        __unwind_info
+            segname:         __TEXT
+            addr:            0xFA8
+            size:            88
+            offset:          0xFA8
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x0
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         010000001C000000000000001C000000000000001C00000002000000800F00004000000040000000940F00000000000040000000000000000000000000000000030000000C00010010000100000000000000000100000000
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         152
+        segname:         __DATA_CONST
+        vmaddr:          4096
+        vmsize:          4096
+        fileoff:         4096
+        filesize:        4096
+        maxprot:         3
+        initprot:        3
+        nsects:          1
+        flags:           16
+        Sections:
+          - sectname:        __got
+            segname:         __DATA_CONST
+            addr:            0x1000
+            size:            8
+            offset:          0x1000
+            align:           3
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x6
+            reserved1:       0x1
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         '0000000000000080'
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         72
+        segname:         __LINKEDIT
+        vmaddr:          8192
+        vmsize:          4096
+        fileoff:         8192
+        filesize:        184
+        maxprot:         1
+        initprot:        1
+        nsects:          0
+        flags:           0
+      - cmd:             LC_ID_DYLIB
+        cmdsize:         48
+        dylib:
+          name:            24
+          timestamp:       1
+          current_version: 0
+          compatibility_version: 0
+        Content:         '@rpath/libB.dylib'
+        ZeroPadBytes:    7
+      - cmd:             LC_DYLD_CHAINED_FIXUPS
+        cmdsize:         16
+        dataoff:         8192
+        datasize:        96
+      - cmd:             LC_DYLD_EXPORTS_TRIE
+        cmdsize:         16
+        dataoff:         8288
+        datasize:        24
+      - cmd:             LC_SYMTAB
+        cmdsize:         24
+        symoff:          8320
+        nsyms:           2
+        stroff:          8360
+        strsize:         16
+      - cmd:             LC_DYSYMTAB
+        cmdsize:         80
+        ilocalsym:       0
+        nlocalsym:       0
+        iextdefsym:      0
+        nextdefsym:      1
+        iundefsym:       1
+        nundefsym:       1
+        tocoff:          0
+        ntoc:            0
+        modtaboff:       0
+        nmodtab:         0
+        extrefsymoff:    0
+        nextrefsyms:     0
+        indirectsymoff:  8352
+        nindirectsyms:   2
+        extreloff:       0
+        nextrel:         0
+        locreloff:       0
+        nlocrel:         0
+      - cmd:             LC_UUID
+        cmdsize:         24
+        uuid:            88B60B3C-13D3-3D7E-AEED-5F3E991FDF08
+      - cmd:             LC_BUILD_VERSION
+        cmdsize:         32
+        platform:        1
+        minos:           983040
+        sdk:             983552
+        ntools:          1
+        Tools:
+          - tool:            3
+            version:         73074435
+      - cmd:             LC_SOURCE_VERSION
+        cmdsize:         16
+        version:         0
+      - cmd:             LC_LOAD_DYLIB
+        cmdsize:         56
+        dylib:
+          name:            24
+          timestamp:       2
+          current_version: 88539136
+          compatibility_version: 65536
+        Content:         '/usr/lib/libSystem.B.dylib'
+        ZeroPadBytes:    6
+      - cmd:             LC_FUNCTION_STARTS
+        cmdsize:         16
+        dataoff:         8312
+        datasize:        8
+      - cmd:             LC_DATA_IN_CODE
+        cmdsize:         16
+        dataoff:         8320
+        datasize:        0
+    LinkEditData:
+      ExportTrie:
+        TerminalSize:    0
+        NodeOffset:      0
+        Name:            ''
+        Flags:           0x0
+        Address:         0x0
+        Other:           0x0
+        ImportName:      ''
+        Children:
+          - TerminalSize:    3
+            NodeOffset:      13
+            Name:            _sayB
+            Flags:           0x0
+            Address:         0xF80
+            Other:           0x0
+            ImportName:      ''
+      NameList:
+        - n_strx:          2
+          n_type:          0xF
+          n_sect:          1
+          n_desc:          0
+          n_value:         3968
+        - n_strx:          8
+          n_type:          0x1
+          n_sect:          0
+          n_desc:          256
+          n_value:         0
+      StringTable:
+        - ' '
+        - _sayB
+        - _printf
+      IndirectSymbols: [ 0x1, 0x1 ]
+      FunctionStarts:  [ 0xF80 ]
+      ChainedFixups:   [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x48, 
+                         0x0, 0x0, 0x0, 0x50, 0x0, 0x0, 0x0, 0x1, 0x0, 
+                         0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x18, 0x0, 0x0, 0x0, 0x0, 0x10, 0x6, 0x0, 
+                         0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x2, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x5F, 0x70, 0x72, 
+                         0x69, 0x6E, 0x74, 0x66, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0 ]
+  - !mach-o
+    FileHeader:
+      magic:           0xFEEDFACF
+      cputype:         0x100000C
+      cpusubtype:      0x0
+      filetype:        0x6
+      ncmds:           15
+      sizeofcmds:      976
+      flags:           0x100085
+      reserved:        0x0
+    LoadCommands:
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         392
+        segname:         __TEXT
+        vmaddr:          0
+        vmsize:          16384
+        fileoff:         0
+        filesize:        16384
+        maxprot:         5
+        initprot:        5
+        nsects:          4
+        flags:           0
+        Sections:
+          - sectname:        __text
+            segname:         __TEXT
+            addr:            0x3F70
+            size:            28
+            offset:          0x3F70
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000400
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         FD7BBFA9FD0300910000009000603E9103000094FD7BC1A8C0035FD6
+          - sectname:        __stubs
+            segname:         __TEXT
+            addr:            0x3F8C
+            size:            12
+            offset:          0x3F8C
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000408
+            reserved1:       0x0
+            reserved2:       0xC
+            reserved3:       0x0
+            content:         100000B0100240F900021FD6
+          - sectname:        __cstring
+            segname:         __TEXT
+            addr:            0x3F98
+            size:            14
+            offset:          0x3F98
+            align:           0
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x2
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         48656C6C6F2066726F6D20420A00
+          - sectname:        __unwind_info
+            segname:         __TEXT
+            addr:            0x3FA8
+            size:            88
+            offset:          0x3FA8
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x0
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         010000001C000000000000001C000000000000001C00000002000000703F000040000000400000008C3F00000000000040000000000000000000000000000000030000000C00010010000100000000000000000400000000
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         152
+        segname:         __DATA_CONST
+        vmaddr:          16384
+        vmsize:          16384
+        fileoff:         16384
+        filesize:        16384
+        maxprot:         3
+        initprot:        3
+        nsects:          1
+        flags:           16
+        Sections:
+          - sectname:        __got
+            segname:         __DATA_CONST
+            addr:            0x4000
+            size:            8
+            offset:          0x4000
+            align:           3
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x6
+            reserved1:       0x1
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         '0000000000000080'
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         72
+        segname:         __LINKEDIT
+        vmaddr:          32768
+        vmsize:          16384
+        fileoff:         32768
+        filesize:        608
+        maxprot:         1
+        initprot:        1
+        nsects:          0
+        flags:           0
+      - cmd:             LC_ID_DYLIB
+        cmdsize:         48
+        dylib:
+          name:            24
+          timestamp:       1
+          current_version: 0
+          compatibility_version: 0
+        Content:         '@rpath/libB.dylib'
+        ZeroPadBytes:    7
+      - cmd:             LC_DYLD_CHAINED_FIXUPS
+        cmdsize:         16
+        dataoff:         32768
+        datasize:        96
+      - cmd:             LC_DYLD_EXPORTS_TRIE
+        cmdsize:         16
+        dataoff:         32864
+        datasize:        24
+      - cmd:             LC_SYMTAB
+        cmdsize:         24
+        symoff:          32896
+        nsyms:           2
+        stroff:          32936
+        strsize:         16
+      - cmd:             LC_DYSYMTAB
+        cmdsize:         80
+        ilocalsym:       0
+        nlocalsym:       0
+        iextdefsym:      0
+        nextdefsym:      1
+        iundefsym:       1
+        nundefsym:       1
+        tocoff:          0
+        ntoc:            0
+        modtaboff:       0
+        nmodtab:         0
+        extrefsymoff:    0
+        nextrefsyms:     0
+        indirectsymoff:  32928
+        nindirectsyms:   2
+        extreloff:       0
+        nextrel:         0
+        locreloff:       0
+        nlocrel:         0
+      - cmd:             LC_UUID
+        cmdsize:         24
+        uuid:            90C3787A-22E1-35AE-9284-97A4842F88AF
+      - cmd:             LC_BUILD_VERSION
+        cmdsize:         32
+        platform:        1
+        minos:           983040
+        sdk:             983552
+        ntools:          1
+        Tools:
+          - tool:            3
+            version:         73074435
+      - cmd:             LC_SOURCE_VERSION
+        cmdsize:         16
+        version:         0
+      - cmd:             LC_LOAD_DYLIB
+        cmdsize:         56
+        dylib:
+          name:            24
+          timestamp:       2
+          current_version: 88539136
+          compatibility_version: 65536
+        Content:         '/usr/lib/libSystem.B.dylib'
+        ZeroPadBytes:    6
+      - cmd:             LC_FUNCTION_STARTS
+        cmdsize:         16
+        dataoff:         32888
+        datasize:        8
+      - cmd:             LC_DATA_IN_CODE
+        cmdsize:         16
+        dataoff:         32896
+        datasize:        0
+      - cmd:             LC_CODE_SIGNATURE
+        cmdsize:         16
+        dataoff:         32960
+        datasize:        416
+    LinkEditData:
+      ExportTrie:
+        TerminalSize:    0
+        NodeOffset:      0
+        Name:            ''
+        Flags:           0x0
+        Address:         0x0
+        Other:           0x0
+        ImportName:      ''
+        Children:
+          - TerminalSize:    3
+            NodeOffset:      13
+            Name:            _sayB
+            Flags:           0x0
+            Address:         0x3F70
+            Other:           0x0
+            ImportName:      ''
+      NameList:
+        - n_strx:          2
+          n_type:          0xF
+          n_sect:          1
+          n_desc:          0
+          n_value:         16240
+        - n_strx:          8
+          n_type:          0x1
+          n_sect:          0
+          n_desc:          256
+          n_value:         0
+      StringTable:
+        - ' '
+        - _sayB
+        - _printf
+      IndirectSymbols: [ 0x1, 0x1 ]
+      FunctionStarts:  [ 0x3F70 ]
+      ChainedFixups:   [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x48, 
+                         0x0, 0x0, 0x0, 0x50, 0x0, 0x0, 0x0, 0x1, 0x0, 
+                         0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x18, 0x0, 0x0, 0x0, 0x0, 0x40, 0x6, 0x0, 
+                         0x0, 0x40, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x2, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x5F, 0x70, 0x72, 
+                         0x69, 0x6E, 0x74, 0x66, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0 ]
+  - !mach-o
+    FileHeader:
+      magic:           0xFEEDFACF
+      cputype:         0x100000C
+      cpusubtype:      0x80000002
+      filetype:        0x6
+      ncmds:           15
+      sizeofcmds:      976
+      flags:           0x100085
+      reserved:        0x0
+    LoadCommands:
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         392
+        segname:         __TEXT
+        vmaddr:          0
+        vmsize:          16384
+        fileoff:         0
+        filesize:        16384
+        maxprot:         5
+        initprot:        5
+        nsects:          4
+        flags:           0
+        Sections:
+          - sectname:        __text
+            segname:         __TEXT
+            addr:            0x3F68
+            size:            32
+            offset:          0x3F68
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000400
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         7F2303D5FD7BBFA9FD0300910000009000603E9103000094FD7BC1A8FF0F5FD6
+          - sectname:        __auth_stubs
+            segname:         __TEXT
+            addr:            0x3F88
+            size:            16
+            offset:          0x3F88
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000408
+            reserved1:       0x0
+            reserved2:       0x10
+            reserved3:       0x0
+            content:         110000B031020091300240F9110A1FD7
+          - sectname:        __cstring
+            segname:         __TEXT
+            addr:            0x3F98
+            size:            14
+            offset:          0x3F98
+            align:           0
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x2
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         48656C6C6F2066726F6D20420A00
+          - sectname:        __unwind_info
+            segname:         __TEXT
+            addr:            0x3FA8
+            size:            88
+            offset:          0x3FA8
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x0
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         010000001C000000000000001C000000000000001C00000002000000683F00004000000040000000883F00000000000040000000000000000000000000000000030000000C00010010000100000000000000000400000000
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         152
+        segname:         __DATA_CONST
+        vmaddr:          16384
+        vmsize:          16384
+        fileoff:         16384
+        filesize:        16384
+        maxprot:         3
+        initprot:        3
+        nsects:          1
+        flags:           16
+        Sections:
+          - sectname:        __auth_got
+            segname:         __DATA_CONST
+            addr:            0x4000
+            size:            8
+            offset:          0x4000
+            align:           3
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x6
+            reserved1:       0x1
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         00000000000001C0
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         72
+        segname:         __LINKEDIT
+        vmaddr:          32768
+        vmsize:          16384
+        fileoff:         32768
+        filesize:        608
+        maxprot:         1
+        initprot:        1
+        nsects:          0
+        flags:           0
+      - cmd:             LC_ID_DYLIB
+        cmdsize:         48
+        dylib:
+          name:            24
+          timestamp:       1
+          current_version: 0
+          compatibility_version: 0
+        Content:         '@rpath/libB.dylib'
+        ZeroPadBytes:    7
+      - cmd:             LC_DYLD_CHAINED_FIXUPS
+        cmdsize:         16
+        dataoff:         32768
+        datasize:        96
+      - cmd:             LC_DYLD_EXPORTS_TRIE
+        cmdsize:         16
+        dataoff:         32864
+        datasize:        24
+      - cmd:             LC_SYMTAB
+        cmdsize:         24
+        symoff:          32896
+        nsyms:           2
+        stroff:          32936
+        strsize:         16
+      - cmd:             LC_DYSYMTAB
+        cmdsize:         80
+        ilocalsym:       0
+        nlocalsym:       0
+        iextdefsym:      0
+        nextdefsym:      1
+        iundefsym:       1
+        nundefsym:       1
+        tocoff:          0
+        ntoc:            0
+        modtaboff:       0
+        nmodtab:         0
+        extrefsymoff:    0
+        nextrefsyms:     0
+        indirectsymoff:  32928
+        nindirectsyms:   2
+        extreloff:       0
+        nextrel:         0
+        locreloff:       0
+        nlocrel:         0
+      - cmd:             LC_UUID
+        cmdsize:         24
+        uuid:            76B41B3A-00EC-388B-A432-478A96772CC4
+      - cmd:             LC_BUILD_VERSION
+        cmdsize:         32
+        platform:        1
+        minos:           983040
+        sdk:             983552
+        ntools:          1
+        Tools:
+          - tool:            3
+            version:         73074435
+      - cmd:             LC_SOURCE_VERSION
+        cmdsize:         16
+        version:         0
+      - cmd:             LC_LOAD_DYLIB
+        cmdsize:         56
+        dylib:
+          name:            24
+          timestamp:       2
+          current_version: 88539136
+          compatibility_version: 65536
+        Content:         '/usr/lib/libSystem.B.dylib'
+        ZeroPadBytes:    6
+      - cmd:             LC_FUNCTION_STARTS
+        cmdsize:         16
+        dataoff:         32888
+        datasize:        8
+      - cmd:             LC_DATA_IN_CODE
+        cmdsize:         16
+        dataoff:         32896
+        datasize:        0
+      - cmd:             LC_CODE_SIGNATURE
+        cmdsize:         16
+        dataoff:         32960
+        datasize:        416
+    LinkEditData:
+      ExportTrie:
+        TerminalSize:    0
+        NodeOffset:      0
+        Name:            ''
+        Flags:           0x0
+        Address:         0x0
+        Other:           0x0
+        ImportName:      ''
+        Children:
+          - TerminalSize:    3
+            NodeOffset:      13
+            Name:            _sayB
+            Flags:           0x0
+            Address:         0x3F68
+            Other:           0x0
+            ImportName:      ''
+      NameList:
+        - n_strx:          2
+          n_type:          0xF
+          n_sect:          1
+          n_desc:          0
+          n_value:         16232
+        - n_strx:          8
+          n_type:          0x1
+          n_sect:          0
+          n_desc:          256
+          n_value:         0
+      StringTable:
+        - ' '
+        - _sayB
+        - _printf
+      IndirectSymbols: [ 0x1, 0x1 ]
+      FunctionStarts:  [ 0x3F68 ]
+      ChainedFixups:   [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x48, 
+                         0x0, 0x0, 0x0, 0x50, 0x0, 0x0, 0x0, 0x1, 0x0, 
+                         0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x18, 0x0, 0x0, 0x0, 0x0, 0x40, 0xC, 0x0, 
+                         0x0, 0x40, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x2, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x5F, 0x70, 0x72, 
+                         0x69, 0x6E, 0x74, 0x66, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0 ]
+...
diff --git a/llvm/unittests/ExecutionEngine/Orc/Inputs/C/C_linux.yaml b/llvm/unittests/ExecutionEngine/Orc/Inputs/C/C_linux.yaml
new file mode 100644
index 0000000000000..3fabf9a62e336
--- /dev/null
+++ b/llvm/unittests/ExecutionEngine/Orc/Inputs/C/C_linux.yaml
@@ -0,0 +1,450 @@
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_DYN
+  Machine:         EM_X86_64
+ProgramHeaders:
+  - Type:            PT_LOAD
+    Flags:           [ PF_R ]
+    FirstSec:        .note.gnu.property
+    LastSec:         .rela.plt
+    Align:           0x1000
+    Offset:          0x0
+  - Type:            PT_LOAD
+    Flags:           [ PF_X, PF_R ]
+    FirstSec:        .init
+    LastSec:         .fini
+    VAddr:           0x1000
+    Align:           0x1000
+    Offset:          0x1000
+  - Type:            PT_LOAD
+    Flags:           [ PF_R ]
+    FirstSec:        .eh_frame_hdr
+    LastSec:         .eh_frame
+    VAddr:           0x2000
+    Align:           0x1000
+    Offset:          0x2000
+  - Type:            PT_LOAD
+    Flags:           [ PF_W, PF_R ]
+    FirstSec:        .init_array
+    LastSec:         .bss
+    VAddr:           0x3E10
+    Align:           0x1000
+    Offset:          0x2E10
+  - Type:            PT_DYNAMIC
+    Flags:           [ PF_W, PF_R ]
+    FirstSec:        .dynamic
+    LastSec:         .dynamic
+    VAddr:           0x3E20
+    Align:           0x8
+    Offset:          0x2E20
+  - Type:            PT_NOTE
+    Flags:           [ PF_R ]
+    FirstSec:        .note.gnu.property
+    LastSec:         .note.gnu.property
+    VAddr:           0x2A8
+    Align:           0x8
+    Offset:          0x2A8
+  - Type:            PT_NOTE
+    Flags:           [ PF_R ]
+    FirstSec:        .note.gnu.build-id
+    LastSec:         .note.gnu.build-id
+    VAddr:           0x2C8
+    Align:           0x4
+    Offset:          0x2C8
+  - Type:            PT_GNU_PROPERTY
+    Flags:           [ PF_R ]
+    FirstSec:        .note.gnu.property
+    LastSec:         .note.gnu.property
+    VAddr:           0x2A8
+    Align:           0x8
+    Offset:          0x2A8
+  - Type:            PT_GNU_EH_FRAME
+    Flags:           [ PF_R ]
+    FirstSec:        .eh_frame_hdr
+    LastSec:         .eh_frame_hdr
+    VAddr:           0x2000
+    Align:           0x4
+    Offset:          0x2000
+  - Type:            PT_GNU_STACK
+    Flags:           [ PF_W, PF_R ]
+    Align:           0x10
+    Offset:          0x0
+  - Type:            PT_GNU_RELRO
+    Flags:           [ PF_R ]
+    FirstSec:        .init_array
+    LastSec:         .got
+    VAddr:           0x3E10
+    Offset:          0x2E10
+Sections:
+  - Name:            .note.gnu.property
+    Type:            SHT_NOTE
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2A8
+    AddressAlign:    0x8
+    Notes:
+      - Name:            GNU
+        Desc:            020000C0040000000300000000000000
+        Type:            NT_GNU_PROPERTY_TYPE_0
+  - Name:            .note.gnu.build-id
+    Type:            SHT_NOTE
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2C8
+    AddressAlign:    0x4
+    Notes:
+      - Name:            GNU
+        Desc:            0318D63E46BF31CEFF90D5C7F0475D9F78676EC8
+        Type:            NT_PRPSINFO
+  - Name:            .gnu.hash
+    Type:            SHT_GNU_HASH
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2F0
+    Link:            .dynsym
+    AddressAlign:    0x8
+    Header:
+      SymNdx:          0x8
+      Shift2:          0x6
+    BloomFilter:     [ 0x400000200000 ]
+    HashBuckets:     [ 0x0, 0x8 ]
+    HashValues:      [ 0x7C9DCB95 ]
+  - Name:            .dynsym
+    Type:            SHT_DYNSYM
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x318
+    Link:            .dynstr
+    AddressAlign:    0x8
+  - Name:            .dynstr
+    Type:            SHT_STRTAB
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x3F0
+    AddressAlign:    0x1
+    Content:         "6C6962412E736F006C6962422E736F006C69625A2E736F00244F524947494E2F2E2E2F413A244F524947494E2F2E2E2F423A244F524947494E2F2E2E2F5A"
+  - Name:            .rela.dyn
+    Type:            SHT_RELA
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x498
+    Link:            .dynsym
+    AddressAlign:    0x8
+    Relocations:
+      - Offset:          0x3E10
+        Type:            R_X86_64_RELATIVE
+        Addend:          4432
+      - Offset:          0x3E18
+        Type:            R_X86_64_RELATIVE
+        Addend:          4368
+      - Offset:          0x4030
+        Type:            R_X86_64_RELATIVE
+        Addend:          16432
+      - Offset:          0x3FE0
+        Symbol:          __cxa_finalize
+        Type:            R_X86_64_GLOB_DAT
+      - Offset:          0x3FE8
+        Symbol:          _ITM_registerTMCloneTable
+        Type:            R_X86_64_GLOB_DAT
+      - Offset:          0x3FF0
+        Symbol:          _ITM_deregisterTMCloneTable
+        Type:            R_X86_64_GLOB_DAT
+      - Offset:          0x3FF8
+        Symbol:          __gmon_start__
+        Type:            R_X86_64_GLOB_DAT
+  - Name:            .rela.plt
+    Type:            SHT_RELA
+    Flags:           [ SHF_ALLOC, SHF_INFO_LINK ]
+    Address:         0x540
+    Link:            .dynsym
+    AddressAlign:    0x8
+    Info:            .got.plt
+    Relocations:
+      - Offset:          0x4018
+        Symbol:          sayA
+        Type:            R_X86_64_JUMP_SLOT
+      - Offset:          0x4020
+        Symbol:          sayB
+        Type:            R_X86_64_JUMP_SLOT
+      - Offset:          0x4028
+        Symbol:          sayZ
+        Type:            R_X86_64_JUMP_SLOT
+  - Name:            .init
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1000
+    AddressAlign:    0x4
+    Offset:          0x1000
+    Content:         F30F1EFA4883EC08488B05E92F00004885C07402FFD04883C408C3
+  - Name:            .plt
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1020
+    AddressAlign:    0x10
+    EntSize:         0x10
+    Content:         FF35E22F0000F2FF25E32F00000F1F00F30F1EFA6800000000F2E9E1FFFFFF90F30F1EFA6801000000F2E9D1FFFFFF90F30F1EFA6802000000F2E9C1FFFFFF90
+  - Name:            .plt.got
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1060
+    AddressAlign:    0x10
+    EntSize:         0x10
+    Content:         F30F1EFAF2FF25752F00000F1F440000
+  - Name:            .plt.sec
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1070
+    AddressAlign:    0x10
+    EntSize:         0x10
+    Content:         F30F1EFAF2FF259D2F00000F1F440000F30F1EFAF2FF25952F00000F1F440000F30F1EFAF2FF258D2F00000F1F440000
+  - Name:            .text
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x10A0
+    AddressAlign:    0x10
+    Content:         488D3D912F0000488D058A2F00004839F87415488B05362F00004885C07409FFE00F1F8000000000C30F1F8000000000488D3D612F0000488D355A2F00004829FE4889F048C1EE3F48C1F8034801C648D1FE7414488B05ED2E00004885C07408FFE0660F1F440000C30F1F8000000000F30F1EFA803D1D2F000000752B5548833DBA2E0000004889E5740C488B3DFE2E0000E829FFFFFFE864FFFFFFC605F52E0000015DC30F1F00C30F1F8000000000F30F1EFAE977FFFFFFF30F1EFA554889E5B800000000E805FFFFFFB800000000E80BFFFFFFB800000000E811FFFFFF905DC3
+  - Name:            .fini
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1184
+    AddressAlign:    0x4
+    Content:         F30F1EFA4883EC084883C408C3
+  - Name:            .eh_frame_hdr
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2000
+    AddressAlign:    0x4
+    Offset:          0x2000
+    Content:         011B033B2C0000000400000020F0FFFF4800000060F0FFFF7000000070F0FFFF8800000059F1FFFFA0000000
+  - Name:            .eh_frame
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2030
+    AddressAlign:    0x8
+    Content:         1400000000000000017A5200017810011B0C070890010000240000001C000000D0EFFFFF40000000000E10460E184A0F0B770880003F1A3A2A332422000000001400000044000000E8EFFFFF100000000000000000000000140000005C000000E0EFFFFF3000000000000000000000001C00000074000000B1F0FFFF2900000000450E108602430D06600C070800000000000000
+  - Name:            .init_array
+    Type:            SHT_INIT_ARRAY
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x3E10
+    AddressAlign:    0x8
+    EntSize:         0x8
+    Offset:          0x2E10
+    Content:         '5011000000000000'
+  - Name:            .fini_array
+    Type:            SHT_FINI_ARRAY
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x3E18
+    AddressAlign:    0x8
+    EntSize:         0x8
+    Content:         '1011000000000000'
+  - Name:            .dynamic
+    Type:            SHT_DYNAMIC
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x3E20
+    Link:            .dynstr
+    AddressAlign:    0x8
+    Entries:
+      - Tag:             DT_NEEDED
+        Value:           0x0
+      - Tag:             DT_NEEDED
+        Value:           0x8
+      - Tag:             DT_NEEDED
+        Value:           0x10
+      - Tag:             DT_RUNPATH
+        Value:           0x18
+      - Tag:             DT_INIT
+        Value:           0x1000
+      - Tag:             DT_FINI
+        Value:           0x1184
+      - Tag:             DT_INIT_ARRAY
+        Value:           0x3E10
+      - Tag:             DT_INIT_ARRAYSZ
+        Value:           0x8
+      - Tag:             DT_FINI_ARRAY
+        Value:           0x3E18
+      - Tag:             DT_FINI_ARRAYSZ
+        Value:           0x8
+      - Tag:             DT_GNU_HASH
+        Value:           0x2F0
+      - Tag:             DT_STRTAB
+        Value:           0x3F0
+      - Tag:             DT_SYMTAB
+        Value:           0x318
+      - Tag:             DT_STRSZ
+        Value:           0xA8
+      - Tag:             DT_SYMENT
+        Value:           0x18
+      - Tag:             DT_PLTGOT
+        Value:           0x4000
+      - Tag:             DT_PLTRELSZ
+        Value:           0x48
+      - Tag:             DT_PLTREL
+        Value:           0x7
+      - Tag:             DT_JMPREL
+        Value:           0x540
+      - Tag:             DT_RELA
+        Value:           0x498
+      - Tag:             DT_RELASZ
+        Value:           0xA8
+      - Tag:             DT_RELAENT
+        Value:           0x18
+      - Tag:             DT_RELACOUNT
+        Value:           0x3
+      - Tag:             DT_NULL
+        Value:           0x0
+      - Tag:             DT_NULL
+        Value:           0x0
+      - Tag:             DT_NULL
+        Value:           0x0
+      - Tag:             DT_NULL
+        Value:           0x0
+      - Tag:             DT_NULL
+        Value:           0x0
+  - Name:            .got
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x3FE0
+    AddressAlign:    0x8
+    EntSize:         0x8
+    Content:         '0000000000000000000000000000000000000000000000000000000000000000'
+  - Name:            .got.plt
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x4000
+    AddressAlign:    0x8
+    EntSize:         0x8
+    Content:         '203E00000000000000000000000000000000000000000000301000000000000040100000000000005010000000000000'
+  - Name:            .data
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x4030
+    AddressAlign:    0x8
+    Content:         '3040000000000000'
+  - Name:            .bss
+    Type:            SHT_NOBITS
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x4038
+    AddressAlign:    0x1
+    Size:            0x8
+  - Name:            .comment
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_MERGE, SHF_STRINGS ]
+    AddressAlign:    0x1
+    EntSize:         0x1
+    Content:         4743433A20285562756E74752031312E342E302D317562756E7475317E32322E30342E32292031312E342E3000
+Symbols:
+  - Name:            crtstuff.c
+    Type:            STT_FILE
+    Index:           SHN_ABS
+  - Name:            deregister_tm_clones
+    Type:            STT_FUNC
+    Section:         .text
+    Value:           0x10A0
+  - Name:            register_tm_clones
+    Type:            STT_FUNC
+    Section:         .text
+    Value:           0x10D0
+  - Name:            __do_global_dtors_aux
+    Type:            STT_FUNC
+    Section:         .text
+    Value:           0x1110
+  - Name:            completed.0
+    Type:            STT_OBJECT
+    Section:         .bss
+    Value:           0x4038
+    Size:            0x1
+  - Name:            __do_global_dtors_aux_fini_array_entry
+    Type:            STT_OBJECT
+    Section:         .fini_array
+    Value:           0x3E18
+  - Name:            frame_dummy
+    Type:            STT_FUNC
+    Section:         .text
+    Value:           0x1150
+  - Name:            __frame_dummy_init_array_entry
+    Type:            STT_OBJECT
+    Section:         .init_array
+    Value:           0x3E10
+  - Name:            libC.c
+    Type:            STT_FILE
+    Index:           SHN_ABS
+  - Name:            'crtstuff.c (1)'
+    Type:            STT_FILE
+    Index:           SHN_ABS
+  - Name:            __FRAME_END__
+    Type:            STT_OBJECT
+    Section:         .eh_frame
+    Value:           0x20C0
+  - Type:            STT_FILE
+    Index:           SHN_ABS
+  - Name:            _DYNAMIC
+    Type:            STT_OBJECT
+    Section:         .dynamic
+    Value:           0x3E20
+  - Name:            __TMC_END__
+    Type:            STT_OBJECT
+    Section:         .data
+    Value:           0x4038
+  - Name:            __dso_handle
+    Type:            STT_OBJECT
+    Section:         .data
+    Value:           0x4030
+  - Name:            _init
+    Type:            STT_FUNC
+    Section:         .init
+    Value:           0x1000
+  - Name:            __GNU_EH_FRAME_HDR
+    Section:         .eh_frame_hdr
+    Value:           0x2000
+  - Name:            _fini
+    Type:            STT_FUNC
+    Section:         .fini
+    Value:           0x1184
+  - Name:            _GLOBAL_OFFSET_TABLE_
+    Type:            STT_OBJECT
+    Section:         .got.plt
+    Value:           0x4000
+  - Name:            __cxa_finalize
+    Binding:         STB_WEAK
+  - Name:            sayC
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+    Value:           0x1159
+    Size:            0x29
+  - Name:            _ITM_registerTMCloneTable
+    Binding:         STB_WEAK
+  - Name:            _ITM_deregisterTMCloneTable
+    Binding:         STB_WEAK
+  - Name:            sayA
+    Type:            STT_FUNC
+    Binding:         STB_GLOBAL
+  - Name:            sayB
+    Type:            STT_FUNC
+    Binding:         STB_GLOBAL
+  - Name:            sayZ
+    Type:            STT_FUNC
+    Binding:         STB_GLOBAL
+  - Name:            __gmon_start__
+    Binding:         STB_WEAK
+DynamicSymbols:
+  - Name:            __cxa_finalize
+    Binding:         STB_WEAK
+  - Name:            _ITM_registerTMCloneTable
+    Binding:         STB_WEAK
+  - Name:            _ITM_deregisterTMCloneTable
+    Binding:         STB_WEAK
+  - Name:            sayA
+    Type:            STT_FUNC
+    Binding:         STB_GLOBAL
+  - Name:            sayB
+    Type:            STT_FUNC
+    Binding:         STB_GLOBAL
+  - Name:            sayZ
+    Type:            STT_FUNC
+    Binding:         STB_GLOBAL
+  - Name:            __gmon_start__
+    Binding:         STB_WEAK
+  - Name:            sayC
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+    Value:           0x1159
+    Size:            0x29
+...
diff --git a/llvm/unittests/ExecutionEngine/Orc/Inputs/C/C_macho.yaml b/llvm/unittests/ExecutionEngine/Orc/Inputs/C/C_macho.yaml
new file mode 100644
index 0000000000000..ba33483c5122f
--- /dev/null
+++ b/llvm/unittests/ExecutionEngine/Orc/Inputs/C/C_macho.yaml
@@ -0,0 +1,870 @@
+--- !fat-mach-o
+FatHeader:
+  magic:           0xCAFEBABE
+  nfat_arch:       3
+FatArchs:
+  - cputype:         0x1000007
+    cpusubtype:      0x3
+    offset:          0x1000
+    size:            8456
+    align:           12
+  - cputype:         0x100000C
+    cpusubtype:      0x0
+    offset:          0x4000
+    size:            33456
+    align:           14
+  - cputype:         0x100000C
+    cpusubtype:      0x80000002
+    offset:          0x10000
+    size:            33456
+    align:           14
+Slices:
+  - !mach-o
+    FileHeader:
+      magic:           0xFEEDFACF
+      cputype:         0x1000007
+      cpusubtype:      0x3
+      filetype:        0x6
+      ncmds:           20
+      sizeofcmds:      1120
+      flags:           0x100085
+      reserved:        0x0
+    LoadCommands:
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         312
+        segname:         __TEXT
+        vmaddr:          0
+        vmsize:          4096
+        fileoff:         0
+        filesize:        4096
+        maxprot:         5
+        initprot:        5
+        nsects:          3
+        flags:           0
+        Sections:
+          - sectname:        __text
+            segname:         __TEXT
+            addr:            0xF70
+            size:            27
+            offset:          0xF70
+            align:           4
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000400
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         554889E5B000E811000000B000E810000000B000E80F0000005DC3
+          - sectname:        __stubs
+            segname:         __TEXT
+            addr:            0xF8C
+            size:            18
+            offset:          0xF8C
+            align:           1
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000408
+            reserved1:       0x0
+            reserved2:       0x6
+            reserved3:       0x0
+            content:         FF256E000000FF2570000000FF2572000000
+          - sectname:        __unwind_info
+            segname:         __TEXT
+            addr:            0xFA0
+            size:            88
+            offset:          0xFA0
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x0
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         010000001C000000000000001C000000000000001C00000002000000700F000040000000400000008B0F00000000000040000000000000000000000000000000030000000C00010010000100000000000000000100000000
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         152
+        segname:         __DATA_CONST
+        vmaddr:          4096
+        vmsize:          4096
+        fileoff:         4096
+        filesize:        4096
+        maxprot:         3
+        initprot:        3
+        nsects:          1
+        flags:           16
+        Sections:
+          - sectname:        __got
+            segname:         __DATA_CONST
+            addr:            0x1000
+            size:            24
+            offset:          0x1000
+            align:           3
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x6
+            reserved1:       0x3
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         '000000000000108001000000000010800200000000000080'
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         72
+        segname:         __LINKEDIT
+        vmaddr:          8192
+        vmsize:          4096
+        fileoff:         8192
+        filesize:        264
+        maxprot:         1
+        initprot:        1
+        nsects:          0
+        flags:           0
+      - cmd:             LC_ID_DYLIB
+        cmdsize:         48
+        dylib:
+          name:            24
+          timestamp:       1
+          current_version: 0
+          compatibility_version: 0
+        Content:         '@rpath/libC.dylib'
+        ZeroPadBytes:    7
+      - cmd:             LC_DYLD_CHAINED_FIXUPS
+        cmdsize:         16
+        dataoff:         8192
+        datasize:        112
+      - cmd:             LC_DYLD_EXPORTS_TRIE
+        cmdsize:         16
+        dataoff:         8304
+        datasize:        24
+      - cmd:             LC_SYMTAB
+        cmdsize:         24
+        symoff:          8336
+        nsyms:           4
+        stroff:          8424
+        strsize:         32
+      - cmd:             LC_DYSYMTAB
+        cmdsize:         80
+        ilocalsym:       0
+        nlocalsym:       0
+        iextdefsym:      0
+        nextdefsym:      1
+        iundefsym:       1
+        nundefsym:       3
+        tocoff:          0
+        ntoc:            0
+        modtaboff:       0
+        nmodtab:         0
+        extrefsymoff:    0
+        nextrefsyms:     0
+        indirectsymoff:  8400
+        nindirectsyms:   6
+        extreloff:       0
+        nextrel:         0
+        locreloff:       0
+        nlocrel:         0
+      - cmd:             LC_UUID
+        cmdsize:         24
+        uuid:            2AA1F9E9-F250-366F-B382-51A91DE06BED
+      - cmd:             LC_BUILD_VERSION
+        cmdsize:         32
+        platform:        1
+        minos:           983040
+        sdk:             983552
+        ntools:          1
+        Tools:
+          - tool:            3
+            version:         73074435
+      - cmd:             LC_SOURCE_VERSION
+        cmdsize:         16
+        version:         0
+      - cmd:             LC_LOAD_DYLIB
+        cmdsize:         48
+        dylib:
+          name:            24
+          timestamp:       2
+          current_version: 0
+          compatibility_version: 0
+        Content:         '@rpath/libA.dylib'
+        ZeroPadBytes:    7
+      - cmd:             LC_LOAD_DYLIB
+        cmdsize:         48
+        dylib:
+          name:            24
+          timestamp:       2
+          current_version: 0
+          compatibility_version: 0
+        Content:         '@rpath/libB.dylib'
+        ZeroPadBytes:    7
+      - cmd:             LC_LOAD_DYLIB
+        cmdsize:         48
+        dylib:
+          name:            24
+          timestamp:       2
+          current_version: 0
+          compatibility_version: 0
+        Content:         '@rpath/libZ.dylib'
+        ZeroPadBytes:    7
+      - cmd:             LC_LOAD_DYLIB
+        cmdsize:         56
+        dylib:
+          name:            24
+          timestamp:       2
+          current_version: 88539136
+          compatibility_version: 65536
+        Content:         '/usr/lib/libSystem.B.dylib'
+        ZeroPadBytes:    6
+      - cmd:             LC_RPATH
+        cmdsize:         32
+        path:            12
+        Content:         '@loader_path/../A'
+        ZeroPadBytes:    3
+      - cmd:             LC_RPATH
+        cmdsize:         32
+        path:            12
+        Content:         '@loader_path/../B'
+        ZeroPadBytes:    3
+      - cmd:             LC_RPATH
+        cmdsize:         32
+        path:            12
+        Content:         '@loader_path/../Z'
+        ZeroPadBytes:    3
+      - cmd:             LC_FUNCTION_STARTS
+        cmdsize:         16
+        dataoff:         8328
+        datasize:        8
+      - cmd:             LC_DATA_IN_CODE
+        cmdsize:         16
+        dataoff:         8336
+        datasize:        0
+    LinkEditData:
+      ExportTrie:
+        TerminalSize:    0
+        NodeOffset:      0
+        Name:            ''
+        Flags:           0x0
+        Address:         0x0
+        Other:           0x0
+        ImportName:      ''
+        Children:
+          - TerminalSize:    3
+            NodeOffset:      13
+            Name:            _sayC
+            Flags:           0x0
+            Address:         0xF70
+            Other:           0x0
+            ImportName:      ''
+      NameList:
+        - n_strx:          2
+          n_type:          0xF
+          n_sect:          1
+          n_desc:          0
+          n_value:         3952
+        - n_strx:          8
+          n_type:          0x1
+          n_sect:          0
+          n_desc:          256
+          n_value:         0
+        - n_strx:          14
+          n_type:          0x1
+          n_sect:          0
+          n_desc:          512
+          n_value:         0
+        - n_strx:          20
+          n_type:          0x1
+          n_sect:          0
+          n_desc:          768
+          n_value:         0
+      StringTable:
+        - ' '
+        - _sayC
+        - _sayA
+        - _sayB
+        - _sayZ
+        - ''
+        - ''
+        - ''
+        - ''
+        - ''
+        - ''
+      IndirectSymbols: [ 0x1, 0x2, 0x3, 0x1, 0x2, 0x3 ]
+      FunctionStarts:  [ 0xF70 ]
+      ChainedFixups:   [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x48, 
+                         0x0, 0x0, 0x0, 0x58, 0x0, 0x0, 0x0, 0x3, 0x0, 
+                         0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x18, 0x0, 0x0, 0x0, 0x0, 0x10, 0x6, 0x0, 
+                         0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x2, 0x0, 
+                         0x0, 0x2, 0xE, 0x0, 0x0, 0x3, 0x1A, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x0, 0x5F, 0x73, 0x61, 0x79, 
+                         0x41, 0x0, 0x5F, 0x73, 0x61, 0x79, 0x42, 0x0, 
+                         0x5F, 0x73, 0x61, 0x79, 0x5A, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0 ]
+  - !mach-o
+    FileHeader:
+      magic:           0xFEEDFACF
+      cputype:         0x100000C
+      cpusubtype:      0x0
+      filetype:        0x6
+      ncmds:           21
+      sizeofcmds:      1136
+      flags:           0x100085
+      reserved:        0x0
+    LoadCommands:
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         312
+        segname:         __TEXT
+        vmaddr:          0
+        vmsize:          16384
+        fileoff:         0
+        filesize:        16384
+        maxprot:         5
+        initprot:        5
+        nsects:          3
+        flags:           0
+        Sections:
+          - sectname:        __text
+            segname:         __TEXT
+            addr:            0x3F68
+            size:            28
+            offset:          0x3F68
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000400
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         FD7BBFA9FD030091050000940700009409000094FD7BC1A8C0035FD6
+          - sectname:        __stubs
+            segname:         __TEXT
+            addr:            0x3F84
+            size:            36
+            offset:          0x3F84
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000408
+            reserved1:       0x0
+            reserved2:       0xC
+            reserved3:       0x0
+            content:         100000B0100240F900021FD6100000B0100640F900021FD6100000B0100A40F900021FD6
+          - sectname:        __unwind_info
+            segname:         __TEXT
+            addr:            0x3FA8
+            size:            88
+            offset:          0x3FA8
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x0
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         010000001C000000000000001C000000000000001C00000002000000683F00004000000040000000843F00000000000040000000000000000000000000000000030000000C00010010000100000000000000000400000000
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         152
+        segname:         __DATA_CONST
+        vmaddr:          16384
+        vmsize:          16384
+        fileoff:         16384
+        filesize:        16384
+        maxprot:         3
+        initprot:        3
+        nsects:          1
+        flags:           16
+        Sections:
+          - sectname:        __got
+            segname:         __DATA_CONST
+            addr:            0x4000
+            size:            24
+            offset:          0x4000
+            align:           3
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x6
+            reserved1:       0x3
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         '000000000000108001000000000010800200000000000080'
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         72
+        segname:         __LINKEDIT
+        vmaddr:          32768
+        vmsize:          16384
+        fileoff:         32768
+        filesize:        688
+        maxprot:         1
+        initprot:        1
+        nsects:          0
+        flags:           0
+      - cmd:             LC_ID_DYLIB
+        cmdsize:         48
+        dylib:
+          name:            24
+          timestamp:       1
+          current_version: 0
+          compatibility_version: 0
+        Content:         '@rpath/libC.dylib'
+        ZeroPadBytes:    7
+      - cmd:             LC_DYLD_CHAINED_FIXUPS
+        cmdsize:         16
+        dataoff:         32768
+        datasize:        112
+      - cmd:             LC_DYLD_EXPORTS_TRIE
+        cmdsize:         16
+        dataoff:         32880
+        datasize:        24
+      - cmd:             LC_SYMTAB
+        cmdsize:         24
+        symoff:          32912
+        nsyms:           4
+        stroff:          33000
+        strsize:         32
+      - cmd:             LC_DYSYMTAB
+        cmdsize:         80
+        ilocalsym:       0
+        nlocalsym:       0
+        iextdefsym:      0
+        nextdefsym:      1
+        iundefsym:       1
+        nundefsym:       3
+        tocoff:          0
+        ntoc:            0
+        modtaboff:       0
+        nmodtab:         0
+        extrefsymoff:    0
+        nextrefsyms:     0
+        indirectsymoff:  32976
+        nindirectsyms:   6
+        extreloff:       0
+        nextrel:         0
+        locreloff:       0
+        nlocrel:         0
+      - cmd:             LC_UUID
+        cmdsize:         24
+        uuid:            02B69690-925D-35EE-A8AB-6D99813D2A16
+      - cmd:             LC_BUILD_VERSION
+        cmdsize:         32
+        platform:        1
+        minos:           983040
+        sdk:             983552
+        ntools:          1
+        Tools:
+          - tool:            3
+            version:         73074435
+      - cmd:             LC_SOURCE_VERSION
+        cmdsize:         16
+        version:         0
+      - cmd:             LC_LOAD_DYLIB
+        cmdsize:         48
+        dylib:
+          name:            24
+          timestamp:       2
+          current_version: 0
+          compatibility_version: 0
+        Content:         '@rpath/libA.dylib'
+        ZeroPadBytes:    7
+      - cmd:             LC_LOAD_DYLIB
+        cmdsize:         48
+        dylib:
+          name:            24
+          timestamp:       2
+          current_version: 0
+          compatibility_version: 0
+        Content:         '@rpath/libB.dylib'
+        ZeroPadBytes:    7
+      - cmd:             LC_LOAD_DYLIB
+        cmdsize:         48
+        dylib:
+          name:            24
+          timestamp:       2
+          current_version: 0
+          compatibility_version: 0
+        Content:         '@rpath/libZ.dylib'
+        ZeroPadBytes:    7
+      - cmd:             LC_LOAD_DYLIB
+        cmdsize:         56
+        dylib:
+          name:            24
+          timestamp:       2
+          current_version: 88539136
+          compatibility_version: 65536
+        Content:         '/usr/lib/libSystem.B.dylib'
+        ZeroPadBytes:    6
+      - cmd:             LC_RPATH
+        cmdsize:         32
+        path:            12
+        Content:         '@loader_path/../A'
+        ZeroPadBytes:    3
+      - cmd:             LC_RPATH
+        cmdsize:         32
+        path:            12
+        Content:         '@loader_path/../B'
+        ZeroPadBytes:    3
+      - cmd:             LC_RPATH
+        cmdsize:         32
+        path:            12
+        Content:         '@loader_path/../Z'
+        ZeroPadBytes:    3
+      - cmd:             LC_FUNCTION_STARTS
+        cmdsize:         16
+        dataoff:         32904
+        datasize:        8
+      - cmd:             LC_DATA_IN_CODE
+        cmdsize:         16
+        dataoff:         32912
+        datasize:        0
+      - cmd:             LC_CODE_SIGNATURE
+        cmdsize:         16
+        dataoff:         33040
+        datasize:        416
+    LinkEditData:
+      ExportTrie:
+        TerminalSize:    0
+        NodeOffset:      0
+        Name:            ''
+        Flags:           0x0
+        Address:         0x0
+        Other:           0x0
+        ImportName:      ''
+        Children:
+          - TerminalSize:    3
+            NodeOffset:      13
+            Name:            _sayC
+            Flags:           0x0
+            Address:         0x3F68
+            Other:           0x0
+            ImportName:      ''
+      NameList:
+        - n_strx:          2
+          n_type:          0xF
+          n_sect:          1
+          n_desc:          0
+          n_value:         16232
+        - n_strx:          8
+          n_type:          0x1
+          n_sect:          0
+          n_desc:          256
+          n_value:         0
+        - n_strx:          14
+          n_type:          0x1
+          n_sect:          0
+          n_desc:          512
+          n_value:         0
+        - n_strx:          20
+          n_type:          0x1
+          n_sect:          0
+          n_desc:          768
+          n_value:         0
+      StringTable:
+        - ' '
+        - _sayC
+        - _sayA
+        - _sayB
+        - _sayZ
+        - ''
+        - ''
+        - ''
+        - ''
+        - ''
+        - ''
+      IndirectSymbols: [ 0x1, 0x2, 0x3, 0x1, 0x2, 0x3 ]
+      FunctionStarts:  [ 0x3F68 ]
+      ChainedFixups:   [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x48, 
+                         0x0, 0x0, 0x0, 0x58, 0x0, 0x0, 0x0, 0x3, 0x0, 
+                         0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x18, 0x0, 0x0, 0x0, 0x0, 0x40, 0x6, 0x0, 
+                         0x0, 0x40, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x2, 0x0, 
+                         0x0, 0x2, 0xE, 0x0, 0x0, 0x3, 0x1A, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x0, 0x5F, 0x73, 0x61, 0x79, 
+                         0x41, 0x0, 0x5F, 0x73, 0x61, 0x79, 0x42, 0x0, 
+                         0x5F, 0x73, 0x61, 0x79, 0x5A, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0 ]
+  - !mach-o
+    FileHeader:
+      magic:           0xFEEDFACF
+      cputype:         0x100000C
+      cpusubtype:      0x80000002
+      filetype:        0x6
+      ncmds:           21
+      sizeofcmds:      1136
+      flags:           0x100085
+      reserved:        0x0
+    LoadCommands:
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         312
+        segname:         __TEXT
+        vmaddr:          0
+        vmsize:          16384
+        fileoff:         0
+        filesize:        16384
+        maxprot:         5
+        initprot:        5
+        nsects:          3
+        flags:           0
+        Sections:
+          - sectname:        __text
+            segname:         __TEXT
+            addr:            0x3F58
+            size:            32
+            offset:          0x3F58
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000400
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         7F2303D5FD7BBFA9FD03009105000094080000940B000094FD7BC1A8FF0F5FD6
+          - sectname:        __auth_stubs
+            segname:         __TEXT
+            addr:            0x3F78
+            size:            48
+            offset:          0x3F78
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000408
+            reserved1:       0x0
+            reserved2:       0x10
+            reserved3:       0x0
+            content:         110000B031020091300240F9110A1FD7110000B031220091300240F9110A1FD7110000B031420091300240F9110A1FD7
+          - sectname:        __unwind_info
+            segname:         __TEXT
+            addr:            0x3FA8
+            size:            88
+            offset:          0x3FA8
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x0
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         010000001C000000000000001C000000000000001C00000002000000583F00004000000040000000783F00000000000040000000000000000000000000000000030000000C00010010000100000000000000000400000000
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         152
+        segname:         __DATA_CONST
+        vmaddr:          16384
+        vmsize:          16384
+        fileoff:         16384
+        filesize:        16384
+        maxprot:         3
+        initprot:        3
+        nsects:          1
+        flags:           16
+        Sections:
+          - sectname:        __auth_got
+            segname:         __DATA_CONST
+            addr:            0x4000
+            size:            24
+            offset:          0x4000
+            align:           3
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x6
+            reserved1:       0x3
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         00000000000009C001000000000009C002000000000001C0
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         72
+        segname:         __LINKEDIT
+        vmaddr:          32768
+        vmsize:          16384
+        fileoff:         32768
+        filesize:        688
+        maxprot:         1
+        initprot:        1
+        nsects:          0
+        flags:           0
+      - cmd:             LC_ID_DYLIB
+        cmdsize:         48
+        dylib:
+          name:            24
+          timestamp:       1
+          current_version: 0
+          compatibility_version: 0
+        Content:         '@rpath/libC.dylib'
+        ZeroPadBytes:    7
+      - cmd:             LC_DYLD_CHAINED_FIXUPS
+        cmdsize:         16
+        dataoff:         32768
+        datasize:        112
+      - cmd:             LC_DYLD_EXPORTS_TRIE
+        cmdsize:         16
+        dataoff:         32880
+        datasize:        24
+      - cmd:             LC_SYMTAB
+        cmdsize:         24
+        symoff:          32912
+        nsyms:           4
+        stroff:          33000
+        strsize:         32
+      - cmd:             LC_DYSYMTAB
+        cmdsize:         80
+        ilocalsym:       0
+        nlocalsym:       0
+        iextdefsym:      0
+        nextdefsym:      1
+        iundefsym:       1
+        nundefsym:       3
+        tocoff:          0
+        ntoc:            0
+        modtaboff:       0
+        nmodtab:         0
+        extrefsymoff:    0
+        nextrefsyms:     0
+        indirectsymoff:  32976
+        nindirectsyms:   6
+        extreloff:       0
+        nextrel:         0
+        locreloff:       0
+        nlocrel:         0
+      - cmd:             LC_UUID
+        cmdsize:         24
+        uuid:            F54076AA-8888-3DED-8BDF-BC7FB3E6FE8A
+      - cmd:             LC_BUILD_VERSION
+        cmdsize:         32
+        platform:        1
+        minos:           983040
+        sdk:             983552
+        ntools:          1
+        Tools:
+          - tool:            3
+            version:         73074435
+      - cmd:             LC_SOURCE_VERSION
+        cmdsize:         16
+        version:         0
+      - cmd:             LC_LOAD_DYLIB
+        cmdsize:         48
+        dylib:
+          name:            24
+          timestamp:       2
+          current_version: 0
+          compatibility_version: 0
+        Content:         '@rpath/libA.dylib'
+        ZeroPadBytes:    7
+      - cmd:             LC_LOAD_DYLIB
+        cmdsize:         48
+        dylib:
+          name:            24
+          timestamp:       2
+          current_version: 0
+          compatibility_version: 0
+        Content:         '@rpath/libB.dylib'
+        ZeroPadBytes:    7
+      - cmd:             LC_LOAD_DYLIB
+        cmdsize:         48
+        dylib:
+          name:            24
+          timestamp:       2
+          current_version: 0
+          compatibility_version: 0
+        Content:         '@rpath/libZ.dylib'
+        ZeroPadBytes:    7
+      - cmd:             LC_LOAD_DYLIB
+        cmdsize:         56
+        dylib:
+          name:            24
+          timestamp:       2
+          current_version: 88539136
+          compatibility_version: 65536
+        Content:         '/usr/lib/libSystem.B.dylib'
+        ZeroPadBytes:    6
+      - cmd:             LC_RPATH
+        cmdsize:         32
+        path:            12
+        Content:         '@loader_path/../A'
+        ZeroPadBytes:    3
+      - cmd:             LC_RPATH
+        cmdsize:         32
+        path:            12
+        Content:         '@loader_path/../B'
+        ZeroPadBytes:    3
+      - cmd:             LC_RPATH
+        cmdsize:         32
+        path:            12
+        Content:         '@loader_path/../Z'
+        ZeroPadBytes:    3
+      - cmd:             LC_FUNCTION_STARTS
+        cmdsize:         16
+        dataoff:         32904
+        datasize:        8
+      - cmd:             LC_DATA_IN_CODE
+        cmdsize:         16
+        dataoff:         32912
+        datasize:        0
+      - cmd:             LC_CODE_SIGNATURE
+        cmdsize:         16
+        dataoff:         33040
+        datasize:        416
+    LinkEditData:
+      ExportTrie:
+        TerminalSize:    0
+        NodeOffset:      0
+        Name:            ''
+        Flags:           0x0
+        Address:         0x0
+        Other:           0x0
+        ImportName:      ''
+        Children:
+          - TerminalSize:    3
+            NodeOffset:      13
+            Name:            _sayC
+            Flags:           0x0
+            Address:         0x3F58
+            Other:           0x0
+            ImportName:      ''
+      NameList:
+        - n_strx:          2
+          n_type:          0xF
+          n_sect:          1
+          n_desc:          0
+          n_value:         16216
+        - n_strx:          8
+          n_type:          0x1
+          n_sect:          0
+          n_desc:          256
+          n_value:         0
+        - n_strx:          14
+          n_type:          0x1
+          n_sect:          0
+          n_desc:          512
+          n_value:         0
+        - n_strx:          20
+          n_type:          0x1
+          n_sect:          0
+          n_desc:          768
+          n_value:         0
+      StringTable:
+        - ' '
+        - _sayC
+        - _sayA
+        - _sayB
+        - _sayZ
+        - ''
+        - ''
+        - ''
+        - ''
+        - ''
+        - ''
+      IndirectSymbols: [ 0x1, 0x2, 0x3, 0x1, 0x2, 0x3 ]
+      FunctionStarts:  [ 0x3F58 ]
+      ChainedFixups:   [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x48, 
+                         0x0, 0x0, 0x0, 0x58, 0x0, 0x0, 0x0, 0x3, 0x0, 
+                         0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x18, 0x0, 0x0, 0x0, 0x0, 0x40, 0xC, 0x0, 
+                         0x0, 0x40, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x2, 0x0, 
+                         0x0, 0x2, 0xE, 0x0, 0x0, 0x3, 0x1A, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x0, 0x5F, 0x73, 0x61, 0x79, 
+                         0x41, 0x0, 0x5F, 0x73, 0x61, 0x79, 0x42, 0x0, 
+                         0x5F, 0x73, 0x61, 0x79, 0x5A, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0 ]
+...
diff --git a/llvm/unittests/ExecutionEngine/Orc/Inputs/Z/Z_linux.yaml b/llvm/unittests/ExecutionEngine/Orc/Inputs/Z/Z_linux.yaml
new file mode 100644
index 0000000000000..5561f29a93602
--- /dev/null
+++ b/llvm/unittests/ExecutionEngine/Orc/Inputs/Z/Z_linux.yaml
@@ -0,0 +1,460 @@
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_DYN
+  Machine:         EM_X86_64
+ProgramHeaders:
+  - Type:            PT_LOAD
+    Flags:           [ PF_R ]
+    FirstSec:        .note.gnu.property
+    LastSec:         .rela.plt
+    Align:           0x1000
+    Offset:          0x0
+  - Type:            PT_LOAD
+    Flags:           [ PF_X, PF_R ]
+    FirstSec:        .init
+    LastSec:         .fini
+    VAddr:           0x1000
+    Align:           0x1000
+    Offset:          0x1000
+  - Type:            PT_LOAD
+    Flags:           [ PF_R ]
+    FirstSec:        .rodata
+    LastSec:         .eh_frame
+    VAddr:           0x2000
+    Align:           0x1000
+    Offset:          0x2000
+  - Type:            PT_LOAD
+    Flags:           [ PF_W, PF_R ]
+    FirstSec:        .init_array
+    LastSec:         .bss
+    VAddr:           0x3E10
+    Align:           0x1000
+    Offset:          0x2E10
+  - Type:            PT_DYNAMIC
+    Flags:           [ PF_W, PF_R ]
+    FirstSec:        .dynamic
+    LastSec:         .dynamic
+    VAddr:           0x3E20
+    Align:           0x8
+    Offset:          0x2E20
+  - Type:            PT_NOTE
+    Flags:           [ PF_R ]
+    FirstSec:        .note.gnu.property
+    LastSec:         .note.gnu.property
+    VAddr:           0x2A8
+    Align:           0x8
+    Offset:          0x2A8
+  - Type:            PT_NOTE
+    Flags:           [ PF_R ]
+    FirstSec:        .note.gnu.build-id
+    LastSec:         .note.gnu.build-id
+    VAddr:           0x2C8
+    Align:           0x4
+    Offset:          0x2C8
+  - Type:            PT_GNU_PROPERTY
+    Flags:           [ PF_R ]
+    FirstSec:        .note.gnu.property
+    LastSec:         .note.gnu.property
+    VAddr:           0x2A8
+    Align:           0x8
+    Offset:          0x2A8
+  - Type:            PT_GNU_EH_FRAME
+    Flags:           [ PF_R ]
+    FirstSec:        .eh_frame_hdr
+    LastSec:         .eh_frame_hdr
+    VAddr:           0x2010
+    Align:           0x4
+    Offset:          0x2010
+  - Type:            PT_GNU_STACK
+    Flags:           [ PF_W, PF_R ]
+    Align:           0x10
+    Offset:          0x0
+  - Type:            PT_GNU_RELRO
+    Flags:           [ PF_R ]
+    FirstSec:        .init_array
+    LastSec:         .got
+    VAddr:           0x3E10
+    Offset:          0x2E10
+Sections:
+  - Name:            .note.gnu.property
+    Type:            SHT_NOTE
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2A8
+    AddressAlign:    0x8
+    Notes:
+      - Name:            GNU
+        Desc:            020000C0040000000300000000000000
+        Type:            NT_GNU_PROPERTY_TYPE_0
+  - Name:            .note.gnu.build-id
+    Type:            SHT_NOTE
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2C8
+    AddressAlign:    0x4
+    Notes:
+      - Name:            GNU
+        Desc:            640A4A3AC0DF6BA3DAC3B51CCD727245117E0B30
+        Type:            NT_PRPSINFO
+  - Name:            .gnu.hash
+    Type:            SHT_GNU_HASH
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2F0
+    Link:            .dynsym
+    AddressAlign:    0x8
+    Header:
+      SymNdx:          0x6
+      Shift2:          0x6
+    BloomFilter:     [ 0x500000000000 ]
+    HashBuckets:     [ 0x6, 0x0 ]
+    HashValues:      [ 0x7C9DCBAD ]
+  - Name:            .dynsym
+    Type:            SHT_DYNSYM
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x318
+    Link:            .dynstr
+    AddressAlign:    0x8
+  - Name:            .dynstr
+    Type:            SHT_STRTAB
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x3C0
+    AddressAlign:    0x1
+  - Name:            .gnu.version
+    Type:            SHT_GNU_versym
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x436
+    Link:            .dynsym
+    AddressAlign:    0x2
+    Entries:         [ 0, 1, 2, 1, 1, 2, 1 ]
+  - Name:            .gnu.version_r
+    Type:            SHT_GNU_verneed
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x448
+    Link:            .dynstr
+    AddressAlign:    0x8
+    Dependencies:
+      - Version:         1
+        File:            libc.so.6
+        Entries:
+          - Name:            GLIBC_2.2.5
+            Hash:            157882997
+            Flags:           0
+            Other:           2
+  - Name:            .rela.dyn
+    Type:            SHT_RELA
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x468
+    Link:            .dynsym
+    AddressAlign:    0x8
+    Relocations:
+      - Offset:          0x3E10
+        Type:            R_X86_64_RELATIVE
+        Addend:          4368
+      - Offset:          0x3E18
+        Type:            R_X86_64_RELATIVE
+        Addend:          4304
+      - Offset:          0x4020
+        Type:            R_X86_64_RELATIVE
+        Addend:          16416
+      - Offset:          0x3FE0
+        Symbol:          _ITM_deregisterTMCloneTable
+        Type:            R_X86_64_GLOB_DAT
+      - Offset:          0x3FE8
+        Symbol:          __gmon_start__
+        Type:            R_X86_64_GLOB_DAT
+      - Offset:          0x3FF0
+        Symbol:          _ITM_registerTMCloneTable
+        Type:            R_X86_64_GLOB_DAT
+      - Offset:          0x3FF8
+        Symbol:          __cxa_finalize
+        Type:            R_X86_64_GLOB_DAT
+  - Name:            .rela.plt
+    Type:            SHT_RELA
+    Flags:           [ SHF_ALLOC, SHF_INFO_LINK ]
+    Address:         0x510
+    Link:            .dynsym
+    AddressAlign:    0x8
+    Info:            .got.plt
+    Relocations:
+      - Offset:          0x4018
+        Symbol:          puts
+        Type:            R_X86_64_JUMP_SLOT
+  - Name:            .init
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1000
+    AddressAlign:    0x4
+    Offset:          0x1000
+    Content:         F30F1EFA4883EC08488B05D92F00004885C07402FFD04883C408C3
+  - Name:            .plt
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1020
+    AddressAlign:    0x10
+    EntSize:         0x10
+    Content:         FF35E22F0000F2FF25E32F00000F1F00F30F1EFA6800000000F2E9E1FFFFFF90
+  - Name:            .plt.got
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1040
+    AddressAlign:    0x10
+    EntSize:         0x10
+    Content:         F30F1EFAF2FF25AD2F00000F1F440000
+  - Name:            .plt.sec
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1050
+    AddressAlign:    0x10
+    EntSize:         0x10
+    Content:         F30F1EFAF2FF25BD2F00000F1F440000
+  - Name:            .text
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1060
+    AddressAlign:    0x10
+    Content:         488D3DC12F0000488D05BA2F00004839F87415488B05662F00004885C07409FFE00F1F8000000000C30F1F8000000000488D3D912F0000488D358A2F00004829FE4889F048C1EE3F48C1F8034801C648D1FE7414488B05352F00004885C07408FFE0660F1F440000C30F1F8000000000F30F1EFA803D4D2F000000752B5548833D122F0000004889E5740C488B3D2E2F0000E849FFFFFFE864FFFFFFC605252F0000015DC30F1F00C30F1F8000000000F30F1EFAE977FFFFFFF30F1EFA554889E5488D05D80E00004889C7E820FFFFFF905DC3
+  - Name:            .fini
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1134
+    AddressAlign:    0x4
+    Content:         F30F1EFA4883EC084883C408C3
+  - Name:            .rodata
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2000
+    AddressAlign:    0x1
+    Offset:          0x2000
+    Content:         48656C6C6F2066726F6D205A00
+  - Name:            .eh_frame_hdr
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2010
+    AddressAlign:    0x4
+    Content:         011B033B2C0000000400000010F0FFFF4800000030F0FFFF7000000040F0FFFF8800000009F1FFFFA0000000
+  - Name:            .eh_frame
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2040
+    AddressAlign:    0x8
+    Content:         1400000000000000017A5200017810011B0C070890010000240000001C000000C0EFFFFF20000000000E10460E184A0F0B770880003F1A3A2A332422000000001400000044000000B8EFFFFF100000000000000000000000140000005C000000B0EFFFFF1000000000000000000000001C0000007400000061F0FFFF1A00000000450E108602430D06510C070800000000000000
+  - Name:            .init_array
+    Type:            SHT_INIT_ARRAY
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x3E10
+    AddressAlign:    0x8
+    EntSize:         0x8
+    Offset:          0x2E10
+    Content:         '1011000000000000'
+  - Name:            .fini_array
+    Type:            SHT_FINI_ARRAY
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x3E18
+    AddressAlign:    0x8
+    EntSize:         0x8
+    Content:         D010000000000000
+  - Name:            .dynamic
+    Type:            SHT_DYNAMIC
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x3E20
+    Link:            .dynstr
+    AddressAlign:    0x8
+    Entries:
+      - Tag:             DT_NEEDED
+        Value:           0x5F
+      - Tag:             DT_INIT
+        Value:           0x1000
+      - Tag:             DT_FINI
+        Value:           0x1134
+      - Tag:             DT_INIT_ARRAY
+        Value:           0x3E10
+      - Tag:             DT_INIT_ARRAYSZ
+        Value:           0x8
+      - Tag:             DT_FINI_ARRAY
+        Value:           0x3E18
+      - Tag:             DT_FINI_ARRAYSZ
+        Value:           0x8
+      - Tag:             DT_GNU_HASH
+        Value:           0x2F0
+      - Tag:             DT_STRTAB
+        Value:           0x3C0
+      - Tag:             DT_SYMTAB
+        Value:           0x318
+      - Tag:             DT_STRSZ
+        Value:           0x75
+      - Tag:             DT_SYMENT
+        Value:           0x18
+      - Tag:             DT_PLTGOT
+        Value:           0x4000
+      - Tag:             DT_PLTRELSZ
+        Value:           0x18
+      - Tag:             DT_PLTREL
+        Value:           0x7
+      - Tag:             DT_JMPREL
+        Value:           0x510
+      - Tag:             DT_RELA
+        Value:           0x468
+      - Tag:             DT_RELASZ
+        Value:           0xA8
+      - Tag:             DT_RELAENT
+        Value:           0x18
+      - Tag:             DT_VERNEED
+        Value:           0x448
+      - Tag:             DT_VERNEEDNUM
+        Value:           0x1
+      - Tag:             DT_VERSYM
+        Value:           0x436
+      - Tag:             DT_RELACOUNT
+        Value:           0x3
+      - Tag:             DT_NULL
+        Value:           0x0
+      - Tag:             DT_NULL
+        Value:           0x0
+      - Tag:             DT_NULL
+        Value:           0x0
+      - Tag:             DT_NULL
+        Value:           0x0
+      - Tag:             DT_NULL
+        Value:           0x0
+  - Name:            .got
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x3FE0
+    AddressAlign:    0x8
+    EntSize:         0x8
+    Content:         '0000000000000000000000000000000000000000000000000000000000000000'
+  - Name:            .got.plt
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x4000
+    AddressAlign:    0x8
+    EntSize:         0x8
+    Content:         '203E000000000000000000000000000000000000000000003010000000000000'
+  - Name:            .data
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x4020
+    AddressAlign:    0x8
+    Content:         '2040000000000000'
+  - Name:            .bss
+    Type:            SHT_NOBITS
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x4028
+    AddressAlign:    0x1
+    Size:            0x8
+  - Name:            .comment
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_MERGE, SHF_STRINGS ]
+    AddressAlign:    0x1
+    EntSize:         0x1
+    Content:         4743433A20285562756E74752031312E342E302D317562756E7475317E32322E30342E32292031312E342E3000
+Symbols:
+  - Name:            crtstuff.c
+    Type:            STT_FILE
+    Index:           SHN_ABS
+  - Name:            deregister_tm_clones
+    Type:            STT_FUNC
+    Section:         .text
+    Value:           0x1060
+  - Name:            register_tm_clones
+    Type:            STT_FUNC
+    Section:         .text
+    Value:           0x1090
+  - Name:            __do_global_dtors_aux
+    Type:            STT_FUNC
+    Section:         .text
+    Value:           0x10D0
+  - Name:            completed.0
+    Type:            STT_OBJECT
+    Section:         .bss
+    Value:           0x4028
+    Size:            0x1
+  - Name:            __do_global_dtors_aux_fini_array_entry
+    Type:            STT_OBJECT
+    Section:         .fini_array
+    Value:           0x3E18
+  - Name:            frame_dummy
+    Type:            STT_FUNC
+    Section:         .text
+    Value:           0x1110
+  - Name:            __frame_dummy_init_array_entry
+    Type:            STT_OBJECT
+    Section:         .init_array
+    Value:           0x3E10
+  - Name:            libZ.c
+    Type:            STT_FILE
+    Index:           SHN_ABS
+  - Name:            'crtstuff.c (1)'
+    Type:            STT_FILE
+    Index:           SHN_ABS
+  - Name:            __FRAME_END__
+    Type:            STT_OBJECT
+    Section:         .eh_frame
+    Value:           0x20D0
+  - Type:            STT_FILE
+    Index:           SHN_ABS
+  - Name:            _fini
+    Type:            STT_FUNC
+    Section:         .fini
+    Value:           0x1134
+  - Name:            __dso_handle
+    Type:            STT_OBJECT
+    Section:         .data
+    Value:           0x4020
+  - Name:            _DYNAMIC
+    Type:            STT_OBJECT
+    Section:         .dynamic
+    Value:           0x3E20
+  - Name:            __GNU_EH_FRAME_HDR
+    Section:         .eh_frame_hdr
+    Value:           0x2010
+  - Name:            __TMC_END__
+    Type:            STT_OBJECT
+    Section:         .data
+    Value:           0x4028
+  - Name:            _GLOBAL_OFFSET_TABLE_
+    Type:            STT_OBJECT
+    Section:         .got.plt
+    Value:           0x4000
+  - Name:            _init
+    Type:            STT_FUNC
+    Section:         .init
+    Value:           0x1000
+  - Name:            _ITM_deregisterTMCloneTable
+    Binding:         STB_WEAK
+  - Name:            'puts@GLIBC_2.2.5'
+    Type:            STT_FUNC
+    Binding:         STB_GLOBAL
+  - Name:            __gmon_start__
+    Binding:         STB_WEAK
+  - Name:            sayZ
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+    Value:           0x1119
+    Size:            0x1A
+  - Name:            _ITM_registerTMCloneTable
+    Binding:         STB_WEAK
+  - Name:            '__cxa_finalize@GLIBC_2.2.5'
+    Type:            STT_FUNC
+    Binding:         STB_WEAK
+DynamicSymbols:
+  - Name:            _ITM_deregisterTMCloneTable
+    Binding:         STB_WEAK
+  - Name:            puts
+    Type:            STT_FUNC
+    Binding:         STB_GLOBAL
+  - Name:            __gmon_start__
+    Binding:         STB_WEAK
+  - Name:            _ITM_registerTMCloneTable
+    Binding:         STB_WEAK
+  - Name:            __cxa_finalize
+    Type:            STT_FUNC
+    Binding:         STB_WEAK
+  - Name:            sayZ
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+    Value:           0x1119
+    Size:            0x1A
+...
diff --git a/llvm/unittests/ExecutionEngine/Orc/Inputs/Z/Z_macho.yaml b/llvm/unittests/ExecutionEngine/Orc/Inputs/Z/Z_macho.yaml
new file mode 100644
index 0000000000000..c0c18265ab667
--- /dev/null
+++ b/llvm/unittests/ExecutionEngine/Orc/Inputs/Z/Z_macho.yaml
@@ -0,0 +1,723 @@
+--- !fat-mach-o
+FatHeader:
+  magic:           0xCAFEBABE
+  nfat_arch:       3
+FatArchs:
+  - cputype:         0x1000007
+    cpusubtype:      0x3
+    offset:          0x1000
+    size:            8376
+    align:           12
+  - cputype:         0x100000C
+    cpusubtype:      0x0
+    offset:          0x4000
+    size:            33376
+    align:           14
+  - cputype:         0x100000C
+    cpusubtype:      0x80000002
+    offset:          0x10000
+    size:            33376
+    align:           14
+Slices:
+  - !mach-o
+    FileHeader:
+      magic:           0xFEEDFACF
+      cputype:         0x1000007
+      cpusubtype:      0x3
+      filetype:        0x6
+      ncmds:           14
+      sizeofcmds:      960
+      flags:           0x100085
+      reserved:        0x0
+    LoadCommands:
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         392
+        segname:         __TEXT
+        vmaddr:          0
+        vmsize:          4096
+        fileoff:         0
+        filesize:        4096
+        maxprot:         5
+        initprot:        5
+        nsects:          4
+        flags:           0
+        Sections:
+          - sectname:        __text
+            segname:         __TEXT
+            addr:            0xF80
+            size:            20
+            offset:          0xF80
+            align:           4
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000400
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         554889E5488D3D0F000000B000E8020000005DC3
+          - sectname:        __stubs
+            segname:         __TEXT
+            addr:            0xF94
+            size:            6
+            offset:          0xF94
+            align:           1
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000408
+            reserved1:       0x0
+            reserved2:       0x6
+            reserved3:       0x0
+            content:         FF2566000000
+          - sectname:        __cstring
+            segname:         __TEXT
+            addr:            0xF9A
+            size:            14
+            offset:          0xF9A
+            align:           0
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x2
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         48656C6C6F2066726F6D205A0A00
+          - sectname:        __unwind_info
+            segname:         __TEXT
+            addr:            0xFA8
+            size:            88
+            offset:          0xFA8
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x0
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         010000001C000000000000001C000000000000001C00000002000000800F00004000000040000000940F00000000000040000000000000000000000000000000030000000C00010010000100000000000000000100000000
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         152
+        segname:         __DATA_CONST
+        vmaddr:          4096
+        vmsize:          4096
+        fileoff:         4096
+        filesize:        4096
+        maxprot:         3
+        initprot:        3
+        nsects:          1
+        flags:           16
+        Sections:
+          - sectname:        __got
+            segname:         __DATA_CONST
+            addr:            0x1000
+            size:            8
+            offset:          0x1000
+            align:           3
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x6
+            reserved1:       0x1
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         '0000000000000080'
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         72
+        segname:         __LINKEDIT
+        vmaddr:          8192
+        vmsize:          4096
+        fileoff:         8192
+        filesize:        184
+        maxprot:         1
+        initprot:        1
+        nsects:          0
+        flags:           0
+      - cmd:             LC_ID_DYLIB
+        cmdsize:         48
+        dylib:
+          name:            24
+          timestamp:       1
+          current_version: 0
+          compatibility_version: 0
+        Content:         '@rpath/libZ.dylib'
+        ZeroPadBytes:    7
+      - cmd:             LC_DYLD_CHAINED_FIXUPS
+        cmdsize:         16
+        dataoff:         8192
+        datasize:        96
+      - cmd:             LC_DYLD_EXPORTS_TRIE
+        cmdsize:         16
+        dataoff:         8288
+        datasize:        24
+      - cmd:             LC_SYMTAB
+        cmdsize:         24
+        symoff:          8320
+        nsyms:           2
+        stroff:          8360
+        strsize:         16
+      - cmd:             LC_DYSYMTAB
+        cmdsize:         80
+        ilocalsym:       0
+        nlocalsym:       0
+        iextdefsym:      0
+        nextdefsym:      1
+        iundefsym:       1
+        nundefsym:       1
+        tocoff:          0
+        ntoc:            0
+        modtaboff:       0
+        nmodtab:         0
+        extrefsymoff:    0
+        nextrefsyms:     0
+        indirectsymoff:  8352
+        nindirectsyms:   2
+        extreloff:       0
+        nextrel:         0
+        locreloff:       0
+        nlocrel:         0
+      - cmd:             LC_UUID
+        cmdsize:         24
+        uuid:            399E203C-FF9A-3B80-872C-85F3A759A78B
+      - cmd:             LC_BUILD_VERSION
+        cmdsize:         32
+        platform:        1
+        minos:           983040
+        sdk:             983552
+        ntools:          1
+        Tools:
+          - tool:            3
+            version:         73074435
+      - cmd:             LC_SOURCE_VERSION
+        cmdsize:         16
+        version:         0
+      - cmd:             LC_LOAD_DYLIB
+        cmdsize:         56
+        dylib:
+          name:            24
+          timestamp:       2
+          current_version: 88539136
+          compatibility_version: 65536
+        Content:         '/usr/lib/libSystem.B.dylib'
+        ZeroPadBytes:    6
+      - cmd:             LC_FUNCTION_STARTS
+        cmdsize:         16
+        dataoff:         8312
+        datasize:        8
+      - cmd:             LC_DATA_IN_CODE
+        cmdsize:         16
+        dataoff:         8320
+        datasize:        0
+    LinkEditData:
+      ExportTrie:
+        TerminalSize:    0
+        NodeOffset:      0
+        Name:            ''
+        Flags:           0x0
+        Address:         0x0
+        Other:           0x0
+        ImportName:      ''
+        Children:
+          - TerminalSize:    3
+            NodeOffset:      13
+            Name:            _sayZ
+            Flags:           0x0
+            Address:         0xF80
+            Other:           0x0
+            ImportName:      ''
+      NameList:
+        - n_strx:          2
+          n_type:          0xF
+          n_sect:          1
+          n_desc:          0
+          n_value:         3968
+        - n_strx:          8
+          n_type:          0x1
+          n_sect:          0
+          n_desc:          256
+          n_value:         0
+      StringTable:
+        - ' '
+        - _sayZ
+        - _printf
+      IndirectSymbols: [ 0x1, 0x1 ]
+      FunctionStarts:  [ 0xF80 ]
+      ChainedFixups:   [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x48, 
+                         0x0, 0x0, 0x0, 0x50, 0x0, 0x0, 0x0, 0x1, 0x0, 
+                         0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x18, 0x0, 0x0, 0x0, 0x0, 0x10, 0x6, 0x0, 
+                         0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x2, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x5F, 0x70, 0x72, 
+                         0x69, 0x6E, 0x74, 0x66, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0 ]
+  - !mach-o
+    FileHeader:
+      magic:           0xFEEDFACF
+      cputype:         0x100000C
+      cpusubtype:      0x0
+      filetype:        0x6
+      ncmds:           15
+      sizeofcmds:      976
+      flags:           0x100085
+      reserved:        0x0
+    LoadCommands:
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         392
+        segname:         __TEXT
+        vmaddr:          0
+        vmsize:          16384
+        fileoff:         0
+        filesize:        16384
+        maxprot:         5
+        initprot:        5
+        nsects:          4
+        flags:           0
+        Sections:
+          - sectname:        __text
+            segname:         __TEXT
+            addr:            0x3F70
+            size:            28
+            offset:          0x3F70
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000400
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         FD7BBFA9FD0300910000009000603E9103000094FD7BC1A8C0035FD6
+          - sectname:        __stubs
+            segname:         __TEXT
+            addr:            0x3F8C
+            size:            12
+            offset:          0x3F8C
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000408
+            reserved1:       0x0
+            reserved2:       0xC
+            reserved3:       0x0
+            content:         100000B0100240F900021FD6
+          - sectname:        __cstring
+            segname:         __TEXT
+            addr:            0x3F98
+            size:            14
+            offset:          0x3F98
+            align:           0
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x2
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         48656C6C6F2066726F6D205A0A00
+          - sectname:        __unwind_info
+            segname:         __TEXT
+            addr:            0x3FA8
+            size:            88
+            offset:          0x3FA8
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x0
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         010000001C000000000000001C000000000000001C00000002000000703F000040000000400000008C3F00000000000040000000000000000000000000000000030000000C00010010000100000000000000000400000000
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         152
+        segname:         __DATA_CONST
+        vmaddr:          16384
+        vmsize:          16384
+        fileoff:         16384
+        filesize:        16384
+        maxprot:         3
+        initprot:        3
+        nsects:          1
+        flags:           16
+        Sections:
+          - sectname:        __got
+            segname:         __DATA_CONST
+            addr:            0x4000
+            size:            8
+            offset:          0x4000
+            align:           3
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x6
+            reserved1:       0x1
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         '0000000000000080'
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         72
+        segname:         __LINKEDIT
+        vmaddr:          32768
+        vmsize:          16384
+        fileoff:         32768
+        filesize:        608
+        maxprot:         1
+        initprot:        1
+        nsects:          0
+        flags:           0
+      - cmd:             LC_ID_DYLIB
+        cmdsize:         48
+        dylib:
+          name:            24
+          timestamp:       1
+          current_version: 0
+          compatibility_version: 0
+        Content:         '@rpath/libZ.dylib'
+        ZeroPadBytes:    7
+      - cmd:             LC_DYLD_CHAINED_FIXUPS
+        cmdsize:         16
+        dataoff:         32768
+        datasize:        96
+      - cmd:             LC_DYLD_EXPORTS_TRIE
+        cmdsize:         16
+        dataoff:         32864
+        datasize:        24
+      - cmd:             LC_SYMTAB
+        cmdsize:         24
+        symoff:          32896
+        nsyms:           2
+        stroff:          32936
+        strsize:         16
+      - cmd:             LC_DYSYMTAB
+        cmdsize:         80
+        ilocalsym:       0
+        nlocalsym:       0
+        iextdefsym:      0
+        nextdefsym:      1
+        iundefsym:       1
+        nundefsym:       1
+        tocoff:          0
+        ntoc:            0
+        modtaboff:       0
+        nmodtab:         0
+        extrefsymoff:    0
+        nextrefsyms:     0
+        indirectsymoff:  32928
+        nindirectsyms:   2
+        extreloff:       0
+        nextrel:         0
+        locreloff:       0
+        nlocrel:         0
+      - cmd:             LC_UUID
+        cmdsize:         24
+        uuid:            6E8E78AF-EDB2-3830-BE1E-013390302CC5
+      - cmd:             LC_BUILD_VERSION
+        cmdsize:         32
+        platform:        1
+        minos:           983040
+        sdk:             983552
+        ntools:          1
+        Tools:
+          - tool:            3
+            version:         73074435
+      - cmd:             LC_SOURCE_VERSION
+        cmdsize:         16
+        version:         0
+      - cmd:             LC_LOAD_DYLIB
+        cmdsize:         56
+        dylib:
+          name:            24
+          timestamp:       2
+          current_version: 88539136
+          compatibility_version: 65536
+        Content:         '/usr/lib/libSystem.B.dylib'
+        ZeroPadBytes:    6
+      - cmd:             LC_FUNCTION_STARTS
+        cmdsize:         16
+        dataoff:         32888
+        datasize:        8
+      - cmd:             LC_DATA_IN_CODE
+        cmdsize:         16
+        dataoff:         32896
+        datasize:        0
+      - cmd:             LC_CODE_SIGNATURE
+        cmdsize:         16
+        dataoff:         32960
+        datasize:        416
+    LinkEditData:
+      ExportTrie:
+        TerminalSize:    0
+        NodeOffset:      0
+        Name:            ''
+        Flags:           0x0
+        Address:         0x0
+        Other:           0x0
+        ImportName:      ''
+        Children:
+          - TerminalSize:    3
+            NodeOffset:      13
+            Name:            _sayZ
+            Flags:           0x0
+            Address:         0x3F70
+            Other:           0x0
+            ImportName:      ''
+      NameList:
+        - n_strx:          2
+          n_type:          0xF
+          n_sect:          1
+          n_desc:          0
+          n_value:         16240
+        - n_strx:          8
+          n_type:          0x1
+          n_sect:          0
+          n_desc:          256
+          n_value:         0
+      StringTable:
+        - ' '
+        - _sayZ
+        - _printf
+      IndirectSymbols: [ 0x1, 0x1 ]
+      FunctionStarts:  [ 0x3F70 ]
+      ChainedFixups:   [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x48, 
+                         0x0, 0x0, 0x0, 0x50, 0x0, 0x0, 0x0, 0x1, 0x0, 
+                         0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x18, 0x0, 0x0, 0x0, 0x0, 0x40, 0x6, 0x0, 
+                         0x0, 0x40, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x2, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x5F, 0x70, 0x72, 
+                         0x69, 0x6E, 0x74, 0x66, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0 ]
+  - !mach-o
+    FileHeader:
+      magic:           0xFEEDFACF
+      cputype:         0x100000C
+      cpusubtype:      0x80000002
+      filetype:        0x6
+      ncmds:           15
+      sizeofcmds:      976
+      flags:           0x100085
+      reserved:        0x0
+    LoadCommands:
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         392
+        segname:         __TEXT
+        vmaddr:          0
+        vmsize:          16384
+        fileoff:         0
+        filesize:        16384
+        maxprot:         5
+        initprot:        5
+        nsects:          4
+        flags:           0
+        Sections:
+          - sectname:        __text
+            segname:         __TEXT
+            addr:            0x3F68
+            size:            32
+            offset:          0x3F68
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000400
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         7F2303D5FD7BBFA9FD0300910000009000603E9103000094FD7BC1A8FF0F5FD6
+          - sectname:        __auth_stubs
+            segname:         __TEXT
+            addr:            0x3F88
+            size:            16
+            offset:          0x3F88
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000408
+            reserved1:       0x0
+            reserved2:       0x10
+            reserved3:       0x0
+            content:         110000B031020091300240F9110A1FD7
+          - sectname:        __cstring
+            segname:         __TEXT
+            addr:            0x3F98
+            size:            14
+            offset:          0x3F98
+            align:           0
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x2
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         48656C6C6F2066726F6D205A0A00
+          - sectname:        __unwind_info
+            segname:         __TEXT
+            addr:            0x3FA8
+            size:            88
+            offset:          0x3FA8
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x0
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         010000001C000000000000001C000000000000001C00000002000000683F00004000000040000000883F00000000000040000000000000000000000000000000030000000C00010010000100000000000000000400000000
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         152
+        segname:         __DATA_CONST
+        vmaddr:          16384
+        vmsize:          16384
+        fileoff:         16384
+        filesize:        16384
+        maxprot:         3
+        initprot:        3
+        nsects:          1
+        flags:           16
+        Sections:
+          - sectname:        __auth_got
+            segname:         __DATA_CONST
+            addr:            0x4000
+            size:            8
+            offset:          0x4000
+            align:           3
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x6
+            reserved1:       0x1
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         00000000000001C0
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         72
+        segname:         __LINKEDIT
+        vmaddr:          32768
+        vmsize:          16384
+        fileoff:         32768
+        filesize:        608
+        maxprot:         1
+        initprot:        1
+        nsects:          0
+        flags:           0
+      - cmd:             LC_ID_DYLIB
+        cmdsize:         48
+        dylib:
+          name:            24
+          timestamp:       1
+          current_version: 0
+          compatibility_version: 0
+        Content:         '@rpath/libZ.dylib'
+        ZeroPadBytes:    7
+      - cmd:             LC_DYLD_CHAINED_FIXUPS
+        cmdsize:         16
+        dataoff:         32768
+        datasize:        96
+      - cmd:             LC_DYLD_EXPORTS_TRIE
+        cmdsize:         16
+        dataoff:         32864
+        datasize:        24
+      - cmd:             LC_SYMTAB
+        cmdsize:         24
+        symoff:          32896
+        nsyms:           2
+        stroff:          32936
+        strsize:         16
+      - cmd:             LC_DYSYMTAB
+        cmdsize:         80
+        ilocalsym:       0
+        nlocalsym:       0
+        iextdefsym:      0
+        nextdefsym:      1
+        iundefsym:       1
+        nundefsym:       1
+        tocoff:          0
+        ntoc:            0
+        modtaboff:       0
+        nmodtab:         0
+        extrefsymoff:    0
+        nextrefsyms:     0
+        indirectsymoff:  32928
+        nindirectsyms:   2
+        extreloff:       0
+        nextrel:         0
+        locreloff:       0
+        nlocrel:         0
+      - cmd:             LC_UUID
+        cmdsize:         24
+        uuid:            E74F368D-238F-31FA-BF40-FA2964FED986
+      - cmd:             LC_BUILD_VERSION
+        cmdsize:         32
+        platform:        1
+        minos:           983040
+        sdk:             983552
+        ntools:          1
+        Tools:
+          - tool:            3
+            version:         73074435
+      - cmd:             LC_SOURCE_VERSION
+        cmdsize:         16
+        version:         0
+      - cmd:             LC_LOAD_DYLIB
+        cmdsize:         56
+        dylib:
+          name:            24
+          timestamp:       2
+          current_version: 88539136
+          compatibility_version: 65536
+        Content:         '/usr/lib/libSystem.B.dylib'
+        ZeroPadBytes:    6
+      - cmd:             LC_FUNCTION_STARTS
+        cmdsize:         16
+        dataoff:         32888
+        datasize:        8
+      - cmd:             LC_DATA_IN_CODE
+        cmdsize:         16
+        dataoff:         32896
+        datasize:        0
+      - cmd:             LC_CODE_SIGNATURE
+        cmdsize:         16
+        dataoff:         32960
+        datasize:        416
+    LinkEditData:
+      ExportTrie:
+        TerminalSize:    0
+        NodeOffset:      0
+        Name:            ''
+        Flags:           0x0
+        Address:         0x0
+        Other:           0x0
+        ImportName:      ''
+        Children:
+          - TerminalSize:    3
+            NodeOffset:      13
+            Name:            _sayZ
+            Flags:           0x0
+            Address:         0x3F68
+            Other:           0x0
+            ImportName:      ''
+      NameList:
+        - n_strx:          2
+          n_type:          0xF
+          n_sect:          1
+          n_desc:          0
+          n_value:         16232
+        - n_strx:          8
+          n_type:          0x1
+          n_sect:          0
+          n_desc:          256
+          n_value:         0
+      StringTable:
+        - ' '
+        - _sayZ
+        - _printf
+      IndirectSymbols: [ 0x1, 0x1 ]
+      FunctionStarts:  [ 0x3F68 ]
+      ChainedFixups:   [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x48, 
+                         0x0, 0x0, 0x0, 0x50, 0x0, 0x0, 0x0, 0x1, 0x0, 
+                         0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x18, 0x0, 0x0, 0x0, 0x0, 0x40, 0xC, 0x0, 
+                         0x0, 0x40, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x2, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x5F, 0x70, 0x72, 
+                         0x69, 0x6E, 0x74, 0x66, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0 ]
+...
diff --git a/llvm/unittests/ExecutionEngine/Orc/LibraryResolverTest.cpp b/llvm/unittests/ExecutionEngine/Orc/LibraryResolverTest.cpp
new file mode 100644
index 0000000000000..2a396da397eb5
--- /dev/null
+++ b/llvm/unittests/ExecutionEngine/Orc/LibraryResolverTest.cpp
@@ -0,0 +1,764 @@
+//===- LibraryResolverTest.cpp - Unit tests for LibraryResolver -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/TargetProcess/LibraryResolver.h"
+#include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h"
+#include "llvm/ExecutionEngine/Orc/TargetProcess/LibraryScanner.h"
+#include "llvm/ObjectYAML/MachOYAML.h"
+#include "llvm/ObjectYAML/yaml2obj.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/YAMLParser.h"
+#include "llvm/Support/YAMLTraits.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include "llvm/Testing/Support/SupportHelpers.h"
+
+#include "gtest/gtest.h"
+
+#include <algorithm>
+#include <optional>
+#include <string>
+#include <vector>
+
+using namespace llvm;
+using namespace llvm::orc;
+
+// Disabled due to test setup issue — YAML to shared library creation seems
+// invalid on some build bots. (PR #165360) Not related to code logic.
+#if 0
+// TODO: Add COFF (Windows) support for these tests.
+// this facility also works correctly on Windows (COFF),
+// so we should eventually enable and run these tests for that platform as well.
+namespace {
+
+#if defined(__APPLE__)
+constexpr const char *ext = ".dylib";
+#elif defined(_WIN32)
+constexpr const char *ext = ".dll";
+#else
+constexpr const char *ext = ".so";
+#endif
+
+bool EnvReady = false;
+
+Triple getTargetTriple() {
+  auto JTMB = JITTargetMachineBuilder::detectHost();
+  if (!JTMB) {
+    consumeError(JTMB.takeError());
+    return Triple();
+  }
+  return JTMB->getTargetTriple();
+}
+
+static bool CheckHostSupport() {
+  auto Triple = getTargetTriple();
+  // TODO: Extend support to COFF (Windows) once test setup and YAML conversion
+  // are verified.
+  if (!Triple.isOSBinFormatMachO() &&
+      !(Triple.isOSBinFormatELF() && Triple.getArch() == Triple::x86_64))
+    return false;
+
+  return true;
+}
+
+std::string getYamlFilePlatformExt() {
+  auto Triple = getTargetTriple();
+  if (Triple.isOSBinFormatMachO())
+    return "_macho";
+  else if (Triple.isOSBinFormatELF())
+    return "_linux";
+
+  return "";
+}
+
+unsigned getYamlDocNum() {
+  // auto Triple = getTargetTriple();
+  // if (Triple.isOSBinFormatELF())
+  //   return 1;
+
+  return 1;
+}
+
+class LibraryTestEnvironment : public ::testing::Environment {
+  std::vector<std::string> CreatedDylibsDir;
+  std::vector<std::string> CreatedDylibs;
+  SmallVector<char, 128> DirPath;
+
+public:
+  void SetUp() override {
+    if (!CheckHostSupport()) {
+      EnvReady = false;
+      return;
+    }
+
+    StringRef ThisFile = __FILE__;
+    SmallVector<char, 128> InputDirPath(ThisFile.begin(), ThisFile.end());
+    sys::path::remove_filename(InputDirPath);
+    sys::path::append(InputDirPath, "Inputs");
+    if (!sys::fs::exists(InputDirPath))
+      return;
+
+    SmallString<128> UniqueDir;
+    sys::path::append(UniqueDir, InputDirPath);
+    std::error_code EC = sys::fs::createUniqueDirectory(UniqueDir, DirPath);
+
+    if (EC)
+      return;
+
+    // given yamlPath + DylibPath, validate + convert
+    auto processYamlToDylib = [&](const SmallVector<char, 128> &YamlPath,
+                                  const SmallVector<char, 128> &DylibPath,
+                                  unsigned DocNum) -> bool {
+      if (!sys::fs::exists(YamlPath)) {
+        errs() << "YAML file missing: "
+               << StringRef(YamlPath.data(), YamlPath.size()) << "\n";
+        EnvReady = false;
+        return false;
+      }
+
+      auto BufOrErr = MemoryBuffer::getFile(YamlPath);
+      if (!BufOrErr) {
+        errs() << "Failed to read "
+               << StringRef(YamlPath.data(), YamlPath.size()) << ": "
+               << BufOrErr.getError().message() << "\n";
+        EnvReady = false;
+        return false;
+      }
+
+      yaml::Input yin(BufOrErr->get()->getBuffer());
+      std::error_code EC;
+      raw_fd_ostream outFile(StringRef(DylibPath.data(), DylibPath.size()), EC,
+                             sys::fs::OF_None);
+
+      if (EC) {
+        errs() << "Failed to open "
+               << StringRef(DylibPath.data(), DylibPath.size())
+               << " for writing: " << EC.message() << "\n";
+        EnvReady = false;
+        return false;
+      }
+
+      if (!yaml::convertYAML(
+              yin, outFile,
+              [](const Twine &M) {
+                // Handle or ignore errors here
+                errs() << "Yaml Error :" << M << "\n";
+              },
+              DocNum)) {
+        errs() << "Failed to convert "
+               << StringRef(YamlPath.data(), YamlPath.size()) << " to "
+               << StringRef(DylibPath.data(), DylibPath.size()) << "\n";
+        EnvReady = false;
+        return false;
+      }
+
+      CreatedDylibsDir.push_back(std::string(sys::path::parent_path(
+          StringRef(DylibPath.data(), DylibPath.size()))));
+      CreatedDylibs.push_back(std::string(DylibPath.begin(), DylibPath.end()));
+      return true;
+    };
+
+    std::vector<const char *> LibDirs = {"Z", "A", "B", "C"};
+
+    unsigned DocNum = getYamlDocNum();
+    std::string YamlPltExt = getYamlFilePlatformExt();
+    for (const auto &LibdirName : LibDirs) {
+      // YAML path
+      SmallVector<char, 128> YamlPath(InputDirPath.begin(), InputDirPath.end());
+      SmallVector<char, 128> YamlFileName;
+      YamlFileName.append(LibdirName, LibdirName + strlen(LibdirName));
+      YamlFileName.append(YamlPltExt.begin(), YamlPltExt.end());
+      sys::path::append(YamlPath, LibdirName, YamlFileName);
+      sys::path::replace_extension(YamlPath, ".yaml");
+
+      // dylib path
+      SmallVector<char, 128> DylibPath(DirPath.begin(), DirPath.end());
+      SmallVector<char, 128> DylibFileName;
+      StringRef prefix("lib");
+      DylibFileName.append(prefix.begin(), prefix.end());
+      DylibFileName.append(LibdirName, LibdirName + strlen(LibdirName));
+
+      sys::path::append(DylibPath, LibdirName);
+      if (!sys::fs::exists(DylibPath)) {
+        auto EC = sys::fs::create_directory(DylibPath);
+        if (EC)
+          return;
+      }
+      sys::path::append(DylibPath, DylibFileName);
+      sys::path::replace_extension(DylibPath, ext);
+      if (!processYamlToDylib(YamlPath, DylibPath, DocNum))
+        return;
+    }
+
+    EnvReady = true;
+  }
+
+  void TearDown() override { sys::fs::remove_directories(DirPath); }
+
+  std::string getBaseDir() const {
+    return std::string(DirPath.begin(), DirPath.end());
+  }
+
+  std::vector<std::string> getDylibPaths() const { return CreatedDylibs; }
+};
+
+static LibraryTestEnvironment *GlobalEnv =
+    static_cast<LibraryTestEnvironment *>(
+        ::testing::AddGlobalTestEnvironment(new LibraryTestEnvironment()));
+
+inline std::string libPath(const std::string &BaseDir,
+                           const std::string &name) {
+#if defined(__APPLE__)
+  return BaseDir + "/" + name + ".dylib";
+#elif defined(_WIN32)
+  return BaseDir + "/" + name + ".dll";
+#else
+  return BaseDir + "/" + name + ".so";
+#endif
+}
+
+inline std::string withext(const std::string &lib) {
+  SmallString<128> P(lib);
+  sys::path::replace_extension(P, ext);
+  return P.str().str();
+}
+
+inline std::string platformSymbolName(const std::string &name) {
+#if defined(__APPLE__)
+  return "_" + name; // macOS prepends underscore
+#else
+  return name;
+#endif
+}
+
+struct TestLibrary {
+  std::string path;
+  std::vector<std::string> Syms;
+};
+
+class LibraryResolverIT : public ::testing::Test {
+protected:
+  std::string BaseDir;
+  std::unordered_map<std::string, TestLibrary> libs;
+
+  void addLib(const std::string &name) {
+    SmallString<512> path;
+    std::error_code EC =
+        sys::fs::real_path(libPath(BaseDir, name + "/lib" + name), path);
+    if (EC || path.empty() || !sys::fs::exists(path))
+      GTEST_SKIP();
+    libs[name] = {path.str().str(), {platformSymbolName("say" + name)}};
+  }
+
+  void SetUp() override {
+    if (!EnvReady || GlobalEnv == nullptr)
+      GTEST_SKIP() << "Skipping test: environment setup failed.";
+
+    {
+      SmallString<512> path;
+      std::error_code EC = sys::fs::real_path(GlobalEnv->getBaseDir(), path);
+      if (path.empty() || EC)
+        GTEST_SKIP() << "Base directory resolution failed: " << EC.message();
+      BaseDir = path.str().str();
+    }
+
+    for (const auto &P : GlobalEnv->getDylibPaths()) {
+      if (!sys::fs::exists(P))
+        GTEST_SKIP() << "Missing dylib path: " << P;
+    }
+
+    const std::vector<std::string> libNames = {"A", "B", "C", "Z"};
+    for (const auto &name : libNames)
+      addLib(name);
+
+    if (!EnvReady)
+      GTEST_SKIP() << "Skipping test: environment setup failed.";
+  }
+
+  const std::vector<std::string> &sym(const std::string &key) {
+    return libs[key].Syms;
+  }
+  const std::string &lib(const std::string &key) { return libs[key].path; }
+  const std::string libdir(const std::string &key) {
+    SmallString<512> P(libs[key].path);
+    sys::path::remove_filename(P);
+    return P.str().str();
+  }
+  const std::string libname(const std::string &key) {
+    return sys::path::filename(libs[key].path).str();
+  }
+};
+
+// Helper: allow either "sayA" or "_sayA" depending on how your
+// SymbolEnumerator reports.
+static bool matchesEitherUnderscore(const std::string &got,
+                                    const std::string &bare) {
+  return got == bare || got == ("_" + bare);
+}
+
+// Helper: normalize path ending check (we only care that it resolved to the
+// right dylib)
+static bool endsWith(const std::string &s, const std::string &suffix) {
+  if (s.size() < suffix.size())
+    return false;
+  return std::equal(suffix.rbegin(), suffix.rend(), s.rbegin());
+}
+
+TEST_F(LibraryResolverIT, EnumerateSymbols_ExportsOnly_DefaultFlags) {
+  const std::string libC = lib("C");
+  SymbolEnumeratorOptions Opts = SymbolEnumeratorOptions::defaultOptions();
+
+  std::vector<std::string> seen;
+  auto onEach = [&](llvm::StringRef sym) -> EnumerateResult {
+    seen.emplace_back(sym.str());
+    return EnumerateResult::Continue;
+  };
+
+  ASSERT_TRUE(SymbolEnumerator::enumerateSymbols(libC, onEach, Opts));
+
+  // sayC is exported, others are undefined → only sayC expected
+  EXPECT_TRUE(any_of(seen, [&](const std::string &s) {
+    return matchesEitherUnderscore(s, "sayC");
+  }));
+  EXPECT_FALSE(any_of(seen, [&](const std::string &s) {
+    return matchesEitherUnderscore(s, "sayA");
+  }));
+  EXPECT_FALSE(any_of(seen, [&](const std::string &s) {
+    return matchesEitherUnderscore(s, "sayB");
+  }));
+}
+
+TEST_F(LibraryResolverIT, EnumerateSymbols_IncludesUndefineds) {
+  const std::string libC = lib("C");
+
+  SymbolEnumeratorOptions Opts;
+  Opts.FilterFlags =
+      SymbolEnumeratorOptions::IgnoreWeak |
+      SymbolEnumeratorOptions::IgnoreIndirect; // no IgnoreUndefined
+
+  std::vector<std::string> seen;
+  auto onEach = [&](llvm::StringRef sym) -> EnumerateResult {
+    seen.emplace_back(sym.str());
+    return EnumerateResult::Continue;
+  };
+
+  ASSERT_TRUE(SymbolEnumerator::enumerateSymbols(libC, onEach, Opts));
+
+  // Now we should see both sayC (export) and the undefined refs sayA, sayB,
+  // sayZ
+  EXPECT_TRUE(any_of(seen, [&](const std::string &s) {
+    return matchesEitherUnderscore(s, "sayC");
+  }));
+  EXPECT_TRUE(any_of(seen, [&](const std::string &s) {
+    return matchesEitherUnderscore(s, "sayA");
+  }));
+  EXPECT_TRUE(any_of(seen, [&](const std::string &s) {
+    return matchesEitherUnderscore(s, "sayB");
+  }));
+}
+
+// Full resolution via LibraryResolutionDriver/LibraryResolver ---
+TEST_F(LibraryResolverIT, DriverResolvesSymbolsToCorrectLibraries) {
+  // Create the resolver from real base paths (our fixtures dir)
+  auto Stup = LibraryResolver::Setup::create({BaseDir});
+
+  // Full system behavior: no mocks
+  auto Driver = LibraryResolutionDriver::create(Stup);
+  ASSERT_NE(Driver, nullptr);
+
+  // Tell the Driver about the scan path kinds (User/System) as your
+  // production code expects.
+  Driver->addScanPath(libdir("A"), PathType::User);
+  Driver->addScanPath(libdir("B"), PathType::User);
+  Driver->addScanPath(libdir("Z"), PathType::User);
+
+  // Symbols to resolve (bare names; class handles underscore differences
+  // internally)
+  std::vector<std::string> Syms = {platformSymbolName("sayA"),
+                                   platformSymbolName("sayB"),
+                                   platformSymbolName("sayZ")};
+
+  bool CallbackRan = false;
+  Driver->resolveSymbols(Syms, [&](SymbolQuery &Q) {
+    CallbackRan = true;
+
+    // sayA should resolve to A.dylib
+    {
+      auto lib = Q.getResolvedLib(platformSymbolName("sayA"));
+      ASSERT_TRUE(lib.has_value()) << "sayA should be resolved";
+      EXPECT_TRUE(endsWith(lib->str(), libname("A")))
+          << "sayA resolved to: " << lib->str();
+    }
+
+    // sayB should resolve to B.dylib
+    {
+      auto lib = Q.getResolvedLib(platformSymbolName("sayB"));
+      ASSERT_TRUE(lib.has_value()) << "sayB should be resolved";
+      EXPECT_TRUE(endsWith(lib->str(), libname("B")))
+          << "sayB resolved to: " << lib->str();
+    }
+
+    // sayZ should resolve to B.dylib
+    {
+      auto lib = Q.getResolvedLib(platformSymbolName("sayZ"));
+      ASSERT_TRUE(lib.has_value()) << "sayZ should be resolved";
+      EXPECT_TRUE(endsWith(lib->str(), libname("Z")))
+          << "sayZ resolved to: " << lib->str();
+    }
+
+    EXPECT_TRUE(Q.allResolved());
+  });
+
+  EXPECT_TRUE(CallbackRan);
+}
+
+// stress SymbolQuery with the real resolve flow
+// And resolve libC dependency libA, libB, libZ ---
+TEST_F(LibraryResolverIT, ResolveManySymbols) {
+  auto Stup = LibraryResolver::Setup::create({BaseDir});
+  auto Driver = LibraryResolutionDriver::create(Stup);
+  ASSERT_NE(Driver, nullptr);
+  Driver->addScanPath(libdir("C"), PathType::User);
+
+  // Many duplicates to provoke concurrent updates inside SymbolQuery
+  std::vector<std::string> Syms = {
+      platformSymbolName("sayA"), platformSymbolName("sayB"),
+      platformSymbolName("sayA"), platformSymbolName("sayB"),
+      platformSymbolName("sayZ"), platformSymbolName("sayZ"),
+      platformSymbolName("sayZ"), platformSymbolName("sayZ"),
+      platformSymbolName("sayA"), platformSymbolName("sayB"),
+      platformSymbolName("sayA"), platformSymbolName("sayB")};
+
+  bool CallbackRan = false;
+  Driver->resolveSymbols(Syms, [&](SymbolQuery &Q) {
+    CallbackRan = true;
+    EXPECT_TRUE(Q.isResolved(platformSymbolName("sayA")));
+    EXPECT_TRUE(Q.isResolved(platformSymbolName("sayB")));
+    EXPECT_TRUE(Q.isResolved(platformSymbolName("sayZ")));
+
+    auto A = Q.getResolvedLib(platformSymbolName("sayA"));
+    auto B = Q.getResolvedLib(platformSymbolName("sayB"));
+    auto Z = Q.getResolvedLib(platformSymbolName("sayZ"));
+    ASSERT_TRUE(A.has_value());
+    ASSERT_TRUE(B.has_value());
+    ASSERT_TRUE(Z.has_value());
+    EXPECT_TRUE(endsWith(A->str(), libname("A")));
+    EXPECT_TRUE(endsWith(B->str(), libname("B")));
+    EXPECT_TRUE(endsWith(Z->str(), libname("Z")));
+    EXPECT_TRUE(Q.allResolved());
+  });
+
+  EXPECT_TRUE(CallbackRan);
+}
+
+TEST_F(LibraryResolverIT, ScanAndResolveDependencyGraph) {
+  auto LibPathCache = std::make_shared<LibraryPathCache>();
+  auto PResolver = std::make_shared<PathResolver>(LibPathCache);
+  LibraryScanHelper ScanH({}, LibPathCache, PResolver);
+
+  ScanH.addBasePath(libdir("C"), PathType::User);
+
+  LibraryManager LibMgr;
+  LibraryScanner Scanner(ScanH, LibMgr);
+
+  Scanner.scanNext(PathType::User, 0);
+
+  size_t numLibs = 0;
+  LibMgr.forEachLibrary([&](const LibraryInfo &L) {
+    numLibs++;
+    return true;
+  });
+
+  EXPECT_GT(numLibs, 0u) << "Expected at least one library scanned";
+
+  // Validate that each scanned library path is resolvable
+  std::error_code EC;
+  LibMgr.forEachLibrary([&](const LibraryInfo &L) {
+    auto R = PResolver->resolve(L.getFullPath(), EC);
+    EXPECT_TRUE(R.has_value());
+    EXPECT_FALSE(EC);
+    return true;
+  });
+}
+
+TEST_F(LibraryResolverIT, ScanEmptyPath) {
+  auto LibPathCache = std::make_shared<LibraryPathCache>();
+  auto PResolver = std::make_shared<PathResolver>(LibPathCache);
+  LibraryScanHelper ScanH({}, LibPathCache, PResolver);
+
+  ScanH.addBasePath("/tmp/empty", PathType::User);
+
+  LibraryManager LibMgr;
+  LibraryScanner Scanner(ScanH, LibMgr);
+
+  Scanner.scanNext(PathType::User, 0);
+
+  size_t count = 0;
+  LibMgr.forEachLibrary([&](const LibraryInfo &) {
+    count++;
+    return true;
+  });
+  EXPECT_EQ(count, 0u);
+}
+
+TEST_F(LibraryResolverIT, PathResolverResolvesKnownPaths) {
+  auto LibPathCache = std::make_shared<LibraryPathCache>();
+  auto PResolver = std::make_shared<PathResolver>(LibPathCache);
+
+  std::error_code EC;
+  auto Missing = PResolver->resolve("temp/foo/bar", EC);
+  EXPECT_FALSE(Missing.has_value()) << "Unexpectedly resolved a bogus path";
+  EXPECT_TRUE(EC) << "Expected error resolving path";
+
+  auto DirPath = PResolver->resolve(BaseDir, EC);
+  ASSERT_TRUE(DirPath.has_value());
+  EXPECT_FALSE(EC) << "Expected no error resolving path";
+  EXPECT_EQ(*DirPath, BaseDir);
+
+  auto DylibPath = PResolver->resolve(lib("C"), EC);
+  ASSERT_TRUE(DylibPath.has_value());
+  EXPECT_FALSE(EC) << "Expected no error resolving path";
+  EXPECT_EQ(*DylibPath, lib("C"));
+}
+
+TEST_F(LibraryResolverIT, PathResolverNormalizesDotAndDotDot) {
+  auto LibPathCache = std::make_shared<LibraryPathCache>();
+  auto PResolver = std::make_shared<PathResolver>(LibPathCache);
+
+  std::error_code EC;
+
+  // e.g. BaseDir + "/./C/../C/C.dylib" → BaseDir + "/C.dylib"
+  std::string Messy = BaseDir + "/C/./../C/./libC" + ext;
+  auto Resolved = PResolver->resolve(Messy, EC);
+  ASSERT_TRUE(Resolved.has_value());
+  EXPECT_FALSE(EC);
+  EXPECT_EQ(*Resolved, lib("C")) << "Expected realpath to collapse . and ..";
+}
+
+#if !defined(_WIN32)
+TEST_F(LibraryResolverIT, PathResolverFollowsSymlinks) {
+  auto LibPathCache = std::make_shared<LibraryPathCache>();
+  auto PResolver = std::make_shared<PathResolver>(LibPathCache);
+
+  std::error_code EC;
+
+  // Create a symlink temp -> BaseDir (only if filesystem allows it)
+  std::string linkName = BaseDir + withext("/link_to_C");
+  std::string target = lib("C");
+  if (::symlink(target.c_str(), linkName.c_str()) != 0)
+    GTEST_SKIP() << "Failed to create symlink: " << strerror(errno);
+
+  auto resolved = PResolver->resolve(linkName, EC);
+  ASSERT_TRUE(resolved.has_value());
+  EXPECT_FALSE(EC);
+  EXPECT_EQ(*resolved, target);
+
+  (void)::unlink(linkName.c_str()); // cleanup
+}
+
+TEST_F(LibraryResolverIT, PathResolverCachesResults) {
+  auto LibPathCache = std::make_shared<LibraryPathCache>();
+  auto PResolver = std::make_shared<PathResolver>(LibPathCache);
+
+  SmallString<128> TmpDylib;
+  std::error_code EC;
+  EC = sys::fs::createUniqueFile(withext("A-copy"), TmpDylib);
+  if (EC)
+    GTEST_SKIP() << "Failed to create temp dylib" << EC.message();
+
+  EC = sys::fs::copy_file(lib("A"), TmpDylib);
+  if (EC)
+    GTEST_SKIP() << "Failed to copy libA: " << EC.message();
+  EC.clear();
+
+  // First resolve -> should populate LibPathCache
+  auto first = PResolver->resolve(TmpDylib, EC);
+  ASSERT_TRUE(first.has_value());
+
+  // Forcefully remove the file from disk
+  (void)::unlink(TmpDylib.c_str());
+
+  // Second resolve -> should still succeed from LibPathCache
+  auto second = PResolver->resolve(TmpDylib, EC);
+  EXPECT_TRUE(second.has_value());
+  EXPECT_EQ(*second, *first);
+}
+#endif
+
+TEST_F(LibraryResolverIT, LoaderPathSubstitutionAndResolve) {
+  auto LibPathCache = std::make_shared<LibraryPathCache>();
+  auto PResolver = std::make_shared<PathResolver>(LibPathCache);
+
+  DylibSubstitutor substitutor;
+  substitutor.configure(libdir("C"));
+#if defined(__APPLE__)
+  // Substitute @loader_path with BaseDir
+  std::string substituted =
+      substitutor.substitute(withext("@loader_path/libC"));
+#elif defined(__linux__)
+  // Substitute $origin with BaseDir
+  std::string substituted = substitutor.substitute(withext("$ORIGIN/libC"));
+#endif
+  ASSERT_FALSE(substituted.empty());
+  EXPECT_EQ(substituted, lib("C"));
+
+  // Now try resolving the substituted path
+  std::error_code EC;
+  auto resolved = PResolver->resolve(substituted, EC);
+  ASSERT_TRUE(resolved.has_value()) << "Expected to resolve substituted dylib";
+  EXPECT_EQ(*resolved, lib("C"));
+  EXPECT_FALSE(EC) << "Expected no error resolving substituted dylib";
+}
+
+TEST_F(LibraryResolverIT, ResolveFromUsrOrSystemPaths) {
+  auto LibPathCache = std::make_shared<LibraryPathCache>();
+  auto PResolver = std::make_shared<PathResolver>(LibPathCache);
+
+  DylibPathValidator validator(*PResolver);
+
+  std::vector<std::string> Paths = {"/foo/bar/", "temp/foo",  libdir("C"),
+                                    libdir("A"), libdir("B"), libdir("Z")};
+
+  SmallVector<StringRef> P(Paths.begin(), Paths.end());
+
+  DylibResolver Resolver(validator);
+  Resolver.configure("", {{P, SearchPathType::UsrOrSys}});
+
+  // Check "C"
+  auto ValOptC = Resolver.resolve("libC", true);
+  EXPECT_TRUE(ValOptC.has_value());
+  EXPECT_EQ(*ValOptC, lib("C"));
+
+  auto ValOptCdylib = Resolver.resolve(withext("libC"));
+  EXPECT_TRUE(ValOptCdylib.has_value());
+  EXPECT_EQ(*ValOptCdylib, lib("C"));
+
+  // Check "A"
+  auto ValOptA = Resolver.resolve("libA", true);
+  EXPECT_TRUE(ValOptA.has_value());
+  EXPECT_EQ(*ValOptA, lib("A"));
+
+  auto ValOptAdylib = Resolver.resolve(withext("libA"));
+  EXPECT_TRUE(ValOptAdylib.has_value());
+  EXPECT_EQ(*ValOptAdylib, lib("A"));
+
+  // Check "B"
+  auto ValOptB = Resolver.resolve("libB", true);
+  EXPECT_TRUE(ValOptB.has_value());
+  EXPECT_EQ(*ValOptB, lib("B"));
+
+  auto ValOptBdylib = Resolver.resolve(withext("libB"));
+  EXPECT_TRUE(ValOptBdylib.has_value());
+  EXPECT_EQ(*ValOptBdylib, lib("B"));
+
+  // Check "Z"
+  auto ValOptZ = Resolver.resolve("libZ", true);
+  EXPECT_TRUE(ValOptZ.has_value());
+  EXPECT_EQ(*ValOptZ, lib("Z"));
+
+  auto ValOptZdylib = Resolver.resolve(withext("libZ"));
+  EXPECT_TRUE(ValOptZdylib.has_value());
+  EXPECT_EQ(*ValOptZdylib, lib("Z"));
+}
+
+#if defined(__APPLE__)
+TEST_F(LibraryResolverIT, ResolveViaLoaderPathAndRPathSubstitution) {
+  auto LibPathCache = std::make_shared<LibraryPathCache>();
+  auto PResolver = std::make_shared<PathResolver>(LibPathCache);
+
+  DylibPathValidator validator(*PResolver);
+
+  std::vector<std::string> Paths = {"@loader_path/../A", "@loader_path/../B",
+                                    "@loader_path/../C", "@loader_path/../Z"};
+
+  SmallVector<StringRef> P(Paths.begin(), Paths.end());
+
+  DylibResolver Resolver(validator);
+
+  // Use only RPath config
+  Resolver.configure(lib("C"), {{P, SearchPathType::RPath}});
+
+  // --- Check A ---
+  auto ValOptA = Resolver.resolve("@rpath/libA", true);
+  EXPECT_TRUE(ValOptA.has_value());
+  EXPECT_EQ(*ValOptA, lib("A"));
+
+  auto ValOptAdylib = Resolver.resolve(withext("@rpath/libA"));
+  EXPECT_TRUE(ValOptAdylib.has_value());
+  EXPECT_EQ(*ValOptAdylib, lib("A"));
+
+  // --- Check B ---
+  auto ValOptB = Resolver.resolve("@rpath/libB", true);
+  EXPECT_TRUE(ValOptB.has_value());
+  EXPECT_EQ(*ValOptB, lib("B"));
+
+  auto ValOptBdylib = Resolver.resolve(withext("@rpath/libB"));
+  EXPECT_TRUE(ValOptBdylib.has_value());
+  EXPECT_EQ(*ValOptBdylib, lib("B"));
+
+  // --- Check Z ---
+  auto ValOptZ = Resolver.resolve("@rpath/libZ", true);
+  EXPECT_TRUE(ValOptZ.has_value());
+  EXPECT_EQ(*ValOptZ, lib("Z"));
+
+  auto ValOptZdylib = Resolver.resolve(withext("@rpath/libZ"));
+  EXPECT_TRUE(ValOptZdylib.has_value());
+  EXPECT_EQ(*ValOptZdylib, lib("Z"));
+}
+#endif
+
+#if defined(__linux__)
+TEST_F(LibraryResolverIT, ResolveViaOriginAndRPathSubstitution) {
+  auto LibPathCache = std::make_shared<LibraryPathCache>();
+  auto PResolver = std::make_shared<PathResolver>(LibPathCache);
+
+  DylibPathValidator validator(*PResolver);
+
+  // On Linux, $ORIGIN works like @loader_path
+  std::vector<std::string> Paths = {"$ORIGIN/../A", "$ORIGIN/../B",
+                                    "$ORIGIN/../C", "$ORIGIN/../Z"};
+
+  SmallVector<StringRef> P(Paths.begin(), Paths.end());
+
+  DylibResolver Resolver(validator);
+
+  // Use only RPath config
+  Resolver.configure(lib("C"), {{P, SearchPathType::RunPath}});
+
+  // --- Check A ---
+  auto ValOptA = Resolver.resolve("libA", true);
+  EXPECT_TRUE(ValOptA.has_value());
+  EXPECT_EQ(*ValOptA, lib("A"));
+
+  auto valOptASO = Resolver.resolve(withext("libA"));
+  EXPECT_TRUE(valOptASO.has_value());
+  EXPECT_EQ(*valOptASO, lib("A"));
+
+  // --- Check B ---
+  auto ValOptB = Resolver.resolve("libB", true);
+  EXPECT_TRUE(ValOptB.has_value());
+  EXPECT_EQ(*ValOptB, lib("B"));
+
+  auto valOptBSO = Resolver.resolve(withext("libB"));
+  EXPECT_TRUE(valOptBSO.has_value());
+  EXPECT_EQ(*valOptBSO, lib("B"));
+
+  // --- Check Z ---
+  auto ValOptZ = Resolver.resolve("libZ", true);
+  EXPECT_TRUE(ValOptZ.has_value());
+  EXPECT_EQ(*ValOptZ, lib("Z"));
+
+  auto valOptZSO = Resolver.resolve(withext("libZ"));
+  EXPECT_TRUE(valOptZSO.has_value());
+  EXPECT_EQ(*valOptZSO, lib("Z"));
+}
+#endif
+} // namespace
+#endif // defined(__APPLE__)
diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
index e56872320b4ac..0b3ae643e1494 100644
--- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
@@ -4534,6 +4534,85 @@ TEST_F(OpenMPIRBuilderTest, OMPAtomicCompareCapture) {
   EXPECT_FALSE(verifyModule(*M, &errs()));
 }
 
+TEST_F(OpenMPIRBuilderTest, OMPAtomicRWStructType) {
+  // Test for issue #165184: atomic read/write on struct types should use
+  // element type size, not pointer size.
+  OpenMPIRBuilder OMPBuilder(*M);
+  OMPBuilder.initialize();
+  F->setName("func");
+  IRBuilder<> Builder(BB);
+
+  OpenMPIRBuilder::LocationDescription Loc({Builder.saveIP(), DL});
+  BasicBlock *EntryBB = BB;
+  OpenMPIRBuilder::InsertPointTy AllocaIP(EntryBB,
+                                          EntryBB->getFirstInsertionPt());
+
+  LLVMContext &Ctx = M->getContext();
+
+  // Create a struct type {double, double} to simulate complex(8) - 16 bytes
+  StructType *Complex8Ty = StructType::create(
+      Ctx, {Type::getDoubleTy(Ctx), Type::getDoubleTy(Ctx)}, "complex");
+
+  AllocaInst *XVal = Builder.CreateAlloca(Complex8Ty);
+  XVal->setName("AtomicVar");
+  OpenMPIRBuilder::AtomicOpValue X = {XVal, Complex8Ty, false, false};
+  AtomicOrdering AO = AtomicOrdering::SequentiallyConsistent;
+
+  // Create value to write: {1.0, 1.0}
+  Constant *Real = ConstantFP::get(Type::getDoubleTy(Ctx), 1.0);
+  Constant *Imag = ConstantFP::get(Type::getDoubleTy(Ctx), 1.0);
+  Constant *ValToWrite = ConstantStruct::get(Complex8Ty, {Real, Imag});
+
+  // Test atomic write
+  Builder.restoreIP(
+      OMPBuilder.createAtomicWrite(Loc, X, ValToWrite, AO, AllocaIP));
+
+  // Test atomic read
+  AllocaInst *VVal = Builder.CreateAlloca(Complex8Ty);
+  VVal->setName("ReadDest");
+  OpenMPIRBuilder::AtomicOpValue V = {VVal, Complex8Ty, false, false};
+
+  Builder.restoreIP(OMPBuilder.createAtomicRead(Loc, X, V, AO, AllocaIP));
+
+  Builder.CreateRetVoid();
+  OMPBuilder.finalize();
+  EXPECT_FALSE(verifyModule(*M, &errs()));
+
+  // Verify that __atomic_store and __atomic_load are called with size 16
+  bool FoundAtomicStore = false;
+  bool FoundAtomicLoad = false;
+
+  for (Function &Fn : *M) {
+    if (Fn.getName().starts_with("__atomic_store")) {
+      // Check that first call to __atomic_store has size argument = 16
+      for (User *U : Fn.users()) {
+        if (auto *CB = dyn_cast<CallBase>(U)) {
+          if (auto *SizeArg = dyn_cast<ConstantInt>(CB->getArgOperand(0))) {
+            EXPECT_EQ(SizeArg->getZExtValue(), 16U);
+            FoundAtomicStore = true;
+            break;
+          }
+        }
+      }
+    }
+    if (Fn.getName().starts_with("__atomic_load")) {
+      // Check that first call to __atomic_load has size argument = 16
+      for (User *U : Fn.users()) {
+        if (auto *CB = dyn_cast<CallBase>(U)) {
+          if (auto *SizeArg = dyn_cast<ConstantInt>(CB->getArgOperand(0))) {
+            EXPECT_EQ(SizeArg->getZExtValue(), 16U);
+            FoundAtomicLoad = true;
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  EXPECT_TRUE(FoundAtomicStore) << "Did not find __atomic_store call";
+  EXPECT_TRUE(FoundAtomicLoad) << "Did not find __atomic_load call";
+}
+
 TEST_F(OpenMPIRBuilderTest, CreateTeams) {
   using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
   OpenMPIRBuilder OMPBuilder(*M);
@@ -7576,8 +7655,7 @@ TEST_F(OpenMPIRBuilderTest, CreateTaskgroup) {
   // Checking the general structure of the IR generated is same as expected.
   Instruction *GeneratedStoreInst = TaskgroupCall->getNextNode();
   EXPECT_EQ(GeneratedStoreInst, InternalStoreInst);
-  Instruction *GeneratedLoad32 =
-      GeneratedStoreInst->getNextNode();
+  Instruction *GeneratedLoad32 = GeneratedStoreInst->getNextNode();
   EXPECT_EQ(GeneratedLoad32, InternalLoad32);
   Instruction *GeneratedLoad128 = GeneratedLoad32->getNextNode();
   EXPECT_EQ(GeneratedLoad128, InternalLoad128);
diff --git a/llvm/unittests/IR/AbstractCallSiteTest.cpp b/llvm/unittests/IR/AbstractCallSiteTest.cpp
index ddb10911ad028..623d1b36e1c03 100644
--- a/llvm/unittests/IR/AbstractCallSiteTest.cpp
+++ b/llvm/unittests/IR/AbstractCallSiteTest.cpp
@@ -6,8 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/AsmParser/Parser.h"
 #include "llvm/IR/AbstractCallSite.h"
+#include "llvm/AsmParser/Parser.h"
+#include "llvm/IR/Argument.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/SourceMgr.h"
@@ -51,5 +52,96 @@ TEST(AbstractCallSite, CallbackCall) {
   EXPECT_TRUE(ACS);
   EXPECT_TRUE(ACS.isCallbackCall());
   EXPECT_TRUE(ACS.isCallee(CallbackUse));
+  EXPECT_EQ(ACS.getCalleeUseForCallback(), *CallbackUse);
   EXPECT_EQ(ACS.getCalledFunction(), Callback);
+
+  // The callback metadata {CallbackNo, Arg0No, ..., isVarArg} = {1, -1, true}
+  EXPECT_EQ(ACS.getCallArgOperandNoForCallee(), 1);
+  // Though the callback metadata only specifies ONE unfixed argument No, the
+  // callback callee is vararg, hence the third arg is also considered as
+  // another arg for the callback.
+  EXPECT_EQ(ACS.getNumArgOperands(), 2u);
+  Argument *Param0 = Callback->getArg(0), *Param1 = Callback->getArg(1);
+  ASSERT_TRUE(Param0 && Param1);
+  EXPECT_EQ(ACS.getCallArgOperandNo(*Param0), -1);
+  EXPECT_EQ(ACS.getCallArgOperandNo(*Param1), 2);
+}
+
+TEST(AbstractCallSite, DirectCall) {
+  LLVMContext C;
+
+  const char *IR = "declare void @bar(i32 %x, i32 %y)\n"
+                   "define void @foo() {\n"
+                   "  call void @bar(i32 1, i32 2)\n"
+                   "  ret void\n"
+                   "}\n";
+
+  std::unique_ptr<Module> M = parseIR(C, IR);
+  ASSERT_TRUE(M);
+
+  Function *Callee = M->getFunction("bar");
+  ASSERT_NE(Callee, nullptr);
+
+  const Use *DirectCallUse = Callee->getSingleUndroppableUse();
+  ASSERT_NE(DirectCallUse, nullptr);
+
+  AbstractCallSite ACS(DirectCallUse);
+  EXPECT_TRUE(ACS);
+  EXPECT_TRUE(ACS.isDirectCall());
+  EXPECT_TRUE(ACS.isCallee(DirectCallUse));
+  EXPECT_EQ(ACS.getCalledFunction(), Callee);
+  EXPECT_EQ(ACS.getNumArgOperands(), 2u);
+  Argument *ArgX = Callee->getArg(0);
+  ASSERT_NE(ArgX, nullptr);
+  Value *CAO1 = ACS.getCallArgOperand(*ArgX);
+  Value *CAO2 = ACS.getCallArgOperand(0);
+  ASSERT_NE(CAO2, nullptr);
+  // The two call arg operands should be the same object, since they are both
+  // the first argument of the call.
+  EXPECT_EQ(CAO2, CAO1);
+
+  ConstantInt *FirstArgInt = dyn_cast<ConstantInt>(CAO2);
+  ASSERT_NE(FirstArgInt, nullptr);
+  EXPECT_EQ(FirstArgInt->getZExtValue(), 1ull);
+
+  EXPECT_EQ(ACS.getCallArgOperandNo(*ArgX), 0);
+  EXPECT_EQ(ACS.getCallArgOperandNo(0), 0);
+  EXPECT_EQ(ACS.getCallArgOperandNo(1), 1);
+}
+
+TEST(AbstractCallSite, IndirectCall) {
+  LLVMContext C;
+
+  const char *IR = "define void @foo(ptr %0) {\n"
+                   "  call void %0(i32 1, i32 2)\n"
+                   "  ret void\n"
+                   "}\n";
+
+  std::unique_ptr<Module> M = parseIR(C, IR);
+  ASSERT_TRUE(M);
+
+  Function *Fun = M->getFunction("foo");
+  ASSERT_NE(Fun, nullptr);
+
+  Argument *ArgAsCallee = Fun->getArg(0);
+  ASSERT_NE(ArgAsCallee, nullptr);
+
+  const Use *IndCallUse = ArgAsCallee->getSingleUndroppableUse();
+  ASSERT_NE(IndCallUse, nullptr);
+
+  AbstractCallSite ACS(IndCallUse);
+  EXPECT_TRUE(ACS);
+  EXPECT_TRUE(ACS.isIndirectCall());
+  EXPECT_TRUE(ACS.isCallee(IndCallUse));
+  EXPECT_EQ(ACS.getCalledFunction(), nullptr);
+  EXPECT_EQ(ACS.getCalledOperand(), ArgAsCallee);
+  EXPECT_EQ(ACS.getNumArgOperands(), 2u);
+  Value *CalledOperand = ACS.getCallArgOperand(0);
+  ASSERT_NE(CalledOperand, nullptr);
+  ConstantInt *FirstArgInt = dyn_cast<ConstantInt>(CalledOperand);
+  ASSERT_NE(FirstArgInt, nullptr);
+  EXPECT_EQ(FirstArgInt->getZExtValue(), 1ull);
+
+  EXPECT_EQ(ACS.getCallArgOperandNo(0), 0);
+  EXPECT_EQ(ACS.getCallArgOperandNo(1), 1);
 }
diff --git a/llvm/unittests/MC/SystemZ/SystemZMCDisassemblerTest.cpp b/llvm/unittests/MC/SystemZ/SystemZMCDisassemblerTest.cpp
index 87fad37635320..25c22d1b4ba73 100644
--- a/llvm/unittests/MC/SystemZ/SystemZMCDisassemblerTest.cpp
+++ b/llvm/unittests/MC/SystemZ/SystemZMCDisassemblerTest.cpp
@@ -61,7 +61,7 @@ Context &getContext() {
 class SystemZMCSymbolizerTest : public MCSymbolizer {
 public:
   SystemZMCSymbolizerTest(MCContext &MC) : MCSymbolizer(MC, nullptr) {}
-  ~SystemZMCSymbolizerTest() override {}
+  ~SystemZMCSymbolizerTest() override = default;
 
   bool tryAddingSymbolicOperand([[maybe_unused]] MCInst &Inst,
                                 [[maybe_unused]] raw_ostream &CStream,
diff --git a/llvm/unittests/MC/X86/X86MCDisassemblerTest.cpp b/llvm/unittests/MC/X86/X86MCDisassemblerTest.cpp
index 286528fdd21c3..6d44151b378b2 100644
--- a/llvm/unittests/MC/X86/X86MCDisassemblerTest.cpp
+++ b/llvm/unittests/MC/X86/X86MCDisassemblerTest.cpp
@@ -62,7 +62,7 @@ Context &getContext() {
 class X86MCSymbolizerTest : public MCSymbolizer {
 public:
   X86MCSymbolizerTest(MCContext &MC) : MCSymbolizer(MC, nullptr) {}
-  ~X86MCSymbolizerTest() override {}
+  ~X86MCSymbolizerTest() override = default;
 
   struct OpInfo {
     int64_t Value = 0;
diff --git a/llvm/unittests/MIR/MachineMetadata.cpp b/llvm/unittests/MIR/MachineMetadata.cpp
index 0f038d9dc2234..587551246c4f4 100644
--- a/llvm/unittests/MIR/MachineMetadata.cpp
+++ b/llvm/unittests/MIR/MachineMetadata.cpp
@@ -33,7 +33,7 @@ using namespace llvm;
 
 class MachineMetadataTest : public testing::Test {
 public:
-  MachineMetadataTest() {}
+  MachineMetadataTest() = default;
 
 protected:
   LLVMContext Context;
diff --git a/llvm/unittests/MIR/MachineStableHashTest.cpp b/llvm/unittests/MIR/MachineStableHashTest.cpp
index ea0de1a73da62..bedecb17df51f 100644
--- a/llvm/unittests/MIR/MachineStableHashTest.cpp
+++ b/llvm/unittests/MIR/MachineStableHashTest.cpp
@@ -22,7 +22,7 @@ using namespace llvm;
 
 class MachineStableHashTest : public testing::Test {
 public:
-  MachineStableHashTest() {}
+  MachineStableHashTest() = default;
 
 protected:
   LLVMContext Context;
diff --git a/llvm/unittests/Object/ELFObjectFileTest.cpp b/llvm/unittests/Object/ELFObjectFileTest.cpp
index d6a3ca53b2154..1e2955ae40a66 100644
--- a/llvm/unittests/Object/ELFObjectFileTest.cpp
+++ b/llvm/unittests/Object/ELFObjectFileTest.cpp
@@ -531,7 +531,7 @@ TEST(ELFObjectFileTest, InvalidDecodeBBAddrMap) {
   // Check that we can detect unsupported versions.
   SmallString<128> UnsupportedVersionYamlString(CommonYamlString);
   UnsupportedVersionYamlString += R"(
-      - Version: 5
+      - Version: 6
         BBRanges:
           - BaseAddress: 0x11111
             BBEntries:
@@ -543,7 +543,7 @@ TEST(ELFObjectFileTest, InvalidDecodeBBAddrMap) {
   {
     SCOPED_TRACE("unsupported version");
     DoCheck(UnsupportedVersionYamlString,
-            "unsupported SHT_LLVM_BB_ADDR_MAP version: 5");
+            "unsupported SHT_LLVM_BB_ADDR_MAP version: 6");
   }
 
   SmallString<128> ZeroBBRangesYamlString(CommonYamlString);
@@ -1181,8 +1181,8 @@ TEST(ELFObjectFileTest, ReadPGOAnalysisMap) {
     Type: SHT_LLVM_BB_ADDR_MAP
   # Link: 0 (by default, can be overriden)
     Entries:
-      - Version: 2
-        Feature: 0x7
+      - Version: 5
+        Feature: 0x87
         BBRanges:
           - BaseAddress: 0x44444
             BBEntries:
@@ -1205,7 +1205,8 @@ TEST(ELFObjectFileTest, ReadPGOAnalysisMap) {
     PGOAnalyses:
       - FuncEntryCount: 1000
         PGOBBEntries:
-          - BBFreq:         1000
+          - BBFreq:          1000
+            PostLinkBBFreq:  50
             Successors:
             - ID:          1
               BrProb:      0x22222222
@@ -1243,8 +1244,8 @@ TEST(ELFObjectFileTest, ReadPGOAnalysisMap) {
     Type: SHT_LLVM_BB_ADDR_MAP
   # Link: 0 (by default, can be overriden)
     Entries:
-      - Version: 2
-        Feature: 0xc
+      - Version: 5
+        Feature: 0x8c
         BBRanges:
           - BaseAddress: 0x66666
             BBEntries:
@@ -1265,8 +1266,9 @@ TEST(ELFObjectFileTest, ReadPGOAnalysisMap) {
     PGOAnalyses:
       - PGOBBEntries:
          - Successors:
-            - ID:          1
-              BrProb:      0x22222222
+            - ID:              1
+              BrProb:          0x22222222
+              PostLinkBrFreq:  7
             - ID:          2
               BrProb:      0xcccccccc
          - Successors:
@@ -1278,59 +1280,66 @@ TEST(ELFObjectFileTest, ReadPGOAnalysisMap) {
   BBAddrMap E1 = {
       {{0x11111, {{1, 0x0, 0x1, {false, true, false, false, false}, {}, 0}}}}};
   PGOAnalysisMap P1 = {
-      892, {}, {true, false, false, false, false, false, false}};
+      892, {}, {true, false, false, false, false, false, false, false}};
   BBAddrMap E2 = {
       {{0x22222, {{2, 0x0, 0x2, {false, false, true, false, false}, {}, 0}}}}};
   PGOAnalysisMap P2 = {{},
-                       {{BlockFrequency(343), {}}},
-                       {false, true, false, false, false, false, false}};
+                       {{BlockFrequency(343), 0, {}}},
+                       {false, true, false, false, false, false, false, false}};
   BBAddrMap E3 = {
       {{0x33333,
         {{0, 0x0, 0x3, {false, true, true, false, false}, {}, 0},
          {1, 0x3, 0x3, {false, false, true, false, false}, {}, 0},
          {2, 0x6, 0x3, {false, false, false, false, false}, {}, 0}}}}};
-  PGOAnalysisMap P3 = {{},
-                       {{{},
-                         {{1, BranchProbability::getRaw(0x1111'1111)},
-                          {2, BranchProbability::getRaw(0xeeee'eeee)}}},
-                        {{}, {{2, BranchProbability::getRaw(0xffff'ffff)}}},
-                        {{}, {}}},
-                       {false, false, true, false, false, false, false}};
+  PGOAnalysisMap P3 = {
+      {},
+      {{{},
+        0,
+        {{1, BranchProbability::getRaw(0x1111'1111), 0},
+         {2, BranchProbability::getRaw(0xeeee'eeee), 0}}},
+       {{}, 0, {{2, BranchProbability::getRaw(0xffff'ffff), 0}}},
+       {{}, 0, {}}},
+      {false, false, true, false, false, false, false, false}};
   BBAddrMap E4 = {
       {{0x44444,
         {{0, 0x0, 0x4, {false, false, false, true, true}, {}, 0},
          {1, 0x4, 0x4, {false, false, false, false, false}, {}, 0},
          {2, 0x8, 0x4, {false, false, false, false, false}, {}, 0},
          {3, 0xc, 0x4, {false, false, false, false, false}, {}, 0}}}}};
-  PGOAnalysisMap P4 = {
-      1000,
-      {{BlockFrequency(1000),
-        {{1, BranchProbability::getRaw(0x2222'2222)},
-         {2, BranchProbability::getRaw(0x3333'3333)},
-         {3, BranchProbability::getRaw(0xaaaa'aaaa)}}},
-       {BlockFrequency(133),
-        {{2, BranchProbability::getRaw(0x1111'1111)},
-         {3, BranchProbability::getRaw(0xeeee'eeee)}}},
-       {BlockFrequency(18), {{3, BranchProbability::getRaw(0xffff'ffff)}}},
-       {BlockFrequency(1000), {}}},
-      {true, true, true, false, false, false, false}};
+  PGOAnalysisMap P4 = {1000,
+                       {{BlockFrequency(1000),
+                         50,
+                         {{1, BranchProbability::getRaw(0x2222'2222), 0},
+                          {2, BranchProbability::getRaw(0x3333'3333), 0},
+                          {3, BranchProbability::getRaw(0xaaaa'aaaa), 0}}},
+                        {BlockFrequency(133),
+                         0,
+                         {{2, BranchProbability::getRaw(0x1111'1111), 0},
+                          {3, BranchProbability::getRaw(0xeeee'eeee), 0}}},
+                        {BlockFrequency(18),
+                         0,
+                         {{3, BranchProbability::getRaw(0xffff'ffff), 0}}},
+                        {BlockFrequency(1000), 0, {}}},
+                       {true, true, true, false, false, false, false, true}};
   BBAddrMap E5 = {
       {{0x55555, {{2, 0x0, 0x2, {false, false, true, false, false}, {}, 0}}}}};
   PGOAnalysisMap P5 = {
-      {}, {}, {false, false, false, false, false, false, false}};
+      {}, {}, {false, false, false, false, false, false, false, false}};
   BBAddrMap E6 = {
       {{0x66666,
         {{0, 0x0, 0x6, {false, true, true, false, false}, {}, 0},
          {1, 0x6, 0x6, {false, false, true, false, false}, {}, 0}}},
        {0x666661,
         {{2, 0x0, 0x6, {false, false, false, false, false}, {}, 0}}}}};
-  PGOAnalysisMap P6 = {{},
-                       {{{},
-                         {{1, BranchProbability::getRaw(0x2222'2222)},
-                          {2, BranchProbability::getRaw(0xcccc'cccc)}}},
-                        {{}, {{2, BranchProbability::getRaw(0x8888'8888)}}},
-                        {{}, {}}},
-                       {false, false, true, true, false, false, false}};
+  PGOAnalysisMap P6 = {
+      {},
+      {{{},
+        0,
+        {{1, BranchProbability::getRaw(0x2222'2222), 7},
+         {2, BranchProbability::getRaw(0xcccc'cccc), 0}}},
+       {{}, 0, {{2, BranchProbability::getRaw(0x8888'8888), 0}}},
+       {{}, 0, {}}},
+      {false, false, true, true, false, false, false, true}};
 
   std::vector<BBAddrMap> Section0BBAddrMaps = {E4, E5, E6};
   std::vector<BBAddrMap> Section1BBAddrMaps = {E3};
@@ -1465,7 +1474,7 @@ TEST(ELFObjectFileTest, ReadPGOAnalysisMap) {
     DoCheckFails(
         TruncatedYamlString, /*TextSectionIndex=*/std::nullopt,
         "unable to read SHT_LLVM_BB_ADDR_MAP section with index 6: "
-        "unexpected end of data at offset 0xa while reading [0x3, 0xb)");
+        "unexpected end of data at offset 0xa while reading [0x4, 0xc)");
     // Check that we can read the other section's bb-address-maps which are
     // valid.
     DoCheckSucceeds(TruncatedYamlString, /*TextSectionIndex=*/2,
diff --git a/llvm/unittests/Object/ELFTypesTest.cpp b/llvm/unittests/Object/ELFTypesTest.cpp
index 1765e15003963..9e99b4a6d7bf3 100644
--- a/llvm/unittests/Object/ELFTypesTest.cpp
+++ b/llvm/unittests/Object/ELFTypesTest.cpp
@@ -101,22 +101,24 @@ static_assert(
     "PGOAnalysisMap should use the same type for basic block ID as BBAddrMap");
 
 TEST(ELFTypesTest, BBAddrMapFeaturesEncodingTest) {
-  const std::array<BBAddrMap::Features, 12> Decoded = {
-      {{false, false, false, false, false, false, false},
-       {true, false, false, false, false, false, false},
-       {false, true, false, false, false, false, false},
-       {false, false, true, false, false, false, false},
-       {false, false, false, true, false, false, false},
-       {true, true, false, false, false, false, false},
-       {false, true, true, false, false, false, false},
-       {false, true, true, true, false, false, false},
-       {true, true, true, true, false, false, false},
-       {false, false, false, false, true, false, false},
-       {false, false, false, false, false, true, false},
-       {false, false, false, false, false, false, true}}};
-  const std::array<uint8_t, 12> Encoded = {
+  const std::array<BBAddrMap::Features, 14> Decoded = {
+      {{false, false, false, false, false, false, false, false},
+       {true, false, false, false, false, false, false, false},
+       {false, true, false, false, false, false, false, false},
+       {false, false, true, false, false, false, false, false},
+       {false, false, false, true, false, false, false, false},
+       {true, true, false, false, false, false, false, false},
+       {false, true, true, false, false, false, false, false},
+       {false, true, true, true, false, false, false, false},
+       {true, true, true, true, false, false, false, false},
+       {false, false, false, false, true, false, false, false},
+       {false, false, false, false, false, true, false, false},
+       {false, false, false, false, false, false, true, false},
+       {false, false, false, false, false, false, false, true},
+       {false, false, false, false, false, false, true, true}}};
+  const std::array<uint16_t, 14> Encoded = {
       {0b0000, 0b0001, 0b0010, 0b0100, 0b1000, 0b0011, 0b0110, 0b1110, 0b1111,
-       0b1'0000, 0b10'0000, 0b100'0000}};
+       0b1'0000, 0b10'0000, 0b100'0000, 0b1000'0000, 0b1100'0000}};
   for (const auto &[Feat, EncodedVal] : llvm::zip(Decoded, Encoded))
     EXPECT_EQ(Feat.encode(), EncodedVal);
   for (const auto &[Feat, EncodedVal] : llvm::zip(Decoded, Encoded)) {
@@ -129,9 +131,9 @@ TEST(ELFTypesTest, BBAddrMapFeaturesEncodingTest) {
 
 TEST(ELFTypesTest, BBAddrMapFeaturesInvalidEncodingTest) {
   const std::array<std::string, 2> Errors = {
-      "invalid encoding for BBAddrMap::Features: 0x80",
-      "invalid encoding for BBAddrMap::Features: 0xf0"};
-  const std::array<uint8_t, 2> Values = {{0b1000'0000, 0b1111'0000}};
+      "invalid encoding for BBAddrMap::Features: 0x100",
+      "invalid encoding for BBAddrMap::Features: 0x1000"};
+  const std::array<uint16_t, 2> Values = {{0b1'0000'0000, 0b1'0000'0000'0000}};
   for (const auto &[Val, Error] : llvm::zip(Values, Errors)) {
     EXPECT_THAT_ERROR(BBAddrMap::Features::decode(Val).takeError(),
                       FailedWithMessage(Error));
diff --git a/llvm/unittests/Object/XCOFFObjectFileTest.cpp b/llvm/unittests/Object/XCOFFObjectFileTest.cpp
index f696cde99e0ec..10217f6c2c396 100644
--- a/llvm/unittests/Object/XCOFFObjectFileTest.cpp
+++ b/llvm/unittests/Object/XCOFFObjectFileTest.cpp
@@ -18,10 +18,10 @@ using namespace llvm::XCOFF;
 TEST(XCOFFObjectFileTest, XCOFFObjectType) {
   // Create an arbitrary object of a non-XCOFF type and test that
   // dyn_cast<XCOFFObjectFile> returns null for it.
-  char Buf[sizeof(typename ELF64LE::Ehdr)] = {};
+  char Buf[sizeof(ELF64LE::Ehdr)] = {};
   memcpy(Buf, "\177ELF", 4);
 
-  auto *EHdr = reinterpret_cast<typename ELF64LE::Ehdr *>(Buf);
+  auto *EHdr = reinterpret_cast<ELF64LE::Ehdr *>(Buf);
   EHdr->e_ident[llvm::ELF::EI_CLASS] = llvm::ELF::ELFCLASS64;
   EHdr->e_ident[llvm::ELF::EI_DATA] = llvm::ELF::ELFDATA2LSB;
 
diff --git a/llvm/unittests/ProfileData/InstrProfTest.cpp b/llvm/unittests/ProfileData/InstrProfTest.cpp
index dd17844aef8a6..8641b939dd35d 100644
--- a/llvm/unittests/ProfileData/InstrProfTest.cpp
+++ b/llvm/unittests/ProfileData/InstrProfTest.cpp
@@ -914,7 +914,7 @@ TEST_P(MaybeSparseInstrProfTest, annotate_vp_data) {
   ASSERT_THAT(ValueData, SizeIs(0));
 
   // Remove the MD_prof metadata
-  Inst->setMetadata(LLVMContext::MD_prof, 0);
+  Inst->setMetadata(LLVMContext::MD_prof, nullptr);
   // Annotate 5 records this time.
   annotateValueSite(*M, *Inst, R.get(), IPVK_IndirectCallTarget, 0, 5);
   ValueData = getValueProfDataFromInst(*Inst, IPVK_IndirectCallTarget, 5, T);
@@ -932,7 +932,7 @@ TEST_P(MaybeSparseInstrProfTest, annotate_vp_data) {
   ASSERT_EQ(2U, ValueData[4].Count);
 
   // Remove the MD_prof metadata
-  Inst->setMetadata(LLVMContext::MD_prof, 0);
+  Inst->setMetadata(LLVMContext::MD_prof, nullptr);
   // Annotate with 4 records.
   InstrProfValueData VD0Sorted[] = {{1000, 6}, {2000, 5}, {3000, 4}, {4000, 3},
                               {5000, 2}, {6000, 1}};
diff --git a/llvm/unittests/Support/AlignOfTest.cpp b/llvm/unittests/Support/AlignOfTest.cpp
index 979f2cf18cc15..53358a2815daa 100644
--- a/llvm/unittests/Support/AlignOfTest.cpp
+++ b/llvm/unittests/Support/AlignOfTest.cpp
@@ -79,14 +79,14 @@ struct V8 : V5, virtual V6, V7 { double zz;
 
 double S6::f() { return 0.0; }
 float D2::g() { return 0.0f; }
-V1::~V1() {}
-V2::~V2() {}
-V3::~V3() {}
-V4::~V4() {}
-V5::~V5() {}
-V6::~V6() {}
-V7::~V7() {}
-V8::~V8() {}
+V1::~V1() = default;
+V2::~V2() = default;
+V3::~V3() = default;
+V4::~V4() = default;
+V5::~V5() = default;
+V6::~V6() = default;
+V7::~V7() = default;
+V8::~V8() = default;
 
 template <typename M> struct T { M m; };
 
diff --git a/llvm/unittests/Support/AllocatorTest.cpp b/llvm/unittests/Support/AllocatorTest.cpp
index 1069e436d0a16..2337f34143bad 100644
--- a/llvm/unittests/Support/AllocatorTest.cpp
+++ b/llvm/unittests/Support/AllocatorTest.cpp
@@ -235,7 +235,7 @@ class MockSlabAllocator {
   static size_t LastSlabSize;
 
 public:
-  ~MockSlabAllocator() { }
+  ~MockSlabAllocator() = default;
 
   void *Allocate(size_t Size, size_t /*Alignment*/) {
     // Allocate space for the alignment, the slab, and a void* that goes right
diff --git a/llvm/unittests/Support/BinaryStreamTest.cpp b/llvm/unittests/Support/BinaryStreamTest.cpp
index 70cd4036fb2a6..06ed12b28f597 100644
--- a/llvm/unittests/Support/BinaryStreamTest.cpp
+++ b/llvm/unittests/Support/BinaryStreamTest.cpp
@@ -110,7 +110,7 @@ constexpr uint32_t NumStreams = 2 * NumEndians;
 class BinaryStreamTest : public testing::Test {
 
 public:
-  BinaryStreamTest() {}
+  BinaryStreamTest() = default;
 
   void SetUp() override {
     Streams.clear();
diff --git a/llvm/unittests/Support/Casting.cpp b/llvm/unittests/Support/Casting.cpp
index 18327f6dd1675..790675083614b 100644
--- a/llvm/unittests/Support/Casting.cpp
+++ b/llvm/unittests/Support/Casting.cpp
@@ -23,7 +23,7 @@ template <typename T> IllegalCast *cast(...) { return nullptr; }
 // with conversion facility
 //
 struct bar {
-  bar() {}
+  bar() = default;
   bar(const bar &) = delete;
   struct foo *baz();
   struct foo *caz();
@@ -36,7 +36,7 @@ struct foo {
 };
 
 struct base {
-  virtual ~base() {}
+  virtual ~base() = default;
 };
 
 struct derived : public base {
@@ -375,12 +375,12 @@ namespace inferred_upcasting {
 class Base {
 public:
   // No classof. We are testing that the upcast is inferred.
-  Base() {}
+  Base() = default;
 };
 
 class Derived : public Base {
 public:
-  Derived() {}
+  Derived() = default;
 };
 
 // Even with no explicit classof() in Base, we should still be able to cast
@@ -529,7 +529,7 @@ TEST(CastingTest, smart_dyn_cast_or_null) {
 #ifndef NDEBUG
 namespace assertion_checks {
 struct Base {
-  virtual ~Base() {}
+  virtual ~Base() = default;
 };
 
 struct Derived : public Base {
diff --git a/llvm/unittests/Support/InstructionCostTest.cpp b/llvm/unittests/Support/InstructionCostTest.cpp
index efe838897a684..5392689131071 100644
--- a/llvm/unittests/Support/InstructionCostTest.cpp
+++ b/llvm/unittests/Support/InstructionCostTest.cpp
@@ -14,7 +14,7 @@ using namespace llvm;
 namespace {
 
 struct CostTest : public testing::Test {
-  CostTest() {}
+  CostTest() = default;
 };
 
 } // namespace
diff --git a/llvm/unittests/Support/OptimizedStructLayoutTest.cpp b/llvm/unittests/Support/OptimizedStructLayoutTest.cpp
index e8cd5f4285e52..0bcae0dcd5603 100644
--- a/llvm/unittests/Support/OptimizedStructLayoutTest.cpp
+++ b/llvm/unittests/Support/OptimizedStructLayoutTest.cpp
@@ -25,7 +25,7 @@ class LayoutTest {
   bool Verified = false;
 
 public:
-  LayoutTest() {}
+  LayoutTest() = default;
   LayoutTest(const LayoutTest &) = delete;
   LayoutTest &operator=(const LayoutTest &) = delete;
   ~LayoutTest() { assert(Verified); }
diff --git a/llvm/unittests/Support/YAMLIOTest.cpp b/llvm/unittests/Support/YAMLIOTest.cpp
index 283e5f829ba46..7446c07ccb9a8 100644
--- a/llvm/unittests/Support/YAMLIOTest.cpp
+++ b/llvm/unittests/Support/YAMLIOTest.cpp
@@ -3221,12 +3221,12 @@ template <> struct TaggedScalarTraits<Scalar> {
 
 template <> struct CustomMappingTraits<Map> {
   static void inputOne(IO &IO, StringRef Key, Map &M) {
-    IO.mapRequired(Key.str().c_str(), M[Key]);
+    IO.mapRequired(Key, M[Key]);
   }
 
   static void output(IO &IO, Map &M) {
     for (auto &N : M)
-      IO.mapRequired(N.getKey().str().c_str(), N.getValue());
+      IO.mapRequired(N.getKey(), N.getValue());
   }
 };
 
diff --git a/llvm/unittests/Support/raw_ostream_proxy_test.cpp b/llvm/unittests/Support/raw_ostream_proxy_test.cpp
index 864dda712aac8..446e64aa1ff59 100644
--- a/llvm/unittests/Support/raw_ostream_proxy_test.cpp
+++ b/llvm/unittests/Support/raw_ostream_proxy_test.cpp
@@ -40,8 +40,6 @@ class BufferedNoPwriteSmallVectorStream : public raw_ostream {
   bool IsDisplayed = false;
 };
 
-constexpr size_t BufferedNoPwriteSmallVectorStream::PreferredBufferSize;
-
 TEST(raw_ostream_proxyTest, write) {
   // Besides confirming that "write" works, this test confirms that the proxy
   // takes on the buffer from the stream it's proxying, such that writes to the
diff --git a/llvm/unittests/Target/AArch64/AArch64InstPrinterTest.cpp b/llvm/unittests/Target/AArch64/AArch64InstPrinterTest.cpp
index 4dfc0bcb0dc4c..a835a34fea58b 100644
--- a/llvm/unittests/Target/AArch64/AArch64InstPrinterTest.cpp
+++ b/llvm/unittests/Target/AArch64/AArch64InstPrinterTest.cpp
@@ -36,10 +36,8 @@ static std::string AArch64InstPrinterTestPrintAlignedLabel(uint64_t value) {
   MCAsmInfo MAI;
   MCInstrInfo MII;
   MCRegisterInfo MRI;
-  MCSubtargetInfo STI(Triple(""), "", "", "", {},
-                      ArrayRef((SubtargetFeatureKV *)NULL, (size_t)0),
-                      ArrayRef((SubtargetSubTypeKV *)NULL, (size_t)0), NULL,
-                      NULL, NULL, NULL, NULL, NULL);
+  MCSubtargetInfo STI(Triple(""), "", "", "", {}, {}, {}, nullptr, nullptr,
+                      nullptr, nullptr, nullptr, nullptr);
   MCContext Ctx(Triple(""), &MAI, &MRI, &STI);
   MCInst MI;
 
diff --git a/llvm/unittests/TargetParser/TargetParserTest.cpp b/llvm/unittests/TargetParser/TargetParserTest.cpp
index 759109a3f6cbd..0e5d40ad3c7b1 100644
--- a/llvm/unittests/TargetParser/TargetParserTest.cpp
+++ b/llvm/unittests/TargetParser/TargetParserTest.cpp
@@ -737,8 +737,7 @@ TEST(TargetParserTest, ARMFPUNeonSupportLevel) {
   for (ARM::FPUKind FK = static_cast<ARM::FPUKind>(0);
        FK <= ARM::FPUKind::FK_LAST;
        FK = static_cast<ARM::FPUKind>(static_cast<unsigned>(FK) + 1))
-    if (FK == ARM::FK_LAST ||
-        ARM::getFPUName(FK).find("neon") == std::string::npos)
+    if (FK == ARM::FK_LAST || !ARM::getFPUName(FK).contains("neon"))
       EXPECT_EQ(ARM::NeonSupportLevel::None, ARM::getFPUNeonSupportLevel(FK));
     else
       EXPECT_NE(ARM::NeonSupportLevel::None, ARM::getFPUNeonSupportLevel(FK));
@@ -748,9 +747,8 @@ TEST(TargetParserTest, ARMFPURestriction) {
   for (ARM::FPUKind FK = static_cast<ARM::FPUKind>(0);
        FK <= ARM::FPUKind::FK_LAST;
        FK = static_cast<ARM::FPUKind>(static_cast<unsigned>(FK) + 1)) {
-    if (FK == ARM::FK_LAST ||
-        (ARM::getFPUName(FK).find("d16") == std::string::npos &&
-         ARM::getFPUName(FK).find("vfpv3xd") == std::string::npos))
+    if (FK == ARM::FK_LAST || (!ARM::getFPUName(FK).contains("d16") &&
+                               !ARM::getFPUName(FK).contains("vfpv3xd")))
       EXPECT_EQ(ARM::FPURestriction::None, ARM::getFPURestriction(FK));
     else
       EXPECT_NE(ARM::FPURestriction::None, ARM::getFPURestriction(FK));
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp
index 50ad4d5fa61ff..46802826fe090 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp
@@ -21,7 +21,7 @@ using VPVerifierTest = VPlanTestBase;
 namespace {
 TEST_F(VPVerifierTest, VPInstructionUseBeforeDefSameBB) {
   VPlan &Plan = getPlan();
-  VPValue *Zero = Plan.getOrAddLiveIn(ConstantInt::get(Type::getInt32Ty(C), 0));
+  VPValue *Zero = Plan.getConstantInt(32, 0);
   VPInstruction *DefI = new VPInstruction(Instruction::Add, {Zero});
   VPInstruction *UseI = new VPInstruction(Instruction::Sub, {DefI});
   auto *CanIV = new VPCanonicalIVPHIRecipe(Zero, {});
@@ -56,7 +56,7 @@ TEST_F(VPVerifierTest, VPInstructionUseBeforeDefSameBB) {
 
 TEST_F(VPVerifierTest, VPInstructionUseBeforeDefDifferentBB) {
   VPlan &Plan = getPlan();
-  VPValue *Zero = Plan.getOrAddLiveIn(ConstantInt::get(Type::getInt32Ty(C), 0));
+  VPValue *Zero = Plan.getConstantInt(32, 0);
   VPInstruction *DefI = new VPInstruction(Instruction::Add, {Zero});
   VPInstruction *UseI = new VPInstruction(Instruction::Sub, {DefI});
   auto *CanIV = new VPCanonicalIVPHIRecipe(Zero, {});
@@ -184,7 +184,7 @@ TEST_F(VPVerifierTest, VPPhiIncomingValueDoesntDominateIncomingBlock) {
 
 TEST_F(VPVerifierTest, DuplicateSuccessorsOutsideRegion) {
   VPlan &Plan = getPlan();
-  VPValue *Zero = Plan.getOrAddLiveIn(ConstantInt::get(Type::getInt32Ty(C), 0));
+  VPValue *Zero = Plan.getConstantInt(32, 0);
   VPInstruction *I1 = new VPInstruction(Instruction::Add, {Zero});
   auto *CanIV = new VPCanonicalIVPHIRecipe(Zero, {});
   VPInstruction *BranchOnCond =
@@ -218,7 +218,7 @@ TEST_F(VPVerifierTest, DuplicateSuccessorsOutsideRegion) {
 
 TEST_F(VPVerifierTest, DuplicateSuccessorsInsideRegion) {
   VPlan &Plan = getPlan();
-  VPValue *Zero = Plan.getOrAddLiveIn(ConstantInt::get(Type::getInt32Ty(C), 0));
+  VPValue *Zero = Plan.getConstantInt(32, 0);
   VPInstruction *I1 = new VPInstruction(Instruction::Add, {Zero});
   auto *CanIV = new VPCanonicalIVPHIRecipe(Zero, {});
   VPInstruction *BranchOnCond =
@@ -259,7 +259,7 @@ TEST_F(VPVerifierTest, BlockOutsideRegionWithParent) {
   VPBasicBlock *VPBB1 = Plan.getEntry();
   VPBasicBlock *VPBB2 = Plan.createVPBasicBlock("");
 
-  VPValue *Zero = Plan.getOrAddLiveIn(ConstantInt::get(Type::getInt32Ty(C), 0));
+  VPValue *Zero = Plan.getConstantInt(32, 0);
   auto *CanIV = new VPCanonicalIVPHIRecipe(Zero, {});
   VPBB2->appendRecipe(CanIV);
 
@@ -288,7 +288,7 @@ TEST_F(VPVerifierTest, BlockOutsideRegionWithParent) {
 
 TEST_F(VPVerifierTest, NonHeaderPHIInHeader) {
   VPlan &Plan = getPlan();
-  VPValue *Zero = Plan.getOrAddLiveIn(ConstantInt::get(Type::getInt32Ty(C), 0));
+  VPValue *Zero = Plan.getConstantInt(32, 0);
   auto *CanIV = new VPCanonicalIVPHIRecipe(Zero, {});
   auto *BranchOnCond = new VPInstruction(VPInstruction::BranchOnCond, {CanIV});
 
@@ -351,8 +351,7 @@ TEST_F(VPIRVerifierTest, testVerifyIRPhi) {
   BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor();
   auto Plan = buildVPlan(LoopHeader);
 
-  Plan->getExitBlocks()[0]->front().addOperand(
-      Plan->getOrAddLiveIn(ConstantInt::get(Type::getInt32Ty(*Ctx), 0)));
+  Plan->getExitBlocks()[0]->front().addOperand(Plan->getConstantInt(32, 0));
 
 #if GTEST_HAS_STREAM_REDIRECTION
   ::testing::internal::CaptureStderr();
diff --git a/llvm/unittests/XRay/GraphTest.cpp b/llvm/unittests/XRay/GraphTest.cpp
index 37f07cc721c61..0d46a3d9ad377 100644
--- a/llvm/unittests/XRay/GraphTest.cpp
+++ b/llvm/unittests/XRay/GraphTest.cpp
@@ -23,8 +23,8 @@ struct EAttr {
   unsigned EA;
 };
 typedef Graph<VAttr, EAttr, unsigned> GraphT;
-typedef typename GraphT::VertexIdentifier VI;
-typedef typename GraphT::EdgeIdentifier EI;
+typedef GraphT::VertexIdentifier VI;
+typedef GraphT::EdgeIdentifier EI;
 
 // Test Fixture
 template <typename T> class GraphTest : public testing::Test {
@@ -56,8 +56,8 @@ template <typename T> class GraphTest : public testing::Test {
 
 typedef ::testing::Types<GraphT, const GraphT> GraphTestTypes;
 
-using VVT = typename GraphT::VertexValueType;
-using EVT = typename GraphT::EdgeValueType;
+using VVT = GraphT::VertexValueType;
+using EVT = GraphT::EdgeValueType;
 
 TYPED_TEST_SUITE(GraphTest, GraphTestTypes, );
 
diff --git a/llvm/unittests/tools/llvm-exegesis/AArch64/TargetTest.cpp b/llvm/unittests/tools/llvm-exegesis/AArch64/TargetTest.cpp
index 7a66117b7080c..5bc489bd3df65 100644
--- a/llvm/unittests/tools/llvm-exegesis/AArch64/TargetTest.cpp
+++ b/llvm/unittests/tools/llvm-exegesis/AArch64/TargetTest.cpp
@@ -28,7 +28,7 @@ using testing::IsEmpty;
 using testing::Not;
 using testing::NotNull;
 
-constexpr const char kTriple[] = "aarch64-unknown-linux";
+constexpr char kTriple[] = "aarch64-unknown-linux";
 
 class AArch64TargetTest : public ::testing::Test {
 protected:
diff --git a/llvm/unittests/tools/llvm-exegesis/PowerPC/TargetTest.cpp b/llvm/unittests/tools/llvm-exegesis/PowerPC/TargetTest.cpp
index 3708f18369eaa..0e90654ab0b4a 100644
--- a/llvm/unittests/tools/llvm-exegesis/PowerPC/TargetTest.cpp
+++ b/llvm/unittests/tools/llvm-exegesis/PowerPC/TargetTest.cpp
@@ -20,16 +20,13 @@
 
 namespace llvm{
 namespace exegesis {
-
-void InitializePowerPCExegesisTarget();
-
 namespace {
 
 using testing::NotNull;
 using testing::IsEmpty;
 using testing::Not;
 
-constexpr const char kTriple[] = "powerpc64le-unknown-linux";
+constexpr char kTriple[] = "powerpc64le-unknown-linux";
 
 class PowerPCTargetTest : public PPCTestBase {
 protected:
diff --git a/llvm/unittests/tools/llvm-exegesis/RISCV/TargetTest.cpp b/llvm/unittests/tools/llvm-exegesis/RISCV/TargetTest.cpp
index c86a4363e7b42..13a1e5a22228e 100644
--- a/llvm/unittests/tools/llvm-exegesis/RISCV/TargetTest.cpp
+++ b/llvm/unittests/tools/llvm-exegesis/RISCV/TargetTest.cpp
@@ -20,9 +20,6 @@
 
 namespace llvm {
 namespace exegesis {
-
-void InitializeRISCVExegesisTarget();
-
 namespace {
 
 using testing::IsEmpty;
diff --git a/llvm/unittests/tools/llvm-exegesis/X86/SnippetFileTest.cpp b/llvm/unittests/tools/llvm-exegesis/X86/SnippetFileTest.cpp
index de883ab750d20..755a74811eb69 100644
--- a/llvm/unittests/tools/llvm-exegesis/X86/SnippetFileTest.cpp
+++ b/llvm/unittests/tools/llvm-exegesis/X86/SnippetFileTest.cpp
@@ -23,9 +23,6 @@
 
 namespace llvm {
 namespace exegesis {
-
-void InitializeX86ExegesisTarget();
-
 namespace {
 
 using testing::ElementsAre;
diff --git a/llvm/unittests/tools/llvm-exegesis/X86/SnippetGeneratorTest.cpp b/llvm/unittests/tools/llvm-exegesis/X86/SnippetGeneratorTest.cpp
index 60c726212062d..5953f4e6df04a 100644
--- a/llvm/unittests/tools/llvm-exegesis/X86/SnippetGeneratorTest.cpp
+++ b/llvm/unittests/tools/llvm-exegesis/X86/SnippetGeneratorTest.cpp
@@ -20,9 +20,6 @@
 
 namespace llvm {
 namespace exegesis {
-
-void InitializeX86ExegesisTarget();
-
 namespace {
 
 using testing::AnyOf;
diff --git a/llvm/unittests/tools/llvm-exegesis/X86/SnippetRepetitorTest.cpp b/llvm/unittests/tools/llvm-exegesis/X86/SnippetRepetitorTest.cpp
index 41ee4028051bb..7a40901de9291 100644
--- a/llvm/unittests/tools/llvm-exegesis/X86/SnippetRepetitorTest.cpp
+++ b/llvm/unittests/tools/llvm-exegesis/X86/SnippetRepetitorTest.cpp
@@ -16,9 +16,6 @@
 
 namespace llvm {
 namespace exegesis {
-
-void InitializeX86ExegesisTarget();
-
 namespace {
 
 using testing::ElementsAre;
@@ -52,8 +49,8 @@ class X86SnippetRepetitorTest : public X86TestBase {
     Fill(Sink);
   }
 
-  static constexpr const unsigned kMinInstructions = 3;
-  static constexpr const unsigned kLoopBodySize = 5;
+  static constexpr unsigned kMinInstructions = 3;
+  static constexpr unsigned kLoopBodySize = 5;
 
   std::unique_ptr<TargetMachine> TM;
   std::unique_ptr<LLVMContext> Context;
diff --git a/llvm/unittests/tools/llvm-exegesis/X86/SubprocessMemoryTest.cpp b/llvm/unittests/tools/llvm-exegesis/X86/SubprocessMemoryTest.cpp
index a0cad289e978f..08c18e4ed11e2 100644
--- a/llvm/unittests/tools/llvm-exegesis/X86/SubprocessMemoryTest.cpp
+++ b/llvm/unittests/tools/llvm-exegesis/X86/SubprocessMemoryTest.cpp
@@ -29,7 +29,7 @@ namespace exegesis {
 
 // This needs to be updated anytime a test is added or removed from the test
 // suite.
-static constexpr const size_t TestCount = 4;
+static constexpr size_t TestCount = 4;
 
 class SubprocessMemoryTest : public X86TestBase {
 protected:
diff --git a/llvm/unittests/tools/llvm-exegesis/X86/TargetTest.cpp b/llvm/unittests/tools/llvm-exegesis/X86/TargetTest.cpp
index 846729c6f85ee..5a21a6929f640 100644
--- a/llvm/unittests/tools/llvm-exegesis/X86/TargetTest.cpp
+++ b/llvm/unittests/tools/llvm-exegesis/X86/TargetTest.cpp
@@ -53,9 +53,6 @@ bool operator==(const MCInst &a, const MCInst &b) {
 
 namespace llvm {
 namespace exegesis {
-
-void InitializeX86ExegesisTarget();
-
 namespace {
 
 using testing::AllOf;
@@ -585,7 +582,7 @@ TEST_F(X86Core2TargetTest, SetRegToDf0) {
 TEST_F(X86Core2Avx512TargetTest, FillMemoryOperands_ADD64rm) {
   const Instruction &I = getInstr(X86::ADD64rm);
   InstructionTemplate IT(&I);
-  constexpr const int kOffset = 42;
+  constexpr int kOffset = 42;
   State.getExegesisTarget().fillMemoryOperands(IT, X86::RDI, kOffset);
   // Memory is operands 2-6.
   EXPECT_THAT(IT.getValueFor(I.Operands[2]), IsReg(X86::RDI));
@@ -598,7 +595,7 @@ TEST_F(X86Core2Avx512TargetTest, FillMemoryOperands_ADD64rm) {
 TEST_F(X86Core2Avx512TargetTest, FillMemoryOperands_VGATHERDPSZ128rm) {
   const Instruction &I = getInstr(X86::VGATHERDPSZ128rm);
   InstructionTemplate IT(&I);
-  constexpr const int kOffset = 42;
+  constexpr int kOffset = 42;
   State.getExegesisTarget().fillMemoryOperands(IT, X86::RDI, kOffset);
   // Memory is operands 4-8.
   EXPECT_THAT(IT.getValueFor(I.Operands[4]), IsReg(X86::RDI));
@@ -628,9 +625,9 @@ TEST_F(X86Core2TargetTest, GenerateLowerMunmapTest) {
 }
 
 #ifdef __arm__
-static constexpr const uintptr_t VAddressSpaceCeiling = 0xC0000000;
+static constexpr uintptr_t VAddressSpaceCeiling = 0xC0000000;
 #else
-static constexpr const uintptr_t VAddressSpaceCeiling = 0x0000800000000000;
+static constexpr uintptr_t VAddressSpaceCeiling = 0x0000800000000000;
 #endif
 
 TEST_F(X86Core2TargetTest, GenerateUpperMunmapTest) {
diff --git a/llvm/unittests/tools/llvm-exegesis/X86/TestBase.h b/llvm/unittests/tools/llvm-exegesis/X86/TestBase.h
index 4122726aef94a..b4c84d178c1f9 100644
--- a/llvm/unittests/tools/llvm-exegesis/X86/TestBase.h
+++ b/llvm/unittests/tools/llvm-exegesis/X86/TestBase.h
@@ -22,7 +22,7 @@ namespace exegesis {
 
 void InitializeX86ExegesisTarget();
 
-constexpr const char kTriple[] = "x86_64-unknown-linux";
+constexpr char kTriple[] = "x86_64-unknown-linux";
 
 class X86TestBase : public ::testing::Test {
 protected:
diff --git a/llvm/utils/FileCheck/FileCheck.cpp b/llvm/utils/FileCheck/FileCheck.cpp
index 305c28b4c7257..a5473f951492e 100644
--- a/llvm/utils/FileCheck/FileCheck.cpp
+++ b/llvm/utils/FileCheck/FileCheck.cpp
@@ -193,7 +193,7 @@ struct MarkerStyle {
   std::string Note;
   /// Does this marker indicate inclusion by -dump-input-filter=error?
   bool FiltersAsError;
-  MarkerStyle() {}
+  MarkerStyle() = default;
   MarkerStyle(char Lead, raw_ostream::Colors Color,
               const std::string &Note = "", bool FiltersAsError = false)
       : Lead(Lead), Color(Color), Note(Note), FiltersAsError(FiltersAsError) {
diff --git a/llvm/utils/TableGen/AsmMatcherEmitter.cpp b/llvm/utils/TableGen/AsmMatcherEmitter.cpp
index e1f2f06d755f1..9f18a11c236c0 100644
--- a/llvm/utils/TableGen/AsmMatcherEmitter.cpp
+++ b/llvm/utils/TableGen/AsmMatcherEmitter.cpp
@@ -4164,7 +4164,7 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
     OS << "        MII.getDeprecatedInfo(Inst, getSTI(), Info)) {\n";
     OS << "      SMLoc Loc = ((" << Target.getName()
        << "Operand &)*Operands[0]).getStartLoc();\n";
-    OS << "      getParser().Warning(Loc, Info, std::nullopt);\n";
+    OS << "      getParser().Warning(Loc, Info, {});\n";
     OS << "    }\n";
   }
 
diff --git a/llvm/utils/TableGen/Basic/ARMTargetDefEmitter.cpp b/llvm/utils/TableGen/Basic/ARMTargetDefEmitter.cpp
index 3f284ee1b1032..b63ce3671f922 100644
--- a/llvm/utils/TableGen/Basic/ARMTargetDefEmitter.cpp
+++ b/llvm/utils/TableGen/Basic/ARMTargetDefEmitter.cpp
@@ -220,7 +220,7 @@ static void emitARMTargetDef(const RecordKeeper &RK, raw_ostream &OS) {
                           ProfileLower + "'");
 
     // Name of the object in C++
-    const std::string CppSpelling = ArchInfoName(Major, Minor, ProfileUpper);
+    std::string CppSpelling = ArchInfoName(Major, Minor, ProfileUpper);
     OS << "inline constexpr ArchInfo " << CppSpelling << " = {\n";
     CppSpellings.push_back(std::move(CppSpelling));
 
diff --git a/llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp b/llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp
index ed802e20477d3..6a36f471678bf 100644
--- a/llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp
+++ b/llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp
@@ -154,7 +154,7 @@ class RuntimeLibcallImpl {
       Provides = ProvideMap.lookup(ProvidesDef);
   }
 
-  ~RuntimeLibcallImpl() {}
+  ~RuntimeLibcallImpl() = default;
 
   const Record *getDef() const { return TheDef; }
 
diff --git a/llvm/utils/TableGen/Basic/TargetFeaturesEmitter.h b/llvm/utils/TableGen/Basic/TargetFeaturesEmitter.h
index 99e4820c614c2..412f323d04821 100644
--- a/llvm/utils/TableGen/Basic/TargetFeaturesEmitter.h
+++ b/llvm/utils/TableGen/Basic/TargetFeaturesEmitter.h
@@ -43,7 +43,7 @@ class TargetFeaturesEmitter {
   void printFeatureKeyValues(raw_ostream &OS, const FeatureMapTy &FeatureMap);
   void printCPUKeyValues(raw_ostream &OS, const FeatureMapTy &FeatureMap);
   virtual void run(raw_ostream &O);
-  virtual ~TargetFeaturesEmitter() {};
+  virtual ~TargetFeaturesEmitter() = default;
 };
 } // namespace llvm
 #endif
diff --git a/llvm/utils/TableGen/Common/CodeGenRegisters.cpp b/llvm/utils/TableGen/Common/CodeGenRegisters.cpp
index 8d0ec9abd23ae..2eb94b7e92674 100644
--- a/llvm/utils/TableGen/Common/CodeGenRegisters.cpp
+++ b/llvm/utils/TableGen/Common/CodeGenRegisters.cpp
@@ -1651,8 +1651,7 @@ template <> struct llvm::GraphTraits<SubRegIndexCompositionGraph> {
   struct ChildIteratorType
       : public iterator_adaptor_base<
             ChildIteratorType, CompMapIt,
-            typename std::iterator_traits<CompMapIt>::iterator_category,
-            NodeRef> {
+            std::iterator_traits<CompMapIt>::iterator_category, NodeRef> {
     ChildIteratorType(CompMapIt I)
         : ChildIteratorType::iterator_adaptor_base(I) {}
 
diff --git a/llvm/utils/TableGen/Common/CodeGenTarget.cpp b/llvm/utils/TableGen/Common/CodeGenTarget.cpp
index 3db0d07eec88f..1e9378845854e 100644
--- a/llvm/utils/TableGen/Common/CodeGenTarget.cpp
+++ b/llvm/utils/TableGen/Common/CodeGenTarget.cpp
@@ -80,7 +80,7 @@ CodeGenTarget::CodeGenTarget(const RecordKeeper &records)
   MacroFusions = Records.getAllDerivedDefinitions("Fusion");
 }
 
-CodeGenTarget::~CodeGenTarget() {}
+CodeGenTarget::~CodeGenTarget() = default;
 
 StringRef CodeGenTarget::getName() const { return TargetRec->getName(); }
 
diff --git a/llvm/utils/TableGen/Common/DAGISelMatcher.h b/llvm/utils/TableGen/Common/DAGISelMatcher.h
index f87de757f4f8b..a19f4442f5f4d 100644
--- a/llvm/utils/TableGen/Common/DAGISelMatcher.h
+++ b/llvm/utils/TableGen/Common/DAGISelMatcher.h
@@ -105,7 +105,7 @@ class Matcher {
   Matcher(KindTy K) : Kind(K) {}
 
 public:
-  virtual ~Matcher() {}
+  virtual ~Matcher() = default;
 
   unsigned getSize() const { return Size; }
   void setSize(unsigned sz) { Size = sz; }
diff --git a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp
index 5d49715879280..7af757c037612 100644
--- a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp
+++ b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp
@@ -457,7 +457,7 @@ std::optional<LLTCodeGen> llvm::gi::MVTToLLT(MVT::SimpleValueType SVT) {
 
 void Matcher::optimize() {}
 
-Matcher::~Matcher() {}
+Matcher::~Matcher() = default;
 
 //===- GroupMatcher -------------------------------------------------------===//
 
@@ -1150,11 +1150,11 @@ void RuleMatcher::insnmatchers_pop_front() { Matchers.erase(Matchers.begin()); }
 
 //===- PredicateMatcher ---------------------------------------------------===//
 
-PredicateMatcher::~PredicateMatcher() {}
+PredicateMatcher::~PredicateMatcher() = default;
 
 //===- OperandPredicateMatcher --------------------------------------------===//
 
-OperandPredicateMatcher::~OperandPredicateMatcher() {}
+OperandPredicateMatcher::~OperandPredicateMatcher() = default;
 
 bool OperandPredicateMatcher::isHigherPriorityThan(
     const OperandPredicateMatcher &B) const {
@@ -1941,7 +1941,7 @@ bool InstructionOperandMatcher::isHigherPriorityThan(
 
 //===- OperandRenderer ----------------------------------------------------===//
 
-OperandRenderer::~OperandRenderer() {}
+OperandRenderer::~OperandRenderer() = default;
 
 //===- CopyRenderer -------------------------------------------------------===//
 
diff --git a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h
index 0f1241eb4d63f..84dfca46dfbfa 100644
--- a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h
+++ b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h
@@ -621,7 +621,7 @@ class RuleMatcher : public Matcher {
   DefinedInsnVariablesMap::const_iterator defined_insn_vars_end() const {
     return InsnVariableIDs.end();
   }
-  iterator_range<typename DefinedInsnVariablesMap::const_iterator>
+  iterator_range<DefinedInsnVariablesMap::const_iterator>
   defined_insn_vars() const {
     return make_range(defined_insn_vars_begin(), defined_insn_vars_end());
   }
@@ -632,8 +632,7 @@ class RuleMatcher : public Matcher {
   MutatableInsnSet::const_iterator mutatable_insns_end() const {
     return MutatableInsns.end();
   }
-  iterator_range<typename MutatableInsnSet::const_iterator>
-  mutatable_insns() const {
+  iterator_range<MutatableInsnSet::const_iterator> mutatable_insns() const {
     return make_range(mutatable_insns_begin(), mutatable_insns_end());
   }
   void reserveInsnMatcherForMutation(InstructionMatcher *InsnMatcher) {
@@ -1375,7 +1374,7 @@ class InstructionPredicateMatcher : public PredicateMatcher {
 public:
   InstructionPredicateMatcher(PredicateKind Kind, unsigned InsnVarID)
       : PredicateMatcher(Kind, InsnVarID) {}
-  ~InstructionPredicateMatcher() override {}
+  ~InstructionPredicateMatcher() override = default;
 
   /// Compare the priority of this object and B.
   ///
@@ -2319,7 +2318,7 @@ class MatchAction {
 
   ActionKind getKind() const { return Kind; }
 
-  virtual ~MatchAction() {}
+  virtual ~MatchAction() = default;
 
   // Some actions may need to add extra predicates to ensure they can run.
   virtual void emitAdditionalPredicates(MatchTable &Table,
diff --git a/llvm/utils/TableGen/Common/InfoByHwMode.cpp b/llvm/utils/TableGen/Common/InfoByHwMode.cpp
index 4c8197dc60c14..2b3155cace9f3 100644
--- a/llvm/utils/TableGen/Common/InfoByHwMode.cpp
+++ b/llvm/utils/TableGen/Common/InfoByHwMode.cpp
@@ -174,7 +174,7 @@ bool RegSizeInfoByHwMode::hasStricterSpillThan(
 }
 
 void RegSizeInfoByHwMode::writeToStream(raw_ostream &OS) const {
-  typedef typename decltype(Map)::value_type PairType;
+  typedef decltype(Map)::value_type PairType;
   std::vector<const PairType *> Pairs;
   for (const auto &P : Map)
     Pairs.push_back(&P);
diff --git a/llvm/utils/TableGen/FastISelEmitter.cpp b/llvm/utils/TableGen/FastISelEmitter.cpp
index e0be104c883c5..c4dbb148c72c1 100644
--- a/llvm/utils/TableGen/FastISelEmitter.cpp
+++ b/llvm/utils/TableGen/FastISelEmitter.cpp
@@ -85,7 +85,7 @@ struct OperandsSignature {
     char Repr = OK_Invalid;
 
   public:
-    OpKind() {}
+    OpKind() = default;
 
     bool operator<(OpKind RHS) const { return Repr < RHS.Repr; }
     bool operator==(OpKind RHS) const { return Repr == RHS.Repr; }
diff --git a/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp b/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp
index 043bc6286146c..50e63a4bdc462 100644
--- a/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp
+++ b/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp
@@ -2441,7 +2441,7 @@ class GICombinerEmitter final : public GlobalISelMatchTableExecutorEmitter {
   explicit GICombinerEmitter(const RecordKeeper &RK,
                              const CodeGenTarget &Target, StringRef Name,
                              const Record *Combiner);
-  ~GICombinerEmitter() override {}
+  ~GICombinerEmitter() override = default;
 
   void run(raw_ostream &OS);
 };
diff --git a/llvm/utils/TableGen/RegisterBankEmitter.cpp b/llvm/utils/TableGen/RegisterBankEmitter.cpp
index 61b0b661c0f32..60aa2d80aee3e 100644
--- a/llvm/utils/TableGen/RegisterBankEmitter.cpp
+++ b/llvm/utils/TableGen/RegisterBankEmitter.cpp
@@ -100,8 +100,7 @@ class RegisterBank {
     return RCsWithLargestRegSize[HwMode];
   }
 
-  iterator_range<typename RegisterClassesTy::const_iterator>
-  register_classes() const {
+  iterator_range<RegisterClassesTy::const_iterator> register_classes() const {
     return RCs;
   }
 };
diff --git a/llvm/utils/TableGen/X86DisassemblerTables.cpp b/llvm/utils/TableGen/X86DisassemblerTables.cpp
index 3414190b9c9b4..b8c3c02a9eb3f 100644
--- a/llvm/utils/TableGen/X86DisassemblerTables.cpp
+++ b/llvm/utils/TableGen/X86DisassemblerTables.cpp
@@ -708,7 +708,7 @@ DisassemblerTables::DisassemblerTables() {
   HasConflicts = false;
 }
 
-DisassemblerTables::~DisassemblerTables() {}
+DisassemblerTables::~DisassemblerTables() = default;
 
 void DisassemblerTables::emitModRMDecision(raw_ostream &o1, raw_ostream &o2,
                                            unsigned &i1, unsigned &i2,
diff --git a/llvm/utils/TableGen/X86ModRMFilters.h b/llvm/utils/TableGen/X86ModRMFilters.h
index 7bf111ffa1d50..4eb57b0a4623b 100644
--- a/llvm/utils/TableGen/X86ModRMFilters.h
+++ b/llvm/utils/TableGen/X86ModRMFilters.h
@@ -28,7 +28,7 @@ class ModRMFilter {
 
 public:
   /// Destructor    - Override as necessary.
-  virtual ~ModRMFilter() {}
+  virtual ~ModRMFilter() = default;
 
   /// isDumb        - Indicates whether this filter returns the same value for
   ///                 any value of the ModR/M byte.
diff --git a/llvm/utils/TableGen/X86RecognizableInstr.cpp b/llvm/utils/TableGen/X86RecognizableInstr.cpp
index a006888a2352c..44b76ae7e8487 100644
--- a/llvm/utils/TableGen/X86RecognizableInstr.cpp
+++ b/llvm/utils/TableGen/X86RecognizableInstr.cpp
@@ -1141,7 +1141,6 @@ OperandType RecognizableInstr::typeFromString(StringRef Str, bool hasREX_W,
           .Case("vz64mem", TYPE_MVSIBZ)
           .Case("BNDR", TYPE_BNDR)
           .Case("TILE", TYPE_TMM)
-          .Case("TILEPair", TYPE_TMM_PAIR)
           .Default(TYPE_NONE);
   // clang-format on
 
diff --git a/llvm/utils/UpdateTestChecks/common.py b/llvm/utils/UpdateTestChecks/common.py
index a5e3c39bfdecd..2dad16a8eebb7 100644
--- a/llvm/utils/UpdateTestChecks/common.py
+++ b/llvm/utils/UpdateTestChecks/common.py
@@ -29,6 +29,7 @@
    'none' and 'all'. 'smart' is the default.
 5: Basic block labels are matched by FileCheck expressions
 6: The semantics of TBAA checks has been incorporated in the check lines.
+7: Indent switch-cases correctly; CHECK-EMPTY instead of skipping blank lines.
 """
 DEFAULT_VERSION = 6
 
@@ -606,6 +607,7 @@ def invoke_tool(exe, cmd_args, ir, preprocess_cmd=None, verbose=False):
 DEBUG_ONLY_ARG_RE = re.compile(r"-debug-only[= ]([^ ]+)")
 
 IS_DEBUG_RECORD_RE = re.compile(r"^(\s+)#dbg_")
+IS_SWITCH_CASE_RE = re.compile(r"^\s+i\d+ \d+, label %\S+")
 
 SCRUB_LEADING_WHITESPACE_RE = re.compile(r"^(\s+)")
 SCRUB_WHITESPACE_RE = re.compile(r"(?!^(|  \w))[ \t]+", flags=re.M)
@@ -1121,6 +1123,8 @@ def processed_prefixes(self, prefixes):
 ##### Generator of LLVM IR CHECK lines
 
 SCRUB_IR_COMMENT_RE = re.compile(r"\s*;.*")
+# Comments to indicate the predecessors of a block in the IR.
+SCRUB_PRED_COMMENT_RE = re.compile(r"\s*; preds = .*")
 SCRUB_IR_FUNC_META_RE = re.compile(r"((?:\!(?!dbg\b)[a-zA-Z_]\w*(?:\s+![0-9]+)?)\s*)+")
 
 # TODO: We should also derive check lines for global, debug, loop declarations, etc..
@@ -1359,7 +1363,7 @@ def make_ir_generalizer(version, no_meta_details):
     ]
 
     prefix = r"(\s*)"
-    suffix = r"([,\s\(\)\}]|\Z)"
+    suffix = r"([,\s\(\)\}\]]|\Z)"
 
     # values = [
     #     nameless_value
@@ -1875,6 +1879,7 @@ def generalize_check_lines(
     *,
     unstable_globals_only=False,
     no_meta_details=False,
+    ignore_all_comments=True,  # If False, only ignore comments of predecessors
 ):
     if unstable_globals_only:
         regexp = ginfo.get_unstable_globals_regexp()
@@ -1902,8 +1907,12 @@ def escape_braces(match_obj):
                         line,
                     )
                     break
-            # Ignore any comments, since the check lines will too.
-            scrubbed_line = SCRUB_IR_COMMENT_RE.sub(r"", line)
+            if ignore_all_comments:
+                # Ignore any comments, since the check lines will too.
+                scrubbed_line = SCRUB_IR_COMMENT_RE.sub(r"", line)
+            else:
+                # Ignore comments of predecessors only.
+                scrubbed_line = SCRUB_PRED_COMMENT_RE.sub(r"", line)
             # Ignore the metadata details if check global is none
             if no_meta_details:
                 scrubbed_line = SCRUB_IR_FUNC_META_RE.sub(r"{{.*}}", scrubbed_line)
@@ -2081,6 +2090,7 @@ def add_checks(
     global_tbaa_records_for_prefixes={},
     preserve_names=False,
     original_check_lines: Mapping[str, List[str]] = {},
+    check_inst_comments=True,
 ):
     # prefix_exclusions are prefixes we cannot use to print the function because it doesn't exist in run lines that use these prefixes as well.
     prefix_exclusions = set()
@@ -2270,6 +2280,14 @@ def add_checks(
             # For IR output, change all defs to FileCheck variables, so we're immune
             # to variable naming fashions.
             else:
+                if ginfo.get_version() >= 7:
+                    # Record the indices of blank lines in the function body preemptively.
+                    blank_line_indices = {
+                        i for i, line in enumerate(func_body) if line.strip() == ""
+                    }
+                else:
+                    blank_line_indices = set()
+
                 func_body = generalize_check_lines(
                     func_body,
                     ginfo,
@@ -2278,6 +2296,8 @@ def add_checks(
                     global_tbaa_records,
                     preserve_names,
                     original_check_lines=original_check_lines.get(checkprefix),
+                    # IR output might require comments checks, e.g., print-predicate-info, print<memssa>
+                    ignore_all_comments=not check_inst_comments,
                 )
 
                 # This could be selectively enabled with an optional invocation argument.
@@ -2293,12 +2313,22 @@ def add_checks(
 
                 is_blank_line = False
 
-                for func_line in func_body:
+                for idx, func_line in enumerate(func_body):
                     if func_line.strip() == "":
-                        is_blank_line = True
+                        # We should distinguish if the line is a 'fake' blank line generated by
+                        # generalize_check_lines removing comments.
+                        # Fortunately, generalize_check_lines does not change the index of each line,
+                        # we can record the indices of blank lines preemptively.
+                        if idx in blank_line_indices:
+                            output_lines.append(
+                                "{} {}-EMPTY:".format(comment_marker, checkprefix)
+                            )
+                        else:
+                            is_blank_line = True
                         continue
-                    # Do not waste time checking IR comments.
-                    func_line = SCRUB_IR_COMMENT_RE.sub(r"", func_line)
+                    if not check_inst_comments:
+                        # Do not waste time checking IR comments unless necessary.
+                        func_line = SCRUB_IR_COMMENT_RE.sub(r"", func_line)
 
                     # Skip blank lines instead of checking them.
                     if is_blank_line:
@@ -2340,6 +2370,7 @@ def add_ir_checks(
     global_vars_seen_dict,
     global_tbaa_records_for_prefixes,
     is_filtered,
+    check_inst_comments=False,
     original_check_lines={},
 ):
     assert ginfo.is_ir()
@@ -2366,6 +2397,7 @@ def add_ir_checks(
         global_tbaa_records_for_prefixes,
         preserve_names,
         original_check_lines=original_check_lines,
+        check_inst_comments=check_inst_comments,
     )
 
 
@@ -2394,244 +2426,6 @@ def add_analyze_checks(
     )
 
 
-IR_FUNC_NAME_RE = re.compile(
-    r"^\s*define\s+(?:internal\s+)?[^@]*@(?P<func>[A-Za-z0-9_.]+)\s*\("
-)
-IR_PREFIX_DATA_RE = re.compile(r"^ *(;|$)")
-MIR_FUNC_NAME_RE = re.compile(r" *name: *(?P<func>[A-Za-z0-9_.-]+)")
-MIR_BODY_BEGIN_RE = re.compile(r" *body: *\|")
-MIR_BASIC_BLOCK_RE = re.compile(r" *bb\.[0-9]+.*:$")
-MIR_PREFIX_DATA_RE = re.compile(r"^ *(;|bb.[0-9].*: *$|[a-z]+:( |$)|$)")
-
-
-def find_mir_functions_with_one_bb(lines, verbose=False):
-    result = []
-    cur_func = None
-    bbs = 0
-    for line in lines:
-        m = MIR_FUNC_NAME_RE.match(line)
-        if m:
-            if bbs == 1:
-                result.append(cur_func)
-            cur_func = m.group("func")
-            bbs = 0
-        m = MIR_BASIC_BLOCK_RE.match(line)
-        if m:
-            bbs += 1
-    if bbs == 1:
-        result.append(cur_func)
-    return result
-
-
-def add_mir_checks_for_function(
-    test,
-    output_lines,
-    run_list,
-    func_dict,
-    func_name,
-    single_bb,
-    print_fixed_stack,
-    first_check_is_next,
-    at_the_function_name,
-):
-    printed_prefixes = set()
-    for run in run_list:
-        for prefix in run[0]:
-            if prefix in printed_prefixes:
-                break
-            if not func_dict[prefix][func_name]:
-                continue
-            if printed_prefixes:
-                # Add some space between different check prefixes.
-                indent = len(output_lines[-1]) - len(output_lines[-1].lstrip(" "))
-                output_lines.append(" " * indent + ";")
-            printed_prefixes.add(prefix)
-            add_mir_check_lines(
-                test,
-                output_lines,
-                prefix,
-                ("@" if at_the_function_name else "") + func_name,
-                single_bb,
-                func_dict[prefix][func_name],
-                print_fixed_stack,
-                first_check_is_next,
-            )
-            break
-        else:
-            warn(
-                "Found conflicting asm for function: {}".format(func_name),
-                test_file=test,
-            )
-    return output_lines
-
-
-def add_mir_check_lines(
-    test,
-    output_lines,
-    prefix,
-    func_name,
-    single_bb,
-    func_info,
-    print_fixed_stack,
-    first_check_is_next,
-):
-    func_body = str(func_info).splitlines()
-    if single_bb:
-        # Don't bother checking the basic block label for a single BB
-        func_body.pop(0)
-
-    if not func_body:
-        warn(
-            "Function has no instructions to check: {}".format(func_name),
-            test_file=test,
-        )
-        return
-
-    first_line = func_body[0]
-    indent = len(first_line) - len(first_line.lstrip(" "))
-    # A check comment, indented the appropriate amount
-    check = "{:>{}}; {}".format("", indent, prefix)
-
-    output_lines.append("{}-LABEL: name: {}".format(check, func_name))
-
-    if print_fixed_stack:
-        output_lines.append("{}: fixedStack:".format(check))
-        for stack_line in func_info.extrascrub.splitlines():
-            filecheck_directive = check + "-NEXT"
-            output_lines.append("{}: {}".format(filecheck_directive, stack_line))
-
-    first_check = not first_check_is_next
-    for func_line in func_body:
-        if not func_line.strip():
-            # The mir printer prints leading whitespace so we can't use CHECK-EMPTY:
-            output_lines.append(check + "-NEXT: {{" + func_line + "$}}")
-            continue
-        filecheck_directive = check if first_check else check + "-NEXT"
-        first_check = False
-        check_line = "{}: {}".format(filecheck_directive, func_line[indent:]).rstrip()
-        output_lines.append(check_line)
-
-
-def should_add_mir_line_to_output(input_line, prefix_set):
-    # Skip any check lines that we're handling as well as comments
-    m = CHECK_RE.match(input_line)
-    if (m and m.group(1) in prefix_set) or input_line.strip() == ";":
-        return False
-    return True
-
-
-def add_mir_checks(
-    input_lines,
-    prefix_set,
-    autogenerated_note,
-    test,
-    run_list,
-    func_dict,
-    print_fixed_stack,
-    first_check_is_next,
-    at_the_function_name,
-):
-    simple_functions = find_mir_functions_with_one_bb(input_lines)
-
-    output_lines = []
-    output_lines.append(autogenerated_note)
-
-    func_name = None
-    state = "toplevel"
-    for input_line in input_lines:
-        if input_line == autogenerated_note:
-            continue
-
-        if state == "toplevel":
-            m = IR_FUNC_NAME_RE.match(input_line)
-            if m:
-                state = "ir function prefix"
-                func_name = m.group("func")
-            if input_line.rstrip("| \r\n") == "---":
-                state = "document"
-            output_lines.append(input_line)
-        elif state == "document":
-            m = MIR_FUNC_NAME_RE.match(input_line)
-            if m:
-                state = "mir function metadata"
-                func_name = m.group("func")
-            if input_line.strip() == "...":
-                state = "toplevel"
-                func_name = None
-            if should_add_mir_line_to_output(input_line, prefix_set):
-                output_lines.append(input_line)
-        elif state == "mir function metadata":
-            if should_add_mir_line_to_output(input_line, prefix_set):
-                output_lines.append(input_line)
-            m = MIR_BODY_BEGIN_RE.match(input_line)
-            if m:
-                if func_name in simple_functions:
-                    # If there's only one block, put the checks inside it
-                    state = "mir function prefix"
-                    continue
-                state = "mir function body"
-                add_mir_checks_for_function(
-                    test,
-                    output_lines,
-                    run_list,
-                    func_dict,
-                    func_name,
-                    single_bb=False,
-                    print_fixed_stack=print_fixed_stack,
-                    first_check_is_next=first_check_is_next,
-                    at_the_function_name=at_the_function_name,
-                )
-        elif state == "mir function prefix":
-            m = MIR_PREFIX_DATA_RE.match(input_line)
-            if not m:
-                state = "mir function body"
-                add_mir_checks_for_function(
-                    test,
-                    output_lines,
-                    run_list,
-                    func_dict,
-                    func_name,
-                    single_bb=True,
-                    print_fixed_stack=print_fixed_stack,
-                    first_check_is_next=first_check_is_next,
-                    at_the_function_name=at_the_function_name,
-                )
-
-            if should_add_mir_line_to_output(input_line, prefix_set):
-                output_lines.append(input_line)
-        elif state == "mir function body":
-            if input_line.strip() == "...":
-                state = "toplevel"
-                func_name = None
-            if should_add_mir_line_to_output(input_line, prefix_set):
-                output_lines.append(input_line)
-        elif state == "ir function prefix":
-            m = IR_PREFIX_DATA_RE.match(input_line)
-            if not m:
-                state = "ir function body"
-                add_mir_checks_for_function(
-                    test,
-                    output_lines,
-                    run_list,
-                    func_dict,
-                    func_name,
-                    single_bb=False,
-                    print_fixed_stack=print_fixed_stack,
-                    first_check_is_next=first_check_is_next,
-                    at_the_function_name=at_the_function_name,
-                )
-
-            if should_add_mir_line_to_output(input_line, prefix_set):
-                output_lines.append(input_line)
-        elif state == "ir function body":
-            if input_line.strip() == "}":
-                state = "toplevel"
-                func_name = None
-            if should_add_mir_line_to_output(input_line, prefix_set):
-                output_lines.append(input_line)
-    return output_lines
-
-
 def build_global_values_dictionary(glob_val_dict, raw_tool_output, prefixes, ginfo):
     for nameless_value in ginfo.get_nameless_values():
         if nameless_value.global_ir_rhs_regexp is None:
diff --git a/llvm/utils/UpdateTestChecks/mir.py b/llvm/utils/UpdateTestChecks/mir.py
new file mode 100644
index 0000000000000..24bb8b341d335
--- /dev/null
+++ b/llvm/utils/UpdateTestChecks/mir.py
@@ -0,0 +1,362 @@
+"""MIR test utility functions for UpdateTestChecks scripts."""
+
+import re
+import sys
+from UpdateTestChecks import common
+from UpdateTestChecks.common import (
+    CHECK_RE,
+    warn,
+)
+
+IR_FUNC_NAME_RE = re.compile(
+    r"^\s*define\s+(?:internal\s+)?[^@]*@(?P<func>[A-Za-z0-9_.]+)\s*\("
+)
+IR_PREFIX_DATA_RE = re.compile(r"^ *(;|$)")
+MIR_FUNC_NAME_RE = re.compile(r" *name: *(?P<func>[A-Za-z0-9_.-]+)")
+MIR_BODY_BEGIN_RE = re.compile(r" *body: *\|")
+MIR_BASIC_BLOCK_RE = re.compile(r" *bb\.[0-9]+.*:$")
+MIR_PREFIX_DATA_RE = re.compile(r"^ *(;|bb.[0-9].*: *$|[a-z]+:( |$)|$)")
+
+VREG_RE = re.compile(r"(%[0-9]+)(?:\.[a-z0-9_]+)?(?::[a-z0-9_]+)?(?:\([<>a-z0-9 ]+\))?")
+MI_FLAGS_STR = (
+    r"(frame-setup |frame-destroy |nnan |ninf |nsz |arcp |contract |afn "
+    r"|reassoc |nuw |nsw |exact |nofpexcept |nomerge |unpredictable "
+    r"|noconvergent |nneg |disjoint |nusw |samesign |inbounds )*"
+)
+VREG_DEF_FLAGS_STR = r"(?:dead |undef )*"
+
+# Pattern to match the defined vregs and the opcode of an instruction that
+# defines vregs. Opcodes starting with a lower-case 't' are allowed to match
+# ARM's thumb instructions, like tADDi8 and t2ADDri.
+VREG_DEF_RE = re.compile(
+    r"^ *(?P<vregs>{2}{0}(?:, {2}{0})*) = "
+    r"{1}(?P<opcode>[A-Zt][A-Za-z0-9_]+)".format(
+        VREG_RE.pattern, MI_FLAGS_STR, VREG_DEF_FLAGS_STR
+    )
+)
+
+MIR_FUNC_RE = re.compile(
+    r"^---$"
+    r"\n"
+    r"^ *name: *(?P<func>[A-Za-z0-9_.-]+)$"
+    r".*?"
+    r"(?:^ *fixedStack: *(\[\])? *\n"
+    r"(?P<fixedStack>.*?)\n?"
+    r"^ *stack:"
+    r".*?)?"
+    r"^ *body: *\|\n"
+    r"(?P<body>.*?)\n"
+    r"^\.\.\.$",
+    flags=(re.M | re.S),
+)
+
+
+def build_function_info_dictionary(
+    test, raw_tool_output, triple, prefixes, func_dict, verbose
+):
+    for m in MIR_FUNC_RE.finditer(raw_tool_output):
+        func = m.group("func")
+        fixedStack = m.group("fixedStack")
+        body = m.group("body")
+        if verbose:
+            print("Processing function: {}".format(func), file=sys.stderr)
+            for l in body.splitlines():
+                print("  {}".format(l), file=sys.stderr)
+
+        # Vreg mangling
+        mangled = []
+        vreg_map = {}
+        for func_line in body.splitlines(keepends=True):
+            m = VREG_DEF_RE.match(func_line)
+            if m:
+                for vreg in VREG_RE.finditer(m.group("vregs")):
+                    if vreg.group(1) in vreg_map:
+                        name = vreg_map[vreg.group(1)]
+                    else:
+                        name = mangle_vreg(m.group("opcode"), vreg_map.values())
+                        vreg_map[vreg.group(1)] = name
+                    func_line = func_line.replace(
+                        vreg.group(1), "[[{}:%[0-9]+]]".format(name), 1
+                    )
+            for number, name in vreg_map.items():
+                func_line = re.sub(
+                    r"{}\b".format(number), "[[{}]]".format(name), func_line
+                )
+            mangled.append(func_line)
+        body = "".join(mangled)
+
+        for prefix in prefixes:
+            info = common.function_body(
+                body, fixedStack, None, None, None, None, ginfo=None
+            )
+            if func in func_dict[prefix]:
+                if (
+                    not func_dict[prefix][func]
+                    or func_dict[prefix][func].scrub != info.scrub
+                    or func_dict[prefix][func].extrascrub != info.extrascrub
+                ):
+                    func_dict[prefix][func] = None
+            else:
+                func_dict[prefix][func] = info
+
+
+def mangle_vreg(opcode, current_names):
+    base = opcode
+    # Simplify some common prefixes and suffixes
+    if opcode.startswith("G_"):
+        base = base[len("G_") :]
+    if opcode.endswith("_PSEUDO"):
+        base = base[: len("_PSEUDO")]
+    # Shorten some common opcodes with long-ish names
+    base = dict(
+        IMPLICIT_DEF="DEF",
+        GLOBAL_VALUE="GV",
+        CONSTANT="C",
+        FCONSTANT="C",
+        MERGE_VALUES="MV",
+        UNMERGE_VALUES="UV",
+        INTRINSIC="INT",
+        INTRINSIC_W_SIDE_EFFECTS="INT",
+        INSERT_VECTOR_ELT="IVEC",
+        EXTRACT_VECTOR_ELT="EVEC",
+        SHUFFLE_VECTOR="SHUF",
+    ).get(base, base)
+    # Avoid ambiguity when opcodes end in numbers
+    if len(base.rstrip("0123456789")) < len(base):
+        base += "_"
+
+    i = 0
+    for name in current_names:
+        if name.rstrip("0123456789") == base:
+            i += 1
+    if i:
+        return "{}{}".format(base, i)
+    return base
+
+
+def find_mir_functions_with_one_bb(lines, verbose=False):
+    result = []
+    cur_func = None
+    bbs = 0
+    for line in lines:
+        m = MIR_FUNC_NAME_RE.match(line)
+        if m:
+            if bbs == 1:
+                result.append(cur_func)
+            cur_func = m.group("func")
+            bbs = 0
+        m = MIR_BASIC_BLOCK_RE.match(line)
+        if m:
+            bbs += 1
+    if bbs == 1:
+        result.append(cur_func)
+    return result
+
+
+def add_mir_checks_for_function(
+    test,
+    output_lines,
+    run_list,
+    func_dict,
+    func_name,
+    single_bb,
+    print_fixed_stack,
+    first_check_is_next,
+    at_the_function_name,
+):
+    printed_prefixes = set()
+    for run in run_list:
+        for prefix in run[0]:
+            if prefix in printed_prefixes:
+                break
+            if not func_dict[prefix][func_name]:
+                continue
+            if printed_prefixes:
+                # Add some space between different check prefixes.
+                indent = len(output_lines[-1]) - len(output_lines[-1].lstrip(" "))
+                output_lines.append(" " * indent + ";")
+            printed_prefixes.add(prefix)
+            add_mir_check_lines(
+                test,
+                output_lines,
+                prefix,
+                ("@" if at_the_function_name else "") + func_name,
+                single_bb,
+                func_dict[prefix][func_name],
+                print_fixed_stack,
+                first_check_is_next,
+            )
+            break
+        else:
+            warn(
+                "Found conflicting asm for function: {}".format(func_name),
+                test_file=test,
+            )
+    return output_lines
+
+
+def add_mir_check_lines(
+    test,
+    output_lines,
+    prefix,
+    func_name,
+    single_bb,
+    func_info,
+    print_fixed_stack,
+    first_check_is_next,
+):
+    func_body = str(func_info).splitlines()
+    if single_bb:
+        # Don't bother checking the basic block label for a single BB
+        func_body.pop(0)
+
+    if not func_body:
+        warn(
+            "Function has no instructions to check: {}".format(func_name),
+            test_file=test,
+        )
+        return
+
+    first_line = func_body[0]
+    indent = len(first_line) - len(first_line.lstrip(" "))
+    # A check comment, indented the appropriate amount
+    check = "{:>{}}; {}".format("", indent, prefix)
+
+    output_lines.append("{}-LABEL: name: {}".format(check, func_name))
+
+    if print_fixed_stack:
+        output_lines.append("{}: fixedStack:".format(check))
+        for stack_line in func_info.extrascrub.splitlines():
+            filecheck_directive = check + "-NEXT"
+            output_lines.append("{}: {}".format(filecheck_directive, stack_line))
+
+    first_check = not first_check_is_next
+    for func_line in func_body:
+        if not func_line.strip():
+            # The mir printer prints leading whitespace so we can't use CHECK-EMPTY:
+            output_lines.append(check + "-NEXT: {{" + func_line + "$}}")
+            continue
+        filecheck_directive = check if first_check else check + "-NEXT"
+        first_check = False
+        check_line = "{}: {}".format(filecheck_directive, func_line[indent:]).rstrip()
+        output_lines.append(check_line)
+
+
+def should_add_mir_line_to_output(input_line, prefix_set):
+    # Skip any check lines that we're handling as well as comments
+    m = CHECK_RE.match(input_line)
+    if (m and m.group(1) in prefix_set) or input_line.strip() == ";":
+        return False
+    return True
+
+
+def add_mir_checks(
+    input_lines,
+    prefix_set,
+    autogenerated_note,
+    test,
+    run_list,
+    func_dict,
+    print_fixed_stack,
+    first_check_is_next,
+    at_the_function_name,
+):
+    simple_functions = find_mir_functions_with_one_bb(input_lines)
+
+    output_lines = []
+    output_lines.append(autogenerated_note)
+
+    func_name = None
+    state = "toplevel"
+    for input_line in input_lines:
+        if input_line == autogenerated_note:
+            continue
+
+        if state == "toplevel":
+            m = IR_FUNC_NAME_RE.match(input_line)
+            if m:
+                state = "ir function prefix"
+                func_name = m.group("func")
+            if input_line.rstrip("| \r\n") == "---":
+                state = "document"
+            output_lines.append(input_line)
+        elif state == "document":
+            m = MIR_FUNC_NAME_RE.match(input_line)
+            if m:
+                state = "mir function metadata"
+                func_name = m.group("func")
+            if input_line.strip() == "...":
+                state = "toplevel"
+                func_name = None
+            if should_add_mir_line_to_output(input_line, prefix_set):
+                output_lines.append(input_line)
+        elif state == "mir function metadata":
+            if should_add_mir_line_to_output(input_line, prefix_set):
+                output_lines.append(input_line)
+            m = MIR_BODY_BEGIN_RE.match(input_line)
+            if m:
+                if func_name in simple_functions:
+                    # If there's only one block, put the checks inside it
+                    state = "mir function prefix"
+                    continue
+                state = "mir function body"
+                add_mir_checks_for_function(
+                    test,
+                    output_lines,
+                    run_list,
+                    func_dict,
+                    func_name,
+                    single_bb=False,
+                    print_fixed_stack=print_fixed_stack,
+                    first_check_is_next=first_check_is_next,
+                    at_the_function_name=at_the_function_name,
+                )
+        elif state == "mir function prefix":
+            m = MIR_PREFIX_DATA_RE.match(input_line)
+            if not m:
+                state = "mir function body"
+                add_mir_checks_for_function(
+                    test,
+                    output_lines,
+                    run_list,
+                    func_dict,
+                    func_name,
+                    single_bb=True,
+                    print_fixed_stack=print_fixed_stack,
+                    first_check_is_next=first_check_is_next,
+                    at_the_function_name=at_the_function_name,
+                )
+
+            if should_add_mir_line_to_output(input_line, prefix_set):
+                output_lines.append(input_line)
+        elif state == "mir function body":
+            if input_line.strip() == "...":
+                state = "toplevel"
+                func_name = None
+            if should_add_mir_line_to_output(input_line, prefix_set):
+                output_lines.append(input_line)
+        elif state == "ir function prefix":
+            m = IR_PREFIX_DATA_RE.match(input_line)
+            if not m:
+                state = "ir function body"
+                add_mir_checks_for_function(
+                    test,
+                    output_lines,
+                    run_list,
+                    func_dict,
+                    func_name,
+                    single_bb=False,
+                    print_fixed_stack=print_fixed_stack,
+                    first_check_is_next=first_check_is_next,
+                    at_the_function_name=at_the_function_name,
+                )
+
+            if should_add_mir_line_to_output(input_line, prefix_set):
+                output_lines.append(input_line)
+        elif state == "ir function body":
+            if input_line.strip() == "}":
+                state = "toplevel"
+                func_name = None
+            if should_add_mir_line_to_output(input_line, prefix_set):
+                output_lines.append(input_line)
+    return output_lines
diff --git a/llvm/utils/git/code-format-helper.py b/llvm/utils/git/code-format-helper.py
index 406a72817acb8..f6b28f480b8a2 100755
--- a/llvm/utils/git/code-format-helper.py
+++ b/llvm/utils/git/code-format-helper.py
@@ -486,8 +486,6 @@ def hook_main():
         if fmt.has_tool():
             if not fmt.run(args.changed_files, args):
                 failed_fmts.append(fmt.name)
-            if fmt.comment:
-                comments.append(fmt.comment)
         else:
             print(f"Couldn't find {fmt.name}, can't check " + fmt.friendly_name.lower())
 
@@ -508,7 +506,7 @@ def hook_main():
 
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        "--token", type=str, required=True, help="GitHub authentiation token"
+        "--token", type=str, required=True, help="GitHub authentication token"
     )
     parser.add_argument(
         "--repo",
diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn
index 036123371d24c..f280f695cd3ab 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn
@@ -69,6 +69,7 @@ static_library("bugprone") {
     "ParentVirtualCallCheck.cpp",
     "PointerArithmeticOnPolymorphicObjectCheck.cpp",
     "PosixReturnCheck.cpp",
+    "RawMemoryCallOnNonTrivialTypeCheck.cpp",
     "RedundantBranchConditionCheck.cpp",
     "ReservedIdentifierCheck.cpp",
     "ReturnConstRefFromParameterCheck.cpp",
diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cert/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cert/BUILD.gn
index b097e139b9c7f..3ad0a83a8fb23 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cert/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cert/BUILD.gn
@@ -21,7 +21,6 @@ static_library("cert") {
     "FloatLoopCounter.cpp",
     "LimitedRandomnessCheck.cpp",
     "MutatingCopyCheck.cpp",
-    "NonTrivialTypesLibcMemoryCallsCheck.cpp",
     "ProperlySeededRandomGeneratorCheck.cpp",
     "ThrownExceptionTypeCheck.cpp",
   ]
diff --git a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn
index 3c523aeada6cb..03e5294b03860 100644
--- a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn
@@ -132,18 +132,12 @@ copy("Headers") {
     "amdgpuintrin.h",
     "ammintrin.h",
     "amxavx512intrin.h",
-    "amxbf16transposeintrin.h",
     "amxcomplexintrin.h",
-    "amxcomplextransposeintrin.h",
     "amxfp16intrin.h",
-    "amxfp16transposeintrin.h",
     "amxfp8intrin.h",
     "amxintrin.h",
     "amxmovrsintrin.h",
-    "amxmovrstransposeintrin.h",
     "amxtf32intrin.h",
-    "amxtf32transposeintrin.h",
-    "amxtransposeintrin.h",
     "andes_vector.h",
     "arm64intr.h",
     "arm_acle.h",
diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
index 278c29c766ddb..9e0b9513a9a1a 100644
--- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
+++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
@@ -1175,6 +1175,7 @@ if (current_toolchain == default_toolchain) {
       "__locale_dir/support/freebsd.h",
       "__locale_dir/support/fuchsia.h",
       "__locale_dir/support/linux.h",
+      "__locale_dir/support/netbsd.h",
       "__locale_dir/support/no_locale/characters.h",
       "__locale_dir/support/no_locale/strtonum.h",
       "__locale_dir/support/windows.h",
@@ -1523,6 +1524,7 @@ if (current_toolchain == default_toolchain) {
       "__type_traits/is_valid_expansion.h",
       "__type_traits/is_void.h",
       "__type_traits/is_volatile.h",
+      "__type_traits/is_within_lifetime.h",
       "__type_traits/lazy.h",
       "__type_traits/make_32_64_or_128_bit.h",
       "__type_traits/make_const_lvalue_ref.h",
diff --git a/llvm/utils/gn/secondary/lldb/tools/lldb-dap/tool/BUILD.gn b/llvm/utils/gn/secondary/lldb/tools/lldb-dap/tool/BUILD.gn
index 8b764843ac82f..1773fe3c73f57 100644
--- a/llvm/utils/gn/secondary/lldb/tools/lldb-dap/tool/BUILD.gn
+++ b/llvm/utils/gn/secondary/lldb/tools/lldb-dap/tool/BUILD.gn
@@ -5,7 +5,7 @@ import("//llvm/version.gni")
 tablegen("Options") {
   visibility = [ ":lldb-dap" ]
   args = [ "-gen-opt-parser-defs" ]
-  td_file = "../Options.td"
+  td_file = "Options.td"
 }
 
 if (host_os == "mac") {
diff --git a/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/TargetProcess/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/TargetProcess/BUILD.gn
index c4ce9906a7021..937e81b476967 100644
--- a/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/TargetProcess/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/TargetProcess/BUILD.gn
@@ -12,6 +12,8 @@ static_library("TargetProcess") {
     "JITLoaderGDB.cpp",
     "JITLoaderPerf.cpp",
     "JITLoaderVTune.cpp",
+    "LibraryResolver.cpp",
+    "LibraryScanner.cpp",
     "OrcRTBootstrap.cpp",
     "RegisterEHFrames.cpp",
     "SimpleExecutorDylibManager.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/unittests/ExecutionEngine/Orc/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/ExecutionEngine/Orc/BUILD.gn
index dfe6d6da005dd..111e4c997de92 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/ExecutionEngine/Orc/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/ExecutionEngine/Orc/BUILD.gn
@@ -24,6 +24,7 @@ unittest("OrcJITTests") {
     "JITLinkRedirectionManagerTest.cpp",
     "JITTargetMachineBuilderTest.cpp",
     "LazyCallThroughAndReexportsTest.cpp",
+    "LibraryResolverTest.cpp",
     "LookupAndRecordAddrsTest.cpp",
     "MachOPlatformTest.cpp",
     "MapperJITLinkMemoryManagerTest.cpp",
diff --git a/llvm/utils/lit/lit/TestRunner.py b/llvm/utils/lit/lit/TestRunner.py
index f88314547bb3f..3176b1a257434 100644
--- a/llvm/utils/lit/lit/TestRunner.py
+++ b/llvm/utils/lit/lit/TestRunner.py
@@ -600,18 +600,33 @@ def executeBuiltinUmask(cmd, shenv):
 
 def executeBuiltinUlimit(cmd, shenv):
     """executeBuiltinUlimit - Change the current limits."""
-    if os.name != "posix":
+    try:
+        # Try importing the resource module (available on POSIX systems) and
+        # emit an error where it does not exist (e.g., Windows).
+        import resource
+    except ImportError:
         raise InternalShellError(cmd, "'ulimit' not supported on this system")
     if len(cmd.args) != 3:
         raise InternalShellError(cmd, "'ulimit' requires two arguments")
     try:
-        new_limit = int(cmd.args[2])
+        if cmd.args[2] == "unlimited":
+            new_limit = resource.RLIM_INFINITY
+        else:
+            new_limit = int(cmd.args[2])
     except ValueError as err:
         raise InternalShellError(cmd, "Error: 'ulimit': %s" % str(err))
     if cmd.args[1] == "-v":
-        shenv.ulimit["RLIMIT_AS"] = new_limit * 1024
+        if new_limit != resource.RLIM_INFINITY:
+            new_limit = new_limit * 1024
+        shenv.ulimit["RLIMIT_AS"] = new_limit
     elif cmd.args[1] == "-n":
         shenv.ulimit["RLIMIT_NOFILE"] = new_limit
+    elif cmd.args[1] == "-s":
+        if new_limit != resource.RLIM_INFINITY:
+            new_limit = new_limit * 1024
+        shenv.ulimit["RLIMIT_STACK"] = new_limit
+    elif cmd.args[1] == "-f":
+        shenv.ulimit["RLIMIT_FSIZE"] = new_limit
     else:
         raise InternalShellError(
             cmd, "'ulimit' does not support option: %s" % cmd.args[1]
@@ -811,6 +826,10 @@ def _executeShCmd(cmd, shenv, results, timeoutHelper):
         not_args = []
         not_count = 0
         not_crash = False
+
+        # Expand all late substitutions.
+        args = _expandLateSubstitutions(j, args, cmd_shenv.cwd)
+
         while True:
             if args[0] == "env":
                 # Create a copy of the global environment and modify it for
@@ -860,9 +879,6 @@ def _executeShCmd(cmd, shenv, results, timeoutHelper):
         # Ensure args[0] is hashable.
         args[0] = expand_glob(args[0], cmd_shenv.cwd)[0]
 
-        # Expand all late substitutions.
-        args = _expandLateSubstitutions(j, args, cmd_shenv.cwd)
-
         inproc_builtin = inproc_builtins.get(args[0], None)
         if inproc_builtin and (args[0] != "echo" or len(cmd.commands) == 1):
             # env calling an in-process builtin is useless, so we take the safe
@@ -945,7 +961,7 @@ def _executeShCmd(cmd, shenv, results, timeoutHelper):
             path = (
                 cmd_shenv.env["PATH"] if "PATH" in cmd_shenv.env else shenv.env["PATH"]
             )
-            executable = lit.util.which(args[0], shenv.env["PATH"])
+            executable = lit.util.which(args[0], path)
         if not executable:
             raise InternalShellError(j, "%r: command not found" % args[0])
 
diff --git a/llvm/utils/lit/lit/builtin_commands/_launch_with_limit.py b/llvm/utils/lit/lit/builtin_commands/_launch_with_limit.py
index 33d2d59ff0dbe..a9dc2595497e7 100644
--- a/llvm/utils/lit/lit/builtin_commands/_launch_with_limit.py
+++ b/llvm/utils/lit/lit/builtin_commands/_launch_with_limit.py
@@ -17,6 +17,10 @@ def main(argv):
                 resource.setrlimit(resource.RLIMIT_AS, limit)
             elif limit_str == "RLIMIT_NOFILE":
                 resource.setrlimit(resource.RLIMIT_NOFILE, limit)
+            elif limit_str == "RLIMIT_STACK":
+                resource.setrlimit(resource.RLIMIT_STACK, limit)
+            elif limit_str == "RLIMIT_FSIZE":
+                resource.setrlimit(resource.RLIMIT_FSIZE, limit)
     process_output = subprocess.run(command_args)
     sys.exit(process_output.returncode)
 
diff --git a/llvm/utils/lit/tests/Inputs/shtest-env-path/lit.cfg b/llvm/utils/lit/tests/Inputs/shtest-env-path/lit.cfg
new file mode 100644
index 0000000000000..36517f998530b
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/shtest-env-path/lit.cfg
@@ -0,0 +1,8 @@
+import lit.formats
+
+config.name = "shtest-env-path"
+config.suffixes = [".txt"]
+config.test_format = lit.formats.ShTest()
+config.test_source_root = None
+config.test_exec_root = None
+config.substitutions.append(("%{python}", '"%s"' % (sys.executable)))
diff --git a/llvm/utils/lit/tests/Inputs/shtest-env-path/path.txt b/llvm/utils/lit/tests/Inputs/shtest-env-path/path.txt
new file mode 100644
index 0000000000000..b36e861ec5632
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/shtest-env-path/path.txt
@@ -0,0 +1,8 @@
+## Tests env command for setting the PATH variable.
+
+## Check that test.sh can be found using the configured PATH.
+#
+# RUN: env PATH=%S test.sh | FileCheck --check-prefix=CHECK %s
+#
+
+# CHECK: TEST-ENV-PATH-123
diff --git a/llvm/utils/lit/tests/Inputs/shtest-env-path/test.sh b/llvm/utils/lit/tests/Inputs/shtest-env-path/test.sh
new file mode 100755
index 0000000000000..a1e46fc210d49
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/shtest-env-path/test.sh
@@ -0,0 +1,4 @@
+#!/bin/sh
+
+echo "TEST-ENV-PATH-123"
+
diff --git a/llvm/utils/lit/tests/Inputs/shtest-readfile/env.txt b/llvm/utils/lit/tests/Inputs/shtest-readfile/env.txt
new file mode 100644
index 0000000000000..3e19373754976
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/shtest-readfile/env.txt
@@ -0,0 +1,6 @@
+## Tests that readfile works with the env builtin.
+# RUN: echo -n "hello" > %t.1
+# RUN: env TEST=%{readfile:%t.1} %{python} -c "import os; print(os.environ['TEST'])"
+
+## Fail the test so we can assert on the output.
+# RUN: not echo return
\ No newline at end of file
diff --git a/llvm/utils/lit/tests/Inputs/shtest-readfile/lit.cfg b/llvm/utils/lit/tests/Inputs/shtest-readfile/lit.cfg
index ee496674fdb62..80af27f57d35c 100644
--- a/llvm/utils/lit/tests/Inputs/shtest-readfile/lit.cfg
+++ b/llvm/utils/lit/tests/Inputs/shtest-readfile/lit.cfg
@@ -10,6 +10,7 @@ use_lit_shell = lit.util.pythonize_bool(lit_shell_env)
 config.test_format = lit.formats.ShTest(execute_external=not use_lit_shell)
 config.test_source_root = None
 config.test_exec_root = None
+config.substitutions.append(("%{python}", '"%s"' % (sys.executable)))
 
 # If we are testing with the external shell, remove the fake-externals from
 # PATH so that we use mkdir in the tests.
diff --git a/llvm/utils/lit/tests/Inputs/shtest-ulimit-nondarwin/ulimit_okay.txt b/llvm/utils/lit/tests/Inputs/shtest-ulimit-nondarwin/ulimit_okay.txt
index dbdd0037e70a7..a5fac7b1d126d 100644
--- a/llvm/utils/lit/tests/Inputs/shtest-ulimit-nondarwin/ulimit_okay.txt
+++ b/llvm/utils/lit/tests/Inputs/shtest-ulimit-nondarwin/ulimit_okay.txt
@@ -1,4 +1,5 @@
 # RUN: ulimit -v 1048576
+# RUN: ulimit -s 256
 # RUN: %{python} %S/../shtest-ulimit/print_limits.py
 # Fail the test so that we can assert on the output.
 # RUN: not echo return
diff --git a/llvm/utils/lit/tests/Inputs/shtest-ulimit-nondarwin/ulimit_unlimited.txt b/llvm/utils/lit/tests/Inputs/shtest-ulimit-nondarwin/ulimit_unlimited.txt
new file mode 100644
index 0000000000000..4c687e3061869
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/shtest-ulimit-nondarwin/ulimit_unlimited.txt
@@ -0,0 +1,6 @@
+# RUN: ulimit -f 5
+# RUN: %{python} %S/../shtest-ulimit/print_limits.py
+# RUN: ulimit -f unlimited
+# RUN: %{python} %S/../shtest-ulimit/print_limits.py
+# Fail the test so that we can assert on the output.
+# RUN: not echo return
diff --git a/llvm/utils/lit/tests/Inputs/shtest-ulimit/print_limits.py b/llvm/utils/lit/tests/Inputs/shtest-ulimit/print_limits.py
index 632f954fa8fde..c732c0429e661 100644
--- a/llvm/utils/lit/tests/Inputs/shtest-ulimit/print_limits.py
+++ b/llvm/utils/lit/tests/Inputs/shtest-ulimit/print_limits.py
@@ -2,3 +2,5 @@
 
 print("RLIMIT_AS=" + str(resource.getrlimit(resource.RLIMIT_AS)[0]))
 print("RLIMIT_NOFILE=" + str(resource.getrlimit(resource.RLIMIT_NOFILE)[0]))
+print("RLIMIT_STACK=" + str(resource.getrlimit(resource.RLIMIT_STACK)[0]))
+print("RLIMIT_FSIZE=" + str(resource.getrlimit(resource.RLIMIT_FSIZE)[0]))
diff --git a/llvm/utils/lit/tests/Inputs/shtest-ulimit/ulimit_okay.txt b/llvm/utils/lit/tests/Inputs/shtest-ulimit/ulimit_okay.txt
index 4edf1c303a092..b1f2396b35d69 100644
--- a/llvm/utils/lit/tests/Inputs/shtest-ulimit/ulimit_okay.txt
+++ b/llvm/utils/lit/tests/Inputs/shtest-ulimit/ulimit_okay.txt
@@ -1,4 +1,5 @@
 # RUN: ulimit -n 50
+# RUN: ulimit -f 5
 # RUN: %{python} %S/print_limits.py
 # Fail the test so that we can assert on the output.
 # RUN: not echo return
diff --git a/llvm/utils/lit/tests/Inputs/test-data-micro/dummy_format.py b/llvm/utils/lit/tests/Inputs/test-data-micro/dummy_format.py
index b400083a0d967..27b738edf8e14 100644
--- a/llvm/utils/lit/tests/Inputs/test-data-micro/dummy_format.py
+++ b/llvm/utils/lit/tests/Inputs/test-data-micro/dummy_format.py
@@ -1,9 +1,5 @@
 import os
-
-try:
-    import ConfigParser
-except ImportError:
-    import configparser as ConfigParser
+import configparser
 
 import lit.formats
 import lit.Test
@@ -16,7 +12,7 @@ def execute(self, test, lit_config):
 
         source_path = test.getSourcePath()
 
-        cfg = ConfigParser.ConfigParser()
+        cfg = configparser.ConfigParser()
         cfg.read(source_path)
 
         # Create the basic test result.
diff --git a/llvm/utils/lit/tests/Inputs/test-data/dummy_format.py b/llvm/utils/lit/tests/Inputs/test-data/dummy_format.py
index 30bd1814a6a42..b4c1b92637d01 100644
--- a/llvm/utils/lit/tests/Inputs/test-data/dummy_format.py
+++ b/llvm/utils/lit/tests/Inputs/test-data/dummy_format.py
@@ -1,9 +1,5 @@
 import os
-
-try:
-    import ConfigParser
-except ImportError:
-    import configparser as ConfigParser
+import configparser
 
 import lit.formats
 import lit.Test
@@ -16,7 +12,7 @@ def execute(self, test, lit_config):
 
         source_path = test.getSourcePath()
 
-        cfg = ConfigParser.ConfigParser()
+        cfg = configparser.ConfigParser()
         cfg.read(source_path)
 
         # Create the basic test result.
diff --git a/llvm/utils/lit/tests/Inputs/xunit-output/dummy_format.py b/llvm/utils/lit/tests/Inputs/xunit-output/dummy_format.py
index efac0b561c44b..43da0973df614 100644
--- a/llvm/utils/lit/tests/Inputs/xunit-output/dummy_format.py
+++ b/llvm/utils/lit/tests/Inputs/xunit-output/dummy_format.py
@@ -1,9 +1,5 @@
 import os
-
-try:
-    import ConfigParser
-except ImportError:
-    import configparser as ConfigParser
+import configparser
 
 import lit.formats
 import lit.Test
@@ -16,7 +12,7 @@ def execute(self, test, lit_config):
 
         source_path = test.getSourcePath()
 
-        cfg = ConfigParser.ConfigParser()
+        cfg = configparser.ConfigParser()
         cfg.read(source_path)
 
         # Create the basic test result.
diff --git a/llvm/utils/lit/tests/shtest-env-path.py b/llvm/utils/lit/tests/shtest-env-path.py
new file mode 100644
index 0000000000000..bf459ae53fbc0
--- /dev/null
+++ b/llvm/utils/lit/tests/shtest-env-path.py
@@ -0,0 +1,13 @@
+## Tests env command for setting the PATH variable.
+
+# The test is using /bin/sh. Limit to system known to have /bin/sh.
+# REQUIRES: system-linux
+
+# RUN: %{lit} -a -v %{inputs}/shtest-env-path/path.txt \
+# RUN:   | FileCheck -match-full-lines %s
+#
+# END.
+
+# CHECK: -- Testing: 1 tests{{.*}}
+# CHECK: PASS: shtest-env-path :: path.txt (1 of 1)
+# CHECK: --
diff --git a/llvm/utils/lit/tests/shtest-readfile-external.py b/llvm/utils/lit/tests/shtest-readfile-external.py
index c00bff45c8703..6fe1088efd674 100644
--- a/llvm/utils/lit/tests/shtest-readfile-external.py
+++ b/llvm/utils/lit/tests/shtest-readfile-external.py
@@ -6,7 +6,7 @@
 # UNSUPPORTED: system-windows
 # RUN: env LIT_USE_INTERNAL_SHELL=0 not %{lit} -a -v %{inputs}/shtest-readfile | FileCheck -match-full-lines -DTEMP_PATH=%S/Inputs/shtest-readfile/Output %s
 
-# CHECK: -- Testing: 4 tests{{.*}}
+# CHECK: -- Testing: 5 tests{{.*}}
 
 # CHECK-LABEL: FAIL: shtest-readfile :: absolute-paths.txt ({{[^)]*}})
 # CHECK: echo $(cat [[TEMP_PATH]]/absolute-paths.txt.tmp) && test -e [[TEMP_PATH]]/absolute-paths.txt.tmp {{.*}}
diff --git a/llvm/utils/lit/tests/shtest-readfile.py b/llvm/utils/lit/tests/shtest-readfile.py
index 66e3a042bf787..218da2257bcff 100644
--- a/llvm/utils/lit/tests/shtest-readfile.py
+++ b/llvm/utils/lit/tests/shtest-readfile.py
@@ -5,12 +5,16 @@
 
 # RUN: env LIT_USE_INTERNAL_SHELL=1  not %{lit} -a -v %{inputs}/shtest-readfile | FileCheck -match-full-lines -DTEMP_PATH=%S%{fs-sep}Inputs%{fs-sep}shtest-readfile%{fs-sep}Output %s
 
-# CHECK: -- Testing: 4 tests{{.*}}
+# CHECK: -- Testing: 5 tests{{.*}}
 
 # CHECK-LABEL: FAIL: shtest-readfile :: absolute-paths.txt ({{[^)]*}})
 # CHECK: echo hello
 # CHECK: # executed command: echo '%{readfile:[[TEMP_PATH]]{{[\\\/]}}absolute-paths.txt.tmp}'
 
+# CHECK-LABEL: FAIL: shtest-readfile :: env.txt ({{[^)]*}})
+# CHECK: env TEST=hello {{.*}} -c "import os; print(os.environ['TEST'])"
+# CHECK: # | hello
+
 # CHECK-LABEL: FAIL: shtest-readfile :: file-does-not-exist.txt ({{[^)]*}})
 # CHECK: # executed command: @echo 'echo %{readfile:/file/does/not/exist}'
 # CHECK: # | File specified in readfile substitution does not exist: {{.*}}/file/does/not/exist
diff --git a/llvm/utils/lit/tests/shtest-ulimit-nondarwin.py b/llvm/utils/lit/tests/shtest-ulimit-nondarwin.py
index 2d96feae5b58e..286fd3d7e173e 100644
--- a/llvm/utils/lit/tests/shtest-ulimit-nondarwin.py
+++ b/llvm/utils/lit/tests/shtest-ulimit-nondarwin.py
@@ -2,12 +2,20 @@
 
 # ulimit does not work on non-POSIX platforms.
 # These tests are specific to options that Darwin does not support.
-# UNSUPPORTED: system-windows, system-darwin, system-aix
+# UNSUPPORTED: system-windows, system-darwin, system-aix, system-solaris
 
 # RUN: not %{lit} -a -v %{inputs}/shtest-ulimit-nondarwin | FileCheck %s
 
-# CHECK: -- Testing: 1 tests{{.*}}
+# CHECK: -- Testing: 2 tests{{.*}}
 
 # CHECK-LABEL: FAIL: shtest-ulimit :: ulimit_okay.txt ({{[^)]*}})
 # CHECK: ulimit -v 1048576
+# CHECK: ulimit -s 256
 # CHECK: RLIMIT_AS=1073741824
+# CHECK: RLIMIT_STACK=262144
+
+# CHECK-LABEL: FAIL: shtest-ulimit :: ulimit_unlimited.txt ({{[^)]*}})
+# CHECK: ulimit -f 5
+# CHECK: RLIMIT_FSIZE=5
+# CHECK: ulimit -f unlimited
+# CHECK: RLIMIT_FSIZE=-1
diff --git a/llvm/utils/lit/tests/shtest-ulimit.py b/llvm/utils/lit/tests/shtest-ulimit.py
index 09cd475b737c1..21e5a5e2491d1 100644
--- a/llvm/utils/lit/tests/shtest-ulimit.py
+++ b/llvm/utils/lit/tests/shtest-ulimit.py
@@ -19,7 +19,9 @@
 
 # CHECK-LABEL: FAIL: shtest-ulimit :: ulimit_okay.txt ({{[^)]*}})
 # CHECK: ulimit -n 50
+# CHECK: ulimit -f 5
 # CHECK: RLIMIT_NOFILE=50
+# CHECK: RLIMIT_FSIZE=5
 
 # CHECK-LABEL: FAIL: shtest-ulimit :: ulimit_reset.txt ({{[^)]*}})
 # CHECK: RLIMIT_NOFILE=[[BASE_NOFILE_LIMIT]]
diff --git a/llvm/utils/profcheck-xfail.txt b/llvm/utils/profcheck-xfail.txt
index 3d07b16cac661..380b162d8c58c 100644
--- a/llvm/utils/profcheck-xfail.txt
+++ b/llvm/utils/profcheck-xfail.txt
@@ -517,8 +517,11 @@ Instrumentation/TypeSanitizer/alloca-only.ll
 Instrumentation/TypeSanitizer/anon.ll
 Instrumentation/TypeSanitizer/basic.ll
 Instrumentation/TypeSanitizer/basic-nosan.ll
+Instrumentation/TypeSanitizer/basic_outlined.ll
+Instrumentation/TypeSanitizer/basic_verify_outlined.ll
 Instrumentation/TypeSanitizer/byval.ll
 Instrumentation/TypeSanitizer/globals.ll
+Instrumentation/TypeSanitizer/globals_outlined.ll
 Instrumentation/TypeSanitizer/invalid-metadata.ll
 Instrumentation/TypeSanitizer/memintrinsics.ll
 Instrumentation/TypeSanitizer/nosanitize.ll
@@ -550,6 +553,7 @@ tools/UpdateTestChecks/update_test_checks/stable_ir_values5.test
 tools/UpdateTestChecks/update_test_checks/stable_ir_values6.test
 tools/UpdateTestChecks/update_test_checks/stable_ir_values_funcs.test
 tools/UpdateTestChecks/update_test_checks/stable_ir_values.test
+tools/UpdateTestChecks/update_test_checks/switch_case.test
 tools/UpdateTestChecks/update_test_checks/tbaa-semantics-checks.test
 tools/UpdateTestChecks/update_test_checks/various_ir_values_dbgrecords.test
 Transforms/AtomicExpand/AArch64/atomicrmw-fp.ll
@@ -728,6 +732,7 @@ Transforms/ExpandVariadics/expand-va-intrinsic-split-simple.ll
 Transforms/ExpandVariadics/intrinsics.ll
 Transforms/FixIrreducible/basic.ll
 Transforms/FixIrreducible/bug45623.ll
+Transforms/FixIrreducible/callbr.ll
 Transforms/FixIrreducible/nested.ll
 Transforms/FixIrreducible/switch.ll
 Transforms/GCOVProfiling/atomic-counter.ll
@@ -1105,6 +1110,7 @@ Transforms/LoopSimplifyCFG/update_parents.ll
 Transforms/LoopUnroll/peel-last-iteration-expansion-cost.ll
 Transforms/LoopUnroll/peel-last-iteration-with-guards.ll
 Transforms/LoopUnroll/peel-last-iteration-with-variable-trip-count.ll
+Transforms/LoopUnroll/runtime-loop-multiple-exits.ll
 Transforms/LoopVersioning/add-phi-update-users.ll
 Transforms/LoopVersioning/basic.ll
 Transforms/LoopVersioning/bound-check-partially-known.ll
@@ -1316,8 +1322,6 @@ Transforms/SimpleLoopUnswitch/pr60736.ll
 Transforms/SimpleLoopUnswitch/trivial-unswitch-freeze-individual-conditions.ll
 Transforms/SimpleLoopUnswitch/trivial-unswitch.ll
 Transforms/SimpleLoopUnswitch/trivial-unswitch-logical-and-or.ll
-Transforms/SimplifyCFG/RISCV/switch-of-powers-of-two.ll
-Transforms/SimplifyCFG/X86/switch-of-powers-of-two.ll
 Transforms/StackProtector/cross-dso-cfi-stack-chk-fail.ll
 Transforms/StructurizeCFG/AMDGPU/uniform-regions.ll
 Transforms/StructurizeCFG/hoist-zerocost.ll
diff --git a/llvm/utils/update_givaluetracking_test_checks.py b/llvm/utils/update_givaluetracking_test_checks.py
index 49b068ac7bef0..9ad0f3ec9ad1c 100755
--- a/llvm/utils/update_givaluetracking_test_checks.py
+++ b/llvm/utils/update_givaluetracking_test_checks.py
@@ -19,6 +19,7 @@
 import sys
 
 from UpdateTestChecks import common
+from UpdateTestChecks import mir
 
 VT_FUNCTION_RE = re.compile(
     r"\s*name:\s*@(?P<func>[A-Za-z0-9_-]+)"
@@ -92,7 +93,7 @@ def update_test(ti: common.TestInfo):
     func_dict = builder.finish_and_get_func_dict()
     prefix_set = set([prefix for p in run_list for prefix in p[0]])
     common.debug("Rewriting FileCheck prefixes:", str(prefix_set))
-    output_lines = common.add_mir_checks(
+    output_lines = mir.add_mir_checks(
         ti.input_lines,
         prefix_set,
         ti.test_autogenerated_note,
diff --git a/llvm/utils/update_mir_test_checks.py b/llvm/utils/update_mir_test_checks.py
index c4ee0523a6469..ba70249db28e6 100755
--- a/llvm/utils/update_mir_test_checks.py
+++ b/llvm/utils/update_mir_test_checks.py
@@ -31,39 +31,7 @@
 import sys
 
 from UpdateTestChecks import common
-
-VREG_RE = re.compile(r"(%[0-9]+)(?:\.[a-z0-9_]+)?(?::[a-z0-9_]+)?(?:\([<>a-z0-9 ]+\))?")
-MI_FLAGS_STR = (
-    r"(frame-setup |frame-destroy |nnan |ninf |nsz |arcp |contract |afn "
-    r"|reassoc |nuw |nsw |exact |nofpexcept |nomerge |unpredictable "
-    r"|noconvergent |nneg |disjoint |nusw |samesign |inbounds )*"
-)
-VREG_DEF_FLAGS_STR = r"(?:dead |undef )*"
-
-# Pattern to match the defined vregs and the opcode of an instruction that
-# defines vregs. Opcodes starting with a lower-case 't' are allowed to match
-# ARM's thumb instructions, like tADDi8 and t2ADDri.
-VREG_DEF_RE = re.compile(
-    r"^ *(?P<vregs>{2}{0}(?:, {2}{0})*) = "
-    r"{1}(?P<opcode>[A-Zt][A-Za-z0-9_]+)".format(
-        VREG_RE.pattern, MI_FLAGS_STR, VREG_DEF_FLAGS_STR
-    )
-)
-
-MIR_FUNC_RE = re.compile(
-    r"^---$"
-    r"\n"
-    r"^ *name: *(?P<func>[A-Za-z0-9_.-]+)$"
-    r".*?"
-    r"(?:^ *fixedStack: *(\[\])? *\n"
-    r"(?P<fixedStack>.*?)\n?"
-    r"^ *stack:"
-    r".*?)?"
-    r"^ *body: *\|\n"
-    r"(?P<body>.*?)\n"
-    r"^\.\.\.$",
-    flags=(re.M | re.S),
-)
+from UpdateTestChecks import mir
 
 
 class LLC:
@@ -143,89 +111,6 @@ def build_run_list(test, run_lines, verbose=False):
     return run_list
 
 
-def build_function_info_dictionary(
-    test, raw_tool_output, triple, prefixes, func_dict, verbose
-):
-    for m in MIR_FUNC_RE.finditer(raw_tool_output):
-        func = m.group("func")
-        fixedStack = m.group("fixedStack")
-        body = m.group("body")
-        if verbose:
-            log("Processing function: {}".format(func))
-            for l in body.splitlines():
-                log("  {}".format(l))
-
-        # Vreg mangling
-        mangled = []
-        vreg_map = {}
-        for func_line in body.splitlines(keepends=True):
-            m = VREG_DEF_RE.match(func_line)
-            if m:
-                for vreg in VREG_RE.finditer(m.group("vregs")):
-                    if vreg.group(1) in vreg_map:
-                        name = vreg_map[vreg.group(1)]
-                    else:
-                        name = mangle_vreg(m.group("opcode"), vreg_map.values())
-                        vreg_map[vreg.group(1)] = name
-                    func_line = func_line.replace(
-                        vreg.group(1), "[[{}:%[0-9]+]]".format(name), 1
-                    )
-            for number, name in vreg_map.items():
-                func_line = re.sub(
-                    r"{}\b".format(number), "[[{}]]".format(name), func_line
-                )
-            mangled.append(func_line)
-        body = "".join(mangled)
-
-        for prefix in prefixes:
-            info = common.function_body(
-                body, fixedStack, None, None, None, None, ginfo=None
-            )
-            if func in func_dict[prefix]:
-                if (
-                    not func_dict[prefix][func]
-                    or func_dict[prefix][func].scrub != info.scrub
-                    or func_dict[prefix][func].extrascrub != info.extrascrub
-                ):
-                    func_dict[prefix][func] = None
-            else:
-                func_dict[prefix][func] = info
-
-
-def mangle_vreg(opcode, current_names):
-    base = opcode
-    # Simplify some common prefixes and suffixes
-    if opcode.startswith("G_"):
-        base = base[len("G_") :]
-    if opcode.endswith("_PSEUDO"):
-        base = base[: len("_PSEUDO")]
-    # Shorten some common opcodes with long-ish names
-    base = dict(
-        IMPLICIT_DEF="DEF",
-        GLOBAL_VALUE="GV",
-        CONSTANT="C",
-        FCONSTANT="C",
-        MERGE_VALUES="MV",
-        UNMERGE_VALUES="UV",
-        INTRINSIC="INT",
-        INTRINSIC_W_SIDE_EFFECTS="INT",
-        INSERT_VECTOR_ELT="IVEC",
-        EXTRACT_VECTOR_ELT="EVEC",
-        SHUFFLE_VECTOR="SHUF",
-    ).get(base, base)
-    # Avoid ambiguity when opcodes end in numbers
-    if len(base.rstrip("0123456789")) < len(base):
-        base += "_"
-
-    i = 0
-    for name in current_names:
-        if name.rstrip("0123456789") == base:
-            i += 1
-    if i:
-        return "{}{}".format(base, i)
-    return base
-
-
 def update_test_file(args, test, autogenerated_note):
     with open(test) as fd:
         input_lines = [l.rstrip() for l in fd]
@@ -247,7 +132,7 @@ def update_test_file(args, test, autogenerated_note):
             common.warn("No triple found: skipping file", test_file=test)
             return
 
-        build_function_info_dictionary(
+        mir.build_function_info_dictionary(
             test,
             raw_tool_output,
             triple_in_cmd or triple_in_ir,
@@ -259,7 +144,7 @@ def update_test_file(args, test, autogenerated_note):
     prefix_set = set([prefix for run in run_list for prefix in run[0]])
     log("Rewriting FileCheck prefixes: {}".format(prefix_set), args.verbose)
 
-    output_lines = common.add_mir_checks(
+    output_lines = mir.add_mir_checks(
         input_lines,
         prefix_set,
         autogenerated_note,
diff --git a/llvm/utils/update_test_checks.py b/llvm/utils/update_test_checks.py
index 3b562fbc54f78..74e87787fd5b8 100755
--- a/llvm/utils/update_test_checks.py
+++ b/llvm/utils/update_test_checks.py
@@ -197,6 +197,7 @@ def update_test(ti: common.TestInfo):
                     global_tbaa_records_for_prefixes,
                     is_filtered=builder.is_filtered(),
                     original_check_lines=original_check_lines.get(func, {}),
+                    check_inst_comments=args.check_inst_comments,
                 ),
             )
         )
@@ -230,6 +231,7 @@ def update_test(ti: common.TestInfo):
                         global_tbaa_records_for_prefixes,
                         is_filtered=builder.is_filtered(),
                         original_check_lines=original_check_lines.get(func_name, {}),
+                        check_inst_comments=args.check_inst_comments,
                     )
                 )
                 is_in_function_start = False
@@ -260,9 +262,17 @@ def update_test(ti: common.TestInfo):
                 skip_same_checks=dropped_previous_line,
             ):
                 # This input line of the function body will go as-is into the output.
-                # Except make leading whitespace uniform: 2 spaces. 4 for debug records.
+                # Except make leading whitespace uniform: 2 spaces. 4 for debug records/switch cases.
                 indent = (
-                    "  " if not common.IS_DEBUG_RECORD_RE.match(input_line) else "    "
+                    " " * 4
+                    if (
+                        common.IS_DEBUG_RECORD_RE.match(input_line)
+                        or (
+                            ti.args.version > 6
+                            and common.IS_SWITCH_CASE_RE.match(input_line)
+                        )
+                    )
+                    else " " * 2
                 )
                 input_line = common.SCRUB_LEADING_WHITESPACE_RE.sub(indent, input_line)
                 output_lines.append(input_line)
@@ -354,6 +364,12 @@ def main():
         choices=["none", "smart", "all"],
         help="Check global entries (global variables, metadata, attribute sets, ...) for functions",
     )
+    parser.add_argument(
+        "--check-inst-comments",
+        action="store_true",
+        default=False,
+        help="Check the generated comments describing instructions (e.g., -print-predicate-info/print<memssa>)",
+    )
     parser.add_argument(
         "--reset-variable-names",
         action="store_true",
diff --git a/mlir/include/mlir/Analysis/DataFlow/DenseAnalysis.h b/mlir/include/mlir/Analysis/DataFlow/DenseAnalysis.h
index 8bcfe51ad7cd1..3c87c453a4cf0 100644
--- a/mlir/include/mlir/Analysis/DataFlow/DenseAnalysis.h
+++ b/mlir/include/mlir/Analysis/DataFlow/DenseAnalysis.h
@@ -397,7 +397,7 @@ class AbstractDenseBackwardDataFlowAnalysis : public DataFlowAnalysis {
   /// itself.
   virtual void visitRegionBranchControlFlowTransfer(
       RegionBranchOpInterface branch, RegionBranchPoint regionFrom,
-      RegionBranchPoint regionTo, const AbstractDenseLattice &after,
+      RegionSuccessor regionTo, const AbstractDenseLattice &after,
       AbstractDenseLattice *before) {
     meet(before, after);
   }
@@ -526,7 +526,7 @@ class DenseBackwardDataFlowAnalysis
   /// and "to" regions.
   virtual void visitRegionBranchControlFlowTransfer(
       RegionBranchOpInterface branch, RegionBranchPoint regionFrom,
-      RegionBranchPoint regionTo, const LatticeT &after, LatticeT *before) {
+      RegionSuccessor regionTo, const LatticeT &after, LatticeT *before) {
     AbstractDenseBackwardDataFlowAnalysis::visitRegionBranchControlFlowTransfer(
         branch, regionFrom, regionTo, after, before);
   }
@@ -571,7 +571,7 @@ class DenseBackwardDataFlowAnalysis
   }
   void visitRegionBranchControlFlowTransfer(
       RegionBranchOpInterface branch, RegionBranchPoint regionForm,
-      RegionBranchPoint regionTo, const AbstractDenseLattice &after,
+      RegionSuccessor regionTo, const AbstractDenseLattice &after,
       AbstractDenseLattice *before) final {
     visitRegionBranchControlFlowTransfer(branch, regionForm, regionTo,
                                          static_cast<const LatticeT &>(after),
diff --git a/mlir/include/mlir/Analysis/DataFlow/SparseAnalysis.h b/mlir/include/mlir/Analysis/DataFlow/SparseAnalysis.h
index 1a33ecf8b5aa9..985573476ab78 100644
--- a/mlir/include/mlir/Analysis/DataFlow/SparseAnalysis.h
+++ b/mlir/include/mlir/Analysis/DataFlow/SparseAnalysis.h
@@ -286,7 +286,7 @@ class AbstractSparseForwardDataFlowAnalysis : public DataFlowAnalysis {
   /// and propagating therefrom.
   virtual void
   visitRegionSuccessors(ProgramPoint *point, RegionBranchOpInterface branch,
-                        RegionBranchPoint successor,
+                        RegionSuccessor successor,
                         ArrayRef<AbstractSparseLattice *> lattices);
 };
 
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index 37db096f1ba75..45cb67f0eee4a 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -912,9 +912,10 @@ def ScaledMFMAInTypes : AnyTypeOf<[VectorOfLengthAndType<[32], [F8E5M2, F8E4M3FN
                                    VectorOfLengthAndType<[32], [F6E2M3FN, F6E3M2FN, F4E2M1FN]>]>;
 def ScaledMFMAOutTypes : AnyTypeOf<[VectorOfLengthAndType<[4, 16], [F32]>]>;
 // wmma
-def WMMAInTypes : AnyTypeOf<[VectorOfLengthAndType<[4, 8, 16], [F16, BF16]>,
-                             VectorOfLengthAndType<[4, 8, 16], [I8, SI8, UI8]>,
-                             VectorOfLengthAndType<[4, 8], [F8E4M3FN, F8E5M2]>,
+def WMMAInTypes : AnyTypeOf<[VectorOfLengthAndType<[2], [F32]>,
+                             VectorOfLengthAndType<[4, 8, 16], [F16, BF16]>,
+                             VectorOfLengthAndType<[4, 8, 16, 32], [I8, SI8, UI8]>,
+                             VectorOfLengthAndType<[4, 8, 32, 64], [F8E4M3FN, F8E5M2]>,
                              VectorOfLengthAndType<[4, 8, 16], [I<4>, SI<4>, UI<4>]>]>;
 def WMMAOutTypes : AnyTypeOf<[VectorOfLengthAndType<[4, 8], [F32, I32]>,
                               VectorOfLengthAndType<[4, 8, 16], [F16, BF16]>]>;
@@ -992,7 +993,7 @@ def AMDGPU_WMMAOp :
     Arguments<(ins
                    ConfinedAttr<I32Attr, [IntIsOneOf<[16]>]>:$m,
                    ConfinedAttr<I32Attr, [IntIsOneOf<[16]>]>:$n,
-                   ConfinedAttr<I32Attr, [IntIsOneOf<[16, 32]>]>:$k,
+                   ConfinedAttr<I32Attr, [IntIsOneOf<[4, 16, 32, 64, 128]>]>:$k,
                    WMMAInTypes:$sourceA,
                    WMMAInTypes:$sourceB,
                    WMMAOutTypes:$destC,
@@ -1005,8 +1006,14 @@ def AMDGPU_WMMAOp :
   let description = [{
     The `amdgpu.wmma` op is an MLIR wrapper around intrinsics for various `wmma`
     instructions in the AMDGPU architecture, which perform matrix multiplication.
-    Note that all wmma intrinsics have M=N=16 dimensions but vary by in allowed K
-    dimensions.
+
+    On gfx11/RDNA3, wmma intrinsics have M=N=K=16 dimensions.
+
+    On gfx12/RDNA4, wmma intrinsics have M=N=16 dimensions and support K=16 for
+    all element types, and K=32 for i4 sources.
+
+    On gfx1250, wmma intrinsics have M=N=16 and K dimensions of 4, 32, 64, or 128,
+    depending on the element types.
 
     On gfx11/RDNA3, emitting f16->f16 (or bf16->bf16) wmma the output is a 16xf16
     (or 16xbf16) vector containing only 8 valid values:
@@ -1022,7 +1029,13 @@ def AMDGPU_WMMAOp :
 
     Example:
     ```mlir
-      %0 = amdgpu.wmma 16x16x16 %matA * %matB + %matC : vector<16xf16>, vector<16xf16>, vector<8xf16>
+      %0 = amdgpu.wmma 16x16x16 %matA * %matB + %matC : vector<8xf16>, vector<8xf16>, vector<8xf16>
+
+      %1 = amdgpu.wmma 16x16x64 %matD * %matE + %matF : vector<32xi8>, vector<8xf32>, vector<8xf32>
+
+      %2 = amdgpu.wmma 16x16x128 %matG * %matH + %matI : vector<64xf4E2M1FN>, vector<64xf4E2M1FN>, vector<8xf32>
+
+      %3 = amdgpu.wmma 16x16x4 %matJ * %matK + %matL : vector<2xf32>, vector<2xf32>, vector<8xf32>
     ```
   }];
   let assemblyFormat = [{
diff --git a/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td b/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td
index 12a79358d42f1..409bd05292e0d 100644
--- a/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td
+++ b/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td
@@ -714,7 +714,7 @@ def AffineParallelOp : Affine_Op<"parallel",
     operand_range getUpperBoundsOperands();
     AffineValueMap getUpperBoundsValueMap();
 
-    /// Sets elements fo the loop upper bound.
+    /// Sets elements of the loop upper bound.
     void setUpperBounds(ValueRange operands, AffineMap map);
 
     void setSteps(ArrayRef<int64_t> newSteps);
@@ -999,7 +999,7 @@ def AffineVectorStoreOp : AffineStoreOpBase<"vector_store"> {
     elemental type, supplied as its second operand.
     The index for each memref dimension is an affine expression of loop
     induction variables and symbols. These indices determine the start position
-    of the write within the memref. The shape of th input vector determines the
+    of the write within the memref. The shape of the input vector determines the
     shape of the slice written to the memref. This slice is contiguous along the
     respective dimensions of the shape. Strided vector stores will be supported
     in the future.
@@ -1188,7 +1188,7 @@ def AffineLinearizeIndexOp : Affine_Op<"linearize_index",
     If all `N` basis elements are provided, the linearize_index operation is said to
     "have an outer bound".
 
-    As a convenience, and for symmetry with `getPaddedBasis()`, ifg the first
+    As a convenience, and for symmetry with `getPaddedBasis()`, if the first
     element of a set of `OpFoldResult`s passed to the builders of this operation is
     `nullptr`, that element is ignored.
 
diff --git a/mlir/include/mlir/Dialect/ControlFlow/Transforms/StructuralTypeConversions.h b/mlir/include/mlir/Dialect/ControlFlow/Transforms/StructuralTypeConversions.h
new file mode 100644
index 0000000000000..a32d9e2025c76
--- /dev/null
+++ b/mlir/include/mlir/Dialect/ControlFlow/Transforms/StructuralTypeConversions.h
@@ -0,0 +1,48 @@
+//===- StructuralTypeConversions.h - CF Type Conversions --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_CONTROL_FLOW_TRANSFORMS_STRUCTURAL_TYPE_CONVERSIONS_H
+#define MLIR_DIALECT_CONTROL_FLOW_TRANSFORMS_STRUCTURAL_TYPE_CONVERSIONS_H
+
+#include "mlir/IR/PatternMatch.h"
+
+namespace mlir {
+
+class ConversionTarget;
+class TypeConverter;
+
+namespace cf {
+
+/// Populates patterns for CF structural type conversions and sets up the
+/// provided ConversionTarget with the appropriate legality configuration for
+/// the ops to get converted properly.
+///
+/// A "structural" type conversion is one where the underlying ops are
+/// completely agnostic to the actual types involved and simply need to update
+/// their types. An example of this is cf.br -- the cf.br op needs to update
+/// its types accordingly to the TypeConverter, but otherwise does not care
+/// what type conversions are happening.
+void populateCFStructuralTypeConversionsAndLegality(
+    const TypeConverter &typeConverter, RewritePatternSet &patterns,
+    ConversionTarget &target, PatternBenefit benefit = 1);
+
+/// Similar to `populateCFStructuralTypeConversionsAndLegality` but does not
+/// populate the conversion target.
+void populateCFStructuralTypeConversions(const TypeConverter &typeConverter,
+                                         RewritePatternSet &patterns,
+                                         PatternBenefit benefit = 1);
+
+/// Updates the ConversionTarget with dynamic legality of CF operations based
+/// on the provided type converter.
+void populateCFStructuralTypeConversionTarget(
+    const TypeConverter &typeConverter, ConversionTarget &target);
+
+} // namespace cf
+} // namespace mlir
+
+#endif // MLIR_DIALECT_CONTROL_FLOW_TRANSFORMS_STRUCTURAL_TYPE_CONVERSIONS_H
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h b/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h
index 7b53594a1c8e2..a9886d1f21ca0 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h
@@ -178,8 +178,8 @@ enum class SparseHandleKind { SpMat, DnTensor, SpGEMMOp };
 class SparseDnTensorHandleType
     : public Type::TypeBase<SparseDnTensorHandleType, Type, TypeStorage> {
 public:
-  using Base = typename Type::TypeBase<SparseDnTensorHandleType, Type,
-                                       TypeStorage>::Base;
+  using Base =
+      Type::TypeBase<SparseDnTensorHandleType, Type, TypeStorage>::Base;
   using Base::Base;
 
   static constexpr StringLiteral name = "gpu.sparse.dntensor_handle";
@@ -188,8 +188,7 @@ class SparseDnTensorHandleType
 class SparseSpMatHandleType
     : public Type::TypeBase<SparseSpMatHandleType, Type, TypeStorage> {
 public:
-  using Base =
-      typename Type::TypeBase<SparseSpMatHandleType, Type, TypeStorage>::Base;
+  using Base = Type::TypeBase<SparseSpMatHandleType, Type, TypeStorage>::Base;
   using Base::Base;
 
   static constexpr StringLiteral name = "gpu.sparse.spmat_handle";
@@ -198,8 +197,8 @@ class SparseSpMatHandleType
 class SparseSpGEMMOpHandleType
     : public Type::TypeBase<SparseSpGEMMOpHandleType, Type, TypeStorage> {
 public:
-  using Base = typename Type::TypeBase<SparseSpGEMMOpHandleType, Type,
-                                       TypeStorage>::Base;
+  using Base =
+      Type::TypeBase<SparseSpGEMMOpHandleType, Type, TypeStorage>::Base;
   using Base::Base;
 
   static constexpr StringLiteral name = "gpu.sparse.spgemmop_handle";
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 4f483859ac18d..ba5e48e4ec9ba 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -579,7 +579,8 @@ def NVVM_PMEventOp : NVVM_PTXBuilder_Op<"pmevent">,
 
 /// mbarrier.init instruction with generic pointer type
 def NVVM_MBarrierInitOp : NVVM_PTXBuilder_Op<"mbarrier.init">,
-  Arguments<(ins LLVM_AnyPointer:$addr, I32:$count, PtxPredicate:$predicate)> {
+  Arguments<(ins AnyTypeOf<[LLVM_PointerGeneric, LLVM_PointerShared]>:$addr,
+                 I32:$count, PtxPredicate:$predicate)> {
   let summary = "MBarrier Initialization Op";
   let description = [{
     The `nvvm.mbarrier.init` operation initializes an *mbarrier object* at the specified 
@@ -592,48 +593,35 @@ def NVVM_MBarrierInitOp : NVVM_PTXBuilder_Op<"mbarrier.init">,
     - Transaction count (tx-count): 0
 
     The operation takes the following operands:
-    - `addr`: A pointer to the memory location of the *mbarrier object*. Uses generic 
-      addressing, but the address must still be in the shared memory space.
+    - `addr`: A pointer to the memory location of the *mbarrier object*. The `addr`
+      must be a pointer to generic or shared::cta memory. When it is generic, the
+      underlying address must be within the shared::cta memory space; otherwise
+      the behavior is undefined.
     - `count`: Integer specifying the number of threads that will participate in barrier
       synchronization. Must be in the range [1, 2²⁰ - 1].
     - `predicate`: Optional predicate for conditional execution.
 
     [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-init)
   }];
-  string llvmBuilder = [{
-      createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_init, {$addr, $count});
-  }];
   let assemblyFormat = "$addr `,` $count (`,` `predicate` `=` $predicate^)? attr-dict `:` type(operands)";
+
   let extraClassDeclaration = [{
     bool hasIntrinsic() { if(getPredicate()) return false; return true; }
-  }];
-  let extraClassDefinition = [{
-    std::string $cppClass::getPtx() { return std::string("mbarrier.init.b64 [%0], %1;"); }
-  }];
-}
-
-/// mbarrier.init instruction with shared pointer type
-def NVVM_MBarrierInitSharedOp : NVVM_PTXBuilder_Op<"mbarrier.init.shared", [NVVMRequiresSM<80>, DeclareOpInterfaceMethods<BasicPtxBuilderOpInterface>]>,
-  Arguments<(ins LLVM_PointerShared:$addr, I32:$count, PtxPredicate:$predicate)> {
-  let summary = "Shared MBarrier Initialization Op";
-  let description = [{
-    This Op is the same as `nvvm.mbarrier.init` except that the *mbarrier object*
-    should be accessed using a shared-memory pointer instead of a generic-memory pointer.
 
-    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-init)
+    static mlir::NVVM::IDArgPair
+    getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
+                          llvm::IRBuilderBase& builder);
   }];
+
   string llvmBuilder = [{
-      createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_init_shared, {$addr, $count});
-  }];
-  let assemblyFormat = "$addr `,` $count (`,` `predicate` `=` $predicate^)? attr-dict `:` type(operands)";
-  let extraClassDeclaration = "bool hasIntrinsic() { return !getPredicate(); }";
-  let extraClassDefinition = [{
-    std::string $cppClass::getPtx() { return std::string("mbarrier.init.shared.b64 [%0], %1;"); }
+    auto [id, args] = NVVM::MBarrierInitOp::getIntrinsicIDAndArgs(
+                      *op, moduleTranslation, builder);
+    createIntrinsicCall(builder, id, args);
   }];
 }
 
 def NVVM_MBarrierInvalOp : NVVM_Op<"mbarrier.inval">,
-  Arguments<(ins LLVM_AnyPointer:$addr)> {
+  Arguments<(ins AnyTypeOf<[LLVM_PointerGeneric, LLVM_PointerShared]>:$addr)> {
   let summary = "MBarrier Invalidation Operation";
   let description = [{
     The `nvvm.mbarrier.inval` operation invalidates an *mbarrier object* at the 
@@ -644,30 +632,27 @@ def NVVM_MBarrierInvalOp : NVVM_Op<"mbarrier.inval">,
     It is undefined behavior if the *mbarrier object* is already invalid.
     
     The operation takes the following operand:
-    - `addr`: A pointer to the memory location of the *mbarrier object*. Uses generic 
-      addressing, but the address must still be in the shared memory space.
+    - `addr`: A pointer to the memory location of the *mbarrier object*. The `addr`
+      must be a pointer to generic or shared::cta memory. When it is generic, the
+      underlying address must be within the shared::cta memory space; otherwise
+      the behavior is undefined.
 
     [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-inval)
   }];
-  string llvmBuilder = [{
-      createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_inval, {$addr});
-  }];
-  let assemblyFormat = "$addr attr-dict `:` type(operands)";
-}
 
-def NVVM_MBarrierInvalSharedOp : NVVM_Op<"mbarrier.inval.shared">,
-  Arguments<(ins LLVM_PointerShared:$addr)> {
-  let summary = "Shared MBarrier Invalidation Operation";
-  let description = [{
-    This Op is the same as `nvvm.mbarrier.inval` except that the *mbarrier object*
-    should be accessed using a shared-memory pointer instead of a generic-memory pointer.
+  let assemblyFormat = "$addr attr-dict `:` type(operands)";
 
-    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-inval)
+  let extraClassDeclaration = [{
+    static mlir::NVVM::IDArgPair
+    getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
+                          llvm::IRBuilderBase& builder);
   }];
+
   string llvmBuilder = [{
-      createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_inval_shared, {$addr});
+    auto [id, args] = NVVM::MBarrierInvalOp::getIntrinsicIDAndArgs(
+                      *op, moduleTranslation, builder);
+    createIntrinsicCall(builder, id, args);
   }];
-  let assemblyFormat = "$addr attr-dict `:` type(operands)";
 }
 
 def NVVM_MBarrierArriveOp : NVVM_Op<"mbarrier.arrive">,
@@ -2014,6 +1999,9 @@ class MMA_LDST_OPS<list<GEOM> Geom, list<string> Frags, list<string> Types> {
 // llvm supports and can be extended as needed.
 class NVVM_MMA_OPS {
   // "wmma" operations
+  list<list<WMMA_REGS>> fp64_wmma_ops = MMA_OPS<
+            [GEOM<8, 8, 4>],
+            ["f64"], [], ["f64"], []>.ret;
   list<list<WMMA_REGS>> tf32_wmma_ops = MMA_OPS<
             [GEOM<16, 16, 8>],
             ["tf32"], [], ["f32"], []>.ret;
@@ -2024,6 +2012,7 @@ class NVVM_MMA_OPS {
             [GEOM<16, 16, 16>, GEOM<32, 8, 16>, GEOM<8, 32, 16>],
             ["s8","u8"], [], ["s32"], []>.ret;
   list<list<WMMA_REGS>> all_wmma_ops = !listconcat(
+            fp64_wmma_ops,
             tf32_wmma_ops,
             fp_wmma_ops,
             i8_wmma_ops);
@@ -2040,9 +2029,17 @@ class NVVM_MMA_OPS {
   list<WMMA_REGS> ldst_tf32_cd_ops = MMA_LDST_OPS<
             [GEOM<16, 16, 8>],
             ["c", "d"], ["f32"]>.ret;
+  list<WMMA_REGS> ldst_f64_ab_ops = MMA_LDST_OPS<
+            [GEOM<8, 8, 4>],
+            ["a", "b"], ["f64"]>.ret;
+  list<WMMA_REGS> ldst_f64_cd_ops = MMA_LDST_OPS<
+            [GEOM<8, 8, 4>],
+            ["c", "d"], ["f64"]>.ret;
   list<WMMA_REGS> all_ldst_ops = !listconcat(ldst_ab_ops, ldst_cd_ops,
                                              ldst_tf32_ab_ops,
-                                             ldst_tf32_cd_ops);
+                                             ldst_tf32_cd_ops, 
+                                             ldst_f64_ab_ops,
+                                             ldst_f64_cd_ops);
   // Separate A/B/C fragments (loads) from D (stores).
   list<WMMA_REGS> all_ld_ops = !filter(op, all_ldst_ops, !ne(op.frag, "d"));
   list<WMMA_REGS> all_st_ops = !filter(op, all_ldst_ops, !eq(op.frag, "d"));
@@ -2349,7 +2346,7 @@ def MMAFragAttr : EnumAttr<NVVM_Dialect, MMAFrag, "mma_frag"> {
 }
 
 def NVVM_WMMALoadOp: NVVM_Op<"wmma.load">,
-  Results<(outs LLVM_AnyStruct:$res)>,
+  Results<(outs AnyTypeOf<[LLVM_AnyStruct, F64]>:$res)>,
   Arguments<(ins LLVM_AnyPointer: $ptr, I32: $stride, I32Attr:$m,
              I32Attr:$n, I32Attr:$k, MMALayoutAttr:$layout,
              MMATypesAttr:$eltype, MMAFragAttr:$frag)> {
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index d2df244eb9363..5241f9a6f2b43 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -146,6 +146,35 @@ class ROCDL_DimGetterFunctionOp<string mnemonic, string device_function,
   ];
 }
 
+//===----------------------------------------------------------------------===//
+// ROCDL vector types definitions
+//===----------------------------------------------------------------------===//
+
+class ROCDL_ConcreteVector<Type elem, int length> :
+  FixedVectorOfLengthAndType<[length], [elem]>,
+  BuildableType<
+    "::mlir::VectorType::get({" # length # "} ,"
+      # elem.builderCall # ")">;
+
+def ROCDL_V2I16Type : ROCDL_ConcreteVector<I16, 2>;
+def ROCDL_V2F16Type : ROCDL_ConcreteVector<F16, 2>;
+def ROCDL_V2I32Type : ROCDL_ConcreteVector<I32, 2>;
+def ROCDL_V2BF16Type : ROCDL_ConcreteVector<BF16, 2>;
+def ROCDL_V2F32Type : ROCDL_ConcreteVector<F32, 2>;
+def ROCDL_V3I32Type : ROCDL_ConcreteVector<I32, 3>;
+def ROCDL_V4I32Type : ROCDL_ConcreteVector<I32, 4>;
+def ROCDL_V6I32Type : ROCDL_ConcreteVector<I32, 6>;
+def ROCDL_V8I32Type : ROCDL_ConcreteVector<I32, 8>;
+def ROCDL_V8BF16Type : ROCDL_ConcreteVector<BF16, 8>;
+def ROCDL_V8F16Type : ROCDL_ConcreteVector<F16, 8>;
+def ROCDL_V8F32Type : ROCDL_ConcreteVector<F32, 8>;
+def ROCDL_V16BF16Type : ROCDL_ConcreteVector<BF16, 16>;
+def ROCDL_V16F16Type : ROCDL_ConcreteVector<F16, 16>;
+def ROCDL_V16F32Type : ROCDL_ConcreteVector<F32, 16>;
+def ROCDL_V32F16Type : ROCDL_ConcreteVector<F16, 32>;
+def ROCDL_V32BF16Type : ROCDL_ConcreteVector<BF16, 32>;
+def ROCDL_V32F32Type : ROCDL_ConcreteVector<F32, 32>;
+
 //===----------------------------------------------------------------------===//
 // Wave-level primitives
 //===----------------------------------------------------------------------===//
@@ -663,6 +692,68 @@ def ROCDL_GlobalLoadLDSOp :
   }];
 }
 
+//===---------------------------------------------------------------------===//
+// Tensor load/store intrinsics (available in GFX1250)
+//===---------------------------------------------------------------------===//
+
+// Base class for tensor load/store operations with 4 descriptor groups.
+class ROCDL_TensorLDSIntrOp<string mnemonic> :
+  ROCDL_IntrOp<mnemonic, [], [], [], 0, 0, 1, 0, [4], ["cachePolicy"]> {
+  dag args = (ins ROCDL_V4I32Type:$dgroup0, ROCDL_V8I32Type:$dgroup1,
+                  ROCDL_V4I32Type:$dgroup2, ROCDL_V4I32Type:$dgroup3,
+                  I32Attr:$cachePolicy);
+  let arguments = !con(args, baseArgs);
+  let summary = "Base class for ROCDL tensor load/store to/from LDS.";
+  let description = [{
+    Moves tiles of tensor data between global memory and LDS. The tile is
+    described by the $dgroup descriptors. 4 $dgroup descriptors allows for
+    movement of up to 5D tensors. $cachePolicy describes the memory scope and an
+    indicator of expected data re-use.
+
+    This op is for gfx1250+ architectures.
+  }];
+  let assemblyFormat = [{
+    attr-dict operands `cachepolicy` $cachePolicy `:` type($dgroup0) `,` type($dgroup1)
+  }];
+  let extraClassDefinition = [{
+    SmallVector<Value> $cppClass::getAccessedOperands() {
+      return {getDgroup0(), getDgroup1(), getDgroup2(), getDgroup3()};
+    }
+  }];
+}
+
+// Base class for tensor load/store operations with 2 descriptor groups
+// (D2 variant).
+class ROCDL_TensorLDSIntrD2Op<string mnemonic> :
+  ROCDL_IntrOp<mnemonic, [], [], [], 0, 0, 1, 0, [2], ["cachePolicy"]> {
+  dag args = (ins ROCDL_V4I32Type:$dgroup0, ROCDL_V8I32Type:$dgroup1,
+                  I32Attr:$cachePolicy);
+  let arguments = !con(args, baseArgs);
+  let summary = "Base class for ROCDL tensor load/store to/from LDS (D2 variant).";
+  let description = [{
+    Moves tiles of tensor data between global memory and LDS. The tile is
+    described by the $dgroup descriptors. 2 $dgroup descriptors allows for
+    movement of up to 2D tensors. $cachePolicy describes the memory scope and an
+    indicator of expected data re-use.
+
+    This op is for gfx1250+ architectures.
+  }];
+  let assemblyFormat = [{
+    attr-dict operands `cachepolicy` $cachePolicy `:` type($dgroup0) `,` type($dgroup1)
+  }];
+  let extraClassDefinition = [{
+    SmallVector<Value> $cppClass::getAccessedOperands() {
+      return {getDgroup0(), getDgroup1()};
+    }
+  }];
+}
+
+// Tensor load and store operations
+def ROCDL_TensorLoadToLDSOp : ROCDL_TensorLDSIntrOp<"tensor.load.to.lds">;
+def ROCDL_TensorStoreFromLDSOp : ROCDL_TensorLDSIntrOp<"tensor.store.from.lds">;
+def ROCDL_TensorLoadToLDSD2Op : ROCDL_TensorLDSIntrD2Op<"tensor.load.to.lds.d2">;
+def ROCDL_TensorStoreFromLDSD2Op : ROCDL_TensorLDSIntrD2Op<"tensor.store.from.lds.d2">;
+
 //===---------------------------------------------------------------------===//
 // Operations on raw buffer resources (stride of 0, bounds checks either off or in
 // raw buffer mode).
@@ -932,30 +1023,6 @@ def ROCDL_Permlane32SwapOp : ROCDL_IntrOp<"permlane32.swap", [], [],
   }];
 }
 
-class ROCDL_ConcreteVector<Type elem, int length> :
-  FixedVectorOfLengthAndType<[length], [elem]>,
-  BuildableType<
-    "::mlir::VectorType::get({" # length # "} ,"
-      # elem.builderCall # ")">;
-
-def ROCDL_V2I16Type : ROCDL_ConcreteVector<I16, 2>;
-def ROCDL_V2F16Type : ROCDL_ConcreteVector<F16, 2>;
-def ROCDL_V2I32Type : ROCDL_ConcreteVector<I32, 2>;
-def ROCDL_V2BF16Type : ROCDL_ConcreteVector<BF16, 2>;
-def ROCDL_V2F32Type : ROCDL_ConcreteVector<F32, 2>;
-def ROCDL_V3I32Type : ROCDL_ConcreteVector<I32, 3>;
-def ROCDL_V6I32Type : ROCDL_ConcreteVector<I32, 6>;
-def ROCDL_V8I32Type : ROCDL_ConcreteVector<I32, 8>;
-def ROCDL_V8BF16Type : ROCDL_ConcreteVector<BF16, 8>;
-def ROCDL_V8F16Type : ROCDL_ConcreteVector<F16, 8>;
-def ROCDL_V8F32Type : ROCDL_ConcreteVector<F32, 8>;
-def ROCDL_V16BF16Type : ROCDL_ConcreteVector<BF16, 16>;
-def ROCDL_V16F16Type : ROCDL_ConcreteVector<F16, 16>;
-def ROCDL_V16F32Type : ROCDL_ConcreteVector<F32, 16>;
-def ROCDL_V32F16Type : ROCDL_ConcreteVector<F16, 32>;
-def ROCDL_V32BF16Type : ROCDL_ConcreteVector<BF16, 32>;
-def ROCDL_V32F32Type : ROCDL_ConcreteVector<F32, 32>;
-
 //===---------------------------------------------------------------------===//
 // 16-bit float intrinsics
 //===---------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td
index f3674c3eecfe6..ecd036d452b27 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td
@@ -293,10 +293,6 @@ def MapOp : LinalgStructuredBase_Op<"map", [
     // Implement functions necessary for DestinationStyleOpInterface.
     MutableOperandRange getDpsInitsMutable() { return getInitMutable(); }
 
-    SmallVector<OpOperand *> getOpOperandsMatchingBBargs() {
-      return getDpsInputOperands();
-    }
-
     bool payloadUsesValueFromOperand(OpOperand * opOperand) {
       if (isDpsInit(opOperand)) return false;
       return !getMatchingBlockArgument(opOperand).use_empty();
diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
index c89fc59c91830..d00183a1e16a1 100644
--- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
@@ -653,6 +653,9 @@ struct PadTilingInterfaceResult {
 //    interpreted as the bounding box (dynamic) value to pad to.
 /// * Use "options.paddingValues" to set the padding value of the created
 //    tensor::PadOp.
+//
+// The transformation assumes that the insertion point is set after the
+// operation to pad.
 FailureOr<PadTilingInterfaceResult>
 rewriteAsPaddedOp(OpBuilder &, TilingInterface toPad,
                   PadTilingInterfaceOptions options,
diff --git a/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td b/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td
index b39207fc30dd7..e00f3c1526005 100644
--- a/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td
+++ b/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td
@@ -323,8 +323,8 @@ def MemRef_ReallocOp : MemRef_Op<"realloc"> {
 
     ```mlir
     %new = memref.realloc %old : memref<64xf32> to memref<124xf32>
-    %4 = memref.load %new[%index]   // ok
-    %5 = memref.load %old[%index]   // undefined behavior
+    %4 = memref.load %new[%index] : memref<124xf32> // ok
+    %5 = memref.load %old[%index] : memref<64xf32>  // undefined behavior
     ```
   }];
 
@@ -445,9 +445,10 @@ def MemRef_AllocaScopeOp : MemRef_Op<"alloca_scope",
     operation:
 
     ```mlir
-    %result = memref.alloca_scope {
+    %result = memref.alloca_scope -> f32 {
+      %value = arith.constant 1.0 : f32
       ...
-      memref.alloca_scope.return %value
+      memref.alloca_scope.return %value : f32
     }
     ```
 
@@ -478,7 +479,7 @@ def MemRef_AllocaScopeReturnOp : MemRef_Op<"alloca_scope.return",
     to indicate which values are going to be returned. For example:
 
     ```mlir
-    memref.alloca_scope.return %value
+    memref.alloca_scope.return %value : f32
     ```
   }];
 
@@ -543,11 +544,11 @@ def MemRef_CastOp : MemRef_Op<"cast", [
     Example:
 
     ```mlir
-    Cast to concrete shape.
-        %4 = memref.cast %1 : memref<*xf32> to memref<4x?xf32>
+    // Cast to concrete shape.
+    %4 = memref.cast %1 : memref<*xf32> to memref<4x?xf32>
 
-    Erase rank information.
-        %5 = memref.cast %1 : memref<4x?xf32> to memref<*xf32>
+    // Erase rank information.
+    %5 = memref.cast %1 : memref<4x?xf32> to memref<*xf32>
     ```
   }];
 
@@ -613,8 +614,8 @@ def MemRef_DeallocOp : MemRef_Op<"dealloc", [MemRefsNormalizable]> {
     Example:
 
     ```mlir
-    %0 = memref.alloc() : memref<8x64xf32, affine_map<(d0, d1) -> (d0, d1), 1>>
-    memref.dealloc %0 : memref<8x64xf32,  affine_map<(d0, d1) -> (d0, d1), 1>>
+    %0 = memref.alloc() : memref<8x64xf32, affine_map<(d0, d1) -> (d0, d1)>, 1>
+    memref.dealloc %0 : memref<8x64xf32,  affine_map<(d0, d1) -> (d0, d1)>, 1>
     ```
   }];
 
@@ -728,13 +729,13 @@ def MemRef_DmaStartOp : MemRef_Op<"dma_start"> {
     space 1 at indices [%k, %l], would be specified as follows:
 
     ```mlir
-    %num_elements = arith.constant 256
+    %num_elements = arith.constant 256 : index
     %idx = arith.constant 0 : index
-    %tag = memref.alloc() : memref<1 x i32, affine_map<(d0) -> (d0)>, 4>
-    dma_start %src[%i, %j], %dst[%k, %l], %num_elements, %tag[%idx] :
-      memref<40 x 128 x f32>, affine_map<(d0) -> (d0)>, 0>,
-      memref<2 x 1024 x f32>, affine_map<(d0) -> (d0)>, 1>,
-      memref<1 x i32>, affine_map<(d0) -> (d0)>, 2>
+    %tag = memref.alloc() : memref<1 x i32, affine_map<(d0) -> (d0)>, 2>
+    memref.dma_start %src[%i, %j], %dst[%k, %l], %num_elements, %tag[%idx] :
+      memref<40 x 128 x f32, affine_map<(d0, d1) -> (d0, d1)>, 0>,
+      memref<2 x 1024 x f32, affine_map<(d0, d1) -> (d0, d1)>, 1>,
+      memref<1 x i32, affine_map<(d0) -> (d0)>, 2>
     ```
 
     If %stride and %num_elt_per_stride are specified, the DMA is expected to
@@ -742,8 +743,8 @@ def MemRef_DmaStartOp : MemRef_Op<"dma_start"> {
     memory space 0 until %num_elements are transferred.
 
     ```mlir
-    dma_start %src[%i, %j], %dst[%k, %l], %num_elements, %tag[%idx], %stride,
-              %num_elt_per_stride :
+    memref.dma_start %src[%i, %j], %dst[%k, %l], %num_elements, %tag[%idx], %stride,
+                     %num_elt_per_stride :
     ```
 
     * TODO: add additional operands to allow source and destination striding, and
@@ -891,10 +892,10 @@ def MemRef_DmaWaitOp : MemRef_Op<"dma_wait"> {
    Example:
 
    ```mlir
-    dma_start %src[%i, %j], %dst[%k, %l], %num_elements, %tag[%index] :
-      memref<2048 x f32>, affine_map<(d0) -> (d0)>, 0>,
-      memref<256 x f32>, affine_map<(d0) -> (d0)>, 1>
-      memref<1 x i32>, affine_map<(d0) -> (d0)>, 2>
+    memref.dma_start %src[%i, %j], %dst[%k, %l], %num_elements, %tag[%index] :
+      memref<2048 x f32, affine_map<(d0) -> (d0)>, 0>,
+      memref<256 x f32, affine_map<(d0) -> (d0)>, 1>,
+      memref<1 x i32, affine_map<(d0) -> (d0)>, 2>
     ...
     ...
     dma_wait %tag[%index], %num_elements : memref<1 x i32, affine_map<(d0) -> (d0)>, 2>
@@ -1004,8 +1005,8 @@ def MemRef_ExtractStridedMetadataOp : MemRef_Op<"extract_strided_metadata", [
 
     ```mlir
       %base, %offset, %sizes:2, %strides:2 =
-        memref.extract_strided_metadata %memref :
-          memref<10x?xf32>, index, index, index, index, index
+        memref.extract_strided_metadata %memref : memref<10x?xf32>
+          -> memref<f32>, index, index, index, index, index
 
       // After folding, the type of %m2 can be memref<10x?xf32> and further
       // folded to %memref.
@@ -1013,7 +1014,7 @@ def MemRef_ExtractStridedMetadataOp : MemRef_Op<"extract_strided_metadata", [
           offset: [%offset],
           sizes: [%sizes#0, %sizes#1],
           strides: [%strides#0, %strides#1]
-        : memref<f32> to memref<?x?xf32, offset: ?, strides: [?, ?]>
+        : memref<f32> to memref<?x?xf32, strided<[?, ?], offset:?>>
     ```
   }];
 
@@ -1182,10 +1183,10 @@ def MemRef_GlobalOp : MemRef_Op<"global", [Symbol]> {
 
     ```mlir
     // Private variable with an initial value.
-    memref.global "private" @x : memref<2xf32> = dense<0.0,2.0>
+    memref.global "private" @x : memref<2xf32> = dense<[0.0, 2.0]>
 
     // Private variable with an initial value and an alignment (power of 2).
-    memref.global "private" @x : memref<2xf32> = dense<0.0,2.0> {alignment = 64}
+    memref.global "private" @x : memref<2xf32> = dense<[0.0, 2.0]> {alignment = 64}
 
     // Declaration of an external variable.
     memref.global "private" @y : memref<4xi32>
@@ -1194,7 +1195,7 @@ def MemRef_GlobalOp : MemRef_Op<"global", [Symbol]> {
     memref.global @z : memref<3xf16> = uninitialized
 
     // Externally visible constant variable.
-    memref.global constant @c : memref<2xi32> = dense<1, 4>
+    memref.global constant @c : memref<2xi32> = dense<[1, 4]>
     ```
   }];
 
@@ -1555,7 +1556,8 @@ def MemRef_ReinterpretCastOp
     %dst = memref.reinterpret_cast %src to
       offset: [%offset],
       sizes: [%sizes],
-      strides: [%strides]
+      strides: [%strides] :
+      memref<*xf32> to memref<?x?xf32, strided<[?, ?], offset: ?>>
     ```
     means that `%dst`'s descriptor will be:
     ```mlir
@@ -1695,12 +1697,12 @@ def MemRef_ReshapeOp: MemRef_Op<"reshape", [
     ```mlir
     // Reshape statically-shaped memref.
     %dst = memref.reshape %src(%shape)
-             : (memref<4x1xf32>, memref<1xi32>) to memref<4xf32>
+             : (memref<4x1xf32>, memref<1xi32>) -> memref<4xf32>
     %dst0 = memref.reshape %src(%shape0)
-             : (memref<4x1xf32>, memref<2xi32>) to memref<2x2xf32>
+             : (memref<4x1xf32>, memref<2xi32>) -> memref<2x2xf32>
     // Flatten unranked memref.
     %dst = memref.reshape %src(%shape)
-             : (memref<*xf32>, memref<1xi32>) to memref<?xf32>
+             : (memref<*xf32>, memref<1xi32>) -> memref<?xf32>
     ```
 
     b. Source type is ranked or unranked. Shape argument has dynamic size.
@@ -1709,10 +1711,10 @@ def MemRef_ReshapeOp: MemRef_Op<"reshape", [
     ```mlir
     // Reshape dynamically-shaped 1D memref.
     %dst = memref.reshape %src(%shape)
-             : (memref<?xf32>, memref<?xi32>) to memref<*xf32>
+             : (memref<?xf32>, memref<?xi32>) -> memref<*xf32>
     // Reshape unranked memref.
     %dst = memref.reshape %src(%shape)
-             : (memref<*xf32>, memref<?xi32>) to memref<*xf32>
+             : (memref<*xf32>, memref<?xi32>) -> memref<*xf32>
     ```
   }];
 
diff --git a/mlir/include/mlir/Dialect/OpenACC/Analysis/OpenACCSupport.h b/mlir/include/mlir/Dialect/OpenACC/Analysis/OpenACCSupport.h
index 0833462ea0509..d9b2646b753f3 100644
--- a/mlir/include/mlir/Dialect/OpenACC/Analysis/OpenACCSupport.h
+++ b/mlir/include/mlir/Dialect/OpenACC/Analysis/OpenACCSupport.h
@@ -58,6 +58,9 @@
 namespace mlir {
 namespace acc {
 
+// Forward declaration for RecipeKind enum
+enum class RecipeKind : uint32_t;
+
 namespace detail {
 /// This class contains internal trait classes used by OpenACCSupport.
 /// It follows the Concept-Model pattern used throughout MLIR (e.g., in
@@ -69,6 +72,13 @@ struct OpenACCSupportTraits {
 
     /// Get the variable name for a given MLIR value.
     virtual std::string getVariableName(Value v) = 0;
+
+    /// Get the recipe name for a given kind, type and value.
+    virtual std::string getRecipeName(RecipeKind kind, Type type,
+                                      Value var) = 0;
+
+    // Used to report a case that is not supported by the implementation.
+    virtual InFlightDiagnostic emitNYI(Location loc, const Twine &message) = 0;
   };
 
   /// This class wraps a concrete OpenACCSupport implementation and forwards
@@ -84,6 +94,14 @@ struct OpenACCSupportTraits {
       return impl.getVariableName(v);
     }
 
+    std::string getRecipeName(RecipeKind kind, Type type, Value var) final {
+      return impl.getRecipeName(kind, type, var);
+    }
+
+    InFlightDiagnostic emitNYI(Location loc, const Twine &message) final {
+      return impl.emitNYI(loc, message);
+    }
+
   private:
     ImplT impl;
   };
@@ -118,6 +136,24 @@ class OpenACCSupport {
   /// \return The variable name, or an empty string if unavailable.
   std::string getVariableName(Value v);
 
+  /// Get the recipe name for a given type and value.
+  ///
+  /// \param kind The kind of recipe to get the name for.
+  /// \param type The type to get the recipe name for. Can be null if the
+  ///        var is provided instead.
+  /// \param var The MLIR value to get the recipe name for. Can be null if
+  ///        the type is provided instead.
+  /// \return The recipe name, or an empty string if not available.
+  std::string getRecipeName(RecipeKind kind, Type type, Value var);
+
+  /// Report a case that is not yet supported by the implementation.
+  ///
+  /// \param loc The location to report the unsupported case at.
+  /// \param message The message to report.
+  /// \return An in-flight diagnostic object that can be used to report the
+  ///         unsupported case.
+  InFlightDiagnostic emitNYI(Location loc, const Twine &message);
+
   /// Signal that this analysis should always be preserved so that
   /// underlying implementation registration is not lost.
   bool isInvalidated(const AnalysisManager::PreservedAnalyses &pa) {
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
index 2f87975ebaa04..2f4517ddfe754 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
@@ -152,6 +152,26 @@ def OpenACC_LoopParMode : I32EnumAttr<
   let genSpecializedAttr = 0;
 }
 
+def OpenACC_PrivateRecipe : I32EnumAttrCase<"private_recipe", 0>;
+def OpenACC_FirstprivateRecipe : I32EnumAttrCase<"firstprivate_recipe", 1>;
+def OpenACC_ReductionRecipe : I32EnumAttrCase<"reduction_recipe", 2>;
+
+def OpenACC_RecipeKind : I32EnumAttr<
+    "RecipeKind",
+    "Encodes the options for kinds of recipes availabie in acc dialect",
+    [
+      OpenACC_PrivateRecipe, OpenACC_FirstprivateRecipe,
+      OpenACC_ReductionRecipe]> {
+  let cppNamespace = "::mlir::acc";
+  let genSpecializedAttr = 0;
+}
+
+def OpenACC_RecipeKindAttr : EnumAttr<OpenACC_Dialect,
+                                             OpenACC_RecipeKind,
+                                             "recipe_kind"> {
+  let assemblyFormat = [{ ```<` $value `>` }];
+}
+
 // Type used in operation below.
 def IntOrIndex : AnyTypeOf<[AnyInteger, Index]>;
 
@@ -2116,6 +2136,56 @@ def OpenACC_KernelsOp : OpenACC_Op<"kernels",
   let hasVerifier = 1;
 }
 
+//===----------------------------------------------------------------------===//
+// acc.kernel_environment
+//===----------------------------------------------------------------------===//
+
+def OpenACC_KernelEnvironmentOp : OpenACC_Op<"kernel_environment",
+    [AttrSizedOperandSegments, RecursiveMemoryEffects, SingleBlock,
+     NoTerminator,
+     MemoryEffects<[MemWrite<OpenACC_ConstructResource>,
+                    MemRead<OpenACC_CurrentDeviceIdResource>]>]> {
+  let summary = "Decomposition of compute constructs to capture data mapping "
+                "and asynchronous behavior information";
+  let description = [{
+    The `acc.kernel_environment` operation represents a decomposition of
+    any OpenACC compute construct (acc.kernels, acc.parallel, or
+    acc.serial) that captures data mapping and asynchronous behavior:
+    - data clause operands
+    - async clause operands
+    - wait clause operands
+
+    This allows kernel execution parallelism and privatization to be
+    handled separately, facilitating eventual lowering to GPU dialect where
+    kernel launching and compute offloading are handled separately.
+  }];
+
+  let arguments = (ins
+    Variadic<AnyType>:$dataClauseOperands,
+    Variadic<IntOrIndex>:$asyncOperands,
+    OptionalAttr<DeviceTypeArrayAttr>:$asyncOperandsDeviceType,
+    OptionalAttr<DeviceTypeArrayAttr>:$asyncOnly,
+    Variadic<IntOrIndex>:$waitOperands,
+    OptionalAttr<DenseI32ArrayAttr>:$waitOperandsSegments,
+    OptionalAttr<DeviceTypeArrayAttr>:$waitOperandsDeviceType,
+    OptionalAttr<BoolArrayAttr>:$hasWaitDevnum,
+    OptionalAttr<DeviceTypeArrayAttr>:$waitOnly);
+
+  let regions = (region SizedRegion<1>:$region);
+
+  let assemblyFormat = [{
+    oilist(
+        `dataOperands` `(` $dataClauseOperands `:` type($dataClauseOperands) `)`
+      | `async` `` custom<DeviceTypeOperandsWithKeywordOnly>($asyncOperands,
+            type($asyncOperands), $asyncOperandsDeviceType, $asyncOnly)
+      | `wait` `` custom<WaitClause>($waitOperands, type($waitOperands),
+          $waitOperandsDeviceType, $waitOperandsSegments, $hasWaitDevnum,
+          $waitOnly)
+    )
+    $region attr-dict
+  }];
+}
+
 //===----------------------------------------------------------------------===//
 // 2.6.5 data Construct
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCOpsInterfaces.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCOpsInterfaces.td
index 6fb9a950489f8..054c13a88a552 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCOpsInterfaces.td
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCOpsInterfaces.td
@@ -26,4 +26,22 @@ def ComputeRegionOpInterface : OpInterface<"ComputeRegionOpInterface"> {
   ];
 }
 
+def PartialEntityAccessOpInterface : OpInterface<"PartialEntityAccessOpInterface"> {
+  let cppNamespace = "::mlir::acc";
+
+  let description = [{
+    An interface for operations that access a partial entity such as
+    field or array element access.
+  }];
+
+  let methods = [
+    InterfaceMethod<"Get the base entity being accessed", "::mlir::Value",
+      "getBaseEntity", (ins)>,
+    InterfaceMethod<"Check if this is a complete view of the entity", "bool",
+      "isCompleteView", (ins), [{
+        return false;
+      }]>,
+  ];
+}
+
 #endif // OPENACC_OPS_INTERFACES
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCTypeInterfaces.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCTypeInterfaces.td
index 93e9e3d0689f7..d1bbc7f206ce6 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCTypeInterfaces.td
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCTypeInterfaces.td
@@ -259,6 +259,18 @@ def OpenACC_MappableTypeInterface : TypeInterface<"MappableType"> {
         return {};
       }]
     >,
+    InterfaceMethod<
+      /*description=*/[{
+        Returns true if the dimensions of this type are not known. This occurs
+        when the MLIR type does not encode dimensional information and there is
+        no associated descriptor or metadata in the current entity that would
+        make this information extractable. For example, an opaque pointer type
+        pointing to an array without dimension information would have unknown
+        dimensions.
+      }],
+      /*retTy=*/"bool",
+      /*methodName=*/"hasUnknownDimensions"
+    >,
     InterfaceMethod<
       /*description=*/[{
         Returns explicit `acc.bounds` operations that envelop the whole
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCUtils.h b/mlir/include/mlir/Dialect/OpenACC/OpenACCUtils.h
index 0ee88c6f47b67..964735755c4a3 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCUtils.h
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCUtils.h
@@ -43,6 +43,15 @@ mlir::acc::VariableTypeCategory getTypeCategory(mlir::Value var);
 /// empty string if no name is found.
 std::string getVariableName(mlir::Value v);
 
+/// Get the recipe name for a given recipe kind and type.
+/// Returns an empty string if not possible to generate a recipe name.
+std::string getRecipeName(mlir::acc::RecipeKind kind, mlir::Type type);
+
+// Get the base entity from partial entity access. This is used for getting
+// the base `struct` from an operation that only accesses a field or the
+// base `array` from an operation that only accesses a subarray.
+mlir::Value getBaseEntity(mlir::Value val);
+
 } // namespace acc
 } // namespace mlir
 
diff --git a/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td b/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td
index fadd3fc10bfc4..cd033c140a233 100644
--- a/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td
+++ b/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td
@@ -77,7 +77,7 @@ def ConditionOp : SCF_Op<"condition", [
 //===----------------------------------------------------------------------===//
 
 def ExecuteRegionOp : SCF_Op<"execute_region", [
-    DeclareOpInterfaceMethods<RegionBranchOpInterface>]> {
+    DeclareOpInterfaceMethods<RegionBranchOpInterface>, RecursiveMemoryEffects]> {
   let summary = "operation that executes its region exactly once";
   let description = [{
     The `scf.execute_region` operation is used to allow multiple blocks within SCF
@@ -644,6 +644,13 @@ def ForallOp : SCF_Op<"forall", [
 
     /// Returns true if the mapping specified for this forall op is linear.
     bool usesLinearMapping();
+
+    /// RegionBranchOpInterface
+
+    OperandRange getEntrySuccessorOperands(RegionSuccessor successor) {
+      return getInits();
+    }
+
   }];
 }
 
diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaComplianceData.h.inc b/mlir/include/mlir/Dialect/Tosa/IR/TosaComplianceData.h.inc
index 8b5934ff0630e..c774d870a8c45 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/TosaComplianceData.h.inc
+++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaComplianceData.h.inc
@@ -572,6 +572,8 @@ extensionComplianceMap = {
         {{fp8e4m3T, fp8ue8m0T, fp8e4m3T, fp8ue8m0T, fp32T},
          SpecificationVersion::V_1_1_DRAFT},
         {{fp8e5m2T, fp8ue8m0T, fp8e5m2T, fp8ue8m0T, fp32T},
+         SpecificationVersion::V_1_1_DRAFT},
+        {{mxint8T, fp8ue8m0T, mxint8T, fp8ue8m0T, fp32T},
          SpecificationVersion::V_1_1_DRAFT}}}}},
     {"tosa.max_pool2d",
      {{{Extension::int16}, {{{i16T, i16T}, SpecificationVersion::V_1_0}}},
@@ -870,14 +872,16 @@ extensionComplianceMap = {
         {{fp6e2m3T, fp8ue8m0T, bf16T}, SpecificationVersion::V_1_1_DRAFT},
         {{fp6e3m2T, fp8ue8m0T, bf16T}, SpecificationVersion::V_1_1_DRAFT},
         {{fp8e4m3T, fp8ue8m0T, bf16T}, SpecificationVersion::V_1_1_DRAFT},
-        {{fp8e5m2T, fp8ue8m0T, bf16T}, SpecificationVersion::V_1_1_DRAFT}},
+        {{fp8e5m2T, fp8ue8m0T, bf16T}, SpecificationVersion::V_1_1_DRAFT},
+        {{mxint8T, fp8ue8m0T, bf16T}, SpecificationVersion::V_1_1_DRAFT}},
        allOf},
       {{Extension::mxfp},
        {{{fp4e2m1T, fp8ue8m0T, fp32T}, SpecificationVersion::V_1_1_DRAFT},
         {{fp6e2m3T, fp8ue8m0T, fp32T}, SpecificationVersion::V_1_1_DRAFT},
         {{fp6e3m2T, fp8ue8m0T, fp32T}, SpecificationVersion::V_1_1_DRAFT},
         {{fp8e4m3T, fp8ue8m0T, fp32T}, SpecificationVersion::V_1_1_DRAFT},
-        {{fp8e5m2T, fp8ue8m0T, fp32T}, SpecificationVersion::V_1_1_DRAFT}}}}},
+        {{fp8e5m2T, fp8ue8m0T, fp32T}, SpecificationVersion::V_1_1_DRAFT},
+        {{mxint8T, fp8ue8m0T, fp32T}, SpecificationVersion::V_1_1_DRAFT}}}}},
     {"tosa.cast_to_block_scaled",
      {{{Extension::mxfp},
        {{{bf16T, fp4e2m1T, fp8ue8m0T}, SpecificationVersion::V_1_1_DRAFT},
@@ -885,12 +889,14 @@ extensionComplianceMap = {
         {{fp32T, fp6e2m3T, fp8ue8m0T}, SpecificationVersion::V_1_1_DRAFT},
         {{fp32T, fp6e3m2T, fp8ue8m0T}, SpecificationVersion::V_1_1_DRAFT},
         {{fp32T, fp8e4m3T, fp8ue8m0T}, SpecificationVersion::V_1_1_DRAFT},
-        {{fp32T, fp8e5m2T, fp8ue8m0T}, SpecificationVersion::V_1_1_DRAFT}}},
+        {{fp32T, fp8e5m2T, fp8ue8m0T}, SpecificationVersion::V_1_1_DRAFT},
+        {{fp32T, mxint8T, fp8ue8m0T}, SpecificationVersion::V_1_1_DRAFT}}},
       {{Extension::bf16, Extension::mxfp},
        {{{bf16T, fp6e2m3T, fp8ue8m0T}, SpecificationVersion::V_1_1_DRAFT},
         {{bf16T, fp6e3m2T, fp8ue8m0T}, SpecificationVersion::V_1_1_DRAFT},
         {{bf16T, fp8e4m3T, fp8ue8m0T}, SpecificationVersion::V_1_1_DRAFT},
-        {{bf16T, fp8e5m2T, fp8ue8m0T}, SpecificationVersion::V_1_1_DRAFT}},
+        {{bf16T, fp8e5m2T, fp8ue8m0T}, SpecificationVersion::V_1_1_DRAFT},
+        {{bf16T, mxint8T, fp8ue8m0T}, SpecificationVersion::V_1_1_DRAFT}},
        allOf}}},
     {"tosa.rescale",
      {{{Extension::int16},
@@ -908,7 +914,8 @@ extensionComplianceMap = {
        {{{fp8ue8m0T}, SpecificationVersion::V_1_1_DRAFT},
         {{fp6e3m2T}, SpecificationVersion::V_1_1_DRAFT},
         {{fp6e2m3T}, SpecificationVersion::V_1_1_DRAFT},
-        {{fp4e2m1T}, SpecificationVersion::V_1_1_DRAFT}}}}},
+        {{fp4e2m1T}, SpecificationVersion::V_1_1_DRAFT},
+        {{mxint8T}, SpecificationVersion::V_1_1_DRAFT}}}}},
     {"tosa.identity",
      {{{Extension::int4}, {{{i4T, i4T}, SpecificationVersion::V_1_0}}},
       {{Extension::int16}, {{{i48T, i48T}, SpecificationVersion::V_1_0}}},
diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.h b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.h
index a15f073bc5fcb..2d4e7cf8b9dbd 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.h
+++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.h
@@ -179,6 +179,9 @@ Value createPadConstTensor(OpBuilder &builder, Location loc, Value src,
 // returns type of variable op
 RankedTensorType getVariableType(VariableOp variableOp);
 
+// Returns the bitwidth of a TOSA tensor element type
+unsigned getBitWidth(Type type);
+
 } // namespace tosa
 } // namespace mlir
 
diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaProfileCompliance.h b/mlir/include/mlir/Dialect/Tosa/IR/TosaProfileCompliance.h
index 45d380c1b2e6c..ea58f49b64c44 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/TosaProfileCompliance.h
+++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaProfileCompliance.h
@@ -70,7 +70,7 @@ class ProfileInfoDepot {
 
 private:
   TypeInfo convertTypeToInfo(Type type) {
-    return {type.getTypeID(), type.getIntOrFloatBitWidth()};
+    return {type.getTypeID(), tosa::getBitWidth(type)};
   }
 
   TypeInfo convertValueToInfo(Value value) {
diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td
index 93843e86fd378..414b51bf4b135 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td
+++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td
@@ -22,6 +22,12 @@ include "mlir/Dialect/Tosa/IR/TosaOpBase.td"
 // Tosa Type Definitions.
 //===----------------------------------------------------------------------===//
 
+// The base class for Tosa dialect types.
+class Tosa_Type<string name, string typeMnemonic, list<Trait> traits = []>
+    : TypeDef<Tosa_Dialect, name, traits> {
+  let mnemonic = typeMnemonic;
+}
+
 // The base class of a quantized type.
 // Param tuple is: [bitwidth, zeropt, smantissa, sexp, low_end, high_end].
 // Where low and high ends are 0,255 when unsigned, -128,127 when signed, for
@@ -78,13 +84,26 @@ def Tosa_QuantizedInt : AnyTypeOf<[Tosa_QuantizedType<"uint8", [8], 0>,
                                    Tosa_QuantizedType<"int16", [16, 0], 1>,
                                    Tosa_QuantizedType<"int32", [32, 0], 1>]>;
 
+//===----------------------------------------------------------------------===//
+// Custom TOSA element types.
+//===----------------------------------------------------------------------===//
+
+// MLIR doesn't have a builtin type for mxint8 yet. For now declared it as a
+// custom TOSA type. This may be changed in the future.
+def Tosa_MXInt8 : Tosa_Type<"mxint8", "mxint8"> {
+  let summary = "INT8 type as defined by OCP-MX";
+  let description = [{
+    8-bit integer format with an implicit 1/64 scale defined by OCP-MX.
+  }];
+}
+
 //===----------------------------------------------------------------------===//
 // Multi-category types.
 //===----------------------------------------------------------------------===//
-def Tosa_AnyNumber : AnyTypeOf<[Tosa_Int, Tosa_QuantizedInt, AnyFloat],
+def Tosa_AnyNumber : AnyTypeOf<[Tosa_Int, Tosa_QuantizedInt, AnyFloat, Tosa_MXInt8],
                                 "number">;
 
-def Tosa_MXFPNumber : AnyTypeOf<[F8E4M3FN, F8E5M2, F4E2M1FN, F6E2M3FN, F6E3M2FN],
+def Tosa_MXFPNumber : AnyTypeOf<[F8E4M3FN, F8E5M2, F4E2M1FN, F6E2M3FN, F6E3M2FN, Tosa_MXInt8],
                                 "micro-scaling format number">;
 def Tosa_MXFPScaleNumber : AnyTypeOf<[F8E8M0FNU], "micro-scaling format scale number">;
 
@@ -265,16 +284,6 @@ def Tosa_Buffer : MemRefOf<[Tosa_AnyNumber]>;
 def Tosa_TupleBuffer : NestedTupleOf<[Tosa_Buffer]>;
 def Tosa_BufOrTuple : AnyTypeOf<[Tosa_Buffer, Tosa_TupleBuffer]>;
 
-//===----------------------------------------------------------------------===//
-// Tosa Type Definitions.
-//===----------------------------------------------------------------------===//
-
-// The base class for Tosa dialect types.
-class Tosa_Type<string name, string typeMnemonic, list<Trait> traits = []>
-    : TypeDef<Tosa_Dialect, name, traits> {
-  let mnemonic = typeMnemonic;
-}
-
 //===----------------------------------------------------------------------===//
 // ShapeType
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td b/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td
index 62e66b3dabee8..ed69287410509 100644
--- a/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td
+++ b/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td
@@ -25,7 +25,7 @@ include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.td"
 
 def AlternativesOp : TransformDialectOp<"alternatives",
     [DeclareOpInterfaceMethods<RegionBranchOpInterface,
-        ["getEntrySuccessorOperands", "getSuccessorRegions",
+        ["getEntrySuccessorOperands",
          "getRegionInvocationBounds"]>,
      DeclareOpInterfaceMethods<TransformOpInterface>,
      DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
@@ -624,7 +624,7 @@ def ForeachOp : TransformDialectOp<"foreach",
     [DeclareOpInterfaceMethods<TransformOpInterface>,
      DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
      DeclareOpInterfaceMethods<RegionBranchOpInterface, [
-         "getSuccessorRegions", "getEntrySuccessorOperands"]>,
+         "getEntrySuccessorOperands"]>,
      SingleBlockImplicitTerminator<"::mlir::transform::YieldOp">
     ]> {
   let summary = "Executes the body for each element of the payload";
@@ -1237,7 +1237,7 @@ def SelectOp : TransformDialectOp<"select",
 
 def SequenceOp : TransformDialectOp<"sequence",
     [DeclareOpInterfaceMethods<RegionBranchOpInterface,
-        ["getEntrySuccessorOperands", "getSuccessorRegions",
+        ["getEntrySuccessorOperands",
          "getRegionInvocationBounds"]>,
      MatchOpInterface,
      DeclareOpInterfaceMethods<TransformOpInterface>,
diff --git a/mlir/include/mlir/Dialect/Transform/TuneExtension/TuneExtensionOps.td b/mlir/include/mlir/Dialect/Transform/TuneExtension/TuneExtensionOps.td
index d095659fc4838..4079848fd203a 100644
--- a/mlir/include/mlir/Dialect/Transform/TuneExtension/TuneExtensionOps.td
+++ b/mlir/include/mlir/Dialect/Transform/TuneExtension/TuneExtensionOps.td
@@ -63,7 +63,7 @@ def KnobOp : Op<Transform_Dialect, "tune.knob", [
 
 def AlternativesOp : Op<Transform_Dialect, "tune.alternatives", [
   DeclareOpInterfaceMethods<RegionBranchOpInterface,
-        ["getEntrySuccessorOperands", "getSuccessorRegions",
+        ["getEntrySuccessorOperands",
          "getRegionInvocationBounds"]>,
   DeclareOpInterfaceMethods<TransformOpInterface>,
   DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 19a52317956d2..40352b44b6441 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -379,28 +379,28 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout", [DistributeLayoutAttr]> {
   );
 
   let builders = [
-    AttrBuilder<(ins "llvm::ArrayRef<int32_t>": $lane_layout,
+    AttrBuilder<(ins "llvm::ArrayRef<int32_t>": $inst_data,
+                      "llvm::ArrayRef<int32_t>": $lane_layout,
                      "llvm::ArrayRef<int32_t>": $lane_data),
       [{
         auto sg_layout = DenseI32ArrayAttr();
         auto sg_data = DenseI32ArrayAttr();
-        auto inst_data = DenseI32ArrayAttr();
         auto order = DenseI32ArrayAttr();
-        return $_get($_ctxt, sg_layout, sg_data, inst_data,
+        return $_get($_ctxt, sg_layout, sg_data,
+                     DenseI32ArrayAttr::get($_ctxt, inst_data),
                      DenseI32ArrayAttr::get($_ctxt, lane_layout),
                      DenseI32ArrayAttr::get($_ctxt, lane_data), order);
       }]>,
     AttrBuilder<(ins "llvm::ArrayRef<int32_t>": $lane_layout,
-                     "llvm::ArrayRef<int32_t>": $lane_data,
-                     "llvm::ArrayRef<int32_t>": $order),
+                     "llvm::ArrayRef<int32_t>": $lane_data),
       [{
-        return $_get($_ctxt,
-                     /*sg_layout =*/ nullptr,
-                     /*sg_data   =*/ nullptr,
-                     /*inst_data =*/ nullptr,
+        auto sg_layout = DenseI32ArrayAttr();
+        auto sg_data = DenseI32ArrayAttr();
+        auto inst_data = DenseI32ArrayAttr();
+        auto order = DenseI32ArrayAttr();
+        return $_get($_ctxt, sg_layout, sg_data, inst_data,
                      DenseI32ArrayAttr::get($_ctxt, lane_layout),
-                     DenseI32ArrayAttr::get($_ctxt, lane_data),
-                     DenseI32ArrayAttr::get($_ctxt, order));
+                     DenseI32ArrayAttr::get($_ctxt, lane_data), order);
       }]>,
     AttrBuilder<(ins "DenseI32ArrayAttr": $lane_layout,
                      "DenseI32ArrayAttr": $lane_data,
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTargetInfo.h b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTargetInfo.h
deleted file mode 100644
index 8aa9536cb67c1..0000000000000
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTargetInfo.h
+++ /dev/null
@@ -1,30 +0,0 @@
-//===- XeGPUTargetInfo.h - Target constants ---------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef MLIR_DIALECT_XEGPU_IR_XEGPUTARGETINFO_H_
-#define MLIR_DIALECT_XEGPU_IR_XEGPUTARGETINFO_H_
-
-namespace mlir {
-namespace xegpu {
-/// HW dependent constants.
-/// TODO: These constants should be queried from the target information.
-namespace targetinfo {
-constexpr unsigned subgroupSize = 16; // How many lanes in a subgroup.
-/// If DPAS A or B operands have low precision element types they must be packed
-/// according to the following sizes.
-constexpr unsigned packedSizeInBitsForDefault =
-    16; // Minimum packing size per register for DPAS A.
-constexpr unsigned packedSizeInBitsForDpasB =
-    32; // Minimum packing size per register for DPAS B.
-constexpr unsigned packedSizeInBitsForGatherScatter =
-    32; // Minimum packing size per register for Gather and Scatter ops.
-} // namespace targetinfo
-} // namespace xegpu
-} // namespace mlir
-
-#endif // MLIR_DIALECT_XEGPU_IR_XEGPUTARGETINFO_H_
diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
index 564d9c4d5422b..b7af5413669c9 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
@@ -43,7 +43,12 @@ def XeGPUPropagateLayout : Pass<"xegpu-propagate-layout"> {
   let options = [Option<
     "printOnly", "print-analysis-only", "bool",
     /*default=*/"false",
-    "Print the result of layout propagation analysis and exit.">];
+    "Print the result of layout propagation analysis and exit.">,
+    Option<
+    "layoutKind", "layout-kind", "std::string",
+    /*default=*/"\"lane\"",
+    "Propagate a `sg` / `inst` / `lane` level of xegpu layouts.">
+  ];
 }
 
 def XeGPUWgToSgDistribute : Pass<"xegpu-wg-to-sg-distribute"> {
diff --git a/mlir/include/mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h b/mlir/include/mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h
index dcb2ad5d67a25..b3231a173f33a 100644
--- a/mlir/include/mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h
+++ b/mlir/include/mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h
@@ -270,6 +270,8 @@ inline const uArch *getUArch(llvm::StringRef archName) {
     return PVCuArch::getInstance();
   else if (archName.equals_insensitive("bmg"))
     return BMGuArch::getInstance();
+  else
+    llvm_unreachable("No matching uArch found");
 
   return nullptr;
 }
diff --git a/mlir/include/mlir/Dialect/XeGPU/uArch/uArchBase.h b/mlir/include/mlir/Dialect/XeGPU/uArch/uArchBase.h
index ea33e885c78ff..8f23b89134773 100644
--- a/mlir/include/mlir/Dialect/XeGPU/uArch/uArchBase.h
+++ b/mlir/include/mlir/Dialect/XeGPU/uArch/uArchBase.h
@@ -29,6 +29,8 @@ namespace mlir {
 namespace xegpu {
 namespace uArch {
 
+constexpr unsigned generalPackedFormatBitSize{32};
+
 // An enum class to represent the scope of an instruction
 enum class InstructionScope { Lane, Subgroup, Workgroup, Cluster };
 enum class InstructionKind {
diff --git a/mlir/include/mlir/IR/BlockSupport.h b/mlir/include/mlir/IR/BlockSupport.h
index f9fbef2f3753f..a2e080ef4f63f 100644
--- a/mlir/include/mlir/IR/BlockSupport.h
+++ b/mlir/include/mlir/IR/BlockSupport.h
@@ -206,12 +206,12 @@ namespace ilist_detail {
 // operations to have trailing Regions without a circular include
 // dependence.
 template <>
-struct SpecificNodeAccess<
-    typename compute_node_options<::mlir::Operation>::type> : NodeAccess {
+struct SpecificNodeAccess<compute_node_options<::mlir::Operation>::type>
+    : NodeAccess {
 protected:
-  using OptionsT = typename compute_node_options<mlir::Operation>::type;
-  using pointer = typename OptionsT::pointer;
-  using const_pointer = typename OptionsT::const_pointer;
+  using OptionsT = compute_node_options<mlir::Operation>::type;
+  using pointer = OptionsT::pointer;
+  using const_pointer = OptionsT::const_pointer;
   using node_type = ilist_node_impl<OptionsT>;
 
   static node_type *getNodePtr(pointer N);
diff --git a/mlir/include/mlir/IR/Diagnostics.h b/mlir/include/mlir/IR/Diagnostics.h
index 7ff718ad7f241..a0a99f4953822 100644
--- a/mlir/include/mlir/IR/Diagnostics.h
+++ b/mlir/include/mlir/IR/Diagnostics.h
@@ -29,6 +29,7 @@ class MLIRContext;
 class Operation;
 class OperationName;
 class OpPrintingFlags;
+class OpWithFlags;
 class Type;
 class Value;
 
@@ -199,6 +200,7 @@ class Diagnostic {
 
   /// Stream in an Operation.
   Diagnostic &operator<<(Operation &op);
+  Diagnostic &operator<<(OpWithFlags op);
   Diagnostic &operator<<(Operation *op) { return *this << *op; }
   /// Append an operation with the given printing flags.
   Diagnostic &appendOp(Operation &op, const OpPrintingFlags &flags);
diff --git a/mlir/include/mlir/IR/Operation.h b/mlir/include/mlir/IR/Operation.h
index 5569392cf0b41..b2019574a820d 100644
--- a/mlir/include/mlir/IR/Operation.h
+++ b/mlir/include/mlir/IR/Operation.h
@@ -1114,6 +1114,7 @@ class OpWithFlags {
       : op(op), theFlags(flags) {}
   OpPrintingFlags &flags() { return theFlags; }
   const OpPrintingFlags &flags() const { return theFlags; }
+  Operation *getOperation() const { return op; }
 
 private:
   Operation *op;
diff --git a/mlir/include/mlir/IR/Region.h b/mlir/include/mlir/IR/Region.h
index 1fcb316750230..53d461df98710 100644
--- a/mlir/include/mlir/IR/Region.h
+++ b/mlir/include/mlir/IR/Region.h
@@ -379,6 +379,8 @@ class RegionRange
   friend RangeBaseT;
 };
 
+llvm::raw_ostream &operator<<(llvm::raw_ostream &os, Region &region);
+
 } // namespace mlir
 
 #endif // MLIR_IR_REGION_H
diff --git a/mlir/include/mlir/Interfaces/ControlFlowInterfaces.h b/mlir/include/mlir/Interfaces/ControlFlowInterfaces.h
index d63800c12d132..bfc24c18429ed 100644
--- a/mlir/include/mlir/Interfaces/ControlFlowInterfaces.h
+++ b/mlir/include/mlir/Interfaces/ControlFlowInterfaces.h
@@ -15,10 +15,15 @@
 #define MLIR_INTERFACES_CONTROLFLOWINTERFACES_H
 
 #include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/Operation.h"
+#include "llvm/ADT/PointerUnion.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/raw_ostream.h"
 
 namespace mlir {
 class BranchOpInterface;
 class RegionBranchOpInterface;
+class RegionBranchTerminatorOpInterface;
 
 /// This class models how operands are forwarded to block arguments in control
 /// flow. It consists of a number, denoting how many of the successors block
@@ -186,27 +191,40 @@ class RegionSuccessor {
 public:
   /// Initialize a successor that branches to another region of the parent
   /// operation.
+  /// TODO: the default value for the regionInputs is somehow broken.
+  /// A region successor should have its input correctly set.
   RegionSuccessor(Region *region, Block::BlockArgListType regionInputs = {})
-      : region(region), inputs(regionInputs) {}
+      : successor(region), inputs(regionInputs) {
+    assert(region && "Region must not be null");
+  }
   /// Initialize a successor that branches back to/out of the parent operation.
-  RegionSuccessor(Operation::result_range results)
-      : inputs(ValueRange(results)) {}
-  /// Constructor with no arguments.
-  RegionSuccessor() : inputs(ValueRange()) {}
+  /// The target must be one of the recursive parent operations.
+  RegionSuccessor(Operation *successorOp, Operation::result_range results)
+      : successor(successorOp), inputs(ValueRange(results)) {
+    assert(successorOp && "Successor op must not be null");
+  }
 
   /// Return the given region successor. Returns nullptr if the successor is the
   /// parent operation.
-  Region *getSuccessor() const { return region; }
+  Region *getSuccessor() const { return dyn_cast<Region *>(successor); }
 
   /// Return true if the successor is the parent operation.
-  bool isParent() const { return region == nullptr; }
+  bool isParent() const { return isa<Operation *>(successor); }
 
   /// Return the inputs to the successor that are remapped by the exit values of
   /// the current region.
   ValueRange getSuccessorInputs() const { return inputs; }
 
+  bool operator==(RegionSuccessor rhs) const {
+    return successor == rhs.successor && inputs == rhs.inputs;
+  }
+
+  friend bool operator!=(RegionSuccessor lhs, RegionSuccessor rhs) {
+    return !(lhs == rhs);
+  }
+
 private:
-  Region *region{nullptr};
+  llvm::PointerUnion<Region *, Operation *> successor{nullptr};
   ValueRange inputs;
 };
 
@@ -214,64 +232,67 @@ class RegionSuccessor {
 /// `RegionBranchOpInterface`.
 /// One can branch from one of two kinds of places:
 /// * The parent operation (aka the `RegionBranchOpInterface` implementation)
-/// * A region within the parent operation.
+/// * A RegionBranchTerminatorOpInterface inside a region within the parent
+//    operation.
 class RegionBranchPoint {
 public:
   /// Returns an instance of `RegionBranchPoint` representing the parent
   /// operation.
   static constexpr RegionBranchPoint parent() { return RegionBranchPoint(); }
 
-  /// Creates a `RegionBranchPoint` that branches from the given region.
-  /// The pointer must not be null.
-  RegionBranchPoint(Region *region) : maybeRegion(region) {
-    assert(region && "Region must not be null");
-  }
-
-  RegionBranchPoint(Region &region) : RegionBranchPoint(&region) {}
+  /// Creates a `RegionBranchPoint` that branches from the given terminator.
+  inline RegionBranchPoint(RegionBranchTerminatorOpInterface predecessor);
 
   /// Explicitly stops users from constructing with `nullptr`.
   RegionBranchPoint(std::nullptr_t) = delete;
 
-  /// Constructs a `RegionBranchPoint` from the the target of a
-  /// `RegionSuccessor` instance.
-  RegionBranchPoint(RegionSuccessor successor) {
-    if (successor.isParent())
-      maybeRegion = nullptr;
-    else
-      maybeRegion = successor.getSuccessor();
-  }
-
-  /// Assigns a region being branched from.
-  RegionBranchPoint &operator=(Region &region) {
-    maybeRegion = &region;
-    return *this;
-  }
-
   /// Returns true if branching from the parent op.
-  bool isParent() const { return maybeRegion == nullptr; }
+  bool isParent() const { return predecessor == nullptr; }
 
-  /// Returns the region if branching from a region.
+  /// Returns the terminator if branching from a region.
   /// A null pointer otherwise.
-  Region *getRegionOrNull() const { return maybeRegion; }
+  Operation *getTerminatorPredecessorOrNull() const { return predecessor; }
 
   /// Returns true if the two branch points are equal.
   friend bool operator==(RegionBranchPoint lhs, RegionBranchPoint rhs) {
-    return lhs.maybeRegion == rhs.maybeRegion;
+    return lhs.predecessor == rhs.predecessor;
   }
 
 private:
   // Private constructor to encourage the use of `RegionBranchPoint::parent`.
-  constexpr RegionBranchPoint() : maybeRegion(nullptr) {}
+  constexpr RegionBranchPoint() = default;
 
   /// Internal encoding. Uses nullptr for representing branching from the parent
-  /// op and the region being branched from otherwise.
-  Region *maybeRegion;
+  /// op and the region terminator being branched from otherwise.
+  Operation *predecessor = nullptr;
 };
 
 inline bool operator!=(RegionBranchPoint lhs, RegionBranchPoint rhs) {
   return !(lhs == rhs);
 }
 
+inline llvm::raw_ostream &operator<<(llvm::raw_ostream &os,
+                                     RegionBranchPoint point) {
+  if (point.isParent())
+    return os << "<from parent>";
+  return os << "<region #"
+            << point.getTerminatorPredecessorOrNull()
+                   ->getParentRegion()
+                   ->getRegionNumber()
+            << ", terminator "
+            << OpWithFlags(point.getTerminatorPredecessorOrNull(),
+                           OpPrintingFlags().skipRegions())
+            << ">";
+}
+
+inline llvm::raw_ostream &operator<<(llvm::raw_ostream &os,
+                                     RegionSuccessor successor) {
+  if (successor.isParent())
+    return os << "<to parent>";
+  return os << "<to region #" << successor.getSuccessor()->getRegionNumber()
+            << " with " << successor.getSuccessorInputs().size() << " inputs>";
+}
+
 /// This class represents upper and lower bounds on the number of times a region
 /// of a `RegionBranchOpInterface` can be invoked. The lower bound is at least
 /// zero, but the upper bound may not be known.
@@ -348,4 +369,10 @@ struct ReturnLike : public TraitBase<ConcreteType, ReturnLike> {
 /// Include the generated interface declarations.
 #include "mlir/Interfaces/ControlFlowInterfaces.h.inc"
 
+namespace mlir {
+inline RegionBranchPoint::RegionBranchPoint(
+    RegionBranchTerminatorOpInterface predecessor)
+    : predecessor(predecessor.getOperation()) {}
+} // namespace mlir
+
 #endif // MLIR_INTERFACES_CONTROLFLOWINTERFACES_H
diff --git a/mlir/include/mlir/Interfaces/ControlFlowInterfaces.td b/mlir/include/mlir/Interfaces/ControlFlowInterfaces.td
index b8d08cc553caa..94242e3ba39ce 100644
--- a/mlir/include/mlir/Interfaces/ControlFlowInterfaces.td
+++ b/mlir/include/mlir/Interfaces/ControlFlowInterfaces.td
@@ -117,7 +117,7 @@ def BranchOpInterface : OpInterface<"BranchOpInterface"> {
 
 def RegionBranchOpInterface : OpInterface<"RegionBranchOpInterface"> {
   let description = [{
-    This interface provides information for region operations that exhibit
+    This interface provides information for region-holding operations that exhibit
     branching behavior between held regions. I.e., this interface allows for
     expressing control flow information for region holding operations.
 
@@ -126,12 +126,12 @@ def RegionBranchOpInterface : OpInterface<"RegionBranchOpInterface"> {
     be side-effect free.
 
     A "region branch point" indicates a point from which a branch originates. It
-    can indicate either a region of this op or `RegionBranchPoint::parent()`. In
-    the latter case, the branch originates from outside of the op, i.e., when
-    first executing this op.
+    can indicate either a terminator in any of the immediately nested region of
+    this op or `RegionBranchPoint::parent()`. In the latter case, the branch
+    originates from outside of the op, i.e., when first executing this op.
 
     A "region successor" indicates the target of a branch. It can indicate
-    either a region of this op or this op. In the former case, the region
+    either a region of this op or this op itself. In the former case, the region
     successor is a region pointer and a range of block arguments to which the
     "successor operands" are forwarded to. In the latter case, the control flow
     leaves this op and the region successor is a range of results of this op to
@@ -151,10 +151,10 @@ def RegionBranchOpInterface : OpInterface<"RegionBranchOpInterface"> {
     }
     ```
 
-    `scf.for` has one region. The region has two region successors: the region
-    itself and the `scf.for` op. %b is an entry successor operand. %c is a
-    successor operand. %a is a successor block argument. %r is a successor
-    result.
+    `scf.for` has one region. The `scf.yield` has two region successors: the
+    region body itself and the `scf.for` op. `%b` is an entry successor
+    operand. `%c` is a successor operand. `%a` is a successor block argument.
+    `%r` is a successor result.
   }];
   let cppNamespace = "::mlir";
 
@@ -162,16 +162,16 @@ def RegionBranchOpInterface : OpInterface<"RegionBranchOpInterface"> {
     InterfaceMethod<[{
         Returns the operands of this operation that are forwarded to the region
         successor's block arguments or this operation's results when branching
-        to `point`. `point` is guaranteed to be among the successors that are
+        to `successor`. `successor` is guaranteed to be among the successors that are
         returned by `getEntrySuccessorRegions`/`getSuccessorRegions(parent())`.
 
         Example: In the above example, this method returns the operand %b of the
-        `scf.for` op, regardless of the value of `point`. I.e., this op always
+        `scf.for` op, regardless of the value of `successor`. I.e., this op always
         forwards the same operands, regardless of whether the loop has 0 or more
         iterations.
       }],
       "::mlir::OperandRange", "getEntrySuccessorOperands",
-      (ins "::mlir::RegionBranchPoint":$point), [{}],
+      (ins "::mlir::RegionSuccessor":$successor), [{}],
       /*defaultImplementation=*/[{
         auto operandEnd = this->getOperation()->operand_end();
         return ::mlir::OperandRange(operandEnd, operandEnd);
@@ -224,6 +224,80 @@ def RegionBranchOpInterface : OpInterface<"RegionBranchOpInterface"> {
       (ins "::mlir::RegionBranchPoint":$point,
            "::llvm::SmallVectorImpl<::mlir::RegionSuccessor> &":$regions)
     >,
+    InterfaceMethod<[{
+        Returns the potential region successors when branching from any
+        terminator in `region`.
+        These are the regions that may be selected during the flow of control.
+      }],
+      "void", "getSuccessorRegions",
+      (ins "::mlir::Region&":$region,
+           "::llvm::SmallVectorImpl<::mlir::RegionSuccessor> &":$regions),
+      [{}],
+      /*defaultImplementation=*/[{
+        for (::mlir::Block &block : region) {
+          if (block.empty())
+            continue;
+          if (auto terminator =
+                  dyn_cast<RegionBranchTerminatorOpInterface>(block.back()))
+            $_op.getSuccessorRegions(RegionBranchPoint(terminator),
+                                     regions);
+        }
+      }]>,
+    InterfaceMethod<[{
+        Returns the potential branching point (predecessors) for a given successor.
+      }],
+      "void", "getPredecessors",
+      (ins "::mlir::RegionSuccessor":$successor,
+           "::llvm::SmallVectorImpl<::mlir::RegionBranchPoint> &":$predecessors),
+      [{}],
+      /*defaultImplementation=*/[{
+        ::llvm::SmallVector<::mlir::RegionSuccessor> successors;
+        $_op.getSuccessorRegions(RegionBranchPoint::parent(),
+                                 successors);
+        if (llvm::any_of(successors, [&] (const RegionSuccessor & succ) {
+            return succ.getSuccessor() == successor.getSuccessor() ||
+              (succ.isParent() && successor.isParent());
+          }))
+          predecessors.push_back(RegionBranchPoint::parent());
+        for (Region &region : $_op->getRegions()) {
+          for (::mlir::Block &block : region) {
+            if (block.empty())
+              continue;
+            if (auto terminator =
+                    dyn_cast<RegionBranchTerminatorOpInterface>(block.back())) {
+              ::llvm::SmallVector<::mlir::RegionSuccessor> successors;
+              $_op.getSuccessorRegions(RegionBranchPoint(terminator),
+                                       successors);
+              if (llvm::any_of(successors, [&] (const RegionSuccessor & succ) {
+                  return succ.getSuccessor() == successor.getSuccessor() ||
+                    (succ.isParent() && successor.isParent());
+                }))
+                predecessors.push_back(terminator);
+            }
+          }
+        }
+      }]>,
+    InterfaceMethod<[{
+        Returns the potential values across all (predecessors) for a given successor
+        input, modeled by its index (its position in the list of values).
+      }],
+      "void", "getPredecessorValues",
+      (ins "::mlir::RegionSuccessor":$successor,
+           "int":$index,
+           "::llvm::SmallVectorImpl<::mlir::Value> &":$predecessorValues),
+      [{}],
+      /*defaultImplementation=*/[{
+        ::llvm::SmallVector<::mlir::RegionBranchPoint> predecessors;
+        $_op.getPredecessors(successor, predecessors);
+        for (auto predecessor : predecessors) {
+          if (predecessor.isParent()) {
+            predecessorValues.push_back($_op.getEntrySuccessorOperands(successor)[index]);
+            continue;
+          }
+          auto terminator = cast<RegionBranchTerminatorOpInterface>(predecessor.getTerminatorPredecessorOrNull());
+          predecessorValues.push_back(terminator.getSuccessorOperands(successor)[index]);
+        }
+      }]>,
     InterfaceMethod<[{
         Populates `invocationBounds` with the minimum and maximum number of
         times this operation will invoke the attached regions (assuming the
@@ -298,7 +372,7 @@ def RegionBranchTerminatorOpInterface :
         passing them to the region successor indicated by `point`.
       }],
       "::mlir::MutableOperandRange", "getMutableSuccessorOperands",
-      (ins "::mlir::RegionBranchPoint":$point)
+      (ins "::mlir::RegionSuccessor":$point)
     >,
     InterfaceMethod<[{
         Returns the potential region successors that are branched to after this
@@ -317,7 +391,7 @@ def RegionBranchTerminatorOpInterface :
       /*defaultImplementation=*/[{
         ::mlir::Operation *op = $_op;
         ::llvm::cast<::mlir::RegionBranchOpInterface>(op->getParentOp())
-          .getSuccessorRegions(op->getParentRegion(), regions);
+          .getSuccessorRegions(::llvm::cast<::mlir::RegionBranchTerminatorOpInterface>(op), regions);
       }]
     >,
   ];
@@ -337,8 +411,8 @@ def RegionBranchTerminatorOpInterface :
     // them to the region successor given by `index`.  If `index` is None, this
     // function returns the operands that are passed as a result to the parent
     // operation.
-    ::mlir::OperandRange getSuccessorOperands(::mlir::RegionBranchPoint point) {
-      return getMutableSuccessorOperands(point);
+    ::mlir::OperandRange getSuccessorOperands(::mlir::RegionSuccessor successor) {
+      return getMutableSuccessorOperands(successor);
     }
   }];
 }
@@ -504,7 +578,7 @@ def ReturnLike : TraitList<[
         /*extraOpDeclaration=*/"",
         /*extraOpDefinition=*/[{
           ::mlir::MutableOperandRange $cppClass::getMutableSuccessorOperands(
-            ::mlir::RegionBranchPoint point) {
+            ::mlir::RegionSuccessor successor) {
             return ::mlir::MutableOperandRange(*this);
           }
         }]
diff --git a/mlir/include/mlir/Interfaces/TilingInterface.td b/mlir/include/mlir/Interfaces/TilingInterface.td
index 0c0fc88aec95a..e0516abdfcf0c 100644
--- a/mlir/include/mlir/Interfaces/TilingInterface.td
+++ b/mlir/include/mlir/Interfaces/TilingInterface.td
@@ -57,8 +57,8 @@ def TilingInterface : OpInterface<"TilingInterface"> {
     For an operation to be "tiled and fused" with its (already tiled) producer,
     an operation has to implement the following additional methods (see
     description below):
-      - `getTiledImplementationFromOperandTile`
-      - `getIterationDomainTileFromOperandTile`.
+      - `getTiledImplementationFromOperandTiles`
+      - `getIterationDomainTileFromOperandTiles`.
   }];
   let cppNamespace = "::mlir";
   let methods = [
diff --git a/mlir/include/mlir/Support/Timing.h b/mlir/include/mlir/Support/Timing.h
index a8a4bfd1c6cf1..3d61a0a7a85c9 100644
--- a/mlir/include/mlir/Support/Timing.h
+++ b/mlir/include/mlir/Support/Timing.h
@@ -44,7 +44,7 @@ class DefaultTimingManagerImpl;
 /// This is a POD type with pointer size, so it should be passed around by
 /// value. The underlying data is owned by the `TimingManager`.
 class TimingIdentifier {
-  using EntryType = llvm::StringMapEntry<std::nullopt_t>;
+  using EntryType = llvm::StringMapEntry<llvm::EmptyStringSetTag>;
 
 public:
   TimingIdentifier(const TimingIdentifier &) = default;
diff --git a/mlir/lib/Analysis/AliasAnalysis/LocalAliasAnalysis.cpp b/mlir/lib/Analysis/AliasAnalysis/LocalAliasAnalysis.cpp
index a84d10d5d609d..24cb123e51877 100644
--- a/mlir/lib/Analysis/AliasAnalysis/LocalAliasAnalysis.cpp
+++ b/mlir/lib/Analysis/AliasAnalysis/LocalAliasAnalysis.cpp
@@ -16,19 +16,21 @@
 #include "mlir/IR/Operation.h"
 #include "mlir/IR/Region.h"
 #include "mlir/IR/Value.h"
-#include "mlir/IR/ValueRange.h"
 #include "mlir/Interfaces/ControlFlowInterfaces.h"
 #include "mlir/Interfaces/FunctionInterfaces.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 #include "mlir/Interfaces/ViewLikeInterface.h"
 #include "mlir/Support/LLVM.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/DebugLog.h"
 #include <cassert>
 #include <optional>
 #include <utility>
 
 using namespace mlir;
 
+#define DEBUG_TYPE "local-alias-analysis"
+
 //===----------------------------------------------------------------------===//
 // Underlying Address Computation
 //===----------------------------------------------------------------------===//
@@ -42,81 +44,47 @@ static void collectUnderlyingAddressValues(Value value, unsigned maxDepth,
                                            DenseSet<Value> &visited,
                                            SmallVectorImpl<Value> &output);
 
-/// Given a successor (`region`) of a RegionBranchOpInterface, collect all of
-/// the underlying values being addressed by one of the successor inputs. If the
-/// provided `region` is null, as per `RegionBranchOpInterface` this represents
-/// the parent operation.
-static void collectUnderlyingAddressValues(RegionBranchOpInterface branch,
-                                           Region *region, Value inputValue,
-                                           unsigned inputIndex,
-                                           unsigned maxDepth,
-                                           DenseSet<Value> &visited,
-                                           SmallVectorImpl<Value> &output) {
-  // Given the index of a region of the branch (`predIndex`), or std::nullopt to
-  // represent the parent operation, try to return the index into the outputs of
-  // this region predecessor that correspond to the input values of `region`. If
-  // an index could not be found, std::nullopt is returned instead.
-  auto getOperandIndexIfPred =
-      [&](RegionBranchPoint pred) -> std::optional<unsigned> {
-    SmallVector<RegionSuccessor, 2> successors;
-    branch.getSuccessorRegions(pred, successors);
-    for (RegionSuccessor &successor : successors) {
-      if (successor.getSuccessor() != region)
-        continue;
-      // Check that the successor inputs map to the given input value.
-      ValueRange inputs = successor.getSuccessorInputs();
-      if (inputs.empty()) {
-        output.push_back(inputValue);
-        break;
-      }
-      unsigned firstInputIndex, lastInputIndex;
-      if (region) {
-        firstInputIndex = cast<BlockArgument>(inputs[0]).getArgNumber();
-        lastInputIndex = cast<BlockArgument>(inputs.back()).getArgNumber();
-      } else {
-        firstInputIndex = cast<OpResult>(inputs[0]).getResultNumber();
-        lastInputIndex = cast<OpResult>(inputs.back()).getResultNumber();
-      }
-      if (firstInputIndex > inputIndex || lastInputIndex < inputIndex) {
-        output.push_back(inputValue);
-        break;
-      }
-      return inputIndex - firstInputIndex;
-    }
-    return std::nullopt;
-  };
-
-  // Check branches from the parent operation.
-  auto branchPoint = RegionBranchPoint::parent();
-  if (region)
-    branchPoint = region;
-
-  if (std::optional<unsigned> operandIndex =
-          getOperandIndexIfPred(/*predIndex=*/RegionBranchPoint::parent())) {
-    collectUnderlyingAddressValues(
-        branch.getEntrySuccessorOperands(branchPoint)[*operandIndex], maxDepth,
-        visited, output);
+/// Given a RegionBranchOpInterface operation  (`branch`), a Value`inputValue`
+/// which is an input for the provided successor (`initialSuccessor`), try to
+/// find the possible sources for the value along the control flow edges.
+static void collectUnderlyingAddressValues2(
+    RegionBranchOpInterface branch, RegionSuccessor initialSuccessor,
+    Value inputValue, unsigned inputIndex, unsigned maxDepth,
+    DenseSet<Value> &visited, SmallVectorImpl<Value> &output) {
+  LDBG() << "collectUnderlyingAddressValues2: "
+         << OpWithFlags(branch.getOperation(), OpPrintingFlags().skipRegions());
+  LDBG() << " with initialSuccessor " << initialSuccessor;
+  LDBG() << "  inputValue: " << inputValue;
+  LDBG() << "  inputIndex: " << inputIndex;
+  LDBG() << "  maxDepth: " << maxDepth;
+  ValueRange inputs = initialSuccessor.getSuccessorInputs();
+  if (inputs.empty()) {
+    LDBG() << "  input is empty, enqueue value";
+    output.push_back(inputValue);
+    return;
   }
-  // Check branches from each child region.
-  Operation *op = branch.getOperation();
-  for (Region &region : op->getRegions()) {
-    if (std::optional<unsigned> operandIndex = getOperandIndexIfPred(region)) {
-      for (Block &block : region) {
-        // Try to determine possible region-branch successor operands for the
-        // current region.
-        if (auto term = dyn_cast<RegionBranchTerminatorOpInterface>(
-                block.getTerminator())) {
-          collectUnderlyingAddressValues(
-              term.getSuccessorOperands(branchPoint)[*operandIndex], maxDepth,
-              visited, output);
-        } else if (block.getNumSuccessors()) {
-          // Otherwise, if this terminator may exit the region we can't make
-          // any assumptions about which values get passed.
-          output.push_back(inputValue);
-          return;
-        }
-      }
-    }
+  unsigned firstInputIndex, lastInputIndex;
+  if (isa<BlockArgument>(inputs[0])) {
+    firstInputIndex = cast<BlockArgument>(inputs[0]).getArgNumber();
+    lastInputIndex = cast<BlockArgument>(inputs.back()).getArgNumber();
+  } else {
+    firstInputIndex = cast<OpResult>(inputs[0]).getResultNumber();
+    lastInputIndex = cast<OpResult>(inputs.back()).getResultNumber();
+  }
+  if (firstInputIndex > inputIndex || lastInputIndex < inputIndex) {
+    LDBG() << "  !! Input index " << inputIndex << " out of range "
+           << firstInputIndex << " to " << lastInputIndex
+           << ", adding input value to output";
+    output.push_back(inputValue);
+    return;
+  }
+  SmallVector<Value> predecessorValues;
+  branch.getPredecessorValues(initialSuccessor, inputIndex - firstInputIndex,
+                              predecessorValues);
+  LDBG() << "  Found " << predecessorValues.size() << " predecessor values";
+  for (Value predecessorValue : predecessorValues) {
+    LDBG() << "    Processing predecessor value: " << predecessorValue;
+    collectUnderlyingAddressValues(predecessorValue, maxDepth, visited, output);
   }
 }
 
@@ -124,22 +92,28 @@ static void collectUnderlyingAddressValues(RegionBranchOpInterface branch,
 static void collectUnderlyingAddressValues(OpResult result, unsigned maxDepth,
                                            DenseSet<Value> &visited,
                                            SmallVectorImpl<Value> &output) {
+  LDBG() << "collectUnderlyingAddressValues (OpResult): " << result;
+  LDBG() << "  maxDepth: " << maxDepth;
+
   Operation *op = result.getOwner();
 
   // If this is a view, unwrap to the source.
   if (ViewLikeOpInterface view = dyn_cast<ViewLikeOpInterface>(op)) {
     if (result == view.getViewDest()) {
+      LDBG() << "  Unwrapping view to source: " << view.getViewSource();
       return collectUnderlyingAddressValues(view.getViewSource(), maxDepth,
                                             visited, output);
     }
   }
   // Check to see if we can reason about the control flow of this op.
   if (auto branch = dyn_cast<RegionBranchOpInterface>(op)) {
-    return collectUnderlyingAddressValues(branch, /*region=*/nullptr, result,
-                                          result.getResultNumber(), maxDepth,
-                                          visited, output);
+    LDBG() << "  Processing region branch operation";
+    return collectUnderlyingAddressValues2(
+        branch, RegionSuccessor(op, op->getResults()), result,
+        result.getResultNumber(), maxDepth, visited, output);
   }
 
+  LDBG() << "  Adding result to output: " << result;
   output.push_back(result);
 }
 
@@ -148,14 +122,23 @@ static void collectUnderlyingAddressValues(OpResult result, unsigned maxDepth,
 static void collectUnderlyingAddressValues(BlockArgument arg, unsigned maxDepth,
                                            DenseSet<Value> &visited,
                                            SmallVectorImpl<Value> &output) {
+  LDBG() << "collectUnderlyingAddressValues (BlockArgument): " << arg;
+  LDBG() << "  maxDepth: " << maxDepth;
+  LDBG() << "  argNumber: " << arg.getArgNumber();
+  LDBG() << "  isEntryBlock: " << arg.getOwner()->isEntryBlock();
+
   Block *block = arg.getOwner();
   unsigned argNumber = arg.getArgNumber();
 
   // Handle the case of a non-entry block.
   if (!block->isEntryBlock()) {
+    LDBG() << "  Processing non-entry block with "
+           << std::distance(block->pred_begin(), block->pred_end())
+           << " predecessors";
     for (auto it = block->pred_begin(), e = block->pred_end(); it != e; ++it) {
       auto branch = dyn_cast<BranchOpInterface>((*it)->getTerminator());
       if (!branch) {
+        LDBG() << "    Cannot analyze control flow, adding argument to output";
         // We can't analyze the control flow, so bail out early.
         output.push_back(arg);
         return;
@@ -165,10 +148,12 @@ static void collectUnderlyingAddressValues(BlockArgument arg, unsigned maxDepth,
       unsigned index = it.getSuccessorIndex();
       Value operand = branch.getSuccessorOperands(index)[argNumber];
       if (!operand) {
+        LDBG() << "    No operand found for argument, adding to output";
         // We can't analyze the control flow, so bail out early.
         output.push_back(arg);
         return;
       }
+      LDBG() << "    Processing operand from predecessor: " << operand;
       collectUnderlyingAddressValues(operand, maxDepth, visited, output);
     }
     return;
@@ -178,10 +163,35 @@ static void collectUnderlyingAddressValues(BlockArgument arg, unsigned maxDepth,
   Region *region = block->getParent();
   Operation *op = region->getParentOp();
   if (auto branch = dyn_cast<RegionBranchOpInterface>(op)) {
-    return collectUnderlyingAddressValues(branch, region, arg, argNumber,
-                                          maxDepth, visited, output);
+    LDBG() << "  Processing region branch operation for entry block";
+    // We have to find the successor matching the region, so that the input
+    // arguments are correctly set.
+    // TODO: this isn't comprehensive: the successor may not be reachable from
+    // the entry block.
+    SmallVector<RegionSuccessor> successors;
+    branch.getSuccessorRegions(RegionBranchPoint::parent(), successors);
+    RegionSuccessor regionSuccessor(region);
+    bool found = false;
+    for (RegionSuccessor &successor : successors) {
+      if (successor.getSuccessor() == region) {
+        LDBG() << "  Found matching region successor: " << successor;
+        found = true;
+        regionSuccessor = successor;
+        break;
+      }
+    }
+    if (!found) {
+      LDBG()
+          << "  No matching region successor found, adding argument to output";
+      output.push_back(arg);
+      return;
+    }
+    return collectUnderlyingAddressValues2(
+        branch, regionSuccessor, arg, argNumber, maxDepth, visited, output);
   }
 
+  LDBG()
+      << "  Cannot reason about underlying address, adding argument to output";
   // We can't reason about the underlying address of this argument.
   output.push_back(arg);
 }
@@ -190,17 +200,26 @@ static void collectUnderlyingAddressValues(BlockArgument arg, unsigned maxDepth,
 static void collectUnderlyingAddressValues(Value value, unsigned maxDepth,
                                            DenseSet<Value> &visited,
                                            SmallVectorImpl<Value> &output) {
+  LDBG() << "collectUnderlyingAddressValues: " << value;
+  LDBG() << "  maxDepth: " << maxDepth;
+
   // Check that we don't infinitely recurse.
-  if (!visited.insert(value).second)
+  if (!visited.insert(value).second) {
+    LDBG() << "  Value already visited, skipping";
     return;
+  }
   if (maxDepth == 0) {
+    LDBG() << "  Max depth reached, adding value to output";
     output.push_back(value);
     return;
   }
   --maxDepth;
 
-  if (BlockArgument arg = dyn_cast<BlockArgument>(value))
+  if (BlockArgument arg = dyn_cast<BlockArgument>(value)) {
+    LDBG() << "  Processing as BlockArgument";
     return collectUnderlyingAddressValues(arg, maxDepth, visited, output);
+  }
+  LDBG() << "  Processing as OpResult";
   collectUnderlyingAddressValues(cast<OpResult>(value), maxDepth, visited,
                                  output);
 }
@@ -208,9 +227,11 @@ static void collectUnderlyingAddressValues(Value value, unsigned maxDepth,
 /// Given a value, collect all of the underlying values being addressed.
 static void collectUnderlyingAddressValues(Value value,
                                            SmallVectorImpl<Value> &output) {
+  LDBG() << "collectUnderlyingAddressValues: " << value;
   DenseSet<Value> visited;
   collectUnderlyingAddressValues(value, maxUnderlyingValueSearchDepth, visited,
                                  output);
+  LDBG() << "  Collected " << output.size() << " underlying values";
 }
 
 //===----------------------------------------------------------------------===//
@@ -227,19 +248,33 @@ static LogicalResult
 getAllocEffectFor(Value value,
                   std::optional<MemoryEffects::EffectInstance> &effect,
                   Operation *&allocScopeOp) {
+  LDBG() << "getAllocEffectFor: " << value;
+
   // Try to get a memory effect interface for the parent operation.
   Operation *op;
-  if (BlockArgument arg = dyn_cast<BlockArgument>(value))
+  if (BlockArgument arg = dyn_cast<BlockArgument>(value)) {
     op = arg.getOwner()->getParentOp();
-  else
+    LDBG() << "  BlockArgument, parent op: "
+           << OpWithFlags(op, OpPrintingFlags().skipRegions());
+  } else {
     op = cast<OpResult>(value).getOwner();
+    LDBG() << "  OpResult, owner op: "
+           << OpWithFlags(op, OpPrintingFlags().skipRegions());
+  }
+
   MemoryEffectOpInterface interface = dyn_cast<MemoryEffectOpInterface>(op);
-  if (!interface)
+  if (!interface) {
+    LDBG() << "  No memory effect interface found";
     return failure();
+  }
 
   // Try to find an allocation effect on the resource.
-  if (!(effect = interface.getEffectOnValue<MemoryEffects::Allocate>(value)))
+  if (!(effect = interface.getEffectOnValue<MemoryEffects::Allocate>(value))) {
+    LDBG() << "  No allocation effect found on value";
     return failure();
+  }
+
+  LDBG() << "  Found allocation effect";
 
   // If we found an allocation effect, try to find a scope for the allocation.
   // If the resource of this allocation is automatically scoped, find the parent
@@ -247,6 +282,12 @@ getAllocEffectFor(Value value,
   if (llvm::isa<SideEffects::AutomaticAllocationScopeResource>(
           effect->getResource())) {
     allocScopeOp = op->getParentWithTrait<OpTrait::AutomaticAllocationScope>();
+    if (allocScopeOp) {
+      LDBG() << "  Automatic allocation scope found: "
+             << OpWithFlags(allocScopeOp, OpPrintingFlags().skipRegions());
+    } else {
+      LDBG() << "  Automatic allocation scope found: null";
+    }
     return success();
   }
 
@@ -255,6 +296,12 @@ getAllocEffectFor(Value value,
   // For now assume allocation scope to the function scope (we don't care if
   // pointer escape outside function).
   allocScopeOp = op->getParentOfType<FunctionOpInterface>();
+  if (allocScopeOp) {
+    LDBG() << "  Function scope found: "
+           << OpWithFlags(allocScopeOp, OpPrintingFlags().skipRegions());
+  } else {
+    LDBG() << "  Function scope found: null";
+  }
   return success();
 }
 
@@ -293,33 +340,44 @@ static std::optional<AliasResult> checkDistinctObjects(Value lhs, Value rhs) {
 
 /// Given the two values, return their aliasing behavior.
 AliasResult LocalAliasAnalysis::aliasImpl(Value lhs, Value rhs) {
-  if (lhs == rhs)
+  LDBG() << "aliasImpl: " << lhs << " vs " << rhs;
+
+  if (lhs == rhs) {
+    LDBG() << "  Same value, must alias";
     return AliasResult::MustAlias;
+  }
+
   Operation *lhsAllocScope = nullptr, *rhsAllocScope = nullptr;
   std::optional<MemoryEffects::EffectInstance> lhsAlloc, rhsAlloc;
 
   // Handle the case where lhs is a constant.
   Attribute lhsAttr, rhsAttr;
   if (matchPattern(lhs, m_Constant(&lhsAttr))) {
+    LDBG() << "  lhs is constant";
     // TODO: This is overly conservative. Two matching constants don't
     // necessarily map to the same address. For example, if the two values
     // correspond to different symbols that both represent a definition.
-    if (matchPattern(rhs, m_Constant(&rhsAttr)))
+    if (matchPattern(rhs, m_Constant(&rhsAttr))) {
+      LDBG() << "  rhs is also constant, may alias";
       return AliasResult::MayAlias;
+    }
 
     // Try to find an alloc effect on rhs. If an effect was found we can't
     // alias, otherwise we might.
-    return succeeded(getAllocEffectFor(rhs, rhsAlloc, rhsAllocScope))
-               ? AliasResult::NoAlias
-               : AliasResult::MayAlias;
+    bool rhsHasAlloc =
+        succeeded(getAllocEffectFor(rhs, rhsAlloc, rhsAllocScope));
+    LDBG() << "  rhs has alloc effect: " << rhsHasAlloc;
+    return rhsHasAlloc ? AliasResult::NoAlias : AliasResult::MayAlias;
   }
   // Handle the case where rhs is a constant.
   if (matchPattern(rhs, m_Constant(&rhsAttr))) {
+    LDBG() << "  rhs is constant";
     // Try to find an alloc effect on lhs. If an effect was found we can't
     // alias, otherwise we might.
-    return succeeded(getAllocEffectFor(lhs, lhsAlloc, lhsAllocScope))
-               ? AliasResult::NoAlias
-               : AliasResult::MayAlias;
+    bool lhsHasAlloc =
+        succeeded(getAllocEffectFor(lhs, lhsAlloc, lhsAllocScope));
+    LDBG() << "  lhs has alloc effect: " << lhsHasAlloc;
+    return lhsHasAlloc ? AliasResult::NoAlias : AliasResult::MayAlias;
   }
 
   if (std::optional<AliasResult> result = checkDistinctObjects(lhs, rhs))
@@ -329,9 +387,14 @@ AliasResult LocalAliasAnalysis::aliasImpl(Value lhs, Value rhs) {
   // an allocation effect.
   bool lhsHasAlloc = succeeded(getAllocEffectFor(lhs, lhsAlloc, lhsAllocScope));
   bool rhsHasAlloc = succeeded(getAllocEffectFor(rhs, rhsAlloc, rhsAllocScope));
+  LDBG() << "  lhs has alloc effect: " << lhsHasAlloc;
+  LDBG() << "  rhs has alloc effect: " << rhsHasAlloc;
+
   if (lhsHasAlloc == rhsHasAlloc) {
     // If both values have an allocation effect we know they don't alias, and if
     // neither have an effect we can't make an assumptions.
+    LDBG() << "  Both have same alloc status: "
+           << (lhsHasAlloc ? "NoAlias" : "MayAlias");
     return lhsHasAlloc ? AliasResult::NoAlias : AliasResult::MayAlias;
   }
 
@@ -339,6 +402,7 @@ AliasResult LocalAliasAnalysis::aliasImpl(Value lhs, Value rhs) {
   // and one without. Move the one with the effect to the lhs to make the next
   // checks simpler.
   if (rhsHasAlloc) {
+    LDBG() << "  Swapping lhs and rhs to put alloc effect on lhs";
     std::swap(lhs, rhs);
     lhsAlloc = rhsAlloc;
     lhsAllocScope = rhsAllocScope;
@@ -347,49 +411,74 @@ AliasResult LocalAliasAnalysis::aliasImpl(Value lhs, Value rhs) {
   // If the effect has a scoped allocation region, check to see if the
   // non-effect value is defined above that scope.
   if (lhsAllocScope) {
+    LDBG() << "  Checking allocation scope: "
+           << OpWithFlags(lhsAllocScope, OpPrintingFlags().skipRegions());
     // If the parent operation of rhs is an ancestor of the allocation scope, or
     // if rhs is an entry block argument of the allocation scope we know the two
     // values can't alias.
     Operation *rhsParentOp = rhs.getParentRegion()->getParentOp();
-    if (rhsParentOp->isProperAncestor(lhsAllocScope))
+    if (rhsParentOp->isProperAncestor(lhsAllocScope)) {
+      LDBG() << "  rhs parent is ancestor of alloc scope, no alias";
       return AliasResult::NoAlias;
+    }
     if (rhsParentOp == lhsAllocScope) {
       BlockArgument rhsArg = dyn_cast<BlockArgument>(rhs);
-      if (rhsArg && rhs.getParentBlock()->isEntryBlock())
+      if (rhsArg && rhs.getParentBlock()->isEntryBlock()) {
+        LDBG() << "  rhs is entry block arg of alloc scope, no alias";
         return AliasResult::NoAlias;
+      }
     }
   }
 
   // If we couldn't reason about the relationship between the two values,
   // conservatively assume they might alias.
+  LDBG() << "  Cannot reason about relationship, may alias";
   return AliasResult::MayAlias;
 }
 
 /// Given the two values, return their aliasing behavior.
 AliasResult LocalAliasAnalysis::alias(Value lhs, Value rhs) {
-  if (lhs == rhs)
+  LDBG() << "alias: " << lhs << " vs " << rhs;
+
+  if (lhs == rhs) {
+    LDBG() << "  Same value, must alias";
     return AliasResult::MustAlias;
+  }
 
   // Get the underlying values being addressed.
   SmallVector<Value, 8> lhsValues, rhsValues;
   collectUnderlyingAddressValues(lhs, lhsValues);
   collectUnderlyingAddressValues(rhs, rhsValues);
 
+  LDBG() << "  lhs underlying values: " << lhsValues.size();
+  LDBG() << "  rhs underlying values: " << rhsValues.size();
+
   // If we failed to collect for either of the values somehow, conservatively
   // assume they may alias.
-  if (lhsValues.empty() || rhsValues.empty())
+  if (lhsValues.empty() || rhsValues.empty()) {
+    LDBG() << "  Failed to collect underlying values, may alias";
     return AliasResult::MayAlias;
+  }
 
   // Check the alias results against each of the underlying values.
   std::optional<AliasResult> result;
   for (Value lhsVal : lhsValues) {
     for (Value rhsVal : rhsValues) {
+      LDBG() << "  Checking underlying values: " << lhsVal << " vs " << rhsVal;
       AliasResult nextResult = aliasImpl(lhsVal, rhsVal);
+      LDBG() << "  Result: "
+             << (nextResult == AliasResult::MustAlias ? "MustAlias"
+                 : nextResult == AliasResult::NoAlias ? "NoAlias"
+                                                      : "MayAlias");
       result = result ? result->merge(nextResult) : nextResult;
     }
   }
 
   // We should always have a valid result here.
+  LDBG() << "  Final result: "
+         << (result->isMust() ? "MustAlias"
+             : result->isNo() ? "NoAlias"
+                              : "MayAlias");
   return *result;
 }
 
@@ -398,8 +487,12 @@ AliasResult LocalAliasAnalysis::alias(Value lhs, Value rhs) {
 //===----------------------------------------------------------------------===//
 
 ModRefResult LocalAliasAnalysis::getModRef(Operation *op, Value location) {
+  LDBG() << "getModRef: " << OpWithFlags(op, OpPrintingFlags().skipRegions())
+         << " on location " << location;
+
   // Check to see if this operation relies on nested side effects.
   if (op->hasTrait<OpTrait::HasRecursiveMemoryEffects>()) {
+    LDBG() << "  Operation has recursive memory effects, returning ModAndRef";
     // TODO: To check recursive operations we need to check all of the nested
     // operations, which can result in a quadratic number of queries. We should
     // introduce some caching of some kind to help alleviate this, especially as
@@ -410,38 +503,64 @@ ModRefResult LocalAliasAnalysis::getModRef(Operation *op, Value location) {
 
   // Otherwise, check to see if this operation has a memory effect interface.
   MemoryEffectOpInterface interface = dyn_cast<MemoryEffectOpInterface>(op);
-  if (!interface)
+  if (!interface) {
+    LDBG() << "  No memory effect interface, returning ModAndRef";
     return ModRefResult::getModAndRef();
+  }
 
   // Build a ModRefResult by merging the behavior of the effects of this
   // operation.
   SmallVector<MemoryEffects::EffectInstance> effects;
   interface.getEffects(effects);
+  LDBG() << "  Found " << effects.size() << " memory effects";
 
   ModRefResult result = ModRefResult::getNoModRef();
   for (const MemoryEffects::EffectInstance &effect : effects) {
-    if (isa<MemoryEffects::Allocate, MemoryEffects::Free>(effect.getEffect()))
+    if (isa<MemoryEffects::Allocate, MemoryEffects::Free>(effect.getEffect())) {
+      LDBG() << "    Skipping alloc/free effect";
       continue;
+    }
 
     // Check for an alias between the effect and our memory location.
     // TODO: Add support for checking an alias with a symbol reference.
     AliasResult aliasResult = AliasResult::MayAlias;
-    if (Value effectValue = effect.getValue())
+    if (Value effectValue = effect.getValue()) {
+      LDBG() << "    Checking alias between effect value " << effectValue
+             << " and location " << location;
       aliasResult = alias(effectValue, location);
+      LDBG() << "    Alias result: "
+             << (aliasResult.isMust() ? "MustAlias"
+                 : aliasResult.isNo() ? "NoAlias"
+                                      : "MayAlias");
+    } else {
+      LDBG() << "    No effect value, assuming MayAlias";
+    }
 
     // If we don't alias, ignore this effect.
-    if (aliasResult.isNo())
+    if (aliasResult.isNo()) {
+      LDBG() << "    No alias, ignoring effect";
       continue;
+    }
 
     // Merge in the corresponding mod or ref for this effect.
     if (isa<MemoryEffects::Read>(effect.getEffect())) {
+      LDBG() << "    Adding Ref to result";
       result = result.merge(ModRefResult::getRef());
     } else {
       assert(isa<MemoryEffects::Write>(effect.getEffect()));
+      LDBG() << "    Adding Mod to result";
       result = result.merge(ModRefResult::getMod());
     }
-    if (result.isModAndRef())
+    if (result.isModAndRef()) {
+      LDBG() << "    Result is now ModAndRef, breaking";
       break;
+    }
   }
+
+  LDBG() << "  Final ModRef result: "
+         << (result.isModAndRef() ? "ModAndRef"
+             : result.isMod()     ? "Mod"
+             : result.isRef()     ? "Ref"
+                                  : "NoModRef");
   return result;
 }
diff --git a/mlir/lib/Analysis/DataFlow/DeadCodeAnalysis.cpp b/mlir/lib/Analysis/DataFlow/DeadCodeAnalysis.cpp
index 377f7ebe06750..0fc5b4482bf3e 100644
--- a/mlir/lib/Analysis/DataFlow/DeadCodeAnalysis.cpp
+++ b/mlir/lib/Analysis/DataFlow/DeadCodeAnalysis.cpp
@@ -501,11 +501,10 @@ void DeadCodeAnalysis::visitRegionTerminator(Operation *op,
     return;
 
   SmallVector<RegionSuccessor> successors;
-  if (auto terminator = dyn_cast<RegionBranchTerminatorOpInterface>(op))
-    terminator.getSuccessorRegions(*operands, successors);
-  else
-    branch.getSuccessorRegions(op->getParentRegion(), successors);
-
+  auto terminator = dyn_cast<RegionBranchTerminatorOpInterface>(op);
+  if (!terminator)
+    return;
+  terminator.getSuccessorRegions(*operands, successors);
   visitRegionBranchEdges(branch, op, successors);
 }
 
diff --git a/mlir/lib/Analysis/DataFlow/DenseAnalysis.cpp b/mlir/lib/Analysis/DataFlow/DenseAnalysis.cpp
index daa3db55b2852..0682e5f26785a 100644
--- a/mlir/lib/Analysis/DataFlow/DenseAnalysis.cpp
+++ b/mlir/lib/Analysis/DataFlow/DenseAnalysis.cpp
@@ -588,7 +588,9 @@ void AbstractDenseBackwardDataFlowAnalysis::visitBlock(Block *block) {
     // flow, propagate the lattice back along the control flow edge.
     if (auto branch = dyn_cast<RegionBranchOpInterface>(block->getParentOp())) {
       LDBG() << "    Exit block of region branch operation";
-      visitRegionBranchOperation(point, branch, block->getParent(), before);
+      auto terminator =
+          cast<RegionBranchTerminatorOpInterface>(block->getTerminator());
+      visitRegionBranchOperation(point, branch, terminator, before);
       return;
     }
 
diff --git a/mlir/lib/Analysis/DataFlow/SparseAnalysis.cpp b/mlir/lib/Analysis/DataFlow/SparseAnalysis.cpp
index 0d2e2ed85549d..8e63ae86753b4 100644
--- a/mlir/lib/Analysis/DataFlow/SparseAnalysis.cpp
+++ b/mlir/lib/Analysis/DataFlow/SparseAnalysis.cpp
@@ -130,7 +130,7 @@ AbstractSparseForwardDataFlowAnalysis::visitOperation(Operation *op) {
   // The results of a region branch operation are determined by control-flow.
   if (auto branch = dyn_cast<RegionBranchOpInterface>(op)) {
     visitRegionSuccessors(getProgramPointAfter(branch), branch,
-                          /*successor=*/RegionBranchPoint::parent(),
+                          /*successor=*/{branch, branch->getResults()},
                           resultLattices);
     return success();
   }
@@ -279,7 +279,7 @@ void AbstractSparseForwardDataFlowAnalysis::visitCallableOperation(
 
 void AbstractSparseForwardDataFlowAnalysis::visitRegionSuccessors(
     ProgramPoint *point, RegionBranchOpInterface branch,
-    RegionBranchPoint successor, ArrayRef<AbstractSparseLattice *> lattices) {
+    RegionSuccessor successor, ArrayRef<AbstractSparseLattice *> lattices) {
   const auto *predecessors = getOrCreateFor<PredecessorState>(point, point);
   assert(predecessors->allPredecessorsKnown() &&
          "unexpected unresolved region successors");
@@ -314,7 +314,7 @@ void AbstractSparseForwardDataFlowAnalysis::visitRegionSuccessors(
         visitNonControlFlowArgumentsImpl(
             branch,
             RegionSuccessor(
-                branch->getResults().slice(firstIndex, inputs.size())),
+                branch, branch->getResults().slice(firstIndex, inputs.size())),
             lattices, firstIndex);
       } else {
         if (!inputs.empty())
diff --git a/mlir/lib/Analysis/SliceWalk.cpp b/mlir/lib/Analysis/SliceWalk.cpp
index 817d71a3452ca..863f260cd4b6a 100644
--- a/mlir/lib/Analysis/SliceWalk.cpp
+++ b/mlir/lib/Analysis/SliceWalk.cpp
@@ -114,7 +114,7 @@ mlir::getControlFlowPredecessors(Value value) {
     if (!regionOp)
       return std::nullopt;
     // Add the control flow predecessor operands to the work list.
-    RegionSuccessor region(regionOp->getResults());
+    RegionSuccessor region(regionOp, regionOp->getResults());
     SmallVector<Value> predecessorOperands = getRegionPredecessorOperands(
         regionOp, region, opResult.getResultNumber());
     return predecessorOperands;
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 478b6aaaec83a..3a307a0756d93 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -935,7 +935,7 @@ static std::optional<uint32_t> mfmaTypeSelectCode(Type mlirElemType) {
       .Case([](Float6E2M3FNType) { return 2u; })
       .Case([](Float6E3M2FNType) { return 3u; })
       .Case([](Float4E2M1FNType) { return 4u; })
-      .Default([](Type) { return std::nullopt; });
+      .Default(std::nullopt);
 }
 
 /// If there is a scaled MFMA instruction for the input element types `aType`
@@ -989,21 +989,17 @@ mfmaOpToScaledIntrinsic(ScaledMFMAOp smfma, Chipset chipset) {
                                  smfma.getN(), smfma.getK(), 1u, chipset);
 }
 
-/// Return the `rocdl` intrinsic corresponding to a WMMA operation `wmma`
-/// if one exists. This includes checking to ensure the intrinsic is supported
-/// on the architecture you are compiling for.
-static std::optional<StringRef> wmmaOpToIntrinsic(WMMAOp wmma,
-                                                  Chipset chipset) {
-  auto sourceVectorType = cast<VectorType>(wmma.getSourceA().getType());
-  auto sourceBVectorType = cast<VectorType>(wmma.getSourceB().getType());
-  auto destVectorType = cast<VectorType>(wmma.getDestC().getType());
-  Type elemSourceType = sourceVectorType.getElementType();
-  Type elemBSourceType = sourceBVectorType.getElementType();
-  Type elemDestType = destVectorType.getElementType();
-
-  const uint32_t k = wmma.getK();
+/// Returns the `rocdl` intrinsic corresponding to a WMMA operation `wmma`
+/// for RDNA3/4 architectures.
+static std::optional<StringRef>
+wmmaOpToIntrinsicRDNA(Type elemSourceType, Type elemBSourceType,
+                      Type elemDestType, uint32_t k, bool isRDNA3) {
+  using fp8 = Float8E4M3FNType;
+  using bf8 = Float8E5M2Type;
 
+  // Handle k == 16 for RDNA3/4.
   if (k == 16) {
+    // Common patterns for RDNA3 and RDNA4.
     if (elemSourceType.isF16() && elemDestType.isF32())
       return ROCDL::wmma_f32_16x16x16_f16::getOperationName();
     if (elemSourceType.isBF16() && elemDestType.isF32())
@@ -1014,40 +1010,161 @@ static std::optional<StringRef> wmmaOpToIntrinsic(WMMAOp wmma,
       return ROCDL::wmma_bf16_16x16x16_bf16::getOperationName();
     if (elemSourceType.isInteger(8) && elemDestType.isInteger(32))
       return ROCDL::wmma_i32_16x16x16_iu8::getOperationName();
-    if (chipset.majorVersion == 11) {
+
+    // RDNA3 specific patterns.
+    if (isRDNA3) {
       if (elemSourceType.isInteger(4) && elemDestType.isInteger(32))
         return ROCDL::wmma_i32_16x16x16_iu4::getOperationName();
+      return std::nullopt;
     }
-  }
-  if (chipset.majorVersion < 12)
-    return std::nullopt;
 
-  // gfx12+
-  if (k == 16) {
-    if (isa<Float8E4M3FNType>(elemSourceType) &&
-        isa<Float8E4M3FNType>(elemBSourceType) && elemDestType.isF32())
+    // RDNA4 specific patterns (fp8/bf8).
+    if (isa<fp8>(elemSourceType) && isa<fp8>(elemBSourceType) &&
+        elemDestType.isF32())
       return ROCDL::wmma_f32_16x16x16_fp8_fp8::getOperationName();
-    if (isa<Float8E4M3FNType>(elemSourceType) &&
-        isa<Float8E5M2Type>(elemBSourceType) && elemDestType.isF32())
+    if (isa<fp8>(elemSourceType) && isa<bf8>(elemBSourceType) &&
+        elemDestType.isF32())
       return ROCDL::wmma_f32_16x16x16_fp8_bf8::getOperationName();
-    if (isa<Float8E5M2Type>(elemSourceType) &&
-        isa<Float8E5M2Type>(elemBSourceType) && elemDestType.isF32())
+    if (isa<bf8>(elemSourceType) && isa<bf8>(elemBSourceType) &&
+        elemDestType.isF32())
       return ROCDL::wmma_f32_16x16x16_bf8_bf8::getOperationName();
-    if (isa<Float8E5M2Type>(elemSourceType) &&
-        isa<Float8E4M3FNType>(elemBSourceType) && elemDestType.isF32())
+    if (isa<bf8>(elemSourceType) && isa<fp8>(elemBSourceType) &&
+        elemDestType.isF32())
       return ROCDL::wmma_f32_16x16x16_bf8_fp8::getOperationName();
     if (elemSourceType.isInteger(4) && elemDestType.isInteger(32))
       return ROCDL::wmma_i32_16x16x16_iu4::getOperationName();
 
     return std::nullopt;
   }
-  if (k == 32) {
+
+  // Handle k == 32 for RDNA4.
+  if (k == 32 && !isRDNA3) {
     if (elemSourceType.isInteger(4) && elemDestType.isInteger(32))
       return ROCDL::wmma_i32_16x16x32_iu4::getOperationName();
+  }
+
+  return std::nullopt;
+}
+
+/// Return the `rocdl` intrinsic corresponding to a WMMA operation `wmma`
+/// for the gfx1250 architecture.
+static std::optional<StringRef> wmmaOpToIntrinsicGfx1250(Type elemSourceType,
+                                                         Type elemBSourceType,
+                                                         Type elemDestType,
+                                                         uint32_t k) {
+  using fp8 = Float8E4M3FNType;
+  using bf8 = Float8E5M2Type;
+
+  if (k == 4) {
+    if (elemSourceType.isF32() && elemDestType.isF32())
+      return ROCDL::wmma_f32_16x16x4_f32::getOperationName();
+
+    return std::nullopt;
+  }
+
+  if (k == 32) {
+    if (elemSourceType.isF16() && elemDestType.isF32())
+      return ROCDL::wmma_f32_16x16x32_f16::getOperationName();
+    if (elemSourceType.isBF16() && elemDestType.isF32())
+      return ROCDL::wmma_f32_16x16x32_bf16::getOperationName();
+    if (elemSourceType.isF16() && elemDestType.isF16())
+      return ROCDL::wmma_f16_16x16x32_f16::getOperationName();
+    if (elemSourceType.isBF16() && elemDestType.isBF16())
+      return ROCDL::wmma_bf16_16x16x32_bf16::getOperationName();
+
     return std::nullopt;
   }
 
-  llvm_unreachable("unhandled WMMA case");
+  if (k == 64) {
+    if (isa<fp8>(elemSourceType) && isa<fp8>(elemBSourceType)) {
+      if (elemDestType.isF32())
+        return ROCDL::wmma_f32_16x16x64_fp8_fp8::getOperationName();
+      if (elemDestType.isF16())
+        return ROCDL::wmma_f16_16x16x64_fp8_fp8::getOperationName();
+    }
+    if (isa<fp8>(elemSourceType) && isa<bf8>(elemBSourceType)) {
+      if (elemDestType.isF32())
+        return ROCDL::wmma_f32_16x16x64_fp8_bf8::getOperationName();
+      if (elemDestType.isF16())
+        return ROCDL::wmma_f16_16x16x64_fp8_bf8::getOperationName();
+    }
+    if (isa<bf8>(elemSourceType) && isa<bf8>(elemBSourceType)) {
+      if (elemDestType.isF32())
+        return ROCDL::wmma_f32_16x16x64_bf8_bf8::getOperationName();
+      if (elemDestType.isF16())
+        return ROCDL::wmma_f16_16x16x64_bf8_bf8::getOperationName();
+    }
+    if (isa<bf8>(elemSourceType) && isa<fp8>(elemBSourceType)) {
+      if (elemDestType.isF32())
+        return ROCDL::wmma_f32_16x16x64_bf8_fp8::getOperationName();
+      if (elemDestType.isF16())
+        return ROCDL::wmma_f16_16x16x64_bf8_fp8::getOperationName();
+    }
+    if (elemSourceType.isInteger(8) && elemDestType.isInteger(32))
+      return ROCDL::wmma_i32_16x16x64_iu8::getOperationName();
+
+    return std::nullopt;
+  }
+
+  if (k == 128) {
+    if (isa<fp8>(elemSourceType) && isa<fp8>(elemBSourceType)) {
+      if (elemDestType.isF32())
+        return ROCDL::wmma_f32_16x16x128_fp8_fp8::getOperationName();
+      if (elemDestType.isF16())
+        return ROCDL::wmma_f16_16x16x128_fp8_fp8::getOperationName();
+    }
+    if (isa<fp8>(elemSourceType) && isa<bf8>(elemBSourceType)) {
+      if (elemDestType.isF32())
+        return ROCDL::wmma_f32_16x16x128_fp8_bf8::getOperationName();
+      if (elemDestType.isF16())
+        return ROCDL::wmma_f16_16x16x128_fp8_bf8::getOperationName();
+    }
+    if (isa<bf8>(elemSourceType) && isa<bf8>(elemBSourceType)) {
+      if (elemDestType.isF32())
+        return ROCDL::wmma_f32_16x16x128_bf8_bf8::getOperationName();
+      if (elemDestType.isF16())
+        return ROCDL::wmma_f16_16x16x128_bf8_bf8::getOperationName();
+    }
+    if (isa<bf8>(elemSourceType) && isa<fp8>(elemBSourceType)) {
+      if (elemDestType.isF32())
+        return ROCDL::wmma_f32_16x16x128_bf8_fp8::getOperationName();
+      if (elemDestType.isF16())
+        return ROCDL::wmma_f16_16x16x128_bf8_fp8::getOperationName();
+    }
+
+    return std::nullopt;
+  }
+
+  return std::nullopt;
+}
+
+/// Returns the `rocdl` intrinsic corresponding to a WMMA operation `wmma`
+/// if one exists. This includes checking to ensure the intrinsic is supported
+/// on the architecture you are compiling for.
+static std::optional<StringRef> wmmaOpToIntrinsic(WMMAOp wmma,
+                                                  Chipset chipset) {
+  auto sourceVectorType = cast<VectorType>(wmma.getSourceA().getType());
+  auto sourceBVectorType = cast<VectorType>(wmma.getSourceB().getType());
+  auto destVectorType = cast<VectorType>(wmma.getDestC().getType());
+  Type elemSourceType = sourceVectorType.getElementType();
+  Type elemBSourceType = sourceBVectorType.getElementType();
+  Type elemDestType = destVectorType.getElementType();
+
+  const uint32_t k = wmma.getK();
+  const bool isRDNA3 = chipset.majorVersion == 11;
+  const bool isRDNA4 = chipset.majorVersion == 12 && chipset.minorVersion == 0;
+
+  // Handle RDNA3 and RDNA4.
+  if (isRDNA3 || isRDNA4)
+    return wmmaOpToIntrinsicRDNA(elemSourceType, elemBSourceType, elemDestType,
+                                 k, isRDNA3);
+
+  // Handle gfx1250.
+  if (chipset == Chipset{12, 5, 0})
+    return wmmaOpToIntrinsicGfx1250(elemSourceType, elemBSourceType,
+                                    elemDestType, k);
+
+  return std::nullopt;
 }
 
 namespace {
diff --git a/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp b/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp
index 247dba101cfc1..cfdcd9cc2d86d 100644
--- a/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp
+++ b/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp
@@ -432,7 +432,7 @@ static Value getOriginalVectorValue(Value value) {
                         current = op.getSource();
                         return false;
                       })
-                      .Default([](Operation *) { return false; });
+                      .Default(false);
 
     if (!skipOp) {
       break;
diff --git a/mlir/lib/Conversion/ArithToLLVM/ArithToLLVM.cpp b/mlir/lib/Conversion/ArithToLLVM/ArithToLLVM.cpp
index ba57155ab9b45..03ed4d51cc744 100644
--- a/mlir/lib/Conversion/ArithToLLVM/ArithToLLVM.cpp
+++ b/mlir/lib/Conversion/ArithToLLVM/ArithToLLVM.cpp
@@ -240,8 +240,7 @@ struct CmpFOpLowering : public ConvertOpToLLVMPattern<arith::CmpFOp> {
 
 struct SelectOpOneToNLowering : public ConvertOpToLLVMPattern<arith::SelectOp> {
   using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern;
-  using Adaptor =
-      typename ConvertOpToLLVMPattern<arith::SelectOp>::OneToNOpAdaptor;
+  using Adaptor = ConvertOpToLLVMPattern<arith::SelectOp>::OneToNOpAdaptor;
 
   LogicalResult
   matchAndRewrite(arith::SelectOp op, Adaptor adaptor,
diff --git a/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp b/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp
index 0fe72394b61d6..9e46b7d78baca 100644
--- a/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp
+++ b/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp
@@ -313,25 +313,53 @@ struct DivOpConversion : public OpConversionPattern<complex::DivOp> {
 struct ExpOpConversion : public OpConversionPattern<complex::ExpOp> {
   using OpConversionPattern<complex::ExpOp>::OpConversionPattern;
 
+  // exp(x+I*y) = exp(x)*(cos(y)+I*sin(y))
+  // Handle special cases as StableHLO implementation does:
+  // 1. When b == 0, set imag(exp(z)) = 0
+  // 2. When exp(x) == inf, use exp(x/2)*(cos(y)+I*sin(y))*exp(x/2)
   LogicalResult
   matchAndRewrite(complex::ExpOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     auto loc = op.getLoc();
     auto type = cast<ComplexType>(adaptor.getComplex().getType());
-    auto elementType = cast<FloatType>(type.getElementType());
-    arith::FastMathFlagsAttr fmf = op.getFastMathFlagsAttr();
-
-    Value real =
-        complex::ReOp::create(rewriter, loc, elementType, adaptor.getComplex());
-    Value imag =
-        complex::ImOp::create(rewriter, loc, elementType, adaptor.getComplex());
-    Value expReal = math::ExpOp::create(rewriter, loc, real, fmf.getValue());
-    Value cosImag = math::CosOp::create(rewriter, loc, imag, fmf.getValue());
+    auto ET = cast<FloatType>(type.getElementType());
+    arith::FastMathFlags fmf = op.getFastMathFlagsAttr().getValue();
+    const auto &floatSemantics = ET.getFloatSemantics();
+    ImplicitLocOpBuilder b(loc, rewriter);
+
+    Value x = complex::ReOp::create(b, ET, adaptor.getComplex());
+    Value y = complex::ImOp::create(b, ET, adaptor.getComplex());
+    Value zero = arith::ConstantOp::create(b, ET, b.getZeroAttr(ET));
+    Value half = arith::ConstantOp::create(b, ET, b.getFloatAttr(ET, 0.5));
+    Value inf = arith::ConstantOp::create(
+        b, ET, b.getFloatAttr(ET, APFloat::getInf(floatSemantics)));
+
+    Value exp = math::ExpOp::create(b, x, fmf);
+    Value xHalf = arith::MulFOp::create(b, x, half, fmf);
+    Value expHalf = math::ExpOp::create(b, xHalf, fmf);
+    Value cos = math::CosOp::create(b, y, fmf);
+    Value sin = math::SinOp::create(b, y, fmf);
+
+    Value expIsInf =
+        arith::CmpFOp::create(b, arith::CmpFPredicate::OEQ, exp, inf, fmf);
+    Value yIsZero =
+        arith::CmpFOp::create(b, arith::CmpFPredicate::OEQ, y, zero);
+
+    // Real path: select between exp(x)*cos(y) and exp(x/2)*cos(y)*exp(x/2)
+    Value realNormal = arith::MulFOp::create(b, exp, cos, fmf);
+    Value expHalfCos = arith::MulFOp::create(b, expHalf, cos, fmf);
+    Value realOverflow = arith::MulFOp::create(b, expHalfCos, expHalf, fmf);
     Value resultReal =
-        arith::MulFOp::create(rewriter, loc, expReal, cosImag, fmf.getValue());
-    Value sinImag = math::SinOp::create(rewriter, loc, imag, fmf.getValue());
-    Value resultImag =
-        arith::MulFOp::create(rewriter, loc, expReal, sinImag, fmf.getValue());
+        arith::SelectOp::create(b, expIsInf, realOverflow, realNormal);
+
+    // Imaginary part: if y == 0 return 0 else select between exp(x)*sin(y) and
+    // exp(x/2)*sin(y)*exp(x/2)
+    Value imagNormal = arith::MulFOp::create(b, exp, sin, fmf);
+    Value expHalfSin = arith::MulFOp::create(b, expHalf, sin, fmf);
+    Value imagOverflow = arith::MulFOp::create(b, expHalfSin, expHalf, fmf);
+    Value imagNonZero =
+        arith::SelectOp::create(b, expIsInf, imagOverflow, imagNormal);
+    Value resultImag = arith::SelectOp::create(b, yIsZero, zero, imagNonZero);
 
     rewriter.replaceOpWithNewOp<complex::CreateOp>(op, type, resultReal,
                                                    resultImag);
diff --git a/mlir/lib/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.cpp b/mlir/lib/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.cpp
index 798d8b04eed76..b75968eb31c58 100644
--- a/mlir/lib/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.cpp
+++ b/mlir/lib/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.cpp
@@ -137,8 +137,7 @@ static SmallVector<Value> flattenValues(ArrayRef<ValueRange> values) {
 /// op to llvm.br.
 struct BranchOpLowering : public ConvertOpToLLVMPattern<cf::BranchOp> {
   using ConvertOpToLLVMPattern<cf::BranchOp>::ConvertOpToLLVMPattern;
-  using Adaptor =
-      typename ConvertOpToLLVMPattern<cf::BranchOp>::OneToNOpAdaptor;
+  using Adaptor = ConvertOpToLLVMPattern<cf::BranchOp>::OneToNOpAdaptor;
 
   LogicalResult
   matchAndRewrite(cf::BranchOp op, Adaptor adaptor,
@@ -163,8 +162,7 @@ struct BranchOpLowering : public ConvertOpToLLVMPattern<cf::BranchOp> {
 /// branch op to llvm.cond_br.
 struct CondBranchOpLowering : public ConvertOpToLLVMPattern<cf::CondBranchOp> {
   using ConvertOpToLLVMPattern<cf::CondBranchOp>::ConvertOpToLLVMPattern;
-  using Adaptor =
-      typename ConvertOpToLLVMPattern<cf::CondBranchOp>::OneToNOpAdaptor;
+  using Adaptor = ConvertOpToLLVMPattern<cf::CondBranchOp>::OneToNOpAdaptor;
 
   LogicalResult
   matchAndRewrite(cf::CondBranchOp op, Adaptor adaptor,
@@ -204,7 +202,7 @@ struct SwitchOpLowering : public ConvertOpToLLVMPattern<cf::SwitchOp> {
   using ConvertOpToLLVMPattern<cf::SwitchOp>::ConvertOpToLLVMPattern;
 
   LogicalResult
-  matchAndRewrite(cf::SwitchOp op, typename cf::SwitchOp::Adaptor adaptor,
+  matchAndRewrite(cf::SwitchOp op, cf::SwitchOp::Adaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     // Get or convert default block.
     FailureOr<Block *> convertedDefaultBlock = getConvertedBlock(
diff --git a/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp b/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp
index 25f1e1b184d61..425594b3382f0 100644
--- a/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp
+++ b/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp
@@ -259,7 +259,7 @@ struct GPUShuffleConversion final : ConvertOpToLLVMPattern<gpu::ShuffleOp> {
           }
           return std::nullopt;
         })
-        .Default([](auto) { return std::nullopt; });
+        .Default(std::nullopt);
   }
 
   static std::optional<std::string> getFuncName(gpu::ShuffleMode mode,
diff --git a/mlir/lib/Conversion/MathToROCDL/MathToROCDL.cpp b/mlir/lib/Conversion/MathToROCDL/MathToROCDL.cpp
index a2dfc12cc9d63..a922338176f11 100644
--- a/mlir/lib/Conversion/MathToROCDL/MathToROCDL.cpp
+++ b/mlir/lib/Conversion/MathToROCDL/MathToROCDL.cpp
@@ -68,7 +68,7 @@ struct ClampFOpConversion final
       return LLVM::detail::handleMultidimensionalVectors(
           op.getOperation(), adaptor.getOperands(), *getTypeConverter(),
           [&](Type llvm1DVectorTy, ValueRange operands) -> Value {
-            typename math::ClampFOp::Adaptor adaptor(operands);
+            math::ClampFOp::Adaptor adaptor(operands);
             return ROCDL::FMed3Op::create(rewriter, op.getLoc(), llvm1DVectorTy,
                                           adaptor.getValue(), adaptor.getMin(),
                                           adaptor.getMax());
diff --git a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
index a9efada28a320..ec182f1db48ac 100644
--- a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
+++ b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
@@ -846,13 +846,8 @@ struct NVGPUMBarrierInitLowering
     Value barrier = getMbarrierPtr(b, mbarrierType, adaptor.getBarriers(),
                                    adaptor.getMbarId(), rewriter);
     Value count = truncToI32(b, adaptor.getCount());
-    if (isMbarrierShared(mbarrierType)) {
-      rewriter.replaceOpWithNewOp<NVVM::MBarrierInitSharedOp>(
-          op, barrier, count, adaptor.getPredicate());
-    } else {
-      rewriter.replaceOpWithNewOp<NVVM::MBarrierInitOp>(op, barrier, count,
-                                                        adaptor.getPredicate());
-    }
+    rewriter.replaceOpWithNewOp<NVVM::MBarrierInitOp>(op, barrier, count,
+                                                      adaptor.getPredicate());
     return success();
   }
 };
diff --git a/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp b/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp
index 7d0a236b6f69a..76a822b05a652 100644
--- a/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp
+++ b/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp
@@ -14,6 +14,7 @@
 
 #include "mlir/Conversion/SCFToGPU/SCFToGPU.h"
 
+#include "mlir/Analysis/AliasAnalysis/LocalAliasAnalysis.h"
 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
@@ -27,6 +28,7 @@
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/RegionUtils.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/Support/DebugLog.h"
 #include <optional>
 
@@ -625,18 +627,49 @@ ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp,
   bool seenSideeffects = false;
   // Whether we have left a nesting scope (and hence are no longer innermost).
   bool leftNestingScope = false;
+  LocalAliasAnalysis aliasAnalysis;
+  llvm::DenseSet<Value> writtenBuffer;
   while (!worklist.empty()) {
     Operation *op = worklist.pop_back_val();
     // Now walk over the body and clone it.
     // TODO: This is only correct if there either is no further scf.parallel
-    //       nested or this code is side-effect free. Otherwise we might need
-    //       predication. We are overly conservative for now and only allow
-    //       side-effects in the innermost scope.
+    //       nested or this code has side-effect but the memory buffer is not
+    //       alias to inner loop access buffer. Otherwise we might need
+    //       predication.
     if (auto nestedParallel = dyn_cast<ParallelOp>(op)) {
       // Before entering a nested scope, make sure there have been no
-      // sideeffects until now.
-      if (seenSideeffects)
-        return failure();
+      // sideeffects until now or the nested operations do not access the
+      // buffer written by outer scope.
+      if (seenSideeffects) {
+        WalkResult walkRes = nestedParallel.walk([&](Operation *nestedOp) {
+          if (isMemoryEffectFree(nestedOp))
+            return WalkResult::advance();
+
+          auto memEffectInterface = dyn_cast<MemoryEffectOpInterface>(nestedOp);
+          if (!memEffectInterface)
+            return WalkResult::advance();
+
+          SmallVector<MemoryEffects::EffectInstance> effects;
+          memEffectInterface.getEffects(effects);
+          for (const MemoryEffects::EffectInstance &effect : effects) {
+            if (isa<MemoryEffects::Read>(effect.getEffect()) ||
+                isa<MemoryEffects::Write>(effect.getEffect())) {
+              Value baseBuffer = effect.getValue();
+              if (!baseBuffer)
+                return WalkResult::interrupt();
+              for (Value val : writtenBuffer) {
+                if (aliasAnalysis.alias(baseBuffer, val) !=
+                    AliasResult::NoAlias) {
+                  return WalkResult::interrupt();
+                }
+              }
+            }
+          }
+          return WalkResult::advance();
+        });
+        if (walkRes.wasInterrupted())
+          return failure();
+      }
       // A nested scf.parallel needs insertion of code to compute indices.
       // Insert that now. This will also update the worklist with the loops
       // body.
@@ -650,6 +683,7 @@ ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp,
       rewriter.setInsertionPointAfter(parent);
       leftNestingScope = true;
       seenSideeffects = false;
+      writtenBuffer.clear();
     } else if (auto reduceOp = dyn_cast<scf::ReduceOp>(op)) {
       // Convert scf.reduction op
       auto parentLoop = op->getParentOfType<ParallelOp>();
@@ -682,6 +716,24 @@ ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp,
       Operation *clone = rewriter.clone(*op, cloningMap);
       cloningMap.map(op->getResults(), clone->getResults());
       // Check for side effects.
+      if (!isMemoryEffectFree(clone)) {
+        // Record the buffer accessed by the operations with write effects.
+        if (auto memEffectInterface =
+                dyn_cast<MemoryEffectOpInterface>(clone)) {
+          SmallVector<MemoryEffects::EffectInstance> effects;
+          memEffectInterface.getEffects(effects);
+          for (const MemoryEffects::EffectInstance &effect : effects) {
+            if (isa<MemoryEffects::Write>(effect.getEffect())) {
+              Value writtenBase = effect.getValue();
+              // Conservatively return failure if we cannot find the written
+              // address.
+              if (!writtenBase)
+                return failure();
+              writtenBuffer.insert(writtenBase);
+            }
+          }
+        }
+      }
       // TODO: Handle region side effects properly.
       seenSideeffects |=
           !isMemoryEffectFree(clone) || clone->getNumRegions() != 0;
diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
index 41d8d532757ad..69a317ecd101f 100644
--- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
+++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
@@ -716,7 +716,7 @@ lowerReductionWithStartValue(ConversionPatternRewriter &rewriter, Location loc,
   accumulator = getOrCreateAccumulator<ReductionNeutral>(rewriter, loc,
                                                          llvmType, accumulator);
   return LLVMRedIntrinOp::create(rewriter, loc, llvmType,
-                                 /*startValue=*/accumulator, vectorOperand,
+                                 /*start_value=*/accumulator, vectorOperand,
                                  fmf);
 }
 
@@ -743,7 +743,7 @@ static Value lowerPredicatedReductionWithStartValue(
   Value vectorLength =
       createVectorLengthValue(rewriter, loc, vectorOperand.getType());
   return LLVMVPRedIntrinOp::create(rewriter, loc, llvmType,
-                                   /*startValue=*/accumulator, vectorOperand,
+                                   /*satrt_value=*/accumulator, vectorOperand,
                                    mask, vectorLength);
 }
 
diff --git a/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp b/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp
index e2c7d803e5a5e..91c1aa55fdb4e 100644
--- a/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp
+++ b/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp
@@ -46,7 +46,7 @@ static bool isZeroConstant(Value val) {
           [](auto floatAttr) { return floatAttr.getValue().isZero(); })
       .Case<IntegerAttr>(
           [](auto intAttr) { return intAttr.getValue().isZero(); })
-      .Default([](auto) { return false; });
+      .Default(false);
 }
 
 static LogicalResult storeLoadPreconditions(PatternRewriter &rewriter,
diff --git a/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp b/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp
index fcbf66dbe9e45..33e8f2ed1f6ed 100644
--- a/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp
+++ b/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp
@@ -194,8 +194,8 @@ class CreateNdDescToXeVMPattern
     // If source is a memref, we need to extract the aligned pointer as index.
     // Pointer type is passed as i32 or i64 by type converter.
     if (sourceMemrefTy) {
-      if (!sourceMemrefTy.hasStaticShape()) {
-        return rewriter.notifyMatchFailure(op, "Expected static memref shape.");
+      if (!sourceMemrefTy.hasRank()) {
+        return rewriter.notifyMatchFailure(op, "Expected ranked Memref.");
       }
       baseAddr =
           memref::ExtractAlignedPointerAsIndexOp::create(rewriter, loc, source);
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
index 585b6dacfa648..df955fc90b45f 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
@@ -399,13 +399,15 @@ LogicalResult WMMAOp::verify() {
 
   if (!sourceAElemType.isFloat(8) && sourceAElemType != sourceBElemType) {
     return emitOpError(
-               "source element types much match (except for fp8) but have ")
+               "source element types must match (except for fp8/bf8) but have ")
            << sourceAType << " and " << sourceBType;
   }
 
-  if (!sourceAElemType.isInteger(4) && getK() != 16) {
-    return emitOpError("K dimension must be 16 for source element type ")
-           << sourceAElemType;
+  if (isSrcFloat) {
+    if (getClamp())
+      return emitOpError("clamp flag is not supported for float types");
+    if (getUnsignedA() || getUnsignedB())
+      return emitOpError("unsigned flags are not supported for float types");
   }
   return success();
 }
diff --git a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
index e0a53cd52f143..0c3592124cdec 100644
--- a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
+++ b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
@@ -2716,8 +2716,9 @@ LogicalResult AffineForOp::fold(FoldAdaptor adaptor,
   return success(folded);
 }
 
-OperandRange AffineForOp::getEntrySuccessorOperands(RegionBranchPoint point) {
-  assert((point.isParent() || point == getRegion()) && "invalid region point");
+OperandRange AffineForOp::getEntrySuccessorOperands(RegionSuccessor successor) {
+  assert((successor.isParent() || successor.getSuccessor() == &getRegion()) &&
+         "invalid region point");
 
   // The initial operands map to the loop arguments after the induction
   // variable or are forwarded to the results when the trip count is zero.
@@ -2726,34 +2727,41 @@ OperandRange AffineForOp::getEntrySuccessorOperands(RegionBranchPoint point) {
 
 void AffineForOp::getSuccessorRegions(
     RegionBranchPoint point, SmallVectorImpl<RegionSuccessor> &regions) {
-  assert((point.isParent() || point == getRegion()) && "expected loop region");
+  assert((point.isParent() ||
+          point.getTerminatorPredecessorOrNull()->getParentRegion() ==
+              &getRegion()) &&
+         "expected loop region");
   // The loop may typically branch back to its body or to the parent operation.
   // If the predecessor is the parent op and the trip count is known to be at
   // least one, branch into the body using the iterator arguments. And in cases
   // we know the trip count is zero, it can only branch back to its parent.
   std::optional<uint64_t> tripCount = getTrivialConstantTripCount(*this);
-  if (point.isParent() && tripCount.has_value()) {
-    if (tripCount.value() > 0) {
-      regions.push_back(RegionSuccessor(&getRegion(), getRegionIterArgs()));
-      return;
-    }
-    if (tripCount.value() == 0) {
-      regions.push_back(RegionSuccessor(getResults()));
-      return;
+  if (tripCount.has_value()) {
+    if (!point.isParent()) {
+      // From the loop body, if the trip count is one, we can only branch back
+      // to the parent.
+      if (tripCount == 1) {
+        regions.push_back(RegionSuccessor(getOperation(), getResults()));
+        return;
+      }
+      if (tripCount == 0)
+        return;
+    } else {
+      if (tripCount.value() > 0) {
+        regions.push_back(RegionSuccessor(&getRegion(), getRegionIterArgs()));
+        return;
+      }
+      if (tripCount.value() == 0) {
+        regions.push_back(RegionSuccessor(getOperation(), getResults()));
+        return;
+      }
     }
   }
 
-  // From the loop body, if the trip count is one, we can only branch back to
-  // the parent.
-  if (!point.isParent() && tripCount == 1) {
-    regions.push_back(RegionSuccessor(getResults()));
-    return;
-  }
-
   // In all other cases, the loop may branch back to itself or the parent
   // operation.
   regions.push_back(RegionSuccessor(&getRegion(), getRegionIterArgs()));
-  regions.push_back(RegionSuccessor(getResults()));
+  regions.push_back(RegionSuccessor(getOperation(), getResults()));
 }
 
 AffineBound AffineForOp::getLowerBound() {
@@ -3142,7 +3150,7 @@ void AffineIfOp::getSuccessorRegions(
         RegionSuccessor(&getThenRegion(), getThenRegion().getArguments()));
     // If the "else" region is empty, branch bach into parent.
     if (getElseRegion().empty()) {
-      regions.push_back(getResults());
+      regions.push_back(RegionSuccessor(getOperation(), getResults()));
     } else {
       regions.push_back(
           RegionSuccessor(&getElseRegion(), getElseRegion().getArguments()));
@@ -3152,7 +3160,7 @@ void AffineIfOp::getSuccessorRegions(
 
   // If the predecessor is the `else`/`then` region, then branching into parent
   // op is valid.
-  regions.push_back(RegionSuccessor(getResults()));
+  regions.push_back(RegionSuccessor(getOperation(), getResults()));
 }
 
 LogicalResult AffineIfOp::verify() {
diff --git a/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp b/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp
index e08cc6f645d71..d428fbf2886ff 100644
--- a/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp
@@ -1106,10 +1106,7 @@ static bool isUniformDefinition(Value value,
       return false;
   }
 
-  if (!value.getType().isIntOrIndexOrFloat())
-    return false;
-
-  return true;
+  return value.getType().isIntOrIndexOrFloat();
 }
 
 /// Generates a broadcast op for the provided uniform value using the
diff --git a/mlir/lib/Dialect/Arith/IR/ArithOps.cpp b/mlir/lib/Dialect/Arith/IR/ArithOps.cpp
index 898d76ce8d9b5..980442efdf708 100644
--- a/mlir/lib/Dialect/Arith/IR/ArithOps.cpp
+++ b/mlir/lib/Dialect/Arith/IR/ArithOps.cpp
@@ -2751,7 +2751,7 @@ std::optional<TypedAttr> mlir::arith::getNeutralElement(Operation *op) {
           .Case([](arith::MaxSIOp op) { return AtomicRMWKind::maxs; })
           .Case([](arith::MinSIOp op) { return AtomicRMWKind::mins; })
           .Case([](arith::MulIOp op) { return AtomicRMWKind::muli; })
-          .Default([](Operation *op) { return std::nullopt; });
+          .Default(std::nullopt);
   if (!maybeKind) {
     return std::nullopt;
   }
diff --git a/mlir/lib/Dialect/Async/IR/Async.cpp b/mlir/lib/Dialect/Async/IR/Async.cpp
index dc7b07d911c17..8e4a49df76b52 100644
--- a/mlir/lib/Dialect/Async/IR/Async.cpp
+++ b/mlir/lib/Dialect/Async/IR/Async.cpp
@@ -36,8 +36,9 @@ void AsyncDialect::initialize() {
 
 constexpr char kOperandSegmentSizesAttr[] = "operandSegmentSizes";
 
-OperandRange ExecuteOp::getEntrySuccessorOperands(RegionBranchPoint point) {
-  assert(point == getBodyRegion() && "invalid region index");
+OperandRange ExecuteOp::getEntrySuccessorOperands(RegionSuccessor successor) {
+  assert(successor.getSuccessor() == &getBodyRegion() &&
+         "invalid region index");
   return getBodyOperands();
 }
 
@@ -53,8 +54,10 @@ bool ExecuteOp::areTypesCompatible(Type lhs, Type rhs) {
 void ExecuteOp::getSuccessorRegions(RegionBranchPoint point,
                                     SmallVectorImpl<RegionSuccessor> &regions) {
   // The `body` region branch back to the parent operation.
-  if (point == getBodyRegion()) {
-    regions.push_back(RegionSuccessor(getBodyResults()));
+  if (!point.isParent() &&
+      point.getTerminatorPredecessorOrNull()->getParentRegion() ==
+          &getBodyRegion()) {
+    regions.push_back(RegionSuccessor(getOperation(), getBodyResults()));
     return;
   }
 
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.cpp
index d9d69342e42a8..8655ed3005a93 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.cpp
@@ -95,12 +95,7 @@ getBufferizedFunctionArgType(FuncOp funcOp, int64_t index,
 /// Return the FuncOp called by `callOp`.
 static FuncOp getCalledFunction(CallOpInterface callOp,
                                 SymbolTableCollection &symbolTables) {
-  SymbolRefAttr sym =
-      llvm::dyn_cast_if_present<SymbolRefAttr>(callOp.getCallableForCallee());
-  if (!sym)
-    return nullptr;
-  return dyn_cast_or_null<FuncOp>(
-      symbolTables.lookupNearestSymbolFrom(callOp, sym));
+  return dyn_cast_or_null<FuncOp>(callOp.resolveCallableInTable(&symbolTables));
 }
 
 /// Return the FuncOp called by `callOp`.
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp b/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp
index fb7f2bb5f01d8..9ccbfd363b1df 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp
@@ -620,7 +620,8 @@ hasReadAfterWriteInterference(const DenseSet<OpOperand *> &usesRead,
           LDBG() << "\n- bufferizes out-of-place due to parallel region:\n"
                  << "  unConflictingWrite = operand "
                  << uConflictingWrite->getOperandNumber() << " of "
-                 << *uConflictingWrite->getOwner();
+                 << OpWithFlags(uConflictingWrite->getOwner(),
+                                OpPrintingFlags().skipRegions());
           return true;
         }
       }
@@ -631,7 +632,7 @@ hasReadAfterWriteInterference(const DenseSet<OpOperand *> &usesRead,
     Operation *readingOp = uRead->getOwner();
     LDBG() << "\n- check conflict:\n"
            << "  uRead = operand " << uRead->getOperandNumber() << " of "
-           << *readingOp;
+           << OpWithFlags(readingOp, OpPrintingFlags().skipRegions());
 
     // Find the definition of uRead by following the SSA use-def chain.
     // E.g.:
@@ -655,7 +656,8 @@ hasReadAfterWriteInterference(const DenseSet<OpOperand *> &usesRead,
     for (OpOperand *uConflictingWrite : usesWrite) {
       LDBG() << "  unConflictingWrite = operand "
              << uConflictingWrite->getOperandNumber() << " of "
-             << *uConflictingWrite->getOwner();
+             << OpWithFlags(uConflictingWrite->getOwner(),
+                            OpPrintingFlags().skipRegions());
 
       // Check if op dominance can be used to rule out read-after-write
       // conflicts.
@@ -975,7 +977,7 @@ bufferizableInPlaceAnalysisImpl(OpOperand &operand, OneShotAnalysisState &state,
                                 const DominanceInfo &domInfo) {
   LDBG() << "//===-------------------------------------------===//\n"
          << "Analyzing operand #" << operand.getOperandNumber() << " of "
-         << *operand.getOwner();
+         << OpWithFlags(operand.getOwner(), OpPrintingFlags().skipRegions());
 
   bool foundInterference =
       wouldCreateWriteToNonWritableBuffer(operand, state) ||
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/OneShotModuleBufferize.cpp b/mlir/lib/Dialect/Bufferization/Transforms/OneShotModuleBufferize.cpp
index aa53f94fe839d..c233e24c2a151 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/OneShotModuleBufferize.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/OneShotModuleBufferize.cpp
@@ -285,12 +285,8 @@ static void removeBufferizationAttributes(BlockArgument bbArg) {
 static func::FuncOp
 getCalledFunction(func::CallOp callOp,
                   mlir::SymbolTableCollection &symbolTable) {
-  SymbolRefAttr sym =
-      llvm::dyn_cast_if_present<SymbolRefAttr>(callOp.getCallableForCallee());
-  if (!sym)
-    return nullptr;
   return dyn_cast_or_null<func::FuncOp>(
-      symbolTable.lookupNearestSymbolFrom(callOp, sym));
+      callOp.resolveCallableInTable(&symbolTable));
 }
 
 /// Return "true" if the given function signature has tensor semantics.
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation.cpp b/mlir/lib/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation.cpp
index b593ccab060c7..36a759c279eb7 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation.cpp
@@ -562,8 +562,11 @@ LogicalResult
 BufferDeallocation::updateFunctionSignature(FunctionOpInterface op) {
   SmallVector<TypeRange> returnOperandTypes(llvm::map_range(
       op.getFunctionBody().getOps<RegionBranchTerminatorOpInterface>(),
-      [](RegionBranchTerminatorOpInterface op) {
-        return op.getSuccessorOperands(RegionBranchPoint::parent()).getTypes();
+      [&](RegionBranchTerminatorOpInterface branchOp) {
+        return branchOp
+            .getSuccessorOperands(RegionSuccessor(
+                op.getOperation(), op.getOperation()->getResults()))
+            .getTypes();
       }));
   if (!llvm::all_equal(returnOperandTypes))
     return op->emitError(
@@ -942,8 +945,8 @@ BufferDeallocation::handleInterface(RegionBranchTerminatorOpInterface op) {
   // about, but we would need to check how many successors there are and under
   // which condition they are taken, etc.
 
-  MutableOperandRange operands =
-      op.getMutableSuccessorOperands(RegionBranchPoint::parent());
+  MutableOperandRange operands = op.getMutableSuccessorOperands(
+      RegionSuccessor(op.getOperation(), op.getOperation()->getResults()));
 
   SmallVector<Value> updatedOwnerships;
   auto result = deallocation_impl::insertDeallocOpForReturnLike(
diff --git a/mlir/lib/Dialect/ControlFlow/Transforms/CMakeLists.txt b/mlir/lib/Dialect/ControlFlow/Transforms/CMakeLists.txt
index 47740d31844f4..e9da135ed46f9 100644
--- a/mlir/lib/Dialect/ControlFlow/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/ControlFlow/Transforms/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_mlir_dialect_library(MLIRControlFlowTransforms
   BufferDeallocationOpInterfaceImpl.cpp
   BufferizableOpInterfaceImpl.cpp
+  StructuralTypeConversions.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/ControlFlow/Transforms
diff --git a/mlir/lib/Dialect/ControlFlow/Transforms/StructuralTypeConversions.cpp b/mlir/lib/Dialect/ControlFlow/Transforms/StructuralTypeConversions.cpp
new file mode 100644
index 0000000000000..5e2a742c2d64c
--- /dev/null
+++ b/mlir/lib/Dialect/ControlFlow/Transforms/StructuralTypeConversions.cpp
@@ -0,0 +1,169 @@
+//===- TypeConversion.cpp - Type Conversion of Unstructured Control Flow --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a pass to convert MLIR standard and builtin dialects
+// into the LLVM IR dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/ControlFlow/Transforms/StructuralTypeConversions.h"
+
+#include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/DialectConversion.h"
+
+using namespace mlir;
+
+namespace {
+
+/// Helper function for converting branch ops. This function converts the
+/// signature of the given block. If the new block signature is different from
+/// `expectedTypes`, returns "failure".
+static FailureOr<Block *> getConvertedBlock(ConversionPatternRewriter &rewriter,
+                                            const TypeConverter *converter,
+                                            Operation *branchOp, Block *block,
+                                            TypeRange expectedTypes) {
+  assert(converter && "expected non-null type converter");
+  assert(!block->isEntryBlock() && "entry blocks have no predecessors");
+
+  // There is nothing to do if the types already match.
+  if (block->getArgumentTypes() == expectedTypes)
+    return block;
+
+  // Compute the new block argument types and convert the block.
+  std::optional<TypeConverter::SignatureConversion> conversion =
+      converter->convertBlockSignature(block);
+  if (!conversion)
+    return rewriter.notifyMatchFailure(branchOp,
+                                       "could not compute block signature");
+  if (expectedTypes != conversion->getConvertedTypes())
+    return rewriter.notifyMatchFailure(
+        branchOp,
+        "mismatch between adaptor operand types and computed block signature");
+  return rewriter.applySignatureConversion(block, *conversion, converter);
+}
+
+/// Flatten the given value ranges into a single vector of values.
+static SmallVector<Value> flattenValues(ArrayRef<ValueRange> values) {
+  SmallVector<Value> result;
+  for (const ValueRange &vals : values)
+    llvm::append_range(result, vals);
+  return result;
+}
+
+/// Convert the destination block signature (if necessary) and change the
+/// operands of the branch op.
+struct BranchOpConversion : public OpConversionPattern<cf::BranchOp> {
+  using OpConversionPattern<cf::BranchOp>::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(cf::BranchOp op, OneToNOpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    SmallVector<Value> flattenedAdaptor = flattenValues(adaptor.getOperands());
+    FailureOr<Block *> convertedBlock =
+        getConvertedBlock(rewriter, getTypeConverter(), op, op.getSuccessor(),
+                          TypeRange(ValueRange(flattenedAdaptor)));
+    if (failed(convertedBlock))
+      return failure();
+    rewriter.replaceOpWithNewOp<cf::BranchOp>(op, flattenedAdaptor,
+                                              *convertedBlock);
+    return success();
+  }
+};
+
+/// Convert the destination block signatures (if necessary) and change the
+/// operands of the branch op.
+struct CondBranchOpConversion : public OpConversionPattern<cf::CondBranchOp> {
+  using OpConversionPattern<cf::CondBranchOp>::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(cf::CondBranchOp op, OneToNOpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    SmallVector<Value> flattenedAdaptorTrue =
+        flattenValues(adaptor.getTrueDestOperands());
+    SmallVector<Value> flattenedAdaptorFalse =
+        flattenValues(adaptor.getFalseDestOperands());
+    if (!llvm::hasSingleElement(adaptor.getCondition()))
+      return rewriter.notifyMatchFailure(op,
+                                         "expected single element condition");
+    FailureOr<Block *> convertedTrueBlock =
+        getConvertedBlock(rewriter, getTypeConverter(), op, op.getTrueDest(),
+                          TypeRange(ValueRange(flattenedAdaptorTrue)));
+    if (failed(convertedTrueBlock))
+      return failure();
+    FailureOr<Block *> convertedFalseBlock =
+        getConvertedBlock(rewriter, getTypeConverter(), op, op.getFalseDest(),
+                          TypeRange(ValueRange(flattenedAdaptorFalse)));
+    if (failed(convertedFalseBlock))
+      return failure();
+    rewriter.replaceOpWithNewOp<cf::CondBranchOp>(
+        op, llvm::getSingleElement(adaptor.getCondition()),
+        flattenedAdaptorTrue, flattenedAdaptorFalse, op.getBranchWeightsAttr(),
+        *convertedTrueBlock, *convertedFalseBlock);
+    return success();
+  }
+};
+
+/// Convert the destination block signatures (if necessary) and change the
+/// operands of the switch op.
+struct SwitchOpConversion : public OpConversionPattern<cf::SwitchOp> {
+  using OpConversionPattern<cf::SwitchOp>::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(cf::SwitchOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    // Get or convert default block.
+    FailureOr<Block *> convertedDefaultBlock = getConvertedBlock(
+        rewriter, getTypeConverter(), op, op.getDefaultDestination(),
+        TypeRange(adaptor.getDefaultOperands()));
+    if (failed(convertedDefaultBlock))
+      return failure();
+
+    // Get or convert all case blocks.
+    SmallVector<Block *> caseDestinations;
+    SmallVector<ValueRange> caseOperands = adaptor.getCaseOperands();
+    for (auto it : llvm::enumerate(op.getCaseDestinations())) {
+      Block *b = it.value();
+      FailureOr<Block *> convertedBlock =
+          getConvertedBlock(rewriter, getTypeConverter(), op, b,
+                            TypeRange(caseOperands[it.index()]));
+      if (failed(convertedBlock))
+        return failure();
+      caseDestinations.push_back(*convertedBlock);
+    }
+
+    rewriter.replaceOpWithNewOp<cf::SwitchOp>(
+        op, adaptor.getFlag(), *convertedDefaultBlock,
+        adaptor.getDefaultOperands(), adaptor.getCaseValuesAttr(),
+        caseDestinations, caseOperands);
+    return success();
+  }
+};
+
+} // namespace
+
+void mlir::cf::populateCFStructuralTypeConversions(
+    const TypeConverter &typeConverter, RewritePatternSet &patterns,
+    PatternBenefit benefit) {
+  patterns.add<BranchOpConversion, CondBranchOpConversion, SwitchOpConversion>(
+      typeConverter, patterns.getContext(), benefit);
+}
+
+void mlir::cf::populateCFStructuralTypeConversionTarget(
+    const TypeConverter &typeConverter, ConversionTarget &target) {
+  target.addDynamicallyLegalOp<cf::BranchOp, cf::CondBranchOp, cf::SwitchOp>(
+      [&](Operation *op) { return typeConverter.isLegal(op->getOperands()); });
+}
+
+void mlir::cf::populateCFStructuralTypeConversionsAndLegality(
+    const TypeConverter &typeConverter, RewritePatternSet &patterns,
+    ConversionTarget &target, PatternBenefit benefit) {
+  populateCFStructuralTypeConversions(typeConverter, patterns, benefit);
+  populateCFStructuralTypeConversionTarget(typeConverter, target);
+}
diff --git a/mlir/lib/Dialect/EmitC/IR/EmitC.cpp b/mlir/lib/Dialect/EmitC/IR/EmitC.cpp
index 4754f0bfe895e..d478220221f7a 100644
--- a/mlir/lib/Dialect/EmitC/IR/EmitC.cpp
+++ b/mlir/lib/Dialect/EmitC/IR/EmitC.cpp
@@ -584,6 +584,10 @@ void ForOp::print(OpAsmPrinter &p) {
 LogicalResult ForOp::verifyRegions() {
   // Check that the body defines as single block argument for the induction
   // variable.
+  if (getBody()->getNumArguments() != 1)
+    return emitOpError("expected body to have a single block argument for the "
+                       "induction variable");
+
   if (getInductionVar().getType() != getLowerBound().getType())
     return emitOpError(
         "expected induction variable to be same type as bounds and step");
@@ -845,7 +849,8 @@ void IfOp::getSuccessorRegions(RegionBranchPoint point,
                                SmallVectorImpl<RegionSuccessor> &regions) {
   // The `then` and the `else` region branch back to the parent operation.
   if (!point.isParent()) {
-    regions.push_back(RegionSuccessor());
+    regions.push_back(
+        RegionSuccessor(getOperation(), getOperation()->getResults()));
     return;
   }
 
@@ -854,7 +859,8 @@ void IfOp::getSuccessorRegions(RegionBranchPoint point,
   // Don't consider the else region if it is empty.
   Region *elseRegion = &this->getElseRegion();
   if (elseRegion->empty())
-    regions.push_back(RegionSuccessor());
+    regions.push_back(
+        RegionSuccessor(getOperation(), getOperation()->getResults()));
   else
     regions.push_back(RegionSuccessor(elseRegion));
 }
@@ -871,7 +877,7 @@ void IfOp::getEntrySuccessorRegions(ArrayRef<Attribute> operands,
     if (!getElseRegion().empty())
       regions.emplace_back(&getElseRegion());
     else
-      regions.emplace_back();
+      regions.emplace_back(getOperation(), getOperation()->getResults());
   }
 }
 
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index b5f8ddaadacdf..6c6d8d2bad55d 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -2399,7 +2399,7 @@ ParseResult WarpExecuteOnLane0Op::parse(OpAsmParser &parser,
 void WarpExecuteOnLane0Op::getSuccessorRegions(
     RegionBranchPoint point, SmallVectorImpl<RegionSuccessor> &regions) {
   if (!point.isParent()) {
-    regions.push_back(RegionSuccessor(getResults()));
+    regions.push_back(RegionSuccessor(getOperation(), getResults()));
     return;
   }
 
diff --git a/mlir/lib/Dialect/GPU/Transforms/EliminateBarriers.cpp b/mlir/lib/Dialect/GPU/Transforms/EliminateBarriers.cpp
index d2c2138d61638..025d1acf8d6ba 100644
--- a/mlir/lib/Dialect/GPU/Transforms/EliminateBarriers.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/EliminateBarriers.cpp
@@ -330,7 +330,7 @@ static Value getBase(Value v) {
               v = op.getSrc();
               return true;
             })
-            .Default([](Operation *) { return false; });
+            .Default(false);
     if (!shouldContinue)
       break;
   }
@@ -354,7 +354,7 @@ static Value propagatesCapture(Operation *op) {
       .Case([](memref::TransposeOp transpose) { return transpose.getIn(); })
       .Case<memref::ExpandShapeOp, memref::CollapseShapeOp>(
           [](auto op) { return op.getSrc(); })
-      .Default([](Operation *) { return Value(); });
+      .Default(nullptr);
 }
 
 /// Returns `true` if the given operation is known to capture the given value,
@@ -371,7 +371,7 @@ static std::optional<bool> getKnownCapturingStatus(Operation *op, Value v) {
       // These operations are known not to capture.
       .Case([](memref::DeallocOp) { return false; })
       // By default, we don't know anything.
-      .Default([](Operation *) { return std::nullopt; });
+      .Default(std::nullopt);
 }
 
 /// Returns `true` if the value may be captured by any of its users, i.e., if
diff --git a/mlir/lib/Dialect/GPU/Transforms/ModuleToBinary.cpp b/mlir/lib/Dialect/GPU/Transforms/ModuleToBinary.cpp
index 3c447337d821f..95d5cadbd4e1a 100644
--- a/mlir/lib/Dialect/GPU/Transforms/ModuleToBinary.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/ModuleToBinary.cpp
@@ -39,10 +39,10 @@ void GpuModuleToBinaryPass::runOnOperation() {
   RewritePatternSet patterns(&getContext());
   auto targetFormat =
       llvm::StringSwitch<std::optional<CompilationTarget>>(compilationTarget)
-          .Cases("offloading", "llvm", CompilationTarget::Offload)
-          .Cases("assembly", "isa", CompilationTarget::Assembly)
-          .Cases("binary", "bin", CompilationTarget::Binary)
-          .Cases("fatbinary", "fatbin", CompilationTarget::Fatbin)
+          .Cases({"offloading", "llvm"}, CompilationTarget::Offload)
+          .Cases({"assembly", "isa"}, CompilationTarget::Assembly)
+          .Cases({"binary", "bin"}, CompilationTarget::Binary)
+          .Cases({"fatbinary", "fatbin"}, CompilationTarget::Fatbin)
           .Default(std::nullopt);
   if (!targetFormat)
     getOperation()->emitError() << "Invalid format specified.";
diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
index 81c3069cec16e..ec1571a56fe4a 100644
--- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
@@ -416,13 +416,39 @@ createSubgroupDPPReduction(PatternRewriter &rewriter, gpu::SubgroupReduceOp op,
   if (ci.clusterSize >= 32) {
     if (chipset.majorVersion <= 9) {
       // Broadcast last value from each row to next row.
-      // Use row mask to avoid polluting rows 1 and 3.
+      // Use row mask to avoid polluting row 0 (and row 2 if wave-64).
       dpp = amdgpu::DPPOp::create(rewriter, loc, res.getType(), res, res,
                                   amdgpu::DPPPerm::row_bcast_15,
                                   rewriter.getUnitAttr(), 0xa, allBanks,
                                   /*bound_ctrl*/ false);
       res = vector::makeArithReduction(
           rewriter, loc, gpu::convertReductionKind(mode), res, dpp);
+
+      // For subgroupSize = 64, at this point lanes [16, 32) contain the full
+      // reduction over lanes [0, 32), but lanes [0, 16) do not. Similarly,
+      // lanes [48, 64) contain the full reduction over lanes [32, 64), but
+      // lanes [32, 48) do not.
+      //
+      // If subgroup size is 64 and cluster size is 64, we don't need lanes [0,
+      // 16) and [32, 48) to have the correct cluster-32 reduction values at
+      // this point, because only lane 63's value will ultimately be read in
+      // this full-cluster case.
+      //
+      // If subgroup size is 64 and cluster size is 32, we need to ensure that
+      // lanes [0, 16) and [32, 48) have the correct final cluster-32 reduction
+      // values (subgroup_reduce guarantees that all lanes within each cluster
+      // contain the final reduction value). We do this by broadcasting lane
+      // 31's value to lanes [0, 16) and lanes 63's value to lanes [32, 48).
+      //
+      // See https://gpuopen.com/learn/amd-gcn-assembly-cross-lane-operations
+      // for an illustration of how this within-cluster broadcast works with a
+      // swizzle.
+      if (ci.subgroupSize == 64 && ci.clusterSize == 32) {
+        res =
+            amdgpu::SwizzleBitModeOp::create(rewriter, loc, res, /*and_mask=*/0,
+                                             /*or_mask=*/31,
+                                             /*xor_mask=*/0);
+      }
     } else if (chipset.majorVersion <= 12) {
       // Use a permute lane to cross rows (row 1 <-> row 0, row 3 <-> row 2).
       Value uint32Max = arith::ConstantOp::create(
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
index 3eae67f4c1f98..2731069d6ef54 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
@@ -698,7 +698,7 @@ static void destructureIndices(Type currType, ArrayRef<GEPArg> indices,
                        return structType.getBody()[memberIndex];
                      return nullptr;
                    })
-                   .Default(Type(nullptr));
+                   .Default(nullptr);
   }
 }
 
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp
index cee943d2d86c6..7d9058c262562 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp
@@ -1111,7 +1111,7 @@ memsetCanUsesBeRemoved(MemsetIntr op, const MemorySlot &slot,
           .Case<IntegerType, FloatType>([](auto type) {
             return type.getWidth() % 8 == 0 && type.getWidth() > 0;
           })
-          .Default([](Type) { return false; });
+          .Default(false);
   if (!canConvertType)
     return false;
 
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp
index ac35eea66e9d6..ce93d18f56d39 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp
@@ -798,7 +798,7 @@ static bool isCompatibleImpl(Type type, DenseSet<Type> &compatibleTypes) {
           // clang-format on
           .Case<PtrLikeTypeInterface>(
               [](Type type) { return isCompatiblePtrType(type); })
-          .Default([](Type) { return false; });
+          .Default(false);
 
   if (!result)
     compatibleTypes.erase(type);
diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
index f0de4dbcc1d4b..a5ffb9e77fa9d 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
@@ -896,6 +896,12 @@ std::pair<mlir::Type, unsigned> NVVM::inferMMAType(NVVM::MMATypes type,
   } else if (type == NVVM::MMATypes::f32) {
     elementType = builder.getF32Type();
     numberElements = 8;
+  } else if (type == NVVM::MMATypes::f64) {
+    elementType = builder.getF64Type();
+    if (frag == NVVM::MMAFrag::a || frag == NVVM::MMAFrag::b)
+      numberElements = 1;
+    else
+      numberElements = 2;
   } else if (type == NVVM::MMATypes::tf32) {
     elementType = builder.getI32Type();
     numberElements = 4;
@@ -954,6 +960,14 @@ LogicalResult NVVM::WMMALoadOp::verify() {
     return emitOpError() << "invalid attribute combination";
   std::pair<Type, unsigned> typeInfo = inferMMATypeFromMNK(
       getEltype(), getFrag(), getM(), getN(), getK(), getContext());
+  // Special case for f64 fragments
+  Type f64Ty = Float64Type::get(getContext());
+  if (typeInfo.first == f64Ty && typeInfo.second == 1) {
+    if (getType() != f64Ty)
+      return emitOpError("expected destination type to be f64");
+    return success();
+  }
+  // Everything else is a struct
   Type dstType = LLVM::LLVMStructType::getLiteral(
       getContext(), SmallVector<Type, 8>(typeInfo.second, typeInfo.first));
   if (getType() != dstType)
@@ -1607,10 +1621,53 @@ void Tcgen05MmaSmemDescOp::createSmemDescriptor(Operation &op,
   mt.mapValue(thisOp.getRes()) = smemDesc;
 }
 
+//===----------------------------------------------------------------------===//
+// getPtx methods
+//===----------------------------------------------------------------------===//
+
+std::string NVVM::MBarrierInitOp::getPtx() {
+  unsigned addressSpace =
+      llvm::cast<LLVM::LLVMPointerType>(getAddr().getType()).getAddressSpace();
+  return (addressSpace == NVVMMemorySpace::Shared)
+             ? std::string("mbarrier.init.shared.b64 [%0], %1;")
+             : std::string("mbarrier.init.b64 [%0], %1;");
+}
+
 //===----------------------------------------------------------------------===//
 // getIntrinsicID/getIntrinsicIDAndArgs methods
 //===----------------------------------------------------------------------===//
 
+mlir::NVVM::IDArgPair MBarrierInitOp::getIntrinsicIDAndArgs(
+    Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) {
+  auto thisOp = cast<NVVM::MBarrierInitOp>(op);
+  unsigned addressSpace =
+      llvm::cast<LLVM::LLVMPointerType>(thisOp.getAddr().getType())
+          .getAddressSpace();
+  llvm::Intrinsic::ID id = (addressSpace == NVVMMemorySpace::Shared)
+                               ? llvm::Intrinsic::nvvm_mbarrier_init_shared
+                               : llvm::Intrinsic::nvvm_mbarrier_init;
+
+  // Fill the Intrinsic Args
+  llvm::SmallVector<llvm::Value *> args;
+  args.push_back(mt.lookupValue(thisOp.getAddr()));
+  args.push_back(mt.lookupValue(thisOp.getCount()));
+
+  return {id, std::move(args)};
+}
+
+mlir::NVVM::IDArgPair MBarrierInvalOp::getIntrinsicIDAndArgs(
+    Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) {
+  auto thisOp = cast<NVVM::MBarrierInvalOp>(op);
+  unsigned addressSpace =
+      llvm::cast<LLVM::LLVMPointerType>(thisOp.getAddr().getType())
+          .getAddressSpace();
+  llvm::Intrinsic::ID id = (addressSpace == NVVMMemorySpace::Shared)
+                               ? llvm::Intrinsic::nvvm_mbarrier_inval_shared
+                               : llvm::Intrinsic::nvvm_mbarrier_inval;
+
+  return {id, {mt.lookupValue(thisOp.getAddr())}};
+}
+
 #define CP_ASYNC_ID_IMPL(mod, size, suffix)                                    \
   llvm::Intrinsic::nvvm_cp_async_##mod##_shared_global_##size##suffix
 
diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
index cbc565b0c8cbd..3dc45edf4a23f 100644
--- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
@@ -1474,6 +1474,8 @@ void MapOp::getAsmBlockArgumentNames(Region &region,
                                      OpAsmSetValueNameFn setNameFn) {
   for (Value v : getRegionInputArgs())
     setNameFn(v, "in");
+  for (Value v : getRegionOutputArgs())
+    setNameFn(v, "init");
 }
 
 void MapOp::getAsmResultNames(function_ref<void(Value, StringRef)> setNameFn) {
@@ -1495,14 +1497,14 @@ void MapOp::build(
 
   if (bodyBuild)
     buildGenericRegion(builder, result.location, *result.regions.front(),
-                       inputs, /*outputs=*/{}, bodyBuild);
+                       inputs, /*outputs=*/{init}, bodyBuild);
 }
 
 static void addBodyWithPayloadOp(OpAsmParser &parser, OperationState &result,
                                  const OperationName &payloadOpName,
                                  const NamedAttrList &payloadOpAttrs,
                                  ArrayRef<Value> operands,
-                                 bool initFirst = false) {
+                                 bool initFirst = false, bool mapInit = true) {
   OpBuilder b(parser.getContext());
   Region *body = result.addRegion();
   Block &block = body->emplaceBlock();
@@ -1516,12 +1518,13 @@ static void addBodyWithPayloadOp(OpAsmParser &parser, OperationState &result,
   // If initFirst flag is enabled, we consider init as the first position of
   // payload operands.
   if (initFirst) {
-    payloadOpOperands.push_back(block.getArguments().back());
+    if (mapInit)
+      payloadOpOperands.push_back(block.getArguments().back());
     for (const auto &arg : block.getArguments().drop_back())
       payloadOpOperands.push_back(arg);
   } else {
     payloadOpOperands = {block.getArguments().begin(),
-                         block.getArguments().end()};
+                         block.getArguments().end() - int(!mapInit)};
   }
 
   Operation *payloadOp = b.create(
@@ -1553,8 +1556,8 @@ ParseResult MapOp::parse(OpAsmParser &parser, OperationState &result) {
   if (payloadOpName.has_value()) {
     if (!result.operands.empty())
       addBodyWithPayloadOp(parser, result, payloadOpName.value(),
-                           payloadOpAttrs,
-                           ArrayRef(result.operands).drop_back());
+                           payloadOpAttrs, ArrayRef(result.operands), false,
+                           false);
     else
       result.addRegion();
   } else {
@@ -1570,7 +1573,11 @@ ParseResult MapOp::parse(OpAsmParser &parser, OperationState &result) {
   return success();
 }
 
-static bool canUseShortForm(Block *body, bool initFirst = false) {
+static bool canUseShortForm(Block *body, bool initFirst = false,
+                            bool mapInit = true) {
+  // `intFirst == true` implies that we want to map init arg
+  if (initFirst && !mapInit)
+    return false;
   // Check if the body can be printed in short form. The following 4 conditions
   // must be satisfied:
 
@@ -1582,7 +1589,7 @@ static bool canUseShortForm(Block *body, bool initFirst = false) {
   // 2) The payload op must have the same number of operands as the number of
   //    block arguments.
   if (payload.getNumOperands() == 0 ||
-      payload.getNumOperands() != body->getNumArguments())
+      payload.getNumOperands() != body->getNumArguments() - int(!mapInit))
     return false;
 
   // 3) If `initFirst` is true (e.g., for reduction ops), the init block
@@ -1600,7 +1607,8 @@ static bool canUseShortForm(Block *body, bool initFirst = false) {
     }
   } else {
     for (const auto &[operand, bbArg] :
-         llvm::zip(payload.getOperands(), body->getArguments())) {
+         llvm::zip(payload.getOperands(),
+                   body->getArguments().drop_back(int(!mapInit)))) {
       if (bbArg != operand)
         return false;
     }
@@ -1632,7 +1640,8 @@ static void printShortForm(OpAsmPrinter &p, Operation *payloadOp) {
 
 void MapOp::print(OpAsmPrinter &p) {
   Block *mapper = getBody();
-  bool useShortForm = canUseShortForm(mapper);
+  bool useShortForm =
+      canUseShortForm(mapper, /*initFirst=*/false, /*mapInit*/ false);
   if (useShortForm) {
     printShortForm(p, &mapper->getOperations().front());
   }
@@ -1658,11 +1667,13 @@ LogicalResult MapOp::verify() {
   auto *bodyBlock = getBody();
   auto blockArgs = bodyBlock->getArguments();
 
-  // Checks if the number of `inputs` match the arity of the `mapper` region.
-  if (getInputs().size() != blockArgs.size())
+  // Checks if the number of `inputs` + `init` match the arity of the `mapper`
+  // region.
+  if (getInputs().size() + 1 != blockArgs.size())
     return emitOpError() << "expects number of operands to match the arity of "
                             "mapper, but got: "
-                         << getInputs().size() << " and " << blockArgs.size();
+                         << getInputs().size() + 1 << " and "
+                         << blockArgs.size();
 
   // The parameters of mapper should all match the element type of inputs.
   for (const auto &[bbArgType, inputArg] :
diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
index 794dda96d1dfa..3a433825fd31a 100644
--- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
+++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
@@ -1958,7 +1958,7 @@ enum class OuterOrInnerPerm { Outer = 0, Inner = 1 };
 /// Return true if either `op` or `permutation` are empty to allow a simpler
 /// polymorphic implementation.
 template <typename RelayoutOpTy>
-bool isValidPackingPermutation(
+static bool isValidPackingPermutation(
     RelayoutOpTy op, ArrayRef<int64_t> permutation,
     OuterOrInnerPerm outerOrInnerPerm = OuterOrInnerPerm::Outer) {
   static_assert(
@@ -2464,6 +2464,8 @@ transform::PadTilingInterfaceOp::apply(transform::TransformRewriter &rewriter,
         .setPaddingSizes(getMixedPaddingSizes())
         .setPadToMultipleOf(getPadToMultipleOf());
 
+    OpBuilder::InsertionGuard g(rewriter);
+    rewriter.setInsertionPointAfter(targetOp);
     auto maybePadOps = rewriteAsPaddedOp(
         rewriter, cast<TilingInterface>(targetOp.getOperation()), options);
     if (failed(maybePadOps)) {
@@ -4320,9 +4322,10 @@ DiagnosedSilenceableFailure transform::TransposeMatmulOp::applyToOne(
 // InsertSliceToCopyOp
 //===----------------------------------------------------------------------===//
 template <typename OpTy>
-DiagnosedSilenceableFailure doit(RewriterBase &rewriter, OpTy target,
-                                 transform::ApplyToEachResultList &results,
-                                 transform::TransformState &state) {
+static DiagnosedSilenceableFailure
+doit(RewriterBase &rewriter, OpTy target,
+     transform::ApplyToEachResultList &results,
+     transform::TransformState &state) {
   static_assert(llvm::is_one_of<OpTy, tensor::InsertSliceOp,
                                 tensor::ParallelInsertSliceOp>() &&
                 "wrong op type");
@@ -4497,7 +4500,7 @@ DiagnosedSilenceableFailure transform::DecomposeWinogradOp::applyToOne(
             maybeTransformed = decomposeWinogradOutputTransformOp(rewriter, op);
             return true;
           })
-          .Default([&](Operation *op) { return false; });
+          .Default(false);
 
   if (!supported) {
     DiagnosedSilenceableFailure diag =
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Generalization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Generalization.cpp
index 3e31393fd51ed..75bb1757a55f5 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Generalization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Generalization.cpp
@@ -31,10 +31,8 @@ using namespace mlir;
 using namespace mlir::linalg;
 
 static LogicalResult generalizeNamedOpPrecondition(LinalgOp linalgOp) {
-  // Bailout if `linalgOp` is already a generic or a linalg.map. We cannot
-  // trivially generalize a `linalg.map`, as it does not use the output as
-  // region arguments in the block.
-  if (isa<GenericOp>(linalgOp) || isa<MapOp>(linalgOp))
+  // Bailout if `linalgOp` is already a generic.
+  if (isa<GenericOp>(linalgOp))
     return failure();
   // Check if the operation has exactly one region.
   if (linalgOp->getNumRegions() != 1) {
diff --git a/mlir/lib/Dialect/Linalg/Transforms/PadTilingInterface.cpp b/mlir/lib/Dialect/Linalg/Transforms/PadTilingInterface.cpp
index 3e787a2ad0ef5..52ab92f180575 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/PadTilingInterface.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/PadTilingInterface.cpp
@@ -288,10 +288,6 @@ FailureOr<PadTilingInterfaceResult> linalg::rewriteAsPaddedOp(
     return failure();
   }
 
-  OpBuilder::InsertionGuard g(builder);
-  // Set IP after toPad because we also take the dims of toPad's output.
-  builder.setInsertionPointAfter(toPad);
-
   // 1. Get the loopUpperBounds from the TilingInterface.
   SmallVector<Range> iterationDomain = toPad.getIterationDomain(builder);
 
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp b/mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp
index f05ffa8334d9c..6519c4f64dd05 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp
@@ -322,7 +322,7 @@ promoteSubViews(ImplicitLocOpBuilder &b,
                 tmp = arith::ConstantOp::create(b, IntegerAttr::get(et, 0));
               return complex::CreateOp::create(b, t, tmp, tmp);
             })
-            .Default([](auto) { return Value(); });
+            .Default(nullptr);
     if (!fillVal)
       return failure();
     linalg::FillOp::create(b, fillVal, promotionInfo->fullLocalView);
diff --git a/mlir/lib/Dialect/Linalg/Transforms/SimplifyDepthwiseConv.cpp b/mlir/lib/Dialect/Linalg/Transforms/SimplifyDepthwiseConv.cpp
index 27ccf3c2ba148..6becc1f29afbd 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/SimplifyDepthwiseConv.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/SimplifyDepthwiseConv.cpp
@@ -89,7 +89,7 @@ matchAndReplaceDepthwiseConv(Operation *operation, Value input, Value kernel,
                 ValueRange{input, collapsedKernel, iZp, kZp},
                 ValueRange{collapsedInit}, stride, dilation);
           })
-          .Default([](Operation *op) { return nullptr; });
+          .Default(nullptr);
   if (!newConv)
     return failure();
   for (auto attr : preservedAttrs)
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
index eb2d825e17e44..bd25e946908b6 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
@@ -495,13 +495,14 @@ FailureOr<PackResult> linalg::pack(RewriterBase &rewriter,
     if (failed(maybePackedDimForEachOperand))
       return failure();
     packedOperandsDims.packedDimForEachOperand = *maybePackedDimForEachOperand;
-    listOfPackedOperandsDim.pushBack(std::move(packedOperandsDims));
 
     LDBG() << "++++ After pack size #" << i << ": " << packedSizes[i];
     LDBG() << "maps: " << llvm::interleaved(indexingMaps);
     LDBG() << "iterators: " << llvm::interleaved(iteratorTypes);
     LDBG() << "packedDimForEachOperand: "
            << llvm::interleaved(packedOperandsDims.packedDimForEachOperand);
+
+    listOfPackedOperandsDim.pushBack(std::move(packedOperandsDims));
   }
 
   // Step 2. Propagate packing to all LinalgOp operands.
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
index 9d62491214018..cb6199f026e03 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -656,7 +656,7 @@ mlir::linalg::getCombinerOpKind(Operation *combinerOp) {
           [&](auto op) { return CombiningKind::MUL; })
       .Case<arith::OrIOp>([&](auto op) { return CombiningKind::OR; })
       .Case<arith::XOrIOp>([&](auto op) { return CombiningKind::XOR; })
-      .Default([&](auto op) { return std::nullopt; });
+      .Default(std::nullopt);
 }
 
 /// Check whether `outputOperand` is a reduction with a single combiner
@@ -3911,21 +3911,21 @@ struct Conv1DGenerator
     Value lhs = vector::TransferReadOp::create(
         rewriter, loc, lhsType, lhsShaped, ValueRange{zero, zero, zero},
         /*padding=*/arith::getZeroConstant(rewriter, loc, lhsEltType));
-    auto maybeMaskedLhs = maybeMaskXferOp(
+    auto *maybeMaskedLhs = maybeMaskXferOp(
         lhsType.getShape(), lhsType.getScalableDims(), lhs.getDefiningOp());
 
     // Read rhs slice of size {kw, c} @ [0, 0].
     Value rhs = vector::TransferReadOp::create(
         rewriter, loc, rhsType, rhsShaped, ValueRange{zero, zero},
         /*padding=*/arith::getZeroConstant(rewriter, loc, rhsEltType));
-    auto maybeMaskedRhs = maybeMaskXferOp(
+    auto *maybeMaskedRhs = maybeMaskXferOp(
         rhsType.getShape(), rhsType.getScalableDims(), rhs.getDefiningOp());
 
     // Read res slice of size {n, w, c} @ [0, 0, 0].
     Value res = vector::TransferReadOp::create(
         rewriter, loc, resType, resShaped, ValueRange{zero, zero, zero},
         /*padding=*/arith::getZeroConstant(rewriter, loc, resEltType));
-    auto maybeMaskedRes = maybeMaskXferOp(
+    auto *maybeMaskedRes = maybeMaskXferOp(
         resType.getShape(), resType.getScalableDims(), res.getDefiningOp());
 
     //===------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
index c551fba93e367..1c21a2f270da6 100644
--- a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
+++ b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
@@ -405,7 +405,7 @@ ParseResult AllocaScopeOp::parse(OpAsmParser &parser, OperationState &result) {
 void AllocaScopeOp::getSuccessorRegions(
     RegionBranchPoint point, SmallVectorImpl<RegionSuccessor> &regions) {
   if (!point.isParent()) {
-    regions.push_back(RegionSuccessor(getResults()));
+    regions.push_back(RegionSuccessor(getOperation(), getResults()));
     return;
   }
 
diff --git a/mlir/lib/Dialect/MemRef/IR/ValueBoundsOpInterfaceImpl.cpp b/mlir/lib/Dialect/MemRef/IR/ValueBoundsOpInterfaceImpl.cpp
index 6fa8ce4efff3b..69afbcadb0b07 100644
--- a/mlir/lib/Dialect/MemRef/IR/ValueBoundsOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/MemRef/IR/ValueBoundsOpInterfaceImpl.cpp
@@ -98,6 +98,27 @@ struct RankOpInterface
   }
 };
 
+struct CollapseShapeOpInterface
+    : public ValueBoundsOpInterface::ExternalModel<CollapseShapeOpInterface,
+                                                   memref::CollapseShapeOp> {
+  void populateBoundsForShapedValueDim(Operation *op, Value value, int64_t dim,
+                                       ValueBoundsConstraintSet &cstr) const {
+    auto collapseOp = cast<memref::CollapseShapeOp>(op);
+    assert(value == collapseOp.getResult() && "invalid value");
+
+    // Multiply the expressions for the dimensions in the reassociation group.
+    const ReassociationIndices reassocIndices =
+        collapseOp.getReassociationIndices()[dim];
+    AffineExpr productExpr =
+        cstr.getExpr(collapseOp.getSrc(), reassocIndices[0]);
+    for (size_t i = 1; i < reassocIndices.size(); ++i) {
+      productExpr =
+          productExpr * cstr.getExpr(collapseOp.getSrc(), reassocIndices[i]);
+    }
+    cstr.bound(value)[dim] == productExpr;
+  }
+};
+
 struct SubViewOpInterface
     : public ValueBoundsOpInterface::ExternalModel<SubViewOpInterface,
                                                    SubViewOp> {
@@ -134,6 +155,8 @@ void mlir::memref::registerValueBoundsOpInterfaceExternalModels(
         memref::AllocOpInterface<memref::AllocaOp>>(*ctx);
     memref::CastOp::attachInterface<memref::CastOpInterface>(*ctx);
     memref::DimOp::attachInterface<memref::DimOpInterface>(*ctx);
+    memref::CollapseShapeOp::attachInterface<memref::CollapseShapeOpInterface>(
+        *ctx);
     memref::ExpandShapeOp::attachInterface<memref::ExpandShapeOpInterface>(
         *ctx);
     memref::GetGlobalOp::attachInterface<memref::GetGlobalOpInterface>(*ctx);
diff --git a/mlir/lib/Dialect/MemRef/Transforms/FlattenMemRefs.cpp b/mlir/lib/Dialect/MemRef/Transforms/FlattenMemRefs.cpp
index 1208fddf37e0b..e6850890bf8fe 100644
--- a/mlir/lib/Dialect/MemRef/Transforms/FlattenMemRefs.cpp
+++ b/mlir/lib/Dialect/MemRef/Transforms/FlattenMemRefs.cpp
@@ -104,7 +104,7 @@ static Value getTargetMemref(Operation *op) {
                      vector::MaskedStoreOp, vector::TransferReadOp,
                      vector::TransferWriteOp>(
           [](auto op) { return op.getBase(); })
-      .Default([](auto) { return Value{}; });
+      .Default(nullptr);
 }
 
 template <typename T>
diff --git a/mlir/lib/Dialect/OpenACC/Analysis/OpenACCSupport.cpp b/mlir/lib/Dialect/OpenACC/Analysis/OpenACCSupport.cpp
index f6b4534794eaf..40e769e7068cf 100644
--- a/mlir/lib/Dialect/OpenACC/Analysis/OpenACCSupport.cpp
+++ b/mlir/lib/Dialect/OpenACC/Analysis/OpenACCSupport.cpp
@@ -22,5 +22,24 @@ std::string OpenACCSupport::getVariableName(Value v) {
   return acc::getVariableName(v);
 }
 
+std::string OpenACCSupport::getRecipeName(RecipeKind kind, Type type,
+                                          Value var) {
+  if (impl)
+    return impl->getRecipeName(kind, type, var);
+  // The default implementation assumes that only type matters
+  // and the actual instance of variable is not relevant.
+  auto recipeName = acc::getRecipeName(kind, type);
+  if (recipeName.empty())
+    emitNYI(var ? var.getLoc() : UnknownLoc::get(type.getContext()),
+            "variable privatization (incomplete recipe name handling)");
+  return recipeName;
+}
+
+InFlightDiagnostic OpenACCSupport::emitNYI(Location loc, const Twine &message) {
+  if (impl)
+    return impl->emitNYI(loc, message);
+  return mlir::emitError(loc, "not yet implemented: " + message);
+}
+
 } // namespace acc
 } // namespace mlir
diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
index ca46629919dba..35eba724a9059 100644
--- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
+++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
@@ -50,11 +50,11 @@ static void attachVarNameAttr(Operation *op, OpBuilder &builder,
   }
 }
 
+template <typename T>
 struct MemRefPointerLikeModel
-    : public PointerLikeType::ExternalModel<MemRefPointerLikeModel,
-                                            MemRefType> {
+    : public PointerLikeType::ExternalModel<MemRefPointerLikeModel<T>, T> {
   Type getElementType(Type pointer) const {
-    return cast<MemRefType>(pointer).getElementType();
+    return cast<T>(pointer).getElementType();
   }
 
   mlir::acc::VariableTypeCategory
@@ -63,7 +63,7 @@ struct MemRefPointerLikeModel
     if (auto mappableTy = dyn_cast<MappableType>(varType)) {
       return mappableTy.getTypeCategory(varPtr);
     }
-    auto memrefTy = cast<MemRefType>(pointer);
+    auto memrefTy = cast<T>(pointer);
     if (!memrefTy.hasRank()) {
       // This memref is unranked - aka it could have any rank, including a
       // rank of 0 which could mean scalar. For now, return uncategorized.
@@ -296,7 +296,10 @@ void OpenACCDialect::initialize() {
   // By attaching interfaces here, we make the OpenACC dialect dependent on
   // the other dialects. This is probably better than having dialects like LLVM
   // and memref be dependent on OpenACC.
-  MemRefType::attachInterface<MemRefPointerLikeModel>(*getContext());
+  MemRefType::attachInterface<MemRefPointerLikeModel<MemRefType>>(
+      *getContext());
+  UnrankedMemRefType::attachInterface<
+      MemRefPointerLikeModel<UnrankedMemRefType>>(*getContext());
   LLVM::LLVMPointerType::attachInterface<LLVMPointerPointerLikeModel>(
       *getContext());
 }
diff --git a/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtils.cpp b/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtils.cpp
index 89adda82646e6..fbac28e740750 100644
--- a/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtils.cpp
+++ b/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtils.cpp
@@ -11,6 +11,7 @@
 #include "mlir/Dialect/OpenACC/OpenACC.h"
 #include "mlir/Interfaces/ViewLikeInterface.h"
 #include "llvm/ADT/TypeSwitch.h"
+#include "llvm/Support/Casting.h"
 
 mlir::Operation *mlir::acc::getEnclosingComputeOp(mlir::Region &region) {
   mlir::Operation *parentOp = region.getParentOp();
@@ -106,3 +107,51 @@ std::string mlir::acc::getVariableName(mlir::Value v) {
 
   return "";
 }
+
+std::string mlir::acc::getRecipeName(mlir::acc::RecipeKind kind,
+                                     mlir::Type type) {
+  assert(kind == mlir::acc::RecipeKind::private_recipe ||
+         kind == mlir::acc::RecipeKind::firstprivate_recipe ||
+         kind == mlir::acc::RecipeKind::reduction_recipe);
+  if (!llvm::isa<mlir::acc::PointerLikeType, mlir::acc::MappableType>(type))
+    return "";
+
+  std::string recipeName;
+  llvm::raw_string_ostream ss(recipeName);
+  ss << (kind == mlir::acc::RecipeKind::private_recipe ? "privatization_"
+         : kind == mlir::acc::RecipeKind::firstprivate_recipe
+             ? "firstprivatization_"
+             : "reduction_");
+
+  // Print the type using its dialect-defined textual format.
+  type.print(ss);
+  ss.flush();
+
+  // Replace invalid characters (anything that's not a letter, number, or
+  // period) since this needs to be a valid MLIR identifier.
+  for (char &c : recipeName) {
+    if (!std::isalnum(static_cast<unsigned char>(c)) && c != '.' && c != '_') {
+      if (c == '?')
+        c = 'U';
+      else if (c == '*')
+        c = 'Z';
+      else if (c == '(' || c == ')' || c == '[' || c == ']' || c == '{' ||
+               c == '}' || c == '<' || c == '>')
+        c = '_';
+      else
+        c = 'X';
+    }
+  }
+
+  return recipeName;
+}
+
+mlir::Value mlir::acc::getBaseEntity(mlir::Value val) {
+  if (auto partialEntityAccessOp =
+          dyn_cast<PartialEntityAccessOpInterface>(val.getDefiningOp())) {
+    if (!partialEntityAccessOp.isCompleteView())
+      return partialEntityAccessOp.getBaseEntity();
+  }
+
+  return val;
+}
diff --git a/mlir/lib/Dialect/SCF/IR/SCF.cpp b/mlir/lib/Dialect/SCF/IR/SCF.cpp
index 1ab01d86bcd10..2946b53c8cb36 100644
--- a/mlir/lib/Dialect/SCF/IR/SCF.cpp
+++ b/mlir/lib/Dialect/SCF/IR/SCF.cpp
@@ -397,7 +397,7 @@ void ExecuteRegionOp::getSuccessorRegions(
   }
 
   // Otherwise, the region branches back to the parent operation.
-  regions.push_back(RegionSuccessor(getResults()));
+  regions.push_back(RegionSuccessor(getOperation(), getResults()));
 }
 
 //===----------------------------------------------------------------------===//
@@ -405,10 +405,11 @@ void ExecuteRegionOp::getSuccessorRegions(
 //===----------------------------------------------------------------------===//
 
 MutableOperandRange
-ConditionOp::getMutableSuccessorOperands(RegionBranchPoint point) {
-  assert((point.isParent() || point == getParentOp().getAfter()) &&
-         "condition op can only exit the loop or branch to the after"
-         "region");
+ConditionOp::getMutableSuccessorOperands(RegionSuccessor point) {
+  assert(
+      (point.isParent() || point.getSuccessor() == &getParentOp().getAfter()) &&
+      "condition op can only exit the loop or branch to the after"
+      "region");
   // Pass all operands except the condition to the successor region.
   return getArgsMutable();
 }
@@ -426,7 +427,7 @@ void ConditionOp::getSuccessorRegions(
     regions.emplace_back(&whileOp.getAfter(),
                          whileOp.getAfter().getArguments());
   if (!boolAttr || !boolAttr.getValue())
-    regions.emplace_back(whileOp.getResults());
+    regions.emplace_back(whileOp.getOperation(), whileOp.getResults());
 }
 
 //===----------------------------------------------------------------------===//
@@ -749,7 +750,7 @@ ForOp mlir::scf::getForInductionVarOwner(Value val) {
   return dyn_cast_or_null<ForOp>(containingOp);
 }
 
-OperandRange ForOp::getEntrySuccessorOperands(RegionBranchPoint point) {
+OperandRange ForOp::getEntrySuccessorOperands(RegionSuccessor successor) {
   return getInitArgs();
 }
 
@@ -759,7 +760,7 @@ void ForOp::getSuccessorRegions(RegionBranchPoint point,
   // back into the operation itself. It is possible for loop not to enter the
   // body.
   regions.push_back(RegionSuccessor(&getRegion(), getRegionIterArgs()));
-  regions.push_back(RegionSuccessor(getResults()));
+  regions.push_back(RegionSuccessor(getOperation(), getResults()));
 }
 
 SmallVector<Region *> ForallOp::getLoopRegions() { return {&getRegion()}; }
@@ -2053,9 +2054,10 @@ void ForallOp::getSuccessorRegions(RegionBranchPoint point,
   // parallel by multiple threads. We should not expect to branch back into
   // the forall body after the region's execution is complete.
   if (point.isParent())
-    regions.push_back(RegionSuccessor(&getRegion()));
+    regions.push_back(RegionSuccessor(&getRegion(), getRegionIterArgs()));
   else
-    regions.push_back(RegionSuccessor());
+    regions.push_back(
+        RegionSuccessor(getOperation(), getOperation()->getResults()));
 }
 
 //===----------------------------------------------------------------------===//
@@ -2333,9 +2335,10 @@ void IfOp::print(OpAsmPrinter &p) {
 
 void IfOp::getSuccessorRegions(RegionBranchPoint point,
                                SmallVectorImpl<RegionSuccessor> &regions) {
-  // The `then` and the `else` region branch back to the parent operation.
+  // The `then` and the `else` region branch back to the parent operation or one
+  // of the recursive parent operations (early exit case).
   if (!point.isParent()) {
-    regions.push_back(RegionSuccessor(getResults()));
+    regions.push_back(RegionSuccessor(getOperation(), getResults()));
     return;
   }
 
@@ -2344,7 +2347,8 @@ void IfOp::getSuccessorRegions(RegionBranchPoint point,
   // Don't consider the else region if it is empty.
   Region *elseRegion = &this->getElseRegion();
   if (elseRegion->empty())
-    regions.push_back(RegionSuccessor());
+    regions.push_back(
+        RegionSuccessor(getOperation(), getOperation()->getResults()));
   else
     regions.push_back(RegionSuccessor(elseRegion));
 }
@@ -2361,7 +2365,7 @@ void IfOp::getEntrySuccessorRegions(ArrayRef<Attribute> operands,
     if (!getElseRegion().empty())
       regions.emplace_back(&getElseRegion());
     else
-      regions.emplace_back(getResults());
+      regions.emplace_back(getOperation(), getResults());
   }
 }
 
@@ -3385,7 +3389,8 @@ void ParallelOp::getSuccessorRegions(
   // back into the operation itself. It is possible for loop not to enter the
   // body.
   regions.push_back(RegionSuccessor(&getRegion()));
-  regions.push_back(RegionSuccessor());
+  regions.push_back(RegionSuccessor(
+      getOperation(), ResultRange{getResults().end(), getResults().end()}));
 }
 
 //===----------------------------------------------------------------------===//
@@ -3431,7 +3436,7 @@ LogicalResult ReduceOp::verifyRegions() {
 }
 
 MutableOperandRange
-ReduceOp::getMutableSuccessorOperands(RegionBranchPoint point) {
+ReduceOp::getMutableSuccessorOperands(RegionSuccessor point) {
   // No operands are forwarded to the next iteration.
   return MutableOperandRange(getOperation(), /*start=*/0, /*length=*/0);
 }
@@ -3514,8 +3519,8 @@ Block::BlockArgListType WhileOp::getRegionIterArgs() {
   return getBeforeArguments();
 }
 
-OperandRange WhileOp::getEntrySuccessorOperands(RegionBranchPoint point) {
-  assert(point == getBefore() &&
+OperandRange WhileOp::getEntrySuccessorOperands(RegionSuccessor successor) {
+  assert(successor.getSuccessor() == &getBefore() &&
          "WhileOp is expected to branch only to the first region");
   return getInits();
 }
@@ -3528,15 +3533,18 @@ void WhileOp::getSuccessorRegions(RegionBranchPoint point,
     return;
   }
 
-  assert(llvm::is_contained({&getAfter(), &getBefore()}, point) &&
+  assert(llvm::is_contained(
+             {&getAfter(), &getBefore()},
+             point.getTerminatorPredecessorOrNull()->getParentRegion()) &&
          "there are only two regions in a WhileOp");
   // The body region always branches back to the condition region.
-  if (point == getAfter()) {
+  if (point.getTerminatorPredecessorOrNull()->getParentRegion() ==
+      &getAfter()) {
     regions.emplace_back(&getBefore(), getBefore().getArguments());
     return;
   }
 
-  regions.emplace_back(getResults());
+  regions.emplace_back(getOperation(), getResults());
   regions.emplace_back(&getAfter(), getAfter().getArguments());
 }
 
@@ -4445,7 +4453,7 @@ void IndexSwitchOp::getSuccessorRegions(
     RegionBranchPoint point, SmallVectorImpl<RegionSuccessor> &successors) {
   // All regions branch back to the parent op.
   if (!point.isParent()) {
-    successors.emplace_back(getResults());
+    successors.emplace_back(getOperation(), getResults());
     return;
   }
 
diff --git a/mlir/lib/Dialect/SCF/Transforms/ForToWhile.cpp b/mlir/lib/Dialect/SCF/Transforms/ForToWhile.cpp
index ae52af5009dc9..ddcbda86cf1f3 100644
--- a/mlir/lib/Dialect/SCF/Transforms/ForToWhile.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/ForToWhile.cpp
@@ -23,7 +23,6 @@ namespace mlir {
 #include "mlir/Dialect/SCF/Transforms/Passes.h.inc"
 } // namespace mlir
 
-using namespace llvm;
 using namespace mlir;
 using scf::ForOp;
 using scf::WhileOp;
diff --git a/mlir/lib/Dialect/SCF/Transforms/ForallToFor.cpp b/mlir/lib/Dialect/SCF/Transforms/ForallToFor.cpp
index a2f03f1e1056e..00bef707fadd3 100644
--- a/mlir/lib/Dialect/SCF/Transforms/ForallToFor.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/ForallToFor.cpp
@@ -21,7 +21,6 @@ namespace mlir {
 #include "mlir/Dialect/SCF/Transforms/Passes.h.inc"
 } // namespace mlir
 
-using namespace llvm;
 using namespace mlir;
 using scf::LoopNest;
 
diff --git a/mlir/lib/Dialect/SCF/Transforms/LoopCanonicalization.cpp b/mlir/lib/Dialect/SCF/Transforms/LoopCanonicalization.cpp
index 4ebd90dbcc1d5..d380c46f7fbee 100644
--- a/mlir/lib/Dialect/SCF/Transforms/LoopCanonicalization.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/LoopCanonicalization.cpp
@@ -55,7 +55,7 @@ static bool isShapePreserving(ForOp forOp, int64_t arg) {
                              ? forOp.getInitArgs()[opResult.getResultNumber()]
                              : Value();
                 })
-                .Default([&](auto op) { return Value(); });
+                .Default(nullptr);
   }
   return false;
 }
diff --git a/mlir/lib/Dialect/SPIRV/IR/SPIRVOps.cpp b/mlir/lib/Dialect/SPIRV/IR/SPIRVOps.cpp
index 0c8114d5e957e..938952ed273cd 100644
--- a/mlir/lib/Dialect/SPIRV/IR/SPIRVOps.cpp
+++ b/mlir/lib/Dialect/SPIRV/IR/SPIRVOps.cpp
@@ -346,7 +346,7 @@ LogicalResult spirv::CompositeConstructOp::verify() {
       llvm::TypeSwitch<Type, Type>(getType())
           .Case<spirv::CooperativeMatrixType>(
               [](auto coopType) { return coopType.getElementType(); })
-          .Default([](Type) { return nullptr; });
+          .Default(nullptr);
 
   // Case 1. -- matrices.
   if (coopElementType) {
@@ -1708,7 +1708,7 @@ LogicalResult spirv::MatrixTimesScalarOp::verify() {
       llvm::TypeSwitch<Type, Type>(getMatrix().getType())
           .Case<spirv::CooperativeMatrixType, spirv::MatrixType>(
               [](auto matrixType) { return matrixType.getElementType(); })
-          .Default([](Type) { return nullptr; });
+          .Default(nullptr);
 
   assert(elementType && "Unhandled type");
 
diff --git a/mlir/lib/Dialect/SPIRV/IR/SPIRVTypes.cpp b/mlir/lib/Dialect/SPIRV/IR/SPIRVTypes.cpp
index f895807ea1d18..d1e275d590f78 100644
--- a/mlir/lib/Dialect/SPIRV/IR/SPIRVTypes.cpp
+++ b/mlir/lib/Dialect/SPIRV/IR/SPIRVTypes.cpp
@@ -731,7 +731,7 @@ std::optional<int64_t> SPIRVType::getSizeInBytes() {
           return *elementSize * type.getNumElements();
         return std::nullopt;
       })
-      .Default(std::optional<int64_t>());
+      .Default(std::nullopt);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/SPIRV/Transforms/SPIRVConversion.cpp b/mlir/lib/Dialect/SPIRV/Transforms/SPIRVConversion.cpp
index 88e1ab6ab1e4d..cb9b7f6ec2fd2 100644
--- a/mlir/lib/Dialect/SPIRV/Transforms/SPIRVConversion.cpp
+++ b/mlir/lib/Dialect/SPIRV/Transforms/SPIRVConversion.cpp
@@ -1467,7 +1467,7 @@ mlir::spirv::getNativeVectorShape(Operation *op) {
   return TypeSwitch<Operation *, std::optional<SmallVector<int64_t>>>(op)
       .Case<vector::ReductionOp, vector::TransposeOp>(
           [](auto typedOp) { return getNativeVectorShapeImpl(typedOp); })
-      .Default([](Operation *) { return std::nullopt; });
+      .Default(std::nullopt);
 }
 
 LogicalResult mlir::spirv::unrollVectorsInSignatures(Operation *op) {
diff --git a/mlir/lib/Dialect/Shape/IR/Shape.cpp b/mlir/lib/Dialect/Shape/IR/Shape.cpp
index 5ba828918c22a..f0f22e5ef4a83 100644
--- a/mlir/lib/Dialect/Shape/IR/Shape.cpp
+++ b/mlir/lib/Dialect/Shape/IR/Shape.cpp
@@ -346,7 +346,7 @@ void AssumingOp::getSuccessorRegions(
   // parent, so return the correct RegionSuccessor purely based on the index
   // being None or 0.
   if (!point.isParent()) {
-    regions.push_back(RegionSuccessor(getResults()));
+    regions.push_back(RegionSuccessor(getOperation(), getResults()));
     return;
   }
 
diff --git a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
index 1a9d9e158ee75..3962e3e84dd31 100644
--- a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
+++ b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
@@ -2597,7 +2597,7 @@ std::optional<MutableArrayRef<OpOperand>> IterateOp::getYieldedValuesMutable() {
 
 std::optional<ResultRange> IterateOp::getLoopResults() { return getResults(); }
 
-OperandRange IterateOp::getEntrySuccessorOperands(RegionBranchPoint point) {
+OperandRange IterateOp::getEntrySuccessorOperands(RegionSuccessor successor) {
   return getInitArgs();
 }
 
@@ -2607,7 +2607,7 @@ void IterateOp::getSuccessorRegions(RegionBranchPoint point,
   // or back into the operation itself.
   regions.push_back(RegionSuccessor(&getRegion(), getRegionIterArgs()));
   // It is possible for loop not to enter the body.
-  regions.push_back(RegionSuccessor(getResults()));
+  regions.push_back(RegionSuccessor(getOperation(), getResults()));
 }
 
 void CoIterateOp::build(OpBuilder &builder, OperationState &odsState,
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp
index 7a26cd301eb99..1fbcf5fdc68db 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp
@@ -1050,7 +1050,7 @@ class SparseInsertConverter : public OpConversionPattern<tensor::InsertOp> {
 /// Sparse codegen rule for position accesses.
 class SparseToPositionsConverter : public OpConversionPattern<ToPositionsOp> {
 public:
-  using OpAdaptor = typename ToPositionsOp::Adaptor;
+  using OpAdaptor = ToPositionsOp::Adaptor;
   using OpConversionPattern<ToPositionsOp>::OpConversionPattern;
   LogicalResult
   matchAndRewrite(ToPositionsOp op, OneToNOpAdaptor adaptor,
@@ -1073,7 +1073,7 @@ class SparseToPositionsConverter : public OpConversionPattern<ToPositionsOp> {
 class SparseToCoordinatesConverter
     : public OpConversionPattern<ToCoordinatesOp> {
 public:
-  using OpAdaptor = typename ToCoordinatesOp::Adaptor;
+  using OpAdaptor = ToCoordinatesOp::Adaptor;
   using OpConversionPattern<ToCoordinatesOp>::OpConversionPattern;
   LogicalResult
   matchAndRewrite(ToCoordinatesOp op, OneToNOpAdaptor adaptor,
@@ -1099,7 +1099,7 @@ class SparseToCoordinatesConverter
 class SparseToCoordinatesBufferConverter
     : public OpConversionPattern<ToCoordinatesBufferOp> {
 public:
-  using OpAdaptor = typename ToCoordinatesBufferOp::Adaptor;
+  using OpAdaptor = ToCoordinatesBufferOp::Adaptor;
   using OpConversionPattern<ToCoordinatesBufferOp>::OpConversionPattern;
   LogicalResult
   matchAndRewrite(ToCoordinatesBufferOp op, OneToNOpAdaptor adaptor,
@@ -1121,7 +1121,7 @@ class SparseToCoordinatesBufferConverter
 /// Sparse codegen rule for value accesses.
 class SparseToValuesConverter : public OpConversionPattern<ToValuesOp> {
 public:
-  using OpAdaptor = typename ToValuesOp::Adaptor;
+  using OpAdaptor = ToValuesOp::Adaptor;
   using OpConversionPattern<ToValuesOp>::OpConversionPattern;
   LogicalResult
   matchAndRewrite(ToValuesOp op, OneToNOpAdaptor adaptor,
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/IterationGraphSorter.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/IterationGraphSorter.cpp
index f53d2727c9b00..ffa8b402e0b6b 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/IterationGraphSorter.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/IterationGraphSorter.cpp
@@ -152,19 +152,20 @@ IterationGraphSorter IterationGraphSorter::fromGenericOp(
 }
 
 IterationGraphSorter::IterationGraphSorter(
-    SmallVector<Value> &&ins, SmallVector<AffineMap> &&loop2InsLvl, Value out,
-    AffineMap loop2OutLvl, SmallVector<utils::IteratorType> &&iterTypes,
+    SmallVector<Value> &&insArg, SmallVector<AffineMap> &&loop2InsLvlArg,
+    Value out, AffineMap loop2OutLvl,
+    SmallVector<utils::IteratorType> &&iterTypesArg,
     sparse_tensor::LoopOrderingStrategy strategy)
-    : ins(std::move(ins)), loop2InsLvl(std::move(loop2InsLvl)), out(out),
-      loop2OutLvl(loop2OutLvl), iterTypes(std::move(iterTypes)),
+    : ins(std::move(insArg)), loop2InsLvl(std::move(loop2InsLvlArg)), out(out),
+      loop2OutLvl(loop2OutLvl), iterTypes(std::move(iterTypesArg)),
       strategy(strategy) {
   // One map per tensor.
-  assert(this->loop2InsLvl.size() == this->ins.size());
+  assert(loop2InsLvl.size() == ins.size());
   // All the affine maps have the same number of dimensions (loops).
   assert(llvm::all_equal(llvm::map_range(
-      this->loop2InsLvl, [](AffineMap m) { return m.getNumDims(); })));
+      loop2InsLvl, [](AffineMap m) { return m.getNumDims(); })));
   // The number of results of the map should match the rank of the tensor.
-  assert(llvm::all_of(llvm::zip(this->loop2InsLvl, this->ins), [](auto mvPair) {
+  assert(llvm::all_of(llvm::zip(loop2InsLvl, ins), [](auto mvPair) {
     auto [m, v] = mvPair;
 
     // For ranked types the rank must match.
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/IterationGraphSorter.h b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/IterationGraphSorter.h
index b2a16e9382758..35e58edeb2562 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/IterationGraphSorter.h
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/IterationGraphSorter.h
@@ -59,10 +59,10 @@ class IterationGraphSorter {
 
 private:
   // Private constructor.
-  IterationGraphSorter(SmallVector<Value> &&ins,
-                       SmallVector<AffineMap> &&loop2InsLvl, Value out,
+  IterationGraphSorter(SmallVector<Value> &&insArg,
+                       SmallVector<AffineMap> &&loop2InsLvlArg, Value out,
                        AffineMap loop2OutLvl,
-                       SmallVector<utils::IteratorType> &&iterTypes,
+                       SmallVector<utils::IteratorType> &&iterTypesArg,
                        sparse_tensor::LoopOrderingStrategy strategy =
                            sparse_tensor::LoopOrderingStrategy::kDefault);
 
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorIterator.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorIterator.cpp
index 46d0baac58f06..61b5ad600a16e 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorIterator.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorIterator.cpp
@@ -504,6 +504,14 @@ class SimpleWrapIterator : public SparseIterator {
                      unsigned extraCursorVal = 0)
       : SparseIterator(kind, *wrap, extraCursorVal), wrap(std::move(wrap)) {}
 
+  void setSparseEmitStrategy(SparseEmitStrategy strategy) override {
+    wrap->setSparseEmitStrategy(strategy);
+  }
+
+  SparseEmitStrategy getSparseEmitStrategy() const override {
+    return wrap->getSparseEmitStrategy();
+  }
+
   SmallVector<Type> getCursorValTypes(OpBuilder &b) const override {
     return wrap->getCursorValTypes(b);
   }
@@ -979,7 +987,7 @@ class SubSectIterator : public SparseIterator {
 
 void SparseIterator::genInit(OpBuilder &b, Location l,
                              const SparseIterator *p) {
-  if (emitStrategy == SparseEmitStrategy::kDebugInterface) {
+  if (getSparseEmitStrategy() == SparseEmitStrategy::kDebugInterface) {
     std::string prefix = getDebugInterfacePrefix();
     Operation *begin = b.create(l, b.getStringAttr(prefix + ".begin"), {},
                                 getCursorValTypes(b));
@@ -994,7 +1002,7 @@ void SparseIterator::genInit(OpBuilder &b, Location l,
 }
 
 Value SparseIterator::genNotEnd(OpBuilder &b, Location l) {
-  if (emitStrategy == SparseEmitStrategy::kDebugInterface) {
+  if (getSparseEmitStrategy() == SparseEmitStrategy::kDebugInterface) {
     std::string prefix = getDebugInterfacePrefix();
     Operation *notEnd = b.create(l, b.getStringAttr(prefix + ".not_end"),
                                  getCursor(), b.getI1Type());
@@ -1005,7 +1013,7 @@ Value SparseIterator::genNotEnd(OpBuilder &b, Location l) {
 }
 
 void SparseIterator::locate(OpBuilder &b, Location l, Value crd) {
-  if (emitStrategy == SparseEmitStrategy::kDebugInterface) {
+  if (getSparseEmitStrategy() == SparseEmitStrategy::kDebugInterface) {
     std::string prefix = getDebugInterfacePrefix();
     SmallVector<Value> args = getCursor();
     args.push_back(crd);
@@ -1019,7 +1027,7 @@ void SparseIterator::locate(OpBuilder &b, Location l, Value crd) {
 }
 
 Value SparseIterator::deref(OpBuilder &b, Location l) {
-  if (emitStrategy == SparseEmitStrategy::kDebugInterface) {
+  if (getSparseEmitStrategy() == SparseEmitStrategy::kDebugInterface) {
     std::string prefix = getDebugInterfacePrefix();
     SmallVector<Value> args = getCursor();
     Operation *deref = b.create(l, b.getStringAttr(prefix + ".deref"),
@@ -1032,7 +1040,7 @@ Value SparseIterator::deref(OpBuilder &b, Location l) {
 
 ValueRange SparseIterator::forward(OpBuilder &b, Location l) {
   assert(!randomAccessible());
-  if (emitStrategy == SparseEmitStrategy::kDebugInterface) {
+  if (getSparseEmitStrategy() == SparseEmitStrategy::kDebugInterface) {
     std::string prefix = getDebugInterfacePrefix();
     Operation *next = b.create(l, b.getStringAttr(prefix + ".next"),
                                getCursor(), getCursorValTypes(b));
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorIterator.h b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorIterator.h
index 642cb1afa156b..3636f3f01adb5 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorIterator.h
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorIterator.h
@@ -177,10 +177,14 @@ class SparseIterator {
 public:
   virtual ~SparseIterator() = default;
 
-  void setSparseEmitStrategy(SparseEmitStrategy strategy) {
+  virtual void setSparseEmitStrategy(SparseEmitStrategy strategy) {
     emitStrategy = strategy;
   }
 
+  virtual SparseEmitStrategy getSparseEmitStrategy() const {
+    return emitStrategy;
+  }
+
   virtual std::string getDebugInterfacePrefix() const = 0;
   virtual SmallVector<Type> getCursorValTypes(OpBuilder &b) const = 0;
 
diff --git a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
index ac7200294a3a6..110bfdce72ea4 100644
--- a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
+++ b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
@@ -41,10 +41,6 @@
 using namespace mlir;
 using namespace mlir::tensor;
 
-using llvm::divideCeilSigned;
-using llvm::divideFloorSigned;
-using llvm::mod;
-
 /// Materialize a single constant operation from a given attribute value with
 /// the desired resultant type.
 Operation *TensorDialect::materializeConstant(OpBuilder &builder,
diff --git a/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp
index bce964e47a3be..c607ece418dff 100644
--- a/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -579,6 +579,7 @@ static Value lowerGenerateLikeOpBody(RewriterBase &rewriter, Location loc,
       linalg::MapOp::create(rewriter, loc, tensorType, /*inputs=*/ValueRange(),
                             /*init=*/tensorDestination);
   Block &linalgBody = linalgOp.getMapper().emplaceBlock();
+  linalgBody.addArgument(tensorType.getElementType(), loc);
 
   // Create linalg::IndexOps.
   rewriter.setInsertionPointToStart(&linalgBody);
@@ -1068,6 +1069,7 @@ struct SplatOpInterface
                                           /*inputs=*/ValueRange(),
                                           /*init=*/*tensorAlloc);
     Block &linalgBody = linalgOp.getMapper().emplaceBlock();
+    linalgBody.addArgument(tensorType.getElementType(), loc);
 
     // Create linalg::IndexOps.
     rewriter.setInsertionPointToStart(&linalgBody);
diff --git a/mlir/lib/Dialect/Tensor/Transforms/RewriteAsConstant.cpp b/mlir/lib/Dialect/Tensor/Transforms/RewriteAsConstant.cpp
index 69e649d2eebe8..bc4f5a5ac7f23 100644
--- a/mlir/lib/Dialect/Tensor/Transforms/RewriteAsConstant.cpp
+++ b/mlir/lib/Dialect/Tensor/Transforms/RewriteAsConstant.cpp
@@ -189,7 +189,7 @@ struct PadOpToConstant final : public OpRewritePattern<PadOp> {
               return constantFoldPadOp<llvm::APInt>(
                   rewriter, loc, inputAttr, integerAttr, *lowPad, *highPad);
             })
-            .Default(Value());
+            .Default(nullptr);
 
     if (!newOp)
       return rewriter.notifyMatchFailure(padTensorOp,
diff --git a/mlir/lib/Dialect/Tensor/Transforms/SwapExtractSliceWithProducerPatterns.cpp b/mlir/lib/Dialect/Tensor/Transforms/SwapExtractSliceWithProducerPatterns.cpp
index 1e3b377ab85c7..549ac7afca8ca 100644
--- a/mlir/lib/Dialect/Tensor/Transforms/SwapExtractSliceWithProducerPatterns.cpp
+++ b/mlir/lib/Dialect/Tensor/Transforms/SwapExtractSliceWithProducerPatterns.cpp
@@ -77,7 +77,7 @@ FailureOr<TilingResult> tensor::replaceInsertSlicesWithTiledConsumer(
       dyn_cast<TilingInterface>(consumerOperands.front()->getOwner());
   if (!consumerOp)
     return failure();
-  for (auto opOperand : consumerOperands.drop_front()) {
+  for (auto *opOperand : consumerOperands.drop_front()) {
     if (opOperand->getOwner() != consumerOp) {
       LLVM_DEBUG({
         llvm::dbgs()
diff --git a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
index 0aff67f0b5eba..bf3810ff231da 100644
--- a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
+++ b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
@@ -606,6 +606,12 @@ Value mlir::tosa::createPadConstTensor(OpBuilder &builder, Location loc,
   return tosa::ConstOp::create(builder, loc, padConstType, padConstAttr);
 }
 
+unsigned mlir::tosa::getBitWidth(Type type) {
+  if (dyn_cast<tosa::mxint8Type>(type))
+    return 8;
+  return type.getIntOrFloatBitWidth();
+}
+
 //===----------------------------------------------------------------------===//
 // TOSA Operator Verifiers.
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaProfileCompliance.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaProfileCompliance.cpp
index ab363ee6b4d2a..ddd9c70402fdc 100644
--- a/mlir/lib/Dialect/Tosa/Transforms/TosaProfileCompliance.cpp
+++ b/mlir/lib/Dialect/Tosa/Transforms/TosaProfileCompliance.cpp
@@ -31,6 +31,7 @@ TosaProfileCompliance::TosaProfileCompliance() {
   const TypeInfo fp6e3m2T = {mlir::Float6E3M2FNType::getTypeID(), 6};
   const TypeInfo fp4e2m1T = {mlir::Float4E2M1FNType::getTypeID(), 4};
   const TypeInfo fp8ue8m0T = {mlir::Float8E8M0FNUType::getTypeID(), 8};
+  const TypeInfo mxint8T = {mlir::tosa::mxint8Type::getTypeID(), 8};
 
 // The profile-based compliance content below is auto-generated by a script
 // in https://git.mlplatform.org/tosa/specification.git
@@ -625,6 +626,8 @@ TosaProfileCompliance::stringifyTypeInfo(const TypeInfo &typeInfo) {
     return {"fp4e2m1"};
   } else if (typeInfo.typeID == mlir::Float8E8M0FNUType::getTypeID()) {
     return {"fp8e8m0"};
+  } else if (typeInfo.typeID == tosa::mxint8Type::getTypeID()) {
+    return {"mxint8"};
   }
   llvm_unreachable("unknown type");
 }
diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp
index 4d0b61acc4ea4..b54ed5585d72d 100644
--- a/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp
+++ b/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp
@@ -693,7 +693,7 @@ LogicalResult TosaValidation::levelCheckSize(Operation *op,
                                  << " shape dimension cannot be dynamic";
     }
 
-    int64_t element_bits = type.getElementTypeBitWidth();
+    int64_t element_bits = tosa::getBitWidth(getElementTypeOrSelf(type));
     int64_t element_bytes = std::max(INT64_C(1), element_bits / 8);
     int64_t size = element_bytes * type.getNumElements();
 
@@ -1217,9 +1217,10 @@ bool TosaValidation::isValidElementType(Type type, const bool allowUnsigned) {
         return true;
       }
     }
-  } else if (mlir::isa<tosa::shapeType>(type)) {
+  } else if (isa<tosa::shapeType>(type))
+    return true;
+  else if (isa<tosa::mxint8Type>(type))
     return true;
-  }
   return false;
 }
 
diff --git a/mlir/lib/Dialect/Transform/IR/TransformOps.cpp b/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
index 365afab3764c8..062606e7e10b6 100644
--- a/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
+++ b/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
@@ -96,9 +96,9 @@ ensurePayloadIsSeparateFromTransform(transform::TransformOpInterface transform,
 // AlternativesOp
 //===----------------------------------------------------------------------===//
 
-OperandRange
-transform::AlternativesOp::getEntrySuccessorOperands(RegionBranchPoint point) {
-  if (!point.isParent() && getOperation()->getNumOperands() == 1)
+OperandRange transform::AlternativesOp::getEntrySuccessorOperands(
+    RegionSuccessor successor) {
+  if (!successor.isParent() && getOperation()->getNumOperands() == 1)
     return getOperation()->getOperands();
   return OperandRange(getOperation()->operand_end(),
                       getOperation()->operand_end());
@@ -107,15 +107,18 @@ transform::AlternativesOp::getEntrySuccessorOperands(RegionBranchPoint point) {
 void transform::AlternativesOp::getSuccessorRegions(
     RegionBranchPoint point, SmallVectorImpl<RegionSuccessor> &regions) {
   for (Region &alternative : llvm::drop_begin(
-           getAlternatives(),
-           point.isParent() ? 0
-                            : point.getRegionOrNull()->getRegionNumber() + 1)) {
+           getAlternatives(), point.isParent()
+                                  ? 0
+                                  : point.getTerminatorPredecessorOrNull()
+                                            ->getParentRegion()
+                                            ->getRegionNumber() +
+                                        1)) {
     regions.emplace_back(&alternative, !getOperands().empty()
                                            ? alternative.getArguments()
                                            : Block::BlockArgListType());
   }
   if (!point.isParent())
-    regions.emplace_back(getOperation()->getResults());
+    regions.emplace_back(getOperation(), getOperation()->getResults());
 }
 
 void transform::AlternativesOp::getRegionInvocationBounds(
@@ -1740,16 +1743,18 @@ void transform::ForeachOp::getSuccessorRegions(
   }
 
   // Branch back to the region or the parent.
-  assert(point == getBody() && "unexpected region index");
+  assert(point.getTerminatorPredecessorOrNull()->getParentRegion() ==
+             &getBody() &&
+         "unexpected region index");
   regions.emplace_back(bodyRegion, bodyRegion->getArguments());
-  regions.emplace_back();
+  regions.emplace_back(getOperation(), getOperation()->getResults());
 }
 
 OperandRange
-transform::ForeachOp::getEntrySuccessorOperands(RegionBranchPoint point) {
+transform::ForeachOp::getEntrySuccessorOperands(RegionSuccessor successor) {
   // Each block argument handle is mapped to a subset (one op to be precise)
   // of the payload of the corresponding `targets` operand of ForeachOp.
-  assert(point == getBody() && "unexpected region index");
+  assert(successor.getSuccessor() == &getBody() && "unexpected region index");
   return getOperation()->getOperands();
 }
 
@@ -2948,8 +2953,8 @@ void transform::SequenceOp::getEffects(
 }
 
 OperandRange
-transform::SequenceOp::getEntrySuccessorOperands(RegionBranchPoint point) {
-  assert(point == getBody() && "unexpected region index");
+transform::SequenceOp::getEntrySuccessorOperands(RegionSuccessor successor) {
+  assert(successor.getSuccessor() == &getBody() && "unexpected region index");
   if (getOperation()->getNumOperands() > 0)
     return getOperation()->getOperands();
   return OperandRange(getOperation()->operand_end(),
@@ -2966,8 +2971,10 @@ void transform::SequenceOp::getSuccessorRegions(
     return;
   }
 
-  assert(point == getBody() && "unexpected region index");
-  regions.emplace_back(getOperation()->getResults());
+  assert(point.getTerminatorPredecessorOrNull()->getParentRegion() ==
+             &getBody() &&
+         "unexpected region index");
+  regions.emplace_back(getOperation(), getOperation()->getResults());
 }
 
 void transform::SequenceOp::getRegionInvocationBounds(
diff --git a/mlir/lib/Dialect/Transform/TuneExtension/TuneExtensionOps.cpp b/mlir/lib/Dialect/Transform/TuneExtension/TuneExtensionOps.cpp
index c627158e999ed..f727118f3f9a0 100644
--- a/mlir/lib/Dialect/Transform/TuneExtension/TuneExtensionOps.cpp
+++ b/mlir/lib/Dialect/Transform/TuneExtension/TuneExtensionOps.cpp
@@ -9,6 +9,7 @@
 #include "mlir/Dialect/Transform/IR/TransformOps.h"
 #include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
 #include "mlir/IR/OpImplementation.h"
+#include "mlir/Interfaces/ControlFlowInterfaces.h"
 #include "llvm/Support/Debug.h"
 
 #include "mlir/Dialect/Transform/TuneExtension/TuneExtensionOps.h"
@@ -112,7 +113,7 @@ static void printAlternativesOpSelectedRegion(OpAsmPrinter &printer,
 }
 
 OperandRange transform::tune::AlternativesOp::getEntrySuccessorOperands(
-    RegionBranchPoint point) {
+    RegionSuccessor successor) {
   // No operands will be forwarded to the region(s).
   return getOperands().slice(0, 0);
 }
@@ -128,7 +129,7 @@ void transform::tune::AlternativesOp::getSuccessorRegions(
       for (Region &alternative : getAlternatives())
         regions.emplace_back(&alternative, Block::BlockArgListType());
   else
-    regions.emplace_back(getOperation()->getResults());
+    regions.emplace_back(getOperation(), getOperation()->getResults());
 }
 
 void transform::tune::AlternativesOp::getRegionInvocationBounds(
diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
index ad8255a95cb4e..ae3423c40040d 100644
--- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
+++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
@@ -4336,7 +4336,7 @@ OpFoldResult ExtractStridedSliceOp::fold(FoldAdaptor adaptor) {
   // ExtractStridedSliceOp(splat ConstantOp) -> ConstantOp.
   if (auto splat =
           llvm::dyn_cast_if_present<SplatElementsAttr>(adaptor.getSource()))
-    DenseElementsAttr::get(getType(), splat.getSplatValue<Attribute>());
+    return DenseElementsAttr::get(getType(), splat.getSplatValue<Attribute>());
 
   // ExtractStridedSliceOp(non-splat ConstantOp) -> ConstantOp.
   return foldExtractStridedSliceNonSplatConstant(*this, adaptor.getSource());
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index f9aa28d5203db..83406c8c75dcf 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -11,7 +11,6 @@
 #include "mlir/Dialect/Index/IR/IndexOps.h"
 #include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/Dialect/XeGPU/IR/XeGPU.h"
-#include "mlir/Dialect/XeGPU/IR/XeGPUTargetInfo.h"
 #include "mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/DialectImplementation.h"
@@ -229,8 +228,10 @@ LayoutAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
   }
 
   if (inst_data && lane_layout && inst_data.size() != lane_layout.size()) {
-    return emitError()
-           << "expected inst_data and lane_layout to have the same rank";
+    return emitError() << "expected inst_data and lane_layout to have the same "
+                          "rank, got inst_data "
+                       << inst_data.size() << ", lane_layout "
+                       << lane_layout.size();
   }
 
   // sg_data is optional for Workgroup layout, but its presence requires
@@ -569,8 +570,8 @@ TensorDescType::verify(llvm::function_ref<InFlightDiagnostic()> emitError,
   // for gather and scatter ops, Low-precision types are packed in 32-bit units.
   unsigned bitWidth = elementType.getIntOrFloatBitWidth();
   int chunkAlignmentFactor =
-      bitWidth < targetinfo::packedSizeInBitsForGatherScatter
-          ? targetinfo::packedSizeInBitsForGatherScatter / bitWidth
+      bitWidth < xegpu::uArch::generalPackedFormatBitSize
+          ? xegpu::uArch::generalPackedFormatBitSize / bitWidth
           : 1;
   auto scatterAttr = mlir::dyn_cast_if_present<ScatterTensorDescAttr>(encoding);
   if (scatterAttr) {
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
index 8fab255d6347f..90eae871a5ef3 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
@@ -14,7 +14,6 @@
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/Dialect/XeGPU/IR/XeGPU.h"
-#include "mlir/Dialect/XeGPU/IR/XeGPUTargetInfo.h"
 #include "mlir/Dialect/XeGPU/Transforms/Passes.h"
 #include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
 #include "mlir/IR/Attributes.h"
@@ -37,6 +36,8 @@
 #include "llvm/Support/LogicalResult.h"
 #include "llvm/Support/raw_ostream.h"
 
+#include "mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h"
+
 namespace mlir {
 namespace xegpu {
 #define GEN_PASS_DEF_XEGPUPROPAGATELAYOUT
@@ -104,6 +105,8 @@ struct LayoutInfo {
 
   SmallVector<int> getLaneData() const;
 
+  SmallVector<int> getInstData() const;
+
   bool isSliceLayout() const {
     if (!isAssigned())
       return false;
@@ -137,6 +140,13 @@ SmallVector<int> LayoutInfo::getLaneData() const {
                              [](int64_t val) { return static_cast<int>(val); });
 }
 
+SmallVector<int> LayoutInfo::getInstData() const {
+  if (!isAssigned())
+    return {};
+  return llvm::map_to_vector(storage.getEffectiveInstDataAsInt(),
+                             [](int64_t val) { return static_cast<int>(val); });
+}
+
 void LayoutInfo::print(raw_ostream &os) const {
   if (isAssigned()) {
     os << storage;
@@ -174,12 +184,14 @@ LayoutInfo LayoutInfo::transpose(ArrayRef<int64_t> permutation) const {
 
   SmallVector<int32_t> laneLayout;
   SmallVector<int32_t> laneData;
+  SmallVector<int32_t> instData;
   for (int64_t idx : permutation) {
     laneLayout.push_back(static_cast<int32_t>(getLaneLayout()[idx]));
     laneData.push_back(static_cast<int32_t>(getLaneData()[idx]));
+    instData.push_back(static_cast<int32_t>(getInstData()[idx]));
   }
-  return LayoutInfo(
-      xegpu::LayoutAttr::get(storage.getContext(), laneLayout, laneData));
+  return LayoutInfo(xegpu::LayoutAttr::get(storage.getContext(), instData,
+                                           laneLayout, laneData));
 }
 
 //===----------------------------------------------------------------------===//
@@ -192,6 +204,28 @@ struct LayoutInfoLattice : public Lattice<LayoutInfo> {
   using Lattice::Lattice;
 };
 
+/// Helper Function to find a proper instruction multiple for the user-supplied
+/// sg-level data shape. `candidates` are uArch allowed shapes.
+/// `candidateMultiples` are uArch multiples of such shapes (e.g., block count).
+template <typename T>
+int getLargestDivisor(T dim, ArrayRef<T> candidates,
+                      ArrayRef<T> candidateMultiples = {}) {
+  static_assert(std::is_integral<T>::value, "T must be an integer type");
+  int largest = -1;
+  SmallVector<T> multiples = {1};
+  if (!candidateMultiples.empty())
+    multiples =
+        SmallVector<T>(candidateMultiples.begin(), candidateMultiples.end());
+  for (T candidate : candidates) {
+    for (T multiple : multiples) {
+      int value = static_cast<int>(candidate * multiple);
+      if (value != 0 && dim % value == 0 && value > largest)
+        largest = value;
+    }
+  }
+  return largest;
+}
+
 /// Helper Functions to get default layouts. A `default layout` is a layout that
 /// is assigned to a value when the layout is not fixed by some anchor operation
 /// (like DPAS).
@@ -200,18 +234,32 @@ struct LayoutInfoLattice : public Lattice<LayoutInfo> {
 /// For 1D vector, lane_layout is [subgroupSize] and lane_data is [1].
 /// For 2D vector, lane_layout is [1, subgroupSize] and lane_data is [1, 1].
 static LayoutInfo getDefaultSIMTLayoutInfo(mlir::MLIRContext *ctx,
-                                           unsigned rank) {
+                                           unsigned rank,
+                                           const xegpu::uArch::uArch *uArch,
+                                           ArrayRef<int> instData) {
   assert((rank == 1 || rank == 2) && "Expected 1D or 2D vector.");
   if (rank == 1) {
     return LayoutInfo(
-        xegpu::LayoutAttr::get(ctx, {xegpu::targetinfo::subgroupSize}, {1}));
+        xegpu::LayoutAttr::get(ctx, instData, {uArch->getSubgroupSize()}, {1}));
   }
   return LayoutInfo(xegpu::LayoutAttr::get(
-      ctx, {1, xegpu::targetinfo::subgroupSize}, {1, 1}));
+      ctx, instData, {1, uArch->getSubgroupSize()}, {1, 1}));
+}
+
+static LayoutInfo getDefaultSIMTLayoutInfo(mlir::MLIRContext *ctx,
+                                           unsigned rank, int subgroupSize) {
+  assert((rank == 1 || rank == 2) && "Expected 1D or 2D vector.");
+  if (rank == 1) {
+    return LayoutInfo(xegpu::LayoutAttr::get(ctx, {subgroupSize}, {1}));
+  }
+  return LayoutInfo(xegpu::LayoutAttr::get(ctx, {1, subgroupSize}, {1, 1}));
 }
 
 /// Helper to get the default layout for a vector type.
 static LayoutInfo getDefaultSIMTLayoutInfo(VectorType vectorTy,
+                                           const xegpu::uArch::uArch *uArch,
+                                           ArrayRef<int> instData,
+                                           unsigned packingSize,
                                            bool isScattered = false) {
   // Expecting a 1D or 2D vector.
   assert((vectorTy.getRank() == 1 || vectorTy.getRank() == 2) &&
@@ -221,28 +269,25 @@ static LayoutInfo getDefaultSIMTLayoutInfo(VectorType vectorTy,
          "Expected int or float element type.");
   // If the rank is 1, then return default layout for 1D vector.
   if (vectorTy.getRank() == 1)
-    return getDefaultSIMTLayoutInfo(vectorTy.getContext(), 1);
+    return getDefaultSIMTLayoutInfo(vectorTy.getContext(), 1, uArch, instData);
   // Packing factor is determined by the element type bitwidth.
-  int packingFactor = 1;
   unsigned bitwidth = vectorTy.getElementType().getIntOrFloatBitWidth();
+  int packingFactor = bitwidth < packingSize ? packingSize / bitwidth : 1;
   if (isScattered) {
-    packingFactor =
-        bitwidth < xegpu::targetinfo::packedSizeInBitsForGatherScatter
-            ? xegpu::targetinfo::packedSizeInBitsForGatherScatter / bitwidth
-            : 1;
-    return LayoutInfo(xegpu::LayoutAttr::get(
-        vectorTy.getContext(), {xegpu::targetinfo::subgroupSize, 1},
-        {1, packingFactor}));
+    return LayoutInfo(xegpu::LayoutAttr::get(vectorTy.getContext(), instData,
+                                             {uArch->getSubgroupSize(), 1},
+                                             {1, packingFactor}));
   }
-  if (bitwidth < xegpu::targetinfo::packedSizeInBitsForDefault)
-    packingFactor = xegpu::targetinfo::packedSizeInBitsForDefault / bitwidth;
-  return LayoutInfo(xegpu::LayoutAttr::get(vectorTy.getContext(),
-                                           {1, xegpu::targetinfo::subgroupSize},
+  return LayoutInfo(xegpu::LayoutAttr::get(vectorTy.getContext(), instData,
+                                           {1, uArch->getSubgroupSize()},
                                            {1, packingFactor}));
 }
 
 /// Helper to get the default layout for a vector type.
 static LayoutInfo getDefaultSIMTLayoutInfo(xegpu::TensorDescType tdescTy,
+                                           const xegpu::uArch::uArch *uArch,
+                                           ArrayRef<int> instData,
+                                           unsigned packingSize,
                                            bool isScattered = false) {
   // Expecting a 1D or 2D vector.
   assert((tdescTy.getRank() == 1 || tdescTy.getRank() == 2) &&
@@ -252,27 +297,18 @@ static LayoutInfo getDefaultSIMTLayoutInfo(xegpu::TensorDescType tdescTy,
          "Expected int or float element type.");
   // If the rank is 1, then return default layout for 1D vector.
   if (tdescTy.getRank() == 1)
-    return getDefaultSIMTLayoutInfo(tdescTy.getContext(), 1);
+    return getDefaultSIMTLayoutInfo(tdescTy.getContext(), 1, uArch, instData);
   // Packing factor is determined by the element type bitwidth.
   unsigned bitwidth = tdescTy.getElementType().getIntOrFloatBitWidth();
-
+  int subgroupSize = uArch->getSubgroupSize();
+  int packingFactor = bitwidth < packingSize ? packingSize / bitwidth : 1;
   if (isScattered) {
-    int packingFactor =
-        bitwidth < xegpu::targetinfo::packedSizeInBitsForGatherScatter
-            ? xegpu::targetinfo::packedSizeInBitsForGatherScatter / bitwidth
-            : 1;
     return LayoutInfo(xegpu::LayoutAttr::get(
-        tdescTy.getContext(), {xegpu::targetinfo::subgroupSize, 1},
-        {1, packingFactor}));
+        tdescTy.getContext(), instData, {subgroupSize, 1}, {1, packingFactor}));
   }
 
-  int packingFactor =
-      (bitwidth < xegpu::targetinfo::packedSizeInBitsForDefault)
-          ? xegpu::targetinfo::packedSizeInBitsForDefault / bitwidth
-          : 1;
-  return LayoutInfo(xegpu::LayoutAttr::get(tdescTy.getContext(),
-                                           {1, xegpu::targetinfo::subgroupSize},
-                                           {1, packingFactor}));
+  return LayoutInfo(xegpu::LayoutAttr::get(
+      tdescTy.getContext(), instData, {1, subgroupSize}, {1, packingFactor}));
 }
 
 /// Helper Function to get the expected layouts for DPAS operands. `lane_data`
@@ -281,25 +317,25 @@ static LayoutInfo getDefaultSIMTLayoutInfo(xegpu::TensorDescType tdescTy,
 /// `packedSizeInBitsForDefault`
 /// * For B operand, the data must be packed in minimum
 /// `packedSizeInBitsForDpasB`
-static LayoutInfo getSIMTLayoutInfoForDPASOperand(VectorType vectorTy,
-                                                  unsigned operandNum) {
+static LayoutInfo
+getSIMTLayoutInfoForDPASOperand(VectorType vectorTy, unsigned operandNum,
+                                const xegpu::uArch::uArch *uArch,
+                                ArrayRef<int> instData, unsigned packingSize) {
   Type elementTy = vectorTy.getElementType();
   assert(elementTy.isIntOrFloat() &&
          "Expected int or float type in DPAS operands");
-  SmallVector<int32_t, 2> layout({1, xegpu::targetinfo::subgroupSize});
+  SmallVector<int32_t, 2> layout({1, uArch->getSubgroupSize()});
   // For B operand, data must be packed in minimum `packedDpasBSizeInBits` and
   // must have the VNNI format.
-  if (operandNum == 1 && elementTy.getIntOrFloatBitWidth() <
-                             xegpu::targetinfo::packedSizeInBitsForDpasB) {
+  if (operandNum == 1 && elementTy.getIntOrFloatBitWidth() < packingSize) {
     SmallVector<int32_t, 2> data(
-        {static_cast<int32_t>(xegpu::targetinfo::packedSizeInBitsForDpasB /
-                              elementTy.getIntOrFloatBitWidth()),
+        {static_cast<int32_t>(packingSize / elementTy.getIntOrFloatBitWidth()),
          1});
     return LayoutInfo(
-        xegpu::LayoutAttr::get(vectorTy.getContext(), layout, data));
+        xegpu::LayoutAttr::get(vectorTy.getContext(), instData, layout, data));
   }
   // Otherwise, return the default layout for the vector type.
-  return getDefaultSIMTLayoutInfo(vectorTy);
+  return getDefaultSIMTLayoutInfo(vectorTy, uArch, instData, packingSize);
 }
 
 //===----------------------------------------------------------------------===//
@@ -456,7 +492,37 @@ void LayoutInfoPropagation::visitPrefetchNdOp(
   // Here we assign the default layout to the tensor descriptor operand of
   // prefetch.
   auto tdescTy = prefetch.getTensorDescType();
-  auto prefetchLayout = getDefaultSIMTLayoutInfo(tdescTy);
+
+  auto uArch = getUArch(getChipStr(prefetch).value_or(""));
+  const auto *uArchInstruction =
+      dyn_cast<xegpu::uArch::Subgroup2DBlockPrefetchInstruction>(
+          uArch->getInstruction(
+              xegpu::uArch::InstructionKind::Subgroup2DBlockPrefetch));
+
+  auto blockWHC =
+      uArchInstruction->getBlockWidthHeightCount(tdescTy.getElementType());
+  if (!blockWHC)
+    prefetch.emitWarning("No known block params found for the element type.");
+  auto [bWidth, bHeight, bCount] = blockWHC.value();
+  SmallVector<int> instData;
+  int instWidth = getLargestDivisor(
+      static_cast<int>(tdescTy.getDimSize(tdescTy.getRank() - 1)), bWidth,
+      bCount);
+  if (instWidth == -1)
+    prefetch.emitWarning(
+        "No suitable instruction multiple found for the given shape.");
+  if (tdescTy.getRank() == 1)
+    instData = {instWidth};
+  else {
+    int instHeight = getLargestDivisor(
+        static_cast<int>(tdescTy.getDimSize(tdescTy.getRank() - 2)), bHeight);
+    if (instHeight == -1)
+      prefetch.emitWarning(
+          "No suitable instruction multiple found for the given shape.");
+    instData = {instHeight, instWidth};
+  }
+  auto prefetchLayout = getDefaultSIMTLayoutInfo(
+      tdescTy, uArch, instData, uArchInstruction->getPackedFormatBitSize());
   // Propagate the layout to the source tensor descriptor.
   propagateIfChanged(operands[0], operands[0]->meet(prefetchLayout));
 }
@@ -475,10 +541,11 @@ void LayoutInfoPropagation::visitVectorMultiReductionOp(
     reduction.emitWarning("Expecting output type to be 1D vector.");
     return;
   }
+  auto uArch = getUArch(xegpu::getChipStr(reduction).value_or(""));
   // Given that the result is 1D, the layout of the operand should be 2D with
   // default layout.
-  LayoutInfo operandLayout =
-      getDefaultSIMTLayoutInfo(reduction->getContext(), 2);
+  LayoutInfo operandLayout = getDefaultSIMTLayoutInfo(
+      reduction->getContext(), 2, uArch->getSubgroupSize());
   propagateIfChanged(operands[0], operands[0]->meet(operandLayout));
   // Accumulator should have the same layout as the result.
   propagateIfChanged(operands[1], operands[1]->meet(resultLayout));
@@ -557,15 +624,53 @@ void LayoutInfoPropagation::visitDpasOp(
     ArrayRef<const LayoutInfoLattice *> results) {
   VectorType aTy = dpas.getLhsType();
   VectorType bTy = dpas.getRhsType();
-  propagateIfChanged(
-      operands[0], operands[0]->meet(getSIMTLayoutInfoForDPASOperand(aTy, 0)));
-  propagateIfChanged(
-      operands[1], operands[1]->meet(getSIMTLayoutInfoForDPASOperand(bTy, 1)));
+
+  auto uArch = getUArch(getChipStr(dpas).value_or(""));
+  const int subgroupSize = uArch->getSubgroupSize();
+  const auto *uArchInstruction =
+      dyn_cast<xegpu::uArch::SubgroupMatrixMultiplyAcc>(uArch->getInstruction(
+          xegpu::uArch::InstructionKind::SubgroupMatrixMultiplyAcc));
+
+  const unsigned dataALen = aTy.getShape().front();
+  auto supportedALen = uArchInstruction->getSupportedM(aTy.getElementType());
+  const int maxALen =
+      getLargestDivisor(dataALen, ArrayRef<unsigned>(supportedALen));
+  if (maxALen == -1)
+    dpas.emitWarning(
+        "No suitable instruction multiple found for the given shape.");
+
+  const unsigned dataBLen = bTy.getShape().back();
+  auto supportedBLen = uArchInstruction->getSupportedK(bTy.getElementType());
+  const int maxBLen =
+      getLargestDivisor(dataBLen, ArrayRef<unsigned>(supportedBLen));
+  if (maxBLen == -1)
+    dpas.emitWarning(
+        "No suitable instruction multiple found for the given shape.");
+  SmallVector<int> instDataA = {maxALen, subgroupSize};
+  SmallVector<int> instDataB = {subgroupSize, maxBLen};
+
+  propagateIfChanged(operands[0],
+                     operands[0]->meet(getSIMTLayoutInfoForDPASOperand(
+                         aTy, 0, uArch, instDataA,
+                         uArchInstruction->getPackedFormatBitSizeA())));
+  propagateIfChanged(operands[1],
+                     operands[1]->meet(getSIMTLayoutInfoForDPASOperand(
+                         bTy, 1, uArch, instDataB,
+                         uArchInstruction->getPackedFormatBitSizeB())));
   if (operands.size() > 2) {
     VectorType cTy = dpas.getAccType();
-    propagateIfChanged(
-        operands[2],
-        operands[2]->meet(getSIMTLayoutInfoForDPASOperand(cTy, 2)));
+    const unsigned dataCLen = bTy.getShape().back();
+    auto supportedCLen = uArchInstruction->getSupportedN(bTy.getElementType());
+    const int maxCLen =
+        getLargestDivisor(dataCLen, ArrayRef<unsigned>(supportedCLen));
+    if (maxCLen == -1)
+      dpas.emitWarning(
+          "No suitable instruction multiple found for the given shape.");
+    SmallVector<int> instDataC = {maxALen, maxCLen};
+    propagateIfChanged(operands[2],
+                       operands[2]->meet(getSIMTLayoutInfoForDPASOperand(
+                           cTy, 2, uArch, instDataC,
+                           uArchInstruction->getPackedFormatBitSizeB())));
   }
 }
 
@@ -573,7 +678,38 @@ void LayoutInfoPropagation::visitDpasOp(
 void LayoutInfoPropagation::visitStoreNdOp(
     xegpu::StoreNdOp store, ArrayRef<LayoutInfoLattice *> operands,
     ArrayRef<const LayoutInfoLattice *> results) {
-  LayoutInfo storeLayout = getDefaultSIMTLayoutInfo(store.getValueType());
+
+  auto uArch = getUArch(getChipStr(store).value_or(""));
+  const auto *uArchInstruction =
+      dyn_cast<xegpu::uArch::Subgroup2DBlockStoreInstruction>(
+          uArch->getInstruction(
+              xegpu::uArch::InstructionKind::Subgroup2DBlockStore));
+  VectorType dataTy = store.getValueType();
+  auto blockWHC = uArchInstruction->getBlockWidthHeightCount(
+      store.getValueType().getElementType());
+  if (!blockWHC)
+    store.emitWarning("No known block params found for the element type.");
+  auto [bWidth, bHeight, bCount] = blockWHC.value();
+  SmallVector<int> instData;
+  int instWidth = getLargestDivisor(
+      static_cast<int>(dataTy.getDimSize(dataTy.getRank() - 1)), bWidth,
+      bCount);
+  if (instWidth == -1)
+    store.emitWarning(
+        "No suitable instruction multiple found for the given shape.");
+  if (dataTy.getRank() == 1)
+    instData = {instWidth};
+  else {
+    int instHeight = getLargestDivisor(
+        static_cast<int>(dataTy.getDimSize(dataTy.getRank() - 2)), bHeight);
+    if (instHeight == -1)
+      store.emitWarning(
+          "No suitable instruction multiple found for the given shape.");
+    instData = {instHeight, instWidth};
+  }
+  LayoutInfo storeLayout =
+      getDefaultSIMTLayoutInfo(store.getValueType(), uArch, instData,
+                               uArchInstruction->getPackedFormatBitSize());
   // Both operands should have the same layout
   for (LayoutInfoLattice *operand : operands)
     propagateIfChanged(operand, operand->meet(storeLayout));
@@ -694,10 +830,23 @@ void LayoutInfoPropagation::visitLoadGatherOp(
     load.emitWarning("Not propagating, non-vector payload supplied.");
     return;
   }
-  LayoutInfo layout = getDefaultSIMTLayoutInfo(payloadTy, /*scattered*/ true);
+  auto uArch = getUArch(getChipStr(load).value_or(""));
+  const int subgroupSize = uArch->getSubgroupSize();
+  SmallVector<int> instData{subgroupSize};
+  if (auto chunkSize = load.getChunkSize().value_or(0); chunkSize > 1)
+    instData.push_back(chunkSize);
+  else if (auto srcTdescTy =
+               dyn_cast<xegpu::TensorDescType>(load.getSourceType())) {
+    if (srcTdescTy.getChunkSizeAsInt() > 1)
+      instData.push_back(chunkSize);
+  }
+  LayoutInfo layout = getDefaultSIMTLayoutInfo(
+      payloadTy, uArch, instData, uArch->getGeneralPackedFormatBitSize(),
+      /*scattered*/ true);
 
   // Mask operand should have 1D default layout.
-  LayoutInfo maskLayout = getDefaultSIMTLayoutInfo(load->getContext(), 1);
+  LayoutInfo maskLayout =
+      getDefaultSIMTLayoutInfo(load->getContext(), 1, subgroupSize);
 
   // Propagate the new layout to the tensor descriptor operand.
   if (isa<xegpu::TensorDescType>(load.getSourceType()))
@@ -717,8 +866,10 @@ void LayoutInfoPropagation::visitCreateDescOp(
   // Need the layout of the descriptor to propagate to the operands.
   if (!descLayout.isAssigned())
     return;
+  auto uArch = getUArch(getChipStr(createDesc).value_or(""));
   // For offset operand propagate 1D default layout.
-  LayoutInfo layout = getDefaultSIMTLayoutInfo(createDesc->getContext(), 1);
+  LayoutInfo layout = getDefaultSIMTLayoutInfo(createDesc->getContext(), 1,
+                                               uArch->getSubgroupSize());
   propagateIfChanged(operands[1], operands[1]->meet(layout));
 }
 
@@ -735,18 +886,30 @@ void LayoutInfoPropagation::visitStoreScatterOp(
     storeScatter.emitWarning("Not propagating, non-vector payload supplied.");
     return;
   }
+  auto uArch = getUArch(getChipStr(storeScatter).value_or(""));
+  const int subgroupSize = uArch->getSubgroupSize();
+
   auto payloadShape = payloadTy.getShape();
   if (payloadShape.size() > 1)
     assert(
-        payloadShape[0] == xegpu::targetinfo::subgroupSize &&
+        payloadShape[0] == subgroupSize &&
         "Expected the first dimension of 2D tensor descriptor to be equal to "
         "subgroup size.");
 
-  LayoutInfo payloadLayout =
-      getDefaultSIMTLayoutInfo(payloadTy, /*scattered=*/true);
+  SmallVector<int> instData{subgroupSize};
+  if (auto chunkSize = storeScatter.getChunkSize().value_or(0); chunkSize > 1)
+    instData.push_back(chunkSize);
+  else if (auto dstTdescTy =
+               dyn_cast<xegpu::TensorDescType>(storeScatter.getDestType())) {
+    if (dstTdescTy.getChunkSizeAsInt() > 1)
+      instData.push_back(chunkSize);
+  }
+  LayoutInfo payloadLayout = getDefaultSIMTLayoutInfo(
+      payloadTy, uArch, instData, uArch->getGeneralPackedFormatBitSize(),
+      /*scattered=*/true);
 
   LayoutInfo maskLayout =
-      getDefaultSIMTLayoutInfo(storeScatter->getContext(), 1);
+      getDefaultSIMTLayoutInfo(storeScatter->getContext(), 1, subgroupSize);
   // Propagate the payload operand layout
   propagateIfChanged(operands[0], operands[0]->meet(payloadLayout));
   // Propagate the destination (if tdesc) operand layout
@@ -1023,9 +1186,13 @@ void XeGPUPropagateLayoutPass::runOnOperation() {
     LayoutInfo layout = analysis.getLayoutInfo(val);
     if (!layout.isAssigned())
       return {};
+    xegpu::DistributeLayoutAttr layoutAttr =
+        cast<xegpu::DistributeLayoutAttr>(layout.get());
+    if (this->layoutKind == "lane")
+      layoutAttr = layoutAttr.dropInstData();
     if (layout.isSliceLayout())
-      return cast<xegpu::SliceAttr>(layout.get());
-    return cast<xegpu::LayoutAttr>(layout.get());
+      return cast<xegpu::SliceAttr>(layoutAttr);
+    return cast<xegpu::LayoutAttr>(layoutAttr);
   };
 
   mlir::OpBuilder builder(&getContext());
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index d09dc196c0bf7..5a3b27ec6108e 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -11,10 +11,10 @@
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/Dialect/Vector/Transforms/VectorDistribution.h"
 #include "mlir/Dialect/XeGPU/IR/XeGPU.h"
-#include "mlir/Dialect/XeGPU/IR/XeGPUTargetInfo.h"
 #include "mlir/Dialect/XeGPU/Transforms/Passes.h"
 #include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
 #include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
+#include "mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h"
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/Builders.h"
@@ -159,17 +159,18 @@ static bool requirePacked(const xegpu::LayoutAttr layout) {
 
 /// Helper function to check if the layout requires a transpose effect.
 static bool requireTranspose(const xegpu::LayoutAttr layout,
-                             const std::string &chipStr) {
+                             const xegpu::uArch::uArch *uArch) {
   // Return false for unsupported targets.
   // TODO: Add more support or move to target info.
-  if (chipStr != "pvc" && chipStr != "bmg")
+  if (uArch->getName().equals_insensitive("pvc") &&
+      uArch->getName().equals_insensitive("bmg"))
     return false;
   if (!layout)
     return false;
   auto laneLayout = layout.getEffectiveLaneLayoutAsInt();
   if (laneLayout.size() != 2)
     return false;
-  return laneLayout[0] == xegpu::targetinfo::subgroupSize && laneLayout[1] == 1;
+  return laneLayout[0] == uArch->getSubgroupSize() && laneLayout[1] == 1;
 }
 
 /// Given a GPUFuncOp, this pattern creates a new GPUFuncOp and moves the body
@@ -199,6 +200,11 @@ struct MoveFuncBodyToWarpOp : public OpRewritePattern<gpu::GPUFuncOp> {
   using OpRewritePattern<gpu::GPUFuncOp>::OpRewritePattern;
   LogicalResult matchAndRewrite(gpu::GPUFuncOp gpuFuncOp,
                                 PatternRewriter &rewriter) const override {
+    auto uArch = getUArch(xegpu::getChipStr(gpuFuncOp).value_or(""));
+    if (!uArch)
+      return rewriter.notifyMatchFailure(
+          gpuFuncOp, "Subgroup distribution requires target attribute attached "
+                     "to set the warp size");
     // If the function only contains a single void return, skip.
     if (llvm::all_of(gpuFuncOp.getBody().getOps(), [](Operation &op) {
           return isa<gpu::ReturnOp>(op) && !op.getNumOperands();
@@ -230,7 +236,7 @@ struct MoveFuncBodyToWarpOp : public OpRewritePattern<gpu::GPUFuncOp> {
     ArrayRef<Type> gpuFuncResultType = gpuFuncOp.getFunctionType().getResults();
     auto warpOp = gpu::WarpExecuteOnLane0Op::create(
         rewriter, laneId.getLoc(), gpuFuncResultType, laneId,
-        xegpu::targetinfo::subgroupSize, newGpuFunc.getArguments(),
+        uArch->getSubgroupSize(), newGpuFunc.getArguments(),
         newGpuFunc.getArgumentTypes());
     Block &warpBodyBlock = warpOp.getBodyRegion().front();
     // Replace the ReturnOp of the original gpu function with a YieldOp.
@@ -495,14 +501,14 @@ struct LoadNdDistribution final : public gpu::WarpDistributionPattern {
           warpOp, "warp result is not a xegpu::LoadNd op");
 
     auto loadOp = operand->get().getDefiningOp<xegpu::LoadNdOp>();
+    auto uArch = getUArch(xegpu::getChipStr(loadOp).value_or(""));
+    if (!uArch)
+      return rewriter.notifyMatchFailure(
+          loadOp, "xegpu::LoadNdOp require target attribute attached to "
+                  "determine transpose "
+                  "requirement");
     // Chip information is required to decide if the layout requires transpose
     // effect.
-    auto chipStr = xegpu::getChipStr(loadOp);
-    if (!chipStr)
-      return rewriter.notifyMatchFailure(
-          loadOp,
-          "xegpu::LoadNdOp require chip information to determine transpose "
-          "requirement");
     // Expecting offsets to be present.
     SmallVector<OpFoldResult> offsets = loadOp.getMixedOffsets();
     if (offsets.empty())
@@ -556,7 +562,7 @@ struct LoadNdDistribution final : public gpu::WarpDistributionPattern {
     // Set the packed attribute if the layout requires it.
     newLoadOp.setPacked(requirePacked(layout));
     // Set the transpose attribute if the layout requires it.
-    if (requireTranspose(layout, chipStr.value()))
+    if (requireTranspose(layout, uArch))
       newLoadOp.setTranspose(
           DenseI64ArrayAttr::get(rewriter.getContext(), {1, 0}));
     Value distributedVal = newWarpOp.getResult(operandIdx);
diff --git a/mlir/lib/IR/Diagnostics.cpp b/mlir/lib/IR/Diagnostics.cpp
index 776b5c6588c71..f4c9242ed3479 100644
--- a/mlir/lib/IR/Diagnostics.cpp
+++ b/mlir/lib/IR/Diagnostics.cpp
@@ -138,6 +138,10 @@ Diagnostic &Diagnostic::operator<<(Operation &op) {
   return appendOp(op, OpPrintingFlags());
 }
 
+Diagnostic &Diagnostic::operator<<(OpWithFlags op) {
+  return appendOp(*op.getOperation(), op.flags());
+}
+
 Diagnostic &Diagnostic::appendOp(Operation &op, const OpPrintingFlags &flags) {
   std::string str;
   llvm::raw_string_ostream os(str);
diff --git a/mlir/lib/IR/Operation.cpp b/mlir/lib/IR/Operation.cpp
index ce421f4bf7e0e..8212d6d3d1eba 100644
--- a/mlir/lib/IR/Operation.cpp
+++ b/mlir/lib/IR/Operation.cpp
@@ -463,28 +463,26 @@ void Operation::updateOrderIfNecessary() {
 //===----------------------------------------------------------------------===//
 
 auto llvm::ilist_detail::SpecificNodeAccess<
-    typename llvm::ilist_detail::compute_node_options<
-        ::mlir::Operation>::type>::getNodePtr(pointer n) -> node_type * {
+    llvm::ilist_detail::compute_node_options<::mlir::Operation>::type>::
+    getNodePtr(pointer n) -> node_type * {
   return NodeAccess::getNodePtr<OptionsT>(n);
 }
 
 auto llvm::ilist_detail::SpecificNodeAccess<
-    typename llvm::ilist_detail::compute_node_options<
-        ::mlir::Operation>::type>::getNodePtr(const_pointer n)
-    -> const node_type * {
+    llvm::ilist_detail::compute_node_options<::mlir::Operation>::type>::
+    getNodePtr(const_pointer n) -> const node_type * {
   return NodeAccess::getNodePtr<OptionsT>(n);
 }
 
 auto llvm::ilist_detail::SpecificNodeAccess<
-    typename llvm::ilist_detail::compute_node_options<
-        ::mlir::Operation>::type>::getValuePtr(node_type *n) -> pointer {
+    llvm::ilist_detail::compute_node_options<::mlir::Operation>::type>::
+    getValuePtr(node_type *n) -> pointer {
   return NodeAccess::getValuePtr<OptionsT>(n);
 }
 
 auto llvm::ilist_detail::SpecificNodeAccess<
-    typename llvm::ilist_detail::compute_node_options<
-        ::mlir::Operation>::type>::getValuePtr(const node_type *n)
-    -> const_pointer {
+    llvm::ilist_detail::compute_node_options<::mlir::Operation>::type>::
+    getValuePtr(const node_type *n) -> const_pointer {
   return NodeAccess::getValuePtr<OptionsT>(n);
 }
 
diff --git a/mlir/lib/IR/Region.cpp b/mlir/lib/IR/Region.cpp
index 46b6298076d48..15a941f380225 100644
--- a/mlir/lib/IR/Region.cpp
+++ b/mlir/lib/IR/Region.cpp
@@ -253,6 +253,21 @@ void Region::OpIterator::skipOverBlocksWithNoOps() {
     operation = block->begin();
 }
 
+llvm::raw_ostream &mlir::operator<<(llvm::raw_ostream &os, Region &region) {
+  if (!region.getParentOp()) {
+    os << "Region has no parent op";
+  } else {
+    os << "Region #" << region.getRegionNumber() << " in operation "
+       << region.getParentOp()->getName();
+  }
+  for (auto it : llvm::enumerate(region.getBlocks())) {
+    os << "\n  Block #" << it.index() << ":";
+    for (Operation &op : it.value().getOperations())
+      os << "\n    " << OpWithFlags(&op, OpPrintingFlags().skipRegions());
+  }
+  return os;
+}
+
 //===----------------------------------------------------------------------===//
 // RegionRange
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Interfaces/ControlFlowInterfaces.cpp b/mlir/lib/Interfaces/ControlFlowInterfaces.cpp
index ca3f7666dba8a..1e56810ff7aaf 100644
--- a/mlir/lib/Interfaces/ControlFlowInterfaces.cpp
+++ b/mlir/lib/Interfaces/ControlFlowInterfaces.cpp
@@ -9,7 +9,9 @@
 #include <utility>
 
 #include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Operation.h"
 #include "mlir/Interfaces/ControlFlowInterfaces.h"
+#include "llvm/Support/DebugLog.h"
 
 using namespace mlir;
 
@@ -38,20 +40,31 @@ SuccessorOperands::SuccessorOperands(unsigned int producedOperandCount,
 std::optional<BlockArgument>
 detail::getBranchSuccessorArgument(const SuccessorOperands &operands,
                                    unsigned operandIndex, Block *successor) {
+  LDBG() << "Getting branch successor argument for operand index "
+         << operandIndex << " in successor block";
+
   OperandRange forwardedOperands = operands.getForwardedOperands();
   // Check that the operands are valid.
-  if (forwardedOperands.empty())
+  if (forwardedOperands.empty()) {
+    LDBG() << "No forwarded operands, returning nullopt";
     return std::nullopt;
+  }
 
   // Check to ensure that this operand is within the range.
   unsigned operandsStart = forwardedOperands.getBeginOperandIndex();
   if (operandIndex < operandsStart ||
-      operandIndex >= (operandsStart + forwardedOperands.size()))
+      operandIndex >= (operandsStart + forwardedOperands.size())) {
+    LDBG() << "Operand index " << operandIndex << " out of range ["
+           << operandsStart << ", "
+           << (operandsStart + forwardedOperands.size())
+           << "), returning nullopt";
     return std::nullopt;
+  }
 
   // Index the successor.
   unsigned argIndex =
       operands.getProducedOperandCount() + operandIndex - operandsStart;
+  LDBG() << "Computed argument index " << argIndex << " for successor block";
   return successor->getArgument(argIndex);
 }
 
@@ -59,9 +72,15 @@ detail::getBranchSuccessorArgument(const SuccessorOperands &operands,
 LogicalResult
 detail::verifyBranchSuccessorOperands(Operation *op, unsigned succNo,
                                       const SuccessorOperands &operands) {
+  LDBG() << "Verifying branch successor operands for successor #" << succNo
+         << " in operation " << op->getName();
+
   // Check the count.
   unsigned operandCount = operands.size();
   Block *destBB = op->getSuccessor(succNo);
+  LDBG() << "Branch has " << operandCount << " operands, target block has "
+         << destBB->getNumArguments() << " arguments";
+
   if (operandCount != destBB->getNumArguments())
     return op->emitError() << "branch has " << operandCount
                            << " operands for successor #" << succNo
@@ -69,13 +88,22 @@ detail::verifyBranchSuccessorOperands(Operation *op, unsigned succNo,
                            << destBB->getNumArguments();
 
   // Check the types.
+  LDBG() << "Checking type compatibility for "
+         << (operandCount - operands.getProducedOperandCount())
+         << " forwarded operands";
   for (unsigned i = operands.getProducedOperandCount(); i != operandCount;
        ++i) {
-    if (!cast<BranchOpInterface>(op).areTypesCompatible(
-            operands[i].getType(), destBB->getArgument(i).getType()))
+    Type operandType = operands[i].getType();
+    Type argType = destBB->getArgument(i).getType();
+    LDBG() << "Checking type compatibility: operand type " << operandType
+           << " vs argument type " << argType;
+
+    if (!cast<BranchOpInterface>(op).areTypesCompatible(operandType, argType))
       return op->emitError() << "type mismatch for bb argument #" << i
                              << " of successor #" << succNo;
   }
+
+  LDBG() << "Branch successor operand verification successful";
   return success();
 }
 
@@ -126,15 +154,15 @@ LogicalResult detail::verifyRegionBranchWeights(Operation *op) {
 
 static InFlightDiagnostic &printRegionEdgeName(InFlightDiagnostic &diag,
                                                RegionBranchPoint sourceNo,
-                                               RegionBranchPoint succRegionNo) {
+                                               RegionSuccessor succRegionNo) {
   diag << "from ";
-  if (Region *region = sourceNo.getRegionOrNull())
-    diag << "Region #" << region->getRegionNumber();
+  if (Operation *op = sourceNo.getTerminatorPredecessorOrNull())
+    diag << "Operation " << op->getName();
   else
     diag << "parent operands";
 
   diag << " to ";
-  if (Region *region = succRegionNo.getRegionOrNull())
+  if (Region *region = succRegionNo.getSuccessor())
     diag << "Region #" << region->getRegionNumber();
   else
     diag << "parent results";
@@ -145,13 +173,12 @@ static InFlightDiagnostic &printRegionEdgeName(InFlightDiagnostic &diag,
 /// `sourcePoint`. `getInputsTypesForRegion` is a function that returns the
 /// types of the inputs that flow to a successor region.
 static LogicalResult
-verifyTypesAlongAllEdges(Operation *op, RegionBranchPoint sourcePoint,
-                         function_ref<FailureOr<TypeRange>(RegionBranchPoint)>
+verifyTypesAlongAllEdges(RegionBranchOpInterface branchOp,
+                         RegionBranchPoint sourcePoint,
+                         function_ref<FailureOr<TypeRange>(RegionSuccessor)>
                              getInputsTypesForRegion) {
-  auto regionInterface = cast<RegionBranchOpInterface>(op);
-
   SmallVector<RegionSuccessor, 2> successors;
-  regionInterface.getSuccessorRegions(sourcePoint, successors);
+  branchOp.getSuccessorRegions(sourcePoint, successors);
 
   for (RegionSuccessor &succ : successors) {
     FailureOr<TypeRange> sourceTypes = getInputsTypesForRegion(succ);
@@ -160,10 +187,14 @@ verifyTypesAlongAllEdges(Operation *op, RegionBranchPoint sourcePoint,
 
     TypeRange succInputsTypes = succ.getSuccessorInputs().getTypes();
     if (sourceTypes->size() != succInputsTypes.size()) {
-      InFlightDiagnostic diag = op->emitOpError("region control flow edge ");
+      InFlightDiagnostic diag =
+          branchOp->emitOpError("region control flow edge ");
+      std::string succStr;
+      llvm::raw_string_ostream os(succStr);
+      os << succ;
       return printRegionEdgeName(diag, sourcePoint, succ)
              << ": source has " << sourceTypes->size()
-             << " operands, but target successor needs "
+             << " operands, but target successor " << os.str() << " needs "
              << succInputsTypes.size();
     }
 
@@ -171,8 +202,10 @@ verifyTypesAlongAllEdges(Operation *op, RegionBranchPoint sourcePoint,
          llvm::enumerate(llvm::zip(*sourceTypes, succInputsTypes))) {
       Type sourceType = std::get<0>(typesIdx.value());
       Type inputType = std::get<1>(typesIdx.value());
-      if (!regionInterface.areTypesCompatible(sourceType, inputType)) {
-        InFlightDiagnostic diag = op->emitOpError("along control flow edge ");
+
+      if (!branchOp.areTypesCompatible(sourceType, inputType)) {
+        InFlightDiagnostic diag =
+            branchOp->emitOpError("along control flow edge ");
         return printRegionEdgeName(diag, sourcePoint, succ)
                << ": source type #" << typesIdx.index() << " " << sourceType
                << " should match input type #" << typesIdx.index() << " "
@@ -180,6 +213,7 @@ verifyTypesAlongAllEdges(Operation *op, RegionBranchPoint sourcePoint,
       }
     }
   }
+
   return success();
 }
 
@@ -187,34 +221,18 @@ verifyTypesAlongAllEdges(Operation *op, RegionBranchPoint sourcePoint,
 LogicalResult detail::verifyTypesAlongControlFlowEdges(Operation *op) {
   auto regionInterface = cast<RegionBranchOpInterface>(op);
 
-  auto inputTypesFromParent = [&](RegionBranchPoint point) -> TypeRange {
-    return regionInterface.getEntrySuccessorOperands(point).getTypes();
+  auto inputTypesFromParent = [&](RegionSuccessor successor) -> TypeRange {
+    return regionInterface.getEntrySuccessorOperands(successor).getTypes();
   };
 
   // Verify types along control flow edges originating from the parent.
-  if (failed(verifyTypesAlongAllEdges(op, RegionBranchPoint::parent(),
-                                      inputTypesFromParent)))
+  if (failed(verifyTypesAlongAllEdges(
+          regionInterface, RegionBranchPoint::parent(), inputTypesFromParent)))
     return failure();
 
-  auto areTypesCompatible = [&](TypeRange lhs, TypeRange rhs) {
-    if (lhs.size() != rhs.size())
-      return false;
-    for (auto types : llvm::zip(lhs, rhs)) {
-      if (!regionInterface.areTypesCompatible(std::get<0>(types),
-                                              std::get<1>(types))) {
-        return false;
-      }
-    }
-    return true;
-  };
-
   // Verify types along control flow edges originating from each region.
   for (Region &region : op->getRegions()) {
-
-    // Since there can be multiple terminators implementing the
-    // `RegionBranchTerminatorOpInterface`, all should have the same operand
-    // types when passing them to the same region.
-
+    // Collect all return-like terminators in the region.
     SmallVector<RegionBranchTerminatorOpInterface> regionReturnOps;
     for (Block &block : region)
       if (!block.empty())
@@ -227,33 +245,20 @@ LogicalResult detail::verifyTypesAlongControlFlowEdges(Operation *op) {
     if (regionReturnOps.empty())
       continue;
 
-    auto inputTypesForRegion =
-        [&](RegionBranchPoint point) -> FailureOr<TypeRange> {
-      std::optional<OperandRange> regionReturnOperands;
-      for (RegionBranchTerminatorOpInterface regionReturnOp : regionReturnOps) {
-        auto terminatorOperands = regionReturnOp.getSuccessorOperands(point);
-
-        if (!regionReturnOperands) {
-          regionReturnOperands = terminatorOperands;
-          continue;
-        }
-
-        // Found more than one ReturnLike terminator. Make sure the operand
-        // types match with the first one.
-        if (!areTypesCompatible(regionReturnOperands->getTypes(),
-                                terminatorOperands.getTypes())) {
-          InFlightDiagnostic diag = op->emitOpError("along control flow edge");
-          return printRegionEdgeName(diag, region, point)
-                 << " operands mismatch between return-like terminators";
-        }
-      }
-
-      // All successors get the same set of operand types.
-      return TypeRange(regionReturnOperands->getTypes());
-    };
-
-    if (failed(verifyTypesAlongAllEdges(op, region, inputTypesForRegion)))
-      return failure();
+    // Verify types along control flow edges originating from each return-like
+    // terminator.
+    for (RegionBranchTerminatorOpInterface regionReturnOp : regionReturnOps) {
+
+      auto inputTypesForRegion =
+          [&](RegionSuccessor successor) -> FailureOr<TypeRange> {
+        OperandRange terminatorOperands =
+            regionReturnOp.getSuccessorOperands(successor);
+        return TypeRange(terminatorOperands.getTypes());
+      };
+      if (failed(verifyTypesAlongAllEdges(regionInterface, regionReturnOp,
+                                          inputTypesForRegion)))
+        return failure();
+    }
   }
 
   return success();
@@ -272,31 +277,74 @@ using StopConditionFn = function_ref<bool(Region *, ArrayRef<bool> visited)>;
 static bool traverseRegionGraph(Region *begin,
                                 StopConditionFn stopConditionFn) {
   auto op = cast<RegionBranchOpInterface>(begin->getParentOp());
+  LDBG() << "Starting region graph traversal from region #"
+         << begin->getRegionNumber() << " in operation " << op->getName();
+
   SmallVector<bool> visited(op->getNumRegions(), false);
   visited[begin->getRegionNumber()] = true;
+  LDBG() << "Initialized visited array with " << op->getNumRegions()
+         << " regions";
 
   // Retrieve all successors of the region and enqueue them in the worklist.
   SmallVector<Region *> worklist;
   auto enqueueAllSuccessors = [&](Region *region) {
-    SmallVector<RegionSuccessor> successors;
-    op.getSuccessorRegions(region, successors);
-    for (RegionSuccessor successor : successors)
-      if (!successor.isParent())
-        worklist.push_back(successor.getSuccessor());
+    LDBG() << "Enqueuing successors for region #" << region->getRegionNumber();
+    SmallVector<Attribute> operandAttributes(op->getNumOperands());
+    for (Block &block : *region) {
+      if (block.empty())
+        continue;
+      auto terminator =
+          dyn_cast<RegionBranchTerminatorOpInterface>(block.back());
+      if (!terminator)
+        continue;
+      SmallVector<RegionSuccessor> successors;
+      operandAttributes.resize(terminator->getNumOperands());
+      terminator.getSuccessorRegions(operandAttributes, successors);
+      LDBG() << "Found " << successors.size()
+             << " successors from terminator in block";
+      for (RegionSuccessor successor : successors) {
+        if (!successor.isParent()) {
+          worklist.push_back(successor.getSuccessor());
+          LDBG() << "Added region #"
+                 << successor.getSuccessor()->getRegionNumber()
+                 << " to worklist";
+        } else {
+          LDBG() << "Skipping parent successor";
+        }
+      }
+    }
   };
   enqueueAllSuccessors(begin);
+  LDBG() << "Initial worklist size: " << worklist.size();
 
   // Process all regions in the worklist via DFS.
   while (!worklist.empty()) {
     Region *nextRegion = worklist.pop_back_val();
-    if (stopConditionFn(nextRegion, visited))
+    LDBG() << "Processing region #" << nextRegion->getRegionNumber()
+           << " from worklist (remaining: " << worklist.size() << ")";
+
+    if (stopConditionFn(nextRegion, visited)) {
+      LDBG() << "Stop condition met for region #"
+             << nextRegion->getRegionNumber() << ", returning true";
       return true;
-    if (visited[nextRegion->getRegionNumber()])
+    }
+    llvm::dbgs() << "Region: " << nextRegion << "\n";
+    if (!nextRegion->getParentOp()) {
+      llvm::errs() << "Region " << *nextRegion << " has no parent op\n";
+      return false;
+    }
+    if (visited[nextRegion->getRegionNumber()]) {
+      LDBG() << "Region #" << nextRegion->getRegionNumber()
+             << " already visited, skipping";
       continue;
+    }
     visited[nextRegion->getRegionNumber()] = true;
+    LDBG() << "Marking region #" << nextRegion->getRegionNumber()
+           << " as visited";
     enqueueAllSuccessors(nextRegion);
   }
 
+  LDBG() << "Traversal completed, returning false";
   return false;
 }
 
@@ -322,18 +370,26 @@ static bool isRegionReachable(Region *begin, Region *r) {
 ///    mutually exclusive if they are not reachable from each other as per
 ///    RegionBranchOpInterface::getSuccessorRegions.
 bool mlir::insideMutuallyExclusiveRegions(Operation *a, Operation *b) {
+  LDBG() << "Checking if operations are in mutually exclusive regions: "
+         << a->getName() << " and " << b->getName();
+
   assert(a && "expected non-empty operation");
   assert(b && "expected non-empty operation");
 
   auto branchOp = a->getParentOfType<RegionBranchOpInterface>();
   while (branchOp) {
+    LDBG() << "Checking branch operation " << branchOp->getName();
+
     // Check if b is inside branchOp. (We already know that a is.)
     if (!branchOp->isProperAncestor(b)) {
+      LDBG() << "Operation b is not inside branchOp, checking next ancestor";
       // Check next enclosing RegionBranchOpInterface.
       branchOp = branchOp->getParentOfType<RegionBranchOpInterface>();
       continue;
     }
 
+    LDBG() << "Both operations are inside branchOp, finding their regions";
+
     // b is contained in branchOp. Retrieve the regions in which `a` and `b`
     // are contained.
     Region *regionA = nullptr, *regionB = nullptr;
@@ -341,63 +397,136 @@ bool mlir::insideMutuallyExclusiveRegions(Operation *a, Operation *b) {
       if (r.findAncestorOpInRegion(*a)) {
         assert(!regionA && "already found a region for a");
         regionA = &r;
+        LDBG() << "Found region #" << r.getRegionNumber() << " for operation a";
       }
       if (r.findAncestorOpInRegion(*b)) {
         assert(!regionB && "already found a region for b");
         regionB = &r;
+        LDBG() << "Found region #" << r.getRegionNumber() << " for operation b";
       }
     }
     assert(regionA && regionB && "could not find region of op");
 
+    LDBG() << "Region A: #" << regionA->getRegionNumber() << ", Region B: #"
+           << regionB->getRegionNumber();
+
     // `a` and `b` are in mutually exclusive regions if both regions are
     // distinct and neither region is reachable from the other region.
-    return regionA != regionB && !isRegionReachable(regionA, regionB) &&
-           !isRegionReachable(regionB, regionA);
+    bool regionsAreDistinct = (regionA != regionB);
+    bool aNotReachableFromB = !isRegionReachable(regionA, regionB);
+    bool bNotReachableFromA = !isRegionReachable(regionB, regionA);
+
+    LDBG() << "Regions distinct: " << regionsAreDistinct
+           << ", A not reachable from B: " << aNotReachableFromB
+           << ", B not reachable from A: " << bNotReachableFromA;
+
+    bool mutuallyExclusive =
+        regionsAreDistinct && aNotReachableFromB && bNotReachableFromA;
+    LDBG() << "Operations are mutually exclusive: " << mutuallyExclusive;
+
+    return mutuallyExclusive;
   }
 
   // Could not find a common RegionBranchOpInterface among a's and b's
   // ancestors.
+  LDBG() << "No common RegionBranchOpInterface found, operations are not "
+            "mutually exclusive";
   return false;
 }
 
 bool RegionBranchOpInterface::isRepetitiveRegion(unsigned index) {
+  LDBG() << "Checking if region #" << index << " is repetitive in operation "
+         << getOperation()->getName();
+
   Region *region = &getOperation()->getRegion(index);
-  return isRegionReachable(region, region);
+  bool isRepetitive = isRegionReachable(region, region);
+
+  LDBG() << "Region #" << index << " is repetitive: " << isRepetitive;
+  return isRepetitive;
 }
 
 bool RegionBranchOpInterface::hasLoop() {
+  LDBG() << "Checking if operation " << getOperation()->getName()
+         << " has loops";
+
   SmallVector<RegionSuccessor> entryRegions;
   getSuccessorRegions(RegionBranchPoint::parent(), entryRegions);
-  for (RegionSuccessor successor : entryRegions)
-    if (!successor.isParent() &&
-        traverseRegionGraph(successor.getSuccessor(),
-                            [](Region *nextRegion, ArrayRef<bool> visited) {
-                              // Interrupt traversal if the region was already
-                              // visited.
-                              return visited[nextRegion->getRegionNumber()];
-                            }))
-      return true;
+  LDBG() << "Found " << entryRegions.size() << " entry regions";
+
+  for (RegionSuccessor successor : entryRegions) {
+    if (!successor.isParent()) {
+      LDBG() << "Checking entry region #"
+             << successor.getSuccessor()->getRegionNumber() << " for loops";
+
+      bool hasLoop =
+          traverseRegionGraph(successor.getSuccessor(),
+                              [](Region *nextRegion, ArrayRef<bool> visited) {
+                                // Interrupt traversal if the region was already
+                                // visited.
+                                return visited[nextRegion->getRegionNumber()];
+                              });
+
+      if (hasLoop) {
+        LDBG() << "Found loop in entry region #"
+               << successor.getSuccessor()->getRegionNumber();
+        return true;
+      }
+    } else {
+      LDBG() << "Skipping parent successor";
+    }
+  }
+
+  LDBG() << "No loops found in operation";
   return false;
 }
 
 Region *mlir::getEnclosingRepetitiveRegion(Operation *op) {
+  LDBG() << "Finding enclosing repetitive region for operation "
+         << op->getName();
+
   while (Region *region = op->getParentRegion()) {
+    LDBG() << "Checking region #" << region->getRegionNumber()
+           << " in operation " << region->getParentOp()->getName();
+
     op = region->getParentOp();
-    if (auto branchOp = dyn_cast<RegionBranchOpInterface>(op))
-      if (branchOp.isRepetitiveRegion(region->getRegionNumber()))
+    if (auto branchOp = dyn_cast<RegionBranchOpInterface>(op)) {
+      LDBG()
+          << "Found RegionBranchOpInterface, checking if region is repetitive";
+      if (branchOp.isRepetitiveRegion(region->getRegionNumber())) {
+        LDBG() << "Found repetitive region #" << region->getRegionNumber();
         return region;
+      }
+    } else {
+      LDBG() << "Parent operation does not implement RegionBranchOpInterface";
+    }
   }
+
+  LDBG() << "No enclosing repetitive region found";
   return nullptr;
 }
 
 Region *mlir::getEnclosingRepetitiveRegion(Value value) {
+  LDBG() << "Finding enclosing repetitive region for value";
+
   Region *region = value.getParentRegion();
   while (region) {
+    LDBG() << "Checking region #" << region->getRegionNumber()
+           << " in operation " << region->getParentOp()->getName();
+
     Operation *op = region->getParentOp();
-    if (auto branchOp = dyn_cast<RegionBranchOpInterface>(op))
-      if (branchOp.isRepetitiveRegion(region->getRegionNumber()))
+    if (auto branchOp = dyn_cast<RegionBranchOpInterface>(op)) {
+      LDBG()
+          << "Found RegionBranchOpInterface, checking if region is repetitive";
+      if (branchOp.isRepetitiveRegion(region->getRegionNumber())) {
+        LDBG() << "Found repetitive region #" << region->getRegionNumber();
         return region;
+      }
+    } else {
+      LDBG() << "Parent operation does not implement RegionBranchOpInterface";
+    }
     region = op->getParentRegion();
   }
+
+  LDBG() << "No enclosing repetitive region found for value";
   return nullptr;
 }
diff --git a/mlir/lib/Query/Query.cpp b/mlir/lib/Query/Query.cpp
index 375e82050a481..cf8a4d293299c 100644
--- a/mlir/lib/Query/Query.cpp
+++ b/mlir/lib/Query/Query.cpp
@@ -121,12 +121,13 @@ LogicalResult MatchQuery::run(llvm::raw_ostream &os, QuerySession &qs) const {
   Operation *rootOp = qs.getRootOp();
   int matchCount = 0;
   matcher::MatchFinder finder;
+
+  StringRef functionName = matcher.getFunctionName();
   auto matches = finder.collectMatches(rootOp, std::move(matcher));
 
   // An extract call is recognized by considering if the matcher has a name.
   // TODO: Consider making the extract more explicit.
-  if (matcher.hasFunctionName()) {
-    auto functionName = matcher.getFunctionName();
+  if (!functionName.empty()) {
     std::vector<Operation *> flattenedMatches =
         finder.flattenMatchedOps(matches);
     Operation *function =
diff --git a/mlir/lib/Support/Timing.cpp b/mlir/lib/Support/Timing.cpp
index fb6f82c283df5..2e92d9c1e7835 100644
--- a/mlir/lib/Support/Timing.cpp
+++ b/mlir/lib/Support/Timing.cpp
@@ -50,7 +50,8 @@ class TimingManagerImpl {
   llvm::sys::SmartRWMutex<true> identifierMutex;
 
   /// A thread local cache of identifiers to reduce lock contention.
-  ThreadLocalCache<llvm::StringMap<llvm::StringMapEntry<std::nullopt_t> *>>
+  ThreadLocalCache<
+      llvm::StringMap<llvm::StringMapEntry<llvm::EmptyStringSetTag> *>>
       localIdentifierCache;
 
   TimingManagerImpl() : identifiers(identifierAllocator) {}
@@ -319,7 +320,6 @@ class TimerImpl {
   void mergeChildren(AsyncChildrenMap &&other) {
     for (auto &thread : other) {
       mergeChildren(std::move(thread.second));
-      assert(thread.second.empty());
     }
     other.clear();
   }
diff --git a/mlir/lib/TableGen/Type.cpp b/mlir/lib/TableGen/Type.cpp
index b31377e0de3e9..0f1bf83d1987b 100644
--- a/mlir/lib/TableGen/Type.cpp
+++ b/mlir/lib/TableGen/Type.cpp
@@ -56,7 +56,7 @@ std::optional<StringRef> TypeConstraint::getBuilderCall() const {
         StringRef value = init->getValue();
         return value.empty() ? std::optional<StringRef>() : value;
       })
-      .Default([](auto *) { return std::nullopt; });
+      .Default(std::nullopt);
 }
 
 // Return the C++ type for this type (which may just be ::mlir::Type).
diff --git a/mlir/lib/Target/LLVMIR/DebugTranslation.cpp b/mlir/lib/Target/LLVMIR/DebugTranslation.cpp
index eeb87253e5eb8..e3bcf2749be13 100644
--- a/mlir/lib/Target/LLVMIR/DebugTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/DebugTranslation.cpp
@@ -390,7 +390,7 @@ llvm::DISubrange *DebugTranslation::translateImpl(DISubrangeAttr attr) {
             .Case<>([&](LLVM::DIGlobalVariableAttr global) {
               return translate(global);
             })
-            .Default([&](Attribute attr) { return nullptr; });
+            .Default(nullptr);
     return metadata;
   };
   return llvm::DISubrange::get(llvmCtx, getMetadataOrNull(attr.getCount()),
@@ -420,10 +420,10 @@ DebugTranslation::translateImpl(DIGenericSubrangeAttr attr) {
             .Case([&](LLVM::DILocalVariableAttr local) {
               return translate(local);
             })
-            .Case<>([&](LLVM::DIGlobalVariableAttr global) {
+            .Case([&](LLVM::DIGlobalVariableAttr global) {
               return translate(global);
             })
-            .Default([&](Attribute attr) { return nullptr; });
+            .Default(nullptr);
     return metadata;
   };
   return llvm::DIGenericSubrange::get(llvmCtx,
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index f28454075f1d3..8edec990eaaba 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -4084,12 +4084,13 @@ static omp::MapInfoOp getFirstOrLastMappedMemberPtr(omp::MapInfoOp mapInfo,
 ///
 /// Fortran
 ///     map(tofrom: array(2:5, 3:2))
-///   or
-/// C++
-///   map(tofrom: array[1:4][2:3])
+///
 /// We must calculate the initial pointer offset to pass across, this function
 /// performs this using bounds.
 ///
+/// TODO/WARNING: This only supports Fortran's column major indexing currently
+/// as is noted in the note below and comments in the function, we must extend
+/// this function when we add a C++ frontend.
 /// NOTE: which while specified in row-major order it currently needs to be
 /// flipped for Fortran's column order array allocation and access (as
 /// opposed to C++'s row-major, hence the backwards processing where order is
@@ -4125,46 +4126,28 @@ calculateBoundsOffset(LLVM::ModuleTranslation &moduleTranslation,
     // with a pointer that's being treated like an array and we have the
     // underlying type e.g. an i32, or f64 etc, e.g. a fortran descriptor base
     // address (pointer pointing to the actual data) so we must caclulate the
-    // offset using a single index which the following two loops attempts to
-    // compute.
-
-    // Calculates the size offset we need to make per row e.g. first row or
-    // column only needs to be offset by one, but the next would have to be
-    // the previous row/column offset multiplied by the extent of current row.
+    // offset using a single index which the following loop attempts to
+    // compute using the standard column-major algorithm e.g for a 3D array:
     //
-    // For example ([1][10][100]):
+    // ((((c_idx * b_len) + b_idx) * a_len) + a_idx)
     //
-    //  - First row/column we move by 1 for each index increment
-    //  - Second row/column we move by 1 (first row/column) * 10 (extent/size of
-    //  current) for 10 for each index increment
-    //  - Third row/column we would move by 10 (second row/column) *
-    //  (extent/size of current) 100 for 1000 for each index increment
-    std::vector<llvm::Value *> dimensionIndexSizeOffset{builder.getInt64(1)};
-    for (size_t i = 1; i < bounds.size(); ++i) {
-      if (auto boundOp = dyn_cast_if_present<omp::MapBoundsOp>(
-              bounds[i].getDefiningOp())) {
-        dimensionIndexSizeOffset.push_back(builder.CreateMul(
-            moduleTranslation.lookupValue(boundOp.getExtent()),
-            dimensionIndexSizeOffset[i - 1]));
-      }
-    }
-
-    // Now that we have calculated how much we move by per index, we must
-    // multiply each lower bound offset in indexes by the size offset we
-    // have calculated in the previous and accumulate the results to get
-    // our final resulting offset.
+    // It is of note that it's doing column-major rather than row-major at the
+    // moment, but having a way for the frontend to indicate which major format
+    // to use or standardizing/canonicalizing the order of the bounds to compute
+    // the offset may be useful in the future when there's other frontends with
+    // different formats.
+    std::vector<llvm::Value *> dimensionIndexSizeOffset;
     for (int i = bounds.size() - 1; i >= 0; --i) {
       if (auto boundOp = dyn_cast_if_present<omp::MapBoundsOp>(
               bounds[i].getDefiningOp())) {
-        if (idx.empty())
-          idx.emplace_back(builder.CreateMul(
-              moduleTranslation.lookupValue(boundOp.getLowerBound()),
-              dimensionIndexSizeOffset[i]));
+        if (i == ((int)bounds.size() - 1))
+          idx.emplace_back(
+              moduleTranslation.lookupValue(boundOp.getLowerBound()));
         else
           idx.back() = builder.CreateAdd(
-              idx.back(), builder.CreateMul(moduleTranslation.lookupValue(
-                                                boundOp.getLowerBound()),
-                                            dimensionIndexSizeOffset[i]));
+              builder.CreateMul(idx.back(), moduleTranslation.lookupValue(
+                                                boundOp.getExtent())),
+              moduleTranslation.lookupValue(boundOp.getLowerBound()));
       }
     }
   }
diff --git a/mlir/lib/Transforms/RemoveDeadValues.cpp b/mlir/lib/Transforms/RemoveDeadValues.cpp
index e0c65b0e09774..41f3f9d76a3b1 100644
--- a/mlir/lib/Transforms/RemoveDeadValues.cpp
+++ b/mlir/lib/Transforms/RemoveDeadValues.cpp
@@ -432,8 +432,7 @@ static void processRegionBranchOp(RegionBranchOpInterface regionBranchOp,
 
   // Return the successors of `region` if the latter is not null. Else return
   // the successors of `regionBranchOp`.
-  auto getSuccessors = [&](Region *region = nullptr) {
-    auto point = region ? region : RegionBranchPoint::parent();
+  auto getSuccessors = [&](RegionBranchPoint point) {
     SmallVector<RegionSuccessor> successors;
     regionBranchOp.getSuccessorRegions(point, successors);
     return successors;
@@ -456,7 +455,8 @@ static void processRegionBranchOp(RegionBranchOpInterface regionBranchOp,
   // `nonForwardedOperands`.
   auto markNonForwardedOperands = [&](BitVector &nonForwardedOperands) {
     nonForwardedOperands.resize(regionBranchOp->getNumOperands(), true);
-    for (const RegionSuccessor &successor : getSuccessors()) {
+    for (const RegionSuccessor &successor :
+         getSuccessors(RegionBranchPoint::parent())) {
       for (OpOperand *opOperand : getForwardedOpOperands(successor))
         nonForwardedOperands.reset(opOperand->getOperandNumber());
     }
@@ -469,10 +469,13 @@ static void processRegionBranchOp(RegionBranchOpInterface regionBranchOp,
         for (Region &region : regionBranchOp->getRegions()) {
           if (region.empty())
             continue;
+          // TODO: this isn't correct in face of multiple terminators.
           Operation *terminator = region.front().getTerminator();
           nonForwardedRets[terminator] =
               BitVector(terminator->getNumOperands(), true);
-          for (const RegionSuccessor &successor : getSuccessors(&region)) {
+          for (const RegionSuccessor &successor :
+               getSuccessors(RegionBranchPoint(
+                   cast<RegionBranchTerminatorOpInterface>(terminator)))) {
             for (OpOperand *opOperand :
                  getForwardedOpOperands(successor, terminator))
               nonForwardedRets[terminator].reset(opOperand->getOperandNumber());
@@ -489,8 +492,13 @@ static void processRegionBranchOp(RegionBranchOpInterface regionBranchOp,
           DenseMap<Region *, BitVector> &argsToKeep, Region *region = nullptr) {
         Operation *terminator =
             region ? region->front().getTerminator() : nullptr;
+        RegionBranchPoint point =
+            terminator
+                ? RegionBranchPoint(
+                      cast<RegionBranchTerminatorOpInterface>(terminator))
+                : RegionBranchPoint::parent();
 
-        for (const RegionSuccessor &successor : getSuccessors(region)) {
+        for (const RegionSuccessor &successor : getSuccessors(point)) {
           Region *successorRegion = successor.getSuccessor();
           for (auto [opOperand, input] :
                llvm::zip(getForwardedOpOperands(successor, terminator),
@@ -517,7 +525,8 @@ static void processRegionBranchOp(RegionBranchOpInterface regionBranchOp,
         resultsOrArgsToKeepChanged = false;
 
         // Recompute `resultsToKeep` and `argsToKeep` based on `operandsToKeep`.
-        for (const RegionSuccessor &successor : getSuccessors()) {
+        for (const RegionSuccessor &successor :
+             getSuccessors(RegionBranchPoint::parent())) {
           Region *successorRegion = successor.getSuccessor();
           for (auto [opOperand, input] :
                llvm::zip(getForwardedOpOperands(successor),
@@ -551,7 +560,9 @@ static void processRegionBranchOp(RegionBranchOpInterface regionBranchOp,
           if (region.empty())
             continue;
           Operation *terminator = region.front().getTerminator();
-          for (const RegionSuccessor &successor : getSuccessors(&region)) {
+          for (const RegionSuccessor &successor :
+               getSuccessors(RegionBranchPoint(
+                   cast<RegionBranchTerminatorOpInterface>(terminator)))) {
             Region *successorRegion = successor.getSuccessor();
             for (auto [opOperand, input] :
                  llvm::zip(getForwardedOpOperands(successor, terminator),
diff --git a/mlir/lib/Transforms/ViewOpGraph.cpp b/mlir/lib/Transforms/ViewOpGraph.cpp
index 08cac1fe3695c..5790a77cc4e2b 100644
--- a/mlir/lib/Transforms/ViewOpGraph.cpp
+++ b/mlir/lib/Transforms/ViewOpGraph.cpp
@@ -158,7 +158,8 @@ class PrintOpPass : public impl::ViewOpGraphBase<PrintOpPass> {
 
   /// Emit a cluster (subgraph). The specified builder generates the body of the
   /// cluster. Return the anchor node of the cluster.
-  Node emitClusterStmt(function_ref<void()> builder, std::string label = "") {
+  Node emitClusterStmt(function_ref<void()> builder,
+                       const std::string &label = "") {
     int clusterId = ++counter;
     os << "subgraph cluster_" << clusterId << " {\n";
     os.indent();
@@ -269,7 +270,7 @@ class PrintOpPass : public impl::ViewOpGraphBase<PrintOpPass> {
   }
 
   /// Emit a node statement.
-  Node emitNodeStmt(std::string label, StringRef shape = kShapeNode,
+  Node emitNodeStmt(const std::string &label, StringRef shape = kShapeNode,
                     StringRef background = "") {
     int nodeId = ++counter;
     AttributeMap attrs;
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/wmma-gfx11.mlir b/mlir/test/Conversion/AMDGPUToROCDL/wmma-gfx11.mlir
index d1301d0089220..9fcc1473d4a18 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/wmma-gfx11.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/wmma-gfx11.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s --convert-amdgpu-to-rocdl=chipset=gfx1100 --allow-unregistered-dialect | FileCheck %s
+// RUN: mlir-opt %s --convert-amdgpu-to-rocdl=chipset=gfx1100 | FileCheck %s
 
 // CHECK-LABEL: @wmma_to_rocdl
 func.func @wmma_to_rocdl(%arg0 : vector<16xf16>, %arg1 : vector<8xf32>, %arg2 : vector<4xf32>,
@@ -32,5 +32,5 @@ func.func @wmma_to_rocdl(%arg0 : vector<16xf16>, %arg1 : vector<8xf32>, %arg2 :
   // CHECK: rocdl.wmma.i32.16x16x16.iu4{{.*}}: (i1, i32, i1, i32, vector<4xi32>, i1) -> vector<4xi32>
   amdgpu.wmma 16x16x16 %arg11 * %arg11 + %arg8 {clamp}: vector<8xi4>, vector<8xi4>, vector<4xi32>
 
-  func.return
+  return
 }
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/wmma-gfx12.mlir b/mlir/test/Conversion/AMDGPUToROCDL/wmma-gfx12.mlir
index b897323340402..57883473bbf06 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/wmma-gfx12.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/wmma-gfx12.mlir
@@ -1,4 +1,6 @@
-// RUN: mlir-opt %s --convert-amdgpu-to-rocdl=chipset=gfx1200 --allow-unregistered-dialect | FileCheck %s
+// RUN: mlir-opt %s --convert-amdgpu-to-rocdl=chipset=gfx1200 \
+// RUN:   --split-input-file --verify-diagnostics | FileCheck %s
+
 // CHECK-LABEL: @wmma_to_rocdl
 func.func @wmma_to_rocdl(%arg0 : vector<8xf16>, %arg1 : vector<4xf16>,
                          %arg2 : vector<8xf32>, %arg3 : vector<4xf32>,
@@ -66,3 +68,12 @@ func.func @wmma_to_rocdl(%arg0 : vector<8xf16>, %arg1 : vector<4xf16>,
 
   func.return
 }
+
+// -----
+
+func.func @wmma_unsupported_k(%arg0 : vector<64xf8E4M3FN>, %arg1 : vector<8xf16>) {
+  // expected-error@below {{'amdgpu.wmma' op no intrinsic matching WMMA on the given chipset}}
+  // expected-error@below {{failed to legalize operation 'amdgpu.wmma'}}
+  amdgpu.wmma 16x16x128 %arg0 * %arg0 + %arg1 : vector<64xf8E4M3FN>, vector<64xf8E4M3FN>, vector<8xf16>
+  func.return
+}
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/wmma-gfx1250.mlir b/mlir/test/Conversion/AMDGPUToROCDL/wmma-gfx1250.mlir
new file mode 100644
index 0000000000000..5e77a3add3184
--- /dev/null
+++ b/mlir/test/Conversion/AMDGPUToROCDL/wmma-gfx1250.mlir
@@ -0,0 +1,99 @@
+// RUN: mlir-opt %s --convert-amdgpu-to-rocdl=chipset=gfx1250 \
+// RUN:   --split-input-file --verify-diagnostics | FileCheck %s
+
+// CHECK-LABEL: @wmma_k4
+func.func @wmma_k4(%arg0 : vector<2xf32>, %arg1 : vector<8xf32>) {
+  // CHECK: rocdl.wmma.f32.16x16x4.f32 %arg0, %arg0, %arg1
+  amdgpu.wmma 16x16x4 %arg0 * %arg0 + %arg1 : vector<2xf32>, vector<2xf32>, vector<8xf32>
+  return
+}
+
+// CHECK-LABEL: @wmma_k32
+func.func @wmma_k32(%arg0 : vector<16xf16>, %arg1 : vector<16xbf16>, %arg2 : vector<8xf32>,
+                    %arg3 : vector<8xf16>, %arg4 : vector<8xbf16>) {
+  // CHECK: rocdl.wmma.f32.16x16x32.f16 %arg0, %arg0, %arg2
+  amdgpu.wmma 16x16x32 %arg0 * %arg0 + %arg2 : vector<16xf16>, vector<16xf16>, vector<8xf32>
+
+  // CHECK: rocdl.wmma.f16.16x16x32.f16 %arg0, %arg0, {{.*}} : (vector<16xf16>, vector<16xf16>, vector<8xf16>, i1)
+  amdgpu.wmma 16x16x32 %arg0 * %arg0 + %arg3 : vector<16xf16>, vector<16xf16>, vector<8xf16>
+
+  // CHECK: rocdl.wmma.f32.16x16x32.bf16 {{.*}}, {{.*}}, %arg2
+  amdgpu.wmma 16x16x32 %arg1 * %arg1 + %arg2 : vector<16xbf16>, vector<16xbf16>, vector<8xf32>
+
+  // CHECK: rocdl.wmma.bf16.16x16x32.bf16 {{.*}}, {{.*}}, {{.*}}, {{.*}} : (vector<16xi16>, vector<16xi16>, vector<8xi16>, i1)
+  amdgpu.wmma 16x16x32 %arg1 * %arg1 + %arg4 : vector<16xbf16>, vector<16xbf16>, vector<8xbf16>
+
+  return
+}
+
+// CHECK-LABEL: @wmma_k64
+func.func @wmma_k64(%arg0 : vector<32xi8>, %arg1 : vector<32xf8E4M3FN>, %arg2 : vector<32xf8E5M2>,
+                    %arg3 : vector<8xi32>, %arg4 : vector<8xf32>, %arg5 : vector<8xf16>) {
+  // CHECK: rocdl.wmma.i32.16x16x64.iu8 {{.*}}, {{.*}}, {{.*}}, {{.*}}, %arg3, {{.*}}
+  amdgpu.wmma 16x16x64 %arg0 * %arg0 + %arg3 {clamp} : vector<32xi8>, vector<32xi8>, vector<8xi32>
+
+  // CHECK: rocdl.wmma.f32.16x16x64.fp8_fp8 {{.*}}, {{.*}}, %arg4
+  amdgpu.wmma 16x16x64 %arg1 * %arg1 + %arg4 : vector<32xf8E4M3FN>, vector<32xf8E4M3FN>, vector<8xf32>
+
+  // CHECK: rocdl.wmma.f16.16x16x64.fp8_fp8 {{.*}}, {{.*}}, %arg5, {{.*}} : (vector<8xi32>, vector<8xi32>, vector<8xf16>, i1)
+  amdgpu.wmma 16x16x64 %arg1 * %arg1 + %arg5 : vector<32xf8E4M3FN>, vector<32xf8E4M3FN>, vector<8xf16>
+
+  // CHECK: rocdl.wmma.f32.16x16x64.fp8_bf8 {{.*}}, {{.*}}, %arg4
+  amdgpu.wmma 16x16x64 %arg1 * %arg2 + %arg4 : vector<32xf8E4M3FN>, vector<32xf8E5M2>, vector<8xf32>
+
+  // CHECK: rocdl.wmma.f16.16x16x64.fp8_bf8 {{.*}}, {{.*}}, %arg5, {{.*}} : (vector<8xi32>, vector<8xi32>, vector<8xf16>, i1)
+  amdgpu.wmma 16x16x64 %arg1 * %arg2 + %arg5 : vector<32xf8E4M3FN>, vector<32xf8E5M2>, vector<8xf16>
+
+  // CHECK: rocdl.wmma.f32.16x16x64.bf8_bf8 {{.*}}, {{.*}}, %arg4
+  amdgpu.wmma 16x16x64 %arg2 * %arg2 + %arg4 : vector<32xf8E5M2>, vector<32xf8E5M2>, vector<8xf32>
+
+  // CHECK: rocdl.wmma.f16.16x16x64.bf8_bf8 {{.*}}, {{.*}}, %arg5, {{.*}} : (vector<8xi32>, vector<8xi32>, vector<8xf16>, i1)
+  amdgpu.wmma 16x16x64 %arg2 * %arg2 + %arg5 : vector<32xf8E5M2>, vector<32xf8E5M2>, vector<8xf16>
+
+  // CHECK: rocdl.wmma.f32.16x16x64.bf8_fp8 {{.*}}, {{.*}}, %arg4
+  amdgpu.wmma 16x16x64 %arg2 * %arg1 + %arg4 : vector<32xf8E5M2>, vector<32xf8E4M3FN>, vector<8xf32>
+
+  // CHECK: rocdl.wmma.f16.16x16x64.bf8_fp8 {{.*}}, {{.*}}, %arg5, {{.*}} : (vector<8xi32>, vector<8xi32>, vector<8xf16>, i1)
+  amdgpu.wmma 16x16x64 %arg2 * %arg1 + %arg5 : vector<32xf8E5M2>, vector<32xf8E4M3FN>, vector<8xf16>
+
+  return
+}
+
+// CHECK-LABEL: @wmma_k128
+func.func @wmma_k128(%arg0 : vector<64xf8E4M3FN>, %arg1 : vector<64xf8E5M2>,
+                     %arg2 : vector<8xf32>, %arg3 : vector<8xf16>) {
+  // CHECK: rocdl.wmma.f32.16x16x128.fp8_fp8 {{.*}}, {{.*}}, %arg2
+  amdgpu.wmma 16x16x128 %arg0 * %arg0 + %arg2 : vector<64xf8E4M3FN>, vector<64xf8E4M3FN>, vector<8xf32>
+
+  // CHECK: rocdl.wmma.f16.16x16x128.fp8_fp8 {{.*}}, {{.*}}, %arg3, {{.*}} : (vector<16xi32>, vector<16xi32>, vector<8xf16>, i1)
+  amdgpu.wmma 16x16x128 %arg0 * %arg0 + %arg3 : vector<64xf8E4M3FN>, vector<64xf8E4M3FN>, vector<8xf16>
+
+  // CHECK: rocdl.wmma.f32.16x16x128.fp8_bf8 {{.*}}, {{.*}}, %arg2
+  amdgpu.wmma 16x16x128 %arg0 * %arg1 + %arg2 : vector<64xf8E4M3FN>, vector<64xf8E5M2>, vector<8xf32>
+
+  // CHECK: rocdl.wmma.f16.16x16x128.fp8_bf8 {{.*}}, {{.*}}, %arg3, {{.*}} : (vector<16xi32>, vector<16xi32>, vector<8xf16>, i1)
+  amdgpu.wmma 16x16x128 %arg0 * %arg1 + %arg3 : vector<64xf8E4M3FN>, vector<64xf8E5M2>, vector<8xf16>
+
+  // CHECK: rocdl.wmma.f32.16x16x128.bf8_bf8 {{.*}}, {{.*}}, %arg2
+  amdgpu.wmma 16x16x128 %arg1 * %arg1 + %arg2 : vector<64xf8E5M2>, vector<64xf8E5M2>, vector<8xf32>
+
+  // CHECK: rocdl.wmma.f16.16x16x128.bf8_bf8 {{.*}}, {{.*}}, %arg3, {{.*}} : (vector<16xi32>, vector<16xi32>, vector<8xf16>, i1)
+  amdgpu.wmma 16x16x128 %arg1 * %arg1 + %arg3 : vector<64xf8E5M2>, vector<64xf8E5M2>, vector<8xf16>
+
+  // CHECK: rocdl.wmma.f32.16x16x128.bf8_fp8 {{.*}}, {{.*}}, %arg2
+  amdgpu.wmma 16x16x128 %arg1 * %arg0 + %arg2 : vector<64xf8E5M2>, vector<64xf8E4M3FN>, vector<8xf32>
+
+  // CHECK: rocdl.wmma.f16.16x16x128.bf8_fp8 {{.*}}, {{.*}}, %arg3, {{.*}} : (vector<16xi32>, vector<16xi32>, vector<8xf16>, i1)
+  amdgpu.wmma 16x16x128 %arg1 * %arg0 + %arg3 : vector<64xf8E5M2>, vector<64xf8E4M3FN>, vector<8xf16>
+
+  return
+}
+
+// -----
+
+func.func @wmma_unsupported_k(%arg0 : vector<8xf16>, %arg1 : vector<8xf32>) {
+  // expected-error@below {{'amdgpu.wmma' op no intrinsic matching WMMA on the given chipset}}
+  // expected-error@below {{failed to legalize operation 'amdgpu.wmma'}}
+  amdgpu.wmma 16x16x16 %arg0 * %arg0 + %arg1 : vector<8xf16>, vector<8xf16>, vector<8xf32>
+  return
+}
diff --git a/mlir/test/Conversion/ComplexToStandard/convert-to-standard.mlir b/mlir/test/Conversion/ComplexToStandard/convert-to-standard.mlir
index dec62f92c7b2e..7a82236b0656e 100644
--- a/mlir/test/Conversion/ComplexToStandard/convert-to-standard.mlir
+++ b/mlir/test/Conversion/ComplexToStandard/convert-to-standard.mlir
@@ -211,11 +211,25 @@ func.func @complex_exp(%arg: complex<f32>) -> complex<f32> {
 }
 // CHECK: %[[REAL:.*]] = complex.re %[[ARG]] : complex<f32>
 // CHECK: %[[IMAG:.*]] = complex.im %[[ARG]] : complex<f32>
-// CHECK-DAG: %[[COS_IMAG:.*]] = math.cos %[[IMAG]] : f32
+// CHECK-DAG: %[[ZERO:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK-DAG: %[[HALF:.*]] = arith.constant 5.000000e-01 : f32
+// CHECK-DAG: %[[INF:.*]] = arith.constant 0x7F800000 : f32
 // CHECK-DAG: %[[EXP_REAL:.*]] = math.exp %[[REAL]] : f32
-// CHECK-DAG: %[[RESULT_REAL:.]] = arith.mulf %[[EXP_REAL]], %[[COS_IMAG]] : f32
+// CHECK-DAG: %[[REAL_HALF:.*]] = arith.mulf %[[REAL]], %[[HALF]] : f32
+// CHECK-DAG: %[[EXP_HALF:.*]] = math.exp %[[REAL_HALF]] : f32
+// CHECK-DAG: %[[COS_IMAG:.*]] = math.cos %[[IMAG]] : f32
 // CHECK-DAG: %[[SIN_IMAG:.*]] = math.sin %[[IMAG]] : f32
-// CHECK-DAG: %[[RESULT_IMAG:.*]] = arith.mulf %[[EXP_REAL]], %[[SIN_IMAG]] : f32
+// CHECK-DAG: %[[IS_INF:.*]] = arith.cmpf oeq, %[[EXP_REAL]], %[[INF]] : f32
+// CHECK-DAG: %[[IS_IMAG_ZERO:.*]] = arith.cmpf oeq, %[[IMAG]], %[[ZERO]] : f32
+// CHECK-DAG: %[[REAL_NORMAL:.*]] = arith.mulf %[[EXP_REAL]], %[[COS_IMAG]] : f32
+// CHECK-DAG: %[[EXP_HALF_COS:.*]] = arith.mulf %[[EXP_HALF]], %[[COS_IMAG]] : f32
+// CHECK-DAG: %[[REAL_OVERFLOW:.*]] = arith.mulf %[[EXP_HALF_COS]], %[[EXP_HALF]] : f32
+// CHECK: %[[RESULT_REAL:.*]] = arith.select %[[IS_INF]], %[[REAL_OVERFLOW]], %[[REAL_NORMAL]] : f32
+// CHECK-DAG: %[[IMAG_NORMAL:.*]] = arith.mulf %[[EXP_REAL]], %[[SIN_IMAG]] : f32
+// CHECK-DAG: %[[EXP_HALF_SIN:.*]] = arith.mulf %[[EXP_HALF]], %[[SIN_IMAG]] : f32
+// CHECK-DAG: %[[IMAG_OVERFLOW:.*]] = arith.mulf %[[EXP_HALF_SIN]], %[[EXP_HALF]] : f32
+// CHECK-DAG: %[[IMAG_NONZERO:.*]] = arith.select %[[IS_INF]], %[[IMAG_OVERFLOW]], %[[IMAG_NORMAL]] : f32
+// CHECK: %[[RESULT_IMAG:.*]] = arith.select %[[IS_IMAG_ZERO]], %[[ZERO]], %[[IMAG_NONZERO]] : f32
 // CHECK: %[[RESULT:.*]] = complex.create %[[RESULT_REAL]], %[[RESULT_IMAG]] : complex<f32>
 // CHECK: return %[[RESULT]] : complex<f32>
 
@@ -832,11 +846,25 @@ func.func @complex_exp_with_fmf(%arg: complex<f32>) -> complex<f32> {
 }
 // CHECK: %[[REAL:.*]] = complex.re %[[ARG]] : complex<f32>
 // CHECK: %[[IMAG:.*]] = complex.im %[[ARG]] : complex<f32>
-// CHECK-DAG: %[[COS_IMAG:.*]] = math.cos %[[IMAG]] fastmath<nnan,contract> : f32
+// CHECK-DAG: %[[ZERO:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK-DAG: %[[HALF:.*]] = arith.constant 5.000000e-01 : f32
+// CHECK-DAG: %[[INF:.*]] = arith.constant 0x7F800000 : f32
 // CHECK-DAG: %[[EXP_REAL:.*]] = math.exp %[[REAL]] fastmath<nnan,contract> : f32
-// CHECK-DAG: %[[RESULT_REAL:.]] = arith.mulf %[[EXP_REAL]], %[[COS_IMAG]] fastmath<nnan,contract> : f32
+// CHECK-DAG: %[[REAL_HALF:.*]] = arith.mulf %[[REAL]], %[[HALF]] fastmath<nnan,contract> : f32
+// CHECK-DAG: %[[EXP_HALF:.*]] = math.exp %[[REAL_HALF]] fastmath<nnan,contract> : f32
+// CHECK-DAG: %[[COS_IMAG:.*]] = math.cos %[[IMAG]] fastmath<nnan,contract> : f32
 // CHECK-DAG: %[[SIN_IMAG:.*]] = math.sin %[[IMAG]] fastmath<nnan,contract> : f32
-// CHECK-DAG: %[[RESULT_IMAG:.*]] = arith.mulf %[[EXP_REAL]], %[[SIN_IMAG]] fastmath<nnan,contract> : f32
+// CHECK-DAG: %[[IS_INF:.*]] = arith.cmpf oeq, %[[EXP_REAL]], %[[INF]] fastmath<nnan,contract> : f32
+// CHECK-DAG: %[[IS_IMAG_ZERO:.*]] = arith.cmpf oeq, %[[IMAG]], %[[ZERO]] : f32
+// CHECK-DAG: %[[REAL_NORMAL:.*]] = arith.mulf %[[EXP_REAL]], %[[COS_IMAG]] fastmath<nnan,contract> : f32
+// CHECK-DAG: %[[EXP_HALF_COS:.*]] = arith.mulf %[[EXP_HALF]], %[[COS_IMAG]] fastmath<nnan,contract> : f32
+// CHECK-DAG: %[[REAL_OVERFLOW:.*]] = arith.mulf %[[EXP_HALF_COS]], %[[EXP_HALF]] fastmath<nnan,contract> : f32
+// CHECK: %[[RESULT_REAL:.*]] = arith.select %[[IS_INF]], %[[REAL_OVERFLOW]], %[[REAL_NORMAL]] : f32
+// CHECK-DAG: %[[IMAG_NORMAL:.*]] = arith.mulf %[[EXP_REAL]], %[[SIN_IMAG]] fastmath<nnan,contract> : f32
+// CHECK-DAG: %[[EXP_HALF_SIN:.*]] = arith.mulf %[[EXP_HALF]], %[[SIN_IMAG]] fastmath<nnan,contract> : f32
+// CHECK-DAG: %[[IMAG_OVERFLOW:.*]] = arith.mulf %[[EXP_HALF_SIN]], %[[EXP_HALF]] fastmath<nnan,contract> : f32
+// CHECK-DAG: %[[IMAG_NONZERO:.*]] = arith.select %[[IS_INF]], %[[IMAG_OVERFLOW]], %[[IMAG_NORMAL]] : f32
+// CHECK: %[[RESULT_IMAG:.*]] = arith.select %[[IS_IMAG_ZERO]], %[[ZERO]], %[[IMAG_NONZERO]] : f32
 // CHECK: %[[RESULT:.*]] = complex.create %[[RESULT_REAL]], %[[RESULT_IMAG]] : complex<f32>
 // CHECK: return %[[RESULT]] : complex<f32>
 
diff --git a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
index 5755ca9258283..8cce6308018e2 100644
--- a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
@@ -486,7 +486,7 @@ func.func @mbarrier() {
   // CHECK: %[[barStr:.+]] =  builtin.unrealized_conversion_cast %[[barMemref]] : memref<1xi64, 3> to !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
   // CHECK: %[[base:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
   // CHECK: %[[barPtr:.+]] = llvm.getelementptr %[[base]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64
-  // CHECK: nvvm.mbarrier.init.shared %[[barPtr]]
+  // CHECK: nvvm.mbarrier.init %[[barPtr]]
     nvgpu.mbarrier.init %barrier[%c0], %num_threads : !barrierType
 
   // CHECK: %[[base2:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
@@ -516,7 +516,7 @@ func.func @mbarrier_nocomplete() {
   // CHECK: %[[barStr:.+]] =  builtin.unrealized_conversion_cast %[[barMemref]] : memref<1xi64, 3> to !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
   // CHECK: %[[base:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
   // CHECK: %[[barPtr:.+]] = llvm.getelementptr %[[base]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64
-  // CHECK: nvvm.mbarrier.init.shared %[[barPtr]]
+  // CHECK: nvvm.mbarrier.init %[[barPtr]]
   nvgpu.mbarrier.init %barrier[%c0], %num_threads : !barrierType
 
   // CHECK: %[[base2:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
@@ -592,7 +592,7 @@ func.func @mbarrier_txcount() {
     // CHECK: %[[barStr:.+]] =  builtin.unrealized_conversion_cast %[[barMemref]] : memref<1xi64, 3> to !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
     // CHECK: %[[base:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
     // CHECK: %[[barPtr:.+]] = llvm.getelementptr %[[base]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64
-    // CHECK: nvvm.mbarrier.init.shared %[[barPtr]]
+    // CHECK: nvvm.mbarrier.init %[[barPtr]]
     nvgpu.mbarrier.init %barrier[%c0], %num_threads : !barrierType
 
     %tidxreg = nvvm.read.ptx.sreg.tid.x : i32
@@ -643,7 +643,7 @@ func.func @mbarrier_txcount_pred() {
     // CHECK: %[[barStr:.+]] =  builtin.unrealized_conversion_cast %[[barMemref]] : memref<1xi64, 3> to !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
     // CHECK: %[[base:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
     // CHECK: %[[barPtr:.+]] = llvm.getelementptr %[[base]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64
-    // CHECK: nvvm.mbarrier.init.shared %[[barPtr]], {{.*}}, predicate = %[[P]]
+    // CHECK: nvvm.mbarrier.init %[[barPtr]], {{.*}}, predicate = %[[P]]
     nvgpu.mbarrier.init %barrier[%c0], %mine, predicate = %pred : !barrierType
 
     %txcount = arith.constant 256 : index
diff --git a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
index 6960e83be3573..fbc4c0af60360 100644
--- a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
+++ b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
@@ -8,7 +8,7 @@
 // CHECK-LABEL: @init_mbarrier
 llvm.func @init_mbarrier(%barrier_gen : !llvm.ptr, %barrier : !llvm.ptr<3>, %count : i32, %pred : i1) {
   //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$2 mbarrier.init.shared.b64 [$0], $1;", "r,r,b" 
-  nvvm.mbarrier.init.shared %barrier, %count, predicate = %pred : !llvm.ptr<3>, i32, i1 
+  nvvm.mbarrier.init %barrier, %count, predicate = %pred : !llvm.ptr<3>, i32, i1 
   //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$2 mbarrier.init.b64 [$0], $1;", "l,r,b" 
   nvvm.mbarrier.init %barrier_gen, %count, predicate = %pred : !llvm.ptr, i32, i1
   llvm.return
diff --git a/mlir/test/Conversion/SCFToGPU/parallel_loop.mlir b/mlir/test/Conversion/SCFToGPU/parallel_loop.mlir
index 1dbce05be85b4..26f5a3e1f0ac0 100644
--- a/mlir/test/Conversion/SCFToGPU/parallel_loop.mlir
+++ b/mlir/test/Conversion/SCFToGPU/parallel_loop.mlir
@@ -641,3 +641,35 @@ func.func @parallel_reduction_1d_outside() {
 // CHECK: scf.parallel
 // CHECK-NEXT: scf.parallel
 // CHECK: scf.reduce
+
+// -----
+
+// CHECK-LABEL: @nested_parallel_with_side_effect
+func.func @nested_parallel_with_side_effect() {
+  %c65536 = arith.constant 65536 : index
+  %c2 = arith.constant 2 : index
+  %c256 = arith.constant 256 : index
+  %c0 = arith.constant 0 : index
+  %c4 = arith.constant 4 : index
+  %c1 = arith.constant 1 : index
+  %alloc_0 = memref.alloc() : memref<2x256x256xf32>
+  %alloc_1 = memref.alloc() : memref<2x4x256x256xf32>
+  %alloc_2 = memref.alloc() : memref<4x4xf32>
+  %alloc_3 = memref.alloc() : memref<4x4xf32>
+  scf.parallel (%arg2, %arg3, %arg4) = (%c0, %c0, %c0) to (%c2, %c4, %c65536) step (%c1, %c1, %c1) {
+    %1 = arith.remsi %arg4, %c256 : index
+    %2 = arith.divsi %arg4, %c256 : index
+    %4 = memref.load %alloc_0[%arg2, %2, %1] : memref<2x256x256xf32>
+    memref.store %4, %alloc_1[%arg2, %arg3, %2, %1] : memref<2x4x256x256xf32>
+    scf.parallel (%arg5) = (%c0) to (%c4) step (%c1) {
+      %5 = memref.load %alloc_2[%arg5, %c0] : memref<4x4xf32>
+      memref.store %5, %alloc_3[%arg5, %c0] : memref<4x4xf32>
+      scf.reduce
+    } {mapping = [#gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
+    scf.reduce
+  } {mapping = [#gpu.loop_dim_map<processor = block_z, map = (d0) -> (d0), bound = (d0) -> (d0)>, #gpu.loop_dim_map<processor = block_y, map = (d0) -> (d0), bound = (d0) -> (d0)>, #gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
+  return
+}
+
+// CHECK: gpu.launch
+// CHECK-NOT: scf.parallel
diff --git a/mlir/test/Conversion/XeGPUToXeVM/create_nd_tdesc.mlir b/mlir/test/Conversion/XeGPUToXeVM/create_nd_tdesc.mlir
index d6e36fa73bf04..09ef76c9d1740 100644
--- a/mlir/test/Conversion/XeGPUToXeVM/create_nd_tdesc.mlir
+++ b/mlir/test/Conversion/XeGPUToXeVM/create_nd_tdesc.mlir
@@ -4,8 +4,9 @@ gpu.module @create_nd_tdesc {
   // CHECK-LABEL: gpu.func @create_nd_tdesc
   // CHECK-SAME: %[[ARG0:.*]]: memref<16x32xf32, 1>, %[[ARG1:.*]]: ui64,
   // CHECK-SAME: %[[ARG2:.*]]: index, %[[ARG3:.*]]: index, %[[ARG4:.*]]: index, %[[ARG5:.*]]: index, %[[ARG6:.*]]: index, %[[ARG7:.*]]: index
+  // CHECK-SAME: %[[DYN:.*]]: memref<?x?xf16>) kernel {
   gpu.func @create_nd_tdesc(%src: memref<16x32xf32, 1>, %ptr: ui64, %shape1: index, %shape2: index,
-  %stride1: index, %stride2: index, %offset1: index, %offset2: index) kernel {
+  %stride1: index, %stride2: index, %offset1: index, %offset2: index, %dyn: memref<?x?xf16>) kernel {
         // CHECK: %[[VAR0:.*]] = index.castu %[[ARG1]] : ui64 to index
         // CHECK: %[[BASE_ADDR:.*]] = arith.index_castui %[[VAR0]] : index to i64
         // CHECK: %[[CST:.*]] = arith.constant dense<0> : vector<8xi32>
@@ -43,6 +44,28 @@ gpu.module @create_nd_tdesc {
         // CHECK: %[[VAR19:.*]] = vector.insert %[[OFFSET_W2]], %[[VAR18]] [4] : i32 into vector<8xi32>
         // CHECK: %[[PAYLOAD:.*]] = vector.insert %[[OFFSET_H2]], %[[VAR19]] [5] : i32 into vector<8xi32>
         %src_tdesc = xegpu.create_nd_tdesc %srcce : memref<16x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+
+        // CHECK: %[[C1:.*]] = arith.constant 1 : index
+        %c1 = arith.constant 1 : index
+        // CHECK: %[[C64:.*]] = arith.constant 64 : index
+        %size_x = arith.constant 64 : index
+        // CHECK: %[[C16:.*]] = arith.constant 16 : index
+        %BLOCK_DMODEL = arith.constant 16 : index
+        // CHECK: %[[CST_4:.*]] = arith.constant dense<0> : vector<8xi32>
+        // CHECK: %[[INTPTR_5:.*]] = memref.extract_aligned_pointer_as_index %[[DYN]] : memref<?x?xf16> -> index
+        // CHECK: %[[C0_I32_6:.*]] = arith.constant 0 : i32
+        // CHECK: %[[C0_I32_7:.*]] = arith.constant 0 : i32
+        // CHECK: %[[VAR21:.*]] = arith.index_cast %[[C16]] : index to i32
+        // CHECK: %[[VAR22:.*]] = arith.index_cast %[[C64]] : index to i32
+        // CHECK: %[[VAR23:.*]] = arith.index_castui %[[INTPTR_5]] : index to i64
+        // CHECK: %[[VAR24:.*]] = vector.bitcast %[[CST_4]] : vector<8xi32> to vector<4xi64>
+        // CHECK: %[[VAR25:.*]] = vector.insert %[[VAR23]], %[[VAR24]] [0] : i64 into vector<4xi64>
+        // CHECK: %[[VAR26:.*]] = vector.bitcast %[[VAR25]] : vector<4xi64> to vector<8xi32>
+        // CHECK: %[[VAR27:.*]] = vector.insert %[[VAR21]], %[[VAR26]] [2] : i32 into vector<8xi32>
+        // CHECK: %[[VAR28:.*]] = vector.insert %[[VAR22]], %[[VAR27]] [3] : i32 into vector<8xi32>
+        // CHECK: %[[VAR29:.*]] = vector.insert %[[C0_I32_6]], %[[VAR28]] [4] : i32 into vector<8xi32>
+        // CHECK: %[[VAR30:.*]] = vector.insert %[[C0_I32_7]], %[[VAR29]] [5] : i32 into vector<8xi32>
+        %dyn_tdesc  = xegpu.create_nd_tdesc %dyn, shape: [%size_x, %BLOCK_DMODEL], strides: [%BLOCK_DMODEL, %c1] : memref<?x?xf16> -> !xegpu.tensor_desc<16x16xf16>
         gpu.return
     }
 }
diff --git a/mlir/test/Dialect/AMDGPU/invalid.mlir b/mlir/test/Dialect/AMDGPU/invalid.mlir
index 57847641a2d03..4c6f62a045405 100644
--- a/mlir/test/Dialect/AMDGPU/invalid.mlir
+++ b/mlir/test/Dialect/AMDGPU/invalid.mlir
@@ -156,14 +156,6 @@ func.func @wmma_no_k_dim(%arg0 : vector<16xi8>, %arg1 : vector<8xi32>) -> vector
 
 // -----
 
-func.func @wmma_wrong_m_dim(%arg0 : vector<16xi8>, %arg1 : vector<8xi32>) -> vector<8xi32> {
-  // expected-error@+1 {{'amdgpu.wmma' op attribute 'm' failed to satisfy constraint: 32-bit signless integer attribute whose value is one of {16}}}
-  %0 = amdgpu.wmma 32x16x16 %arg0 * %arg0 + %arg1 : vector<16xi8>, vector<16xi8>, vector<8xi32>
-  func.return %0 : vector<8xi32>
-}
-
-// -----
-
 func.func @wmma_wrong_n_dim(%arg0 : vector<16xi8>, %arg1 : vector<8xi32>) -> vector<8xi32> {
   // expected-error@+1 {{'amdgpu.wmma' op attribute 'n' failed to satisfy constraint: 32-bit signless integer attribute whose value is one of {16}}}
   %0 = amdgpu.wmma 16x32x16 %arg0 * %arg0 + %arg1 : vector<16xi8>, vector<16xi8>, vector<8xi32>
@@ -173,14 +165,62 @@ func.func @wmma_wrong_n_dim(%arg0 : vector<16xi8>, %arg1 : vector<8xi32>) -> vec
 // -----
 
 func.func @wmma_wrong_k_dim(%arg0 : vector<16xi8>, %arg1 : vector<8xi32>) -> vector<8xi32> {
-  // expected-error@+1 {{'amdgpu.wmma' op attribute 'k' failed to satisfy constraint: 32-bit signless integer attribute whose value is one of {16, 32}}}
+  // expected-error@+1 {{'amdgpu.wmma' op attribute 'k' failed to satisfy constraint: 32-bit signless integer attribute whose value is one of {4, 16, 32, 64, 128}}}
   %0 = amdgpu.wmma 16x16x24 %arg0 * %arg0 + %arg1 : vector<16xi8>, vector<16xi8>, vector<8xi32>
   func.return %0 : vector<8xi32>
 }
 
 // -----
 
-// Missinng `resetOffset`
+func.func @wmma_source_length_mismatch(%arg0 : vector<8xf16>, %arg1 : vector<16xf16>, %arg2 : vector<8xf32>) -> vector<8xf32> {
+  // expected-error@+1 {{'amdgpu.wmma' op source vectors have different lengths}}
+  %0 = amdgpu.wmma 16x16x16 %arg0 * %arg1 + %arg2 : vector<8xf16>, vector<16xf16>, vector<8xf32>
+  func.return %0 : vector<8xf32>
+}
+
+// -----
+
+func.func @wmma_mismatched_float_types(%arg0 : vector<8xf16>, %arg1 : vector<8xbf16>, %arg2 : vector<8xf32>) -> vector<8xf32> {
+  // expected-error@+1 {{'amdgpu.wmma' op source element types must match (except for fp8/bf8)}}
+  %0 = amdgpu.wmma 16x16x16 %arg0 * %arg1 + %arg2 : vector<8xf16>, vector<8xbf16>, vector<8xf32>
+  func.return %0 : vector<8xf32>
+}
+
+// -----
+
+func.func @wmma_mismatched_int_types(%arg0 : vector<8xi8>, %arg1 : vector<8xi4>, %arg2 : vector<8xi32>) -> vector<8xi32> {
+  // expected-error@+1 {{'amdgpu.wmma' op source element types must match (except for fp8/bf8)}}
+  %0 = amdgpu.wmma 16x16x16 %arg0 * %arg1 + %arg2 : vector<8xi8>, vector<8xi4>, vector<8xi32>
+  func.return %0 : vector<8xi32>
+}
+
+// -----
+
+func.func @wmma_clamp_float(%arg0 : vector<8xf16>, %arg1 : vector<8xf32>) -> vector<8xf32> {
+  // expected-error@+1 {{'amdgpu.wmma' op clamp flag is not supported for float types}}
+  %0 = amdgpu.wmma 16x16x16 %arg0 * %arg0 + %arg1 {clamp} : vector<8xf16>, vector<8xf16>, vector<8xf32>
+  func.return %0 : vector<8xf32>
+}
+
+// -----
+
+func.func @wmma_unsignedA_float(%arg0 : vector<8xf16>, %arg1 : vector<8xf32>) -> vector<8xf32> {
+  // expected-error@+1 {{'amdgpu.wmma' op unsigned flags are not supported for float types}}
+  %0 = amdgpu.wmma 16x16x16 %arg0 * %arg0 + %arg1 {unsignedA} : vector<8xf16>, vector<8xf16>, vector<8xf32>
+  func.return %0 : vector<8xf32>
+}
+
+// -----
+
+func.func @wmma_unsignedB_float(%arg0 : vector<8xf16>, %arg1 : vector<8xf32>) -> vector<8xf32> {
+  // expected-error@+1 {{'amdgpu.wmma' op unsigned flags are not supported for float types}}
+  %0 = amdgpu.wmma 16x16x16 %arg0 * %arg0 + %arg1 {unsignedB} : vector<8xf16>, vector<8xf16>, vector<8xf32>
+  func.return %0 : vector<8xf32>
+}
+
+// -----
+
+// Missing `resetOffset`
 func.func @fat_raw_buffer_cast_stripped_offset(%m: memref<8xi32, strided<[1], offset: ?>, #gpu.address_space<global>>) -> memref<8xi32, #amdgpu.address_space<fat_raw_buffer>> {
   // expected-error@+1 {{'amdgpu.fat_raw_buffer_cast' op expected result type to be 'memref<8xi32, strided<[1], offset: ?>, #amdgpu.address_space<fat_raw_buffer>>' but got 'memref<8xi32, #amdgpu.address_space<fat_raw_buffer>>'}}
   %ret = amdgpu.fat_raw_buffer_cast %m : memref<8xi32, strided<[1], offset: ?>, #gpu.address_space<global>> to memref<8xi32, #amdgpu.address_space<fat_raw_buffer>>
diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
index a33096750ee23..09134cb4704bb 100644
--- a/mlir/test/Dialect/AMDGPU/ops.mlir
+++ b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -586,6 +586,41 @@ func.func @wmma_i32_16x16x32_i4(%arg0 : vector<16xi4>, %arg1 : vector<8xi32>) ->
   func.return %0 : vector<8xi32>
 }
 
+// CHECK-LABEL: func @wmma_f32_16x16x4_f32
+func.func @wmma_f32_16x16x4_f32(%arg0 : vector<2xf32>, %arg1 : vector<8xf32>) -> vector<8xf32> {
+  // CHECK: amdgpu.wmma 16x16x4
+  %0 = amdgpu.wmma 16x16x4 %arg0 * %arg0 + %arg1 : vector<2xf32>, vector<2xf32>, vector<8xf32>
+  func.return %0 : vector<8xf32>
+}
+
+// CHECK-LABEL: func @wmma_f32_16x16x64_f8
+func.func @wmma_f32_16x16x64_f8(%arg0 : vector<32xf8E4M3FN>, %arg1 : vector<8xf32>) -> vector<8xf32> {
+  // CHECK: amdgpu.wmma 16x16x64
+  %0 = amdgpu.wmma 16x16x64 %arg0 * %arg0 + %arg1 : vector<32xf8E4M3FN>, vector<32xf8E4M3FN>, vector<8xf32>
+  func.return %0 : vector<8xf32>
+}
+
+// CHECK-LABEL: func @wmma_f32_16x16x64_bf8
+func.func @wmma_f32_16x16x64_bf8(%arg0 : vector<32xf8E5M2>, %arg1 : vector<8xf32>) -> vector<8xf32> {
+  // CHECK: amdgpu.wmma 16x16x64
+  %0 = amdgpu.wmma 16x16x64 %arg0 * %arg0 + %arg1 : vector<32xf8E5M2>, vector<32xf8E5M2>, vector<8xf32>
+  func.return %0 : vector<8xf32>
+}
+
+// CHECK-LABEL: func @wmma_f16_16x16x64_bf8
+func.func @wmma_f16_16x16x64_bf8(%arg0 : vector<32xf8E5M2>, %arg1 : vector<8xf16>) -> vector<8xf16> {
+  // CHECK: amdgpu.wmma 16x16x64
+  %0 = amdgpu.wmma 16x16x64 %arg0 * %arg0 + %arg1 : vector<32xf8E5M2>, vector<32xf8E5M2>, vector<8xf16>
+  func.return %0 : vector<8xf16>
+}
+
+// CHECK-LABEL: func @wmma_f16_16x16x64_f8
+func.func @wmma_f16_16x16x64_f8(%arg0 : vector<32xf8E4M3FN>, %arg1 : vector<8xf16>) -> vector<8xf16> {
+  // CHECK: amdgpu.wmma 16x16x64
+  %0 = amdgpu.wmma 16x16x64 %arg0 * %arg0 + %arg1 : vector<32xf8E4M3FN>, vector<32xf8E4M3FN>, vector<8xf16>
+  func.return %0 : vector<8xf16>
+}
+
 // CHECK-LABEL: func @swizzle_bitmode
 func.func @swizzle_bitmode(%arg0 : f32) -> f32 {
   // CHECK: amdgpu.swizzle_bitmode
diff --git a/mlir/test/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation/dealloc-memoryeffect-interface.mlir b/mlir/test/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation/dealloc-memoryeffect-interface.mlir
index 40a57b90c6e99..e8bb0c0f2eff6 100644
--- a/mlir/test/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation/dealloc-memoryeffect-interface.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation/dealloc-memoryeffect-interface.mlir
@@ -156,3 +156,24 @@ func.func @manual_deallocation(%c: i1, %f: f32, %idx: index) -> f32 {
 //       CHECK:   cf.assert %[[true]], "expected that the block does not have ownership"
 //       CHECK:   memref.dealloc %[[manual_alloc]]
 //       CHECK:   bufferization.dealloc (%[[managed_alloc]] : memref<5xf32>) if (%[[true]])
+
+// -----
+
+// CHECK-LABEL: func.func private @properly_creates_deallocations_in_execute_region(
+// CHECK:           %[[true:.*]] = arith.constant true
+// CHECK:           scf.execute_region no_inline {
+// CHECK:             %[[alloc:.*]] = memref.alloc() {alignment = 64 : i64} : memref<1x63x378x16xui8>
+// CHECK:             bufferization.dealloc (%[[alloc]] : memref<1x63x378x16xui8>) if (%[[true]])
+
+func.func private @properly_creates_deallocations_in_execute_region(%arg1: memref<1x16x252x380xui8> ) -> (memref<1x250x378x16xui8> )  {
+  %alloc = memref.alloc() {alignment = 64 : i64} : memref<1x250x378x16xui8>
+  scf.execute_region no_inline {
+    %subview = memref.subview %arg1[0, 0, 0, 0] [1, 16, 65, 380] [1, 1, 1, 1] : memref<1x16x252x380xui8> to memref<1x16x65x380xui8, strided<[1532160, 95760, 380, 1]>>
+    %alloc_3 = memref.alloc() {alignment = 64 : i64} : memref<1x63x378x16xui8>    
+    test.buffer_based in(%subview: memref<1x16x65x380xui8, strided<[1532160, 95760, 380, 1]>>) out(%alloc_3: memref<1x63x378x16xui8>)
+    %subview_7 = memref.subview %alloc[0, 0, 0, 0] [1, 63, 378, 16] [1, 1, 1, 1] : memref<1x250x378x16xui8> to memref<1x63x378x16xui8, strided<[1512000, 6048, 16, 1]>>
+    test.copy(%alloc_3, %subview_7) : (memref<1x63x378x16xui8>, memref<1x63x378x16xui8, strided<[1512000, 6048, 16, 1]>>)
+    scf.yield
+  }
+  return %alloc : memref<1x250x378x16xui8>
+}
diff --git a/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize.mlir
index d5f834bce9b83..8db1ebb87a1e5 100644
--- a/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize.mlir
@@ -381,15 +381,19 @@ func.func private @execute_region_test(%t1 : tensor<?xf32>)
 // -----
 
 // CHECK-LABEL: func @no_inline_execute_region_not_canonicalized
-func.func @no_inline_execute_region_not_canonicalized() {
-  %c = arith.constant 42 : i32
-  // CHECK: scf.execute_region
-  // CHECK-SAME: no_inline
-  %v = scf.execute_region -> i32 no_inline {
-    scf.yield %c : i32
+module {
+  func.func private @foo()->()
+  func.func @no_inline_execute_region_not_canonicalized() {
+    %c = arith.constant 42 : i32
+    // CHECK: scf.execute_region
+    // CHECK-SAME: no_inline
+    %v = scf.execute_region -> i32 no_inline {
+      func.call @foo():()->()
+      scf.yield %c : i32
+    }
+    // CHECK: return
+    return
   }
-  // CHECK: return
-  return
 }
 
 // -----
diff --git a/mlir/test/Dialect/EmitC/invalid_ops.mlir b/mlir/test/Dialect/EmitC/invalid_ops.mlir
index 5f594fb08c43f..f285196d466ce 100644
--- a/mlir/test/Dialect/EmitC/invalid_ops.mlir
+++ b/mlir/test/Dialect/EmitC/invalid_ops.mlir
@@ -876,3 +876,41 @@ func.func @test_do(%arg0 : !emitc.ptr<i32>) {
 
   return
 }
+
+// -----
+
+func.func @test_for_none_block_argument(%arg0: index) {
+  // expected-error@+1 {{expected body to have a single block argument for the induction variable}}
+  "emitc.for"(%arg0, %arg0, %arg0) (
+    {
+      emitc.yield
+    }
+  ) : (index, index, index) -> ()
+  return
+}
+
+// -----
+
+func.func @test_for_more_than_one_block_argument(%arg0: index) {
+  // expected-error@+1 {{expected body to have a single block argument for the induction variable}}
+  "emitc.for"(%arg0, %arg0, %arg0) (
+    {
+    ^bb0(%i0 : index, %i1 : index):
+      emitc.yield
+    }
+  ) : (index, index, index) -> ()
+  return
+}
+
+// -----
+
+func.func @test_for_unmatch_type(%arg0: index) {
+  // expected-error@+1 {{expected induction variable to be same type as bounds}}
+  "emitc.for"(%arg0, %arg0, %arg0) (
+    {
+    ^bb0(%i0 : f32):
+      emitc.yield
+    }
+  ) : (index, index, index) -> ()
+  return
+}
diff --git a/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir b/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir
index 87a31ca20eb7b..1adc4181e05d3 100644
--- a/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir
+++ b/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir
@@ -8,11 +8,11 @@
 
 // RUN: mlir-opt --allow-unregistered-dialect \
 // RUN:   --test-gpu-subgroup-reduce-lowering="expand-to-shuffles target=gfx942" %s \
-// RUN:   | FileCheck %s --check-prefix=CHECK-GFX9
+// RUN:   | FileCheck %s --check-prefixes=CHECK-GFX,CHECK-GFX9
 
 // RUN: mlir-opt --allow-unregistered-dialect \
 // RUN:   --test-gpu-subgroup-reduce-lowering="expand-to-shuffles target=gfx1030" %s \
-// RUN:   | FileCheck %s --check-prefix=CHECK-GFX10
+// RUN:   | FileCheck %s --check-prefixes=CHECK-GFX,CHECK-GFX10
 
 // CHECK-SUB:  gpu.module @kernels {
 // CHECK-SHFL: gpu.module @kernels {
@@ -24,8 +24,7 @@ gpu.module @kernels {
   // CHECK-SUB-SAME:     %[[ARG0:.+]]: vector<5xf16>)
   //
   // CHECK-SHFL-LABEL: gpu.func @kernel0(
-  // CHECK-GFX9-LABEL: gpu.func @kernel0(
-  // CHECK-GFX10-LABEL: gpu.func @kernel0(
+  // CHECK-GFX-LABEL: gpu.func @kernel0(
   gpu.func @kernel0(%arg0: vector<5xf16>) kernel {
     // CHECK-SUB: %[[VZ:.+]] = arith.constant dense<0.0{{.*}}> : vector<5xf16>
     // CHECK-SUB: %[[E0:.+]] = vector.extract_strided_slice %[[ARG0]] {offsets = [0], sizes = [2], strides = [1]} : vector<5xf16> to vector<2xf16>
@@ -56,8 +55,7 @@ gpu.module @kernels {
 
     // CHECK-SUB-COUNT-3: gpu.subgroup_reduce mul {{.+}} cluster(size = 4)
     // CHECK-SUB: "test.consume"
-    // CHECK-GFX9-COUNT-2: amdgpu.dpp {{.+}}
-    // CHECK-GFX10-COUNT-2: amdgpu.dpp {{.+}}
+    // CHECK-GFX-COUNT-2: amdgpu.dpp {{.+}}
     %sum2 = gpu.subgroup_reduce mul %arg0 cluster(size = 4) : (vector<5xf16>) -> (vector<5xf16>)
     "test.consume"(%sum2) : (vector<5xf16>) -> ()
 
@@ -74,8 +72,7 @@ gpu.module @kernels {
   // CHECK-SUB-SAME:     %[[ARG0:.+]]: vector<1xf32>)
   //
   // CHECK-SHFL-LABEL: gpu.func @kernel1(
-  // CHECK-GFX9-LABEL: gpu.func @kernel1(
-  // CHECK-GFX10-LABEL: gpu.func @kernel1(
+  // CHECK-GFX-LABEL: gpu.func @kernel1(
   gpu.func @kernel1(%arg0: vector<1xf32>) kernel {
     // CHECK-SUB: %[[E0:.+]] = vector.extract %[[ARG0]][0] : f32 from vector<1xf32>
     // CHECK-SUB: %[[R0:.+]] = gpu.subgroup_reduce add %[[E0]] : (f32) -> f32
@@ -100,17 +97,14 @@ gpu.module @kernels {
     // Note stride is dropped because it is == 1.
     // CHECK-SUB: gpu.subgroup_reduce add {{.+}} cluster(size = 8) : (f32) -> f32
     // CHECK-SUB: "test.consume"
-    // CHECK-GFX9-COUNT-2: amdgpu.dpp {{.+}} quad_perm
-    // CHECK-GFX9: amdgpu.dpp {{.+}} row_half_mirror
-    // CHECK-GFX10-COUNT-2: amdgpu.dpp {{.+}} quad_perm
-    // CHECK-GFX10: amdgpu.dpp {{.+}} row_half_mirror
+    // CHECK-GFX-COUNT-2: amdgpu.dpp {{.+}} quad_perm
+    // CHECK-GFX: amdgpu.dpp {{.+}} row_half_mirror
     %sum2 = gpu.subgroup_reduce add %arg0 cluster(size = 8, stride = 1) : (vector<1xf32>) -> (vector<1xf32>)
     "test.consume"(%sum2) : (vector<1xf32>) -> ()
 
     // CHECK-SUB: gpu.subgroup_reduce add {{.+}} uniform cluster(size = 8, stride = 4) : (f32) -> f32
     // CHECK-SUB: "test.consume"
-    // CHECK-GFX9-NOT: amdgpu.dpp
-    // CHECK-GFX10-NOT: amdgpu.dpp
+    // CHECK-GFX-NOT: amdgpu.dpp
     // CHECK-GFX10-NOT: rocdl.permlanex16
     %sum3 = gpu.subgroup_reduce add %arg0 uniform cluster(size = 8, stride = 4) : (vector<1xf32>) -> (vector<1xf32>)
     "test.consume"(%sum3) : (vector<1xf32>) -> ()
@@ -126,11 +120,8 @@ gpu.module @kernels {
   //
   // CHECK-SHFL-LABEL: gpu.func @kernel2(
   //
-  // CHECK-GFX9-LABEL: gpu.func @kernel2(
-  // CHECK-GFX9-NOT: amdgpu.dpp
-  //
-  // CHECK-GFX10-LABEL: gpu.func @kernel2(
-  // CHECK-GFX10-NOT: amdgpu.dpp
+  // CHECK-GFX-LABEL: gpu.func @kernel2(
+  // CHECK-GFX-NOT: amdgpu.dpp
   gpu.func @kernel2(%arg0: vector<3xi8>, %arg1: vector<4xi8>) kernel {
     // CHECK-SUB: %[[R0:.+]] = gpu.subgroup_reduce add %[[ARG0]] : (vector<3xi8>) -> vector<3xi8>
     // CHECK-SUB: "test.consume"(%[[R0]]) : (vector<3xi8>) -> ()
@@ -148,8 +139,7 @@ gpu.module @kernels {
 
   // CHECK-SHFL-LABEL: gpu.func @kernel3(
   // CHECK-SHFL-SAME:    %[[ARG0:.+]]: i32)
-  // CHECK-GFX9-LABEL: gpu.func @kernel3(
-  // CHECK-GFX10-LABEL: gpu.func @kernel3(
+  // CHECK-GFX-LABEL: gpu.func @kernel3(
   gpu.func @kernel3(%arg0: i32) kernel {
     // CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 1 : i32
     // CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 2 : i32
@@ -169,9 +159,9 @@ gpu.module @kernels {
     // CHECK-SHFL: %[[S4:.+]], %{{.+}} = gpu.shuffle xor %[[A3]], %[[C16]], %[[C32]] : i32
     // CHECK-SHFL: %[[A4:.+]] = arith.addi %[[A3]], %[[S4]] : i32
     // CHECK-SHFL: "test.consume"(%[[A4]]) : (i32) -> ()
-    
+
     // CHECK-GFX9-COUNT-6: amdgpu.dpp
-    
+
     // CHECK-GFX10-COUNT-4: amdgpu.dpp
     // CHECK-GFX10: rocdl.permlanex16
     // CHECK-GFX10-COUNT-2: rocdl.readlane
@@ -185,11 +175,8 @@ gpu.module @kernels {
   // CHECK-SHFL-LABEL: gpu.func @kernel3_clustered(
   // CHECK-SHFL-SAME:    %[[ARG0:.+]]: i32)
   //
-  // CHECK-GFX9-LABEL: gpu.func @kernel3_clustered(
-  // CHECK-GFX9-SAME:    %[[ARG0:.+]]: i32)
-  //
-  // CHECK-GFX10-LABEL: gpu.func @kernel3_clustered(
-  // CHECK-GFX10-SAME:    %[[ARG0:.+]]: i32)
+  // CHECK-GFX-LABEL: gpu.func @kernel3_clustered(
+  // CHECK-GFX-SAME:    %[[ARG0:.+]]: i32)
   gpu.func @kernel3_clustered(%arg0: i32) kernel {
     // CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 1 : i32
     // CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 2 : i32
@@ -204,19 +191,13 @@ gpu.module @kernels {
     // CHECK-SHFL: %[[A2:.+]] = arith.addi %[[A1]], %[[S2]] : i32
     // CHECK-SHFL: "test.consume"(%[[A2]]) : (i32) -> ()
 
-    // CHECK-GFX9: %[[D0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]]  quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i32
-    // CHECK-GFX9: %[[A0:.+]] = arith.addi %[[ARG0]], %[[D0]] : i32
-    // CHECK-GFX9: %[[D1:.+]] = amdgpu.dpp %[[A0]] %[[A0]]  quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i32
-    // CHECK-GFX9: %[[A1:.+]] = arith.addi %[[A0]], %[[D1]] : i32
-    // CHECK-GFX9: %[[D2:.+]] = amdgpu.dpp %[[A1]] %[[A1]]  row_half_mirror(unit) {bound_ctrl = true} : i32
-    // CHECK-GFX9: %[[A2:.+]] = arith.addi %[[A1]], %[[D2]] : i32
-
-    // CHECK-GFX10: %[[D0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]]  quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i32
-    // CHECK-GFX10: %[[A0:.+]] = arith.addi %[[ARG0]], %[[D0]] : i32
-    // CHECK-GFX10: %[[D1:.+]] = amdgpu.dpp %[[A0]] %[[A0]]  quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i32
-    // CHECK-GFX10: %[[A1:.+]] = arith.addi %[[A0]], %[[D1]] : i32
-    // CHECK-GFX10: %[[D2:.+]] = amdgpu.dpp %[[A1]] %[[A1]]  row_half_mirror(unit) {bound_ctrl = true} : i32
-    // CHECK-GFX10: %[[A2:.+]] = arith.addi %[[A1]], %[[D2]] : i32
+    // CHECK-GFX: %[[D0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]]  quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i32
+    // CHECK-GFX: %[[A0:.+]] = arith.addi %[[ARG0]], %[[D0]] : i32
+    // CHECK-GFX: %[[D1:.+]] = amdgpu.dpp %[[A0]] %[[A0]]  quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i32
+    // CHECK-GFX: %[[A1:.+]] = arith.addi %[[A0]], %[[D1]] : i32
+    // CHECK-GFX: %[[D2:.+]] = amdgpu.dpp %[[A1]] %[[A1]]  row_half_mirror(unit) {bound_ctrl = true} : i32
+    // CHECK-GFX: %[[A2:.+]] = arith.addi %[[A1]], %[[D2]] : i32
+
     // CHECK-GFX10: "test.consume"(%[[A2]]) : (i32) -> ()
     %sum0 = gpu.subgroup_reduce add %arg0 cluster(size = 8) : (i32) -> i32
     "test.consume"(%sum0) : (i32) -> ()
@@ -228,11 +209,8 @@ gpu.module @kernels {
   // CHECK-SHFL-LABEL: gpu.func @kernel3_clustered_strided(
   // CHECK-SHFL-SAME:    %[[ARG0:.+]]: i32)
   //
-  // CHECK-GFX9-LABEL: gpu.func @kernel3_clustered_strided(
-  // CHECK-GFX9-NOT: amdgpu.dpp
-  //
-  // CHECK-GFX10-LABEL: gpu.func @kernel3_clustered_strided(
-  // CHECK-GFX10-NOT: amdgpu.dpp
+  // CHECK-GFX-LABEL: gpu.func @kernel3_clustered_strided(
+  // CHECK-GFX-NOT: amdgpu.dpp
   gpu.func @kernel3_clustered_strided(%arg0: i32) kernel {
     // CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 4 : i32
     // CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 8 : i32
@@ -256,11 +234,8 @@ gpu.module @kernels {
   // CHECK-SHFL-LABEL: gpu.func @kernel4(
   // CHECK-SHFL-SAME:    %[[ARG0:.+]]: vector<2xf16>)
   //
-  // CHECK-GFX9-LABEL: gpu.func @kernel4(
-  // CHECK-GFX9-NOT: amdgpu.dpp
-  //
-  // CHECK-GFX10-LABEL: gpu.func @kernel4(
-  // CHECK-GFX10-NOT: amdgpu.dpp
+  // CHECK-GFX-LABEL: gpu.func @kernel4(
+  // CHECK-GFX-NOT: amdgpu.dpp
   gpu.func @kernel4(%arg0: vector<2xf16>) kernel {
     // CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 1 : i32
     // CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 2 : i32
@@ -298,11 +273,8 @@ gpu.module @kernels {
   // CHECK-SHFL-LABEL: gpu.func @kernel4_clustered(
   // CHECK-SHFL-SAME:    %[[ARG0:.+]]: vector<2xf16>)
   //
-  // CHECK-GFX9-LABEL: gpu.func @kernel4_clustered(
-  // CHECK-GFX9-NOT: amdgpu.dpp
-  //
-  // CHECK-GFX10-LABEL: gpu.func @kernel4_clustered(
-  // CHECK-GFX10-NOT: amdgpu.dpp
+  // CHECK-GFX-LABEL: gpu.func @kernel4_clustered(
+  // CHECK-GFX-NOT: amdgpu.dpp
   gpu.func @kernel4_clustered(%arg0: vector<2xf16>) kernel {
     // CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 1 : i32
     // CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 2 : i32
@@ -319,10 +291,8 @@ gpu.module @kernels {
   // CHECK-SHFL-LABEL: gpu.func @kernel5(
   // CHECK-SHFL-SAME:    %[[ARG0:.+]]: i16)
   //
-  // CHECK-GFX9-LABEL: gpu.func @kernel5(
-  //
-  // CHECK-GFX10-LABEL: gpu.func @kernel5(
-  // CHECK-GFX10-SAME:    %[[ARG0:.+]]: i16)
+  // CHECK-GFX-LABEL: gpu.func @kernel5(
+  // CHECK-GFX-SAME:    %[[ARG0:.+]]: i16)
   gpu.func @kernel5(%arg0: i16) kernel {
     // CHECK-SHFL: %[[E0:.+]] = arith.extui %[[ARG0]] : i16 to i32
     // CHECK-SHFL: %[[S0:.+]], %{{.+}} = gpu.shuffle xor %[[E0]], {{.+}} : i32
@@ -334,7 +304,7 @@ gpu.module @kernels {
     // CHECK-SHFL: arith.trunci {{.+}} : i32 to i16
     // CHECK-SHFL: %[[AL:.+]] = arith.addi {{.+}} : i16
     // CHECK-SHFL: "test.consume"(%[[AL]]) : (i16) -> ()
-    
+
     // CHECK-GFX9-COUNT-6: amdgpu.dpp
 
     // CHECK-GFX10: %[[D0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]]  quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i16
@@ -361,11 +331,8 @@ gpu.module @kernels {
   // CHECK-SHFL-LABEL: gpu.func @kernel5_clustered(
   // CHECK-SHFL-SAME:    %[[ARG0:.+]]: i16)
   //
-  // CHECK-GFX9-LABEL: gpu.func @kernel5_clustered
-  // CHECK-GFX9-SAME:    %[[ARG0:.+]]: i16)
-  //
-  // CHECK-GFX10-LABEL: gpu.func @kernel5_clustered
-  // CHECK-GFX10-SAME:    %[[ARG0:.+]]: i16)
+  // CHECK-GFX-LABEL: gpu.func @kernel5_clustered
+  // CHECK-GFX-SAME:    %[[ARG0:.+]]: i16)
   gpu.func @kernel5_clustered(%arg0: i16) kernel {
     // CHECK-SHFL: %[[E0:.+]] = arith.extui %[[ARG0]] : i16 to i32
     // CHECK-SHFL: %[[S0:.+]], %{{.+}} = gpu.shuffle xor %[[E0]], {{.+}} : i32
@@ -378,25 +345,15 @@ gpu.module @kernels {
     // CHECK-SHFL: %[[AL:.+]] = arith.addi {{.+}} : i16
     // CHECK-SHFL: "test.consume"(%[[AL]]) : (i16) -> ()
 
-    // CHECK-GFX9: %[[VAR0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]]  quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i16
-    // CHECK-GFX9: %[[VAR1:.+]] = arith.addi %[[ARG0]], %[[VAR0]] : i16
-    // CHECK-GFX9: %[[VAR2:.+]] = amdgpu.dpp %[[VAR1]] %[[VAR1]]  quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i16
-    // CHECK-GFX9: %[[VAR3:.+]] = arith.addi %[[VAR1]], %[[VAR2]] : i16
-    // CHECK-GFX9: %[[VAR4:.+]] = amdgpu.dpp %[[VAR3]] %[[VAR3]]  row_half_mirror(unit) {bound_ctrl = true} : i16
-    // CHECK-GFX9: %[[VAR5:.+]] = arith.addi %[[VAR3]], %[[VAR4]] : i16
-    // CHECK-GFX9: %[[VAR6:.+]] = amdgpu.dpp %[[VAR5]] %[[VAR5]]  row_mirror(unit) {bound_ctrl = true} : i16
-    // CHECK-GFX9: %[[VAR7:.+]] = arith.addi %[[VAR5]], %[[VAR6]] : i16
-    // CHECK-GFX9: "test.consume"(%[[VAR7]]) : (i16) -> ()
-
-    // CHECK-GFX10: %[[VAR0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]]  quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i16
-    // CHECK-GFX10: %[[VAR1:.+]] = arith.addi %[[ARG0]], %[[VAR0]] : i16
-    // CHECK-GFX10: %[[VAR2:.+]] = amdgpu.dpp %[[VAR1]] %[[VAR1]]  quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i16
-    // CHECK-GFX10: %[[VAR3:.+]] = arith.addi %[[VAR1]], %[[VAR2]] : i16
-    // CHECK-GFX10: %[[VAR4:.+]] = amdgpu.dpp %[[VAR3]] %[[VAR3]]  row_half_mirror(unit) {bound_ctrl = true} : i16
-    // CHECK-GFX10: %[[VAR5:.+]] = arith.addi %[[VAR3]], %[[VAR4]] : i16
-    // CHECK-GFX10: %[[VAR6:.+]] = amdgpu.dpp %[[VAR5]] %[[VAR5]]  row_mirror(unit) {bound_ctrl = true} : i16
-    // CHECK-GFX10: %[[VAR7:.+]] = arith.addi %[[VAR5]], %[[VAR6]] : i16
-    // CHECK-GFX10: "test.consume"(%[[VAR7]]) : (i16) -> ()
+    // CHECK-GFX: %[[VAR0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]]  quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i16
+    // CHECK-GFX: %[[VAR1:.+]] = arith.addi %[[ARG0]], %[[VAR0]] : i16
+    // CHECK-GFX: %[[VAR2:.+]] = amdgpu.dpp %[[VAR1]] %[[VAR1]]  quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i16
+    // CHECK-GFX: %[[VAR3:.+]] = arith.addi %[[VAR1]], %[[VAR2]] : i16
+    // CHECK-GFX: %[[VAR4:.+]] = amdgpu.dpp %[[VAR3]] %[[VAR3]]  row_half_mirror(unit) {bound_ctrl = true} : i16
+    // CHECK-GFX: %[[VAR5:.+]] = arith.addi %[[VAR3]], %[[VAR4]] : i16
+    // CHECK-GFX: %[[VAR6:.+]] = amdgpu.dpp %[[VAR5]] %[[VAR5]]  row_mirror(unit) {bound_ctrl = true} : i16
+    // CHECK-GFX: %[[VAR7:.+]] = arith.addi %[[VAR5]], %[[VAR6]] : i16
+    // CHECK-GFX: "test.consume"(%[[VAR7]]) : (i16) -> ()
     %sum0 = gpu.subgroup_reduce add %arg0 cluster(size = 16) : (i16) -> i16
     "test.consume"(%sum0) : (i16) -> ()
 
@@ -407,11 +364,8 @@ gpu.module @kernels {
   // CHECK-SHFL-LABEL: gpu.func @kernel6(
   // CHECK-SHFL-SAME:    %[[ARG0:.+]]: vector<3xi8>)
   //
-  // CHECK-GFX9-LABEL: gpu.func @kernel6(
-  // CHECK-GFX9-NOT: amdgpu.dpp
-  //
-  // CHECK-GFX10-LABEL: gpu.func @kernel6(
-  // CHECK-GFX10-NOT: amdgpu.dpp
+  // CHECK-GFX-LABEL: gpu.func @kernel6(
+  // CHECK-GFX-NOT: amdgpu.dpp
   gpu.func @kernel6(%arg0: vector<3xi8>) kernel {
     // CHECK-SHFL: %[[CZ:.+]] = arith.constant dense<0> : vector<4xi8>
     // CHECK-SHFL: %[[V0:.+]] = vector.insert_strided_slice %[[ARG0]], %[[CZ]] {offsets = [0], strides = [1]} : vector<3xi8> into vector<4xi8>
@@ -433,6 +387,44 @@ gpu.module @kernels {
     gpu.return
   }
 
+  // CHECK-GFX-LABEL: gpu.func @kernel7(
+  // CHECK-GFX-SAME:    %[[ARG0:.+]]: f32)
+  //
+  //   Checks, common to gfx942 and gfx1030, of
+  //     (1) quad_perm, followed by reduction resulting in reduction over 2 consecutive lanes,
+  //     (2) quad_perm, followed by reduction resulting in reduction over 4 consecutive lanes,
+  //     (3) row_half_mirror, followed by reduction resulting in reduction over 8 consecutive lanes, and
+  //     (4) row_mirror, followed by reduction resulting in reduction over 16 consecutive lanes.
+  // CHECK-GFX: %[[D0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]]  quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : f32
+  // CHECK-GFX: %[[A0:.+]] = arith.addf %[[ARG0]], %[[D0]] : f32
+  // CHECK-GFX: %[[D1:.+]] = amdgpu.dpp %[[A0]] %[[A0]]  quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : f32
+  // CHECK-GFX: %[[A1:.+]] = arith.addf %[[A0]], %[[D1]] : f32
+  // CHECK-GFX: %[[D2:.+]] = amdgpu.dpp %[[A1]] %[[A1]]  row_half_mirror(unit) {bound_ctrl = true} : f32
+  // CHECK-GFX: %[[A2:.+]] = arith.addf %[[A1]], %[[D2]] : f32
+  // CHECK-GFX: %[[D3:.+]] = amdgpu.dpp %[[A2]] %[[A2]]  row_mirror(unit) {bound_ctrl = true} : f32
+  // CHECK-GFX: %[[A3:.+]] = arith.addf %[[A2]], %[[D3]] : f32
+  //
+  //   Now, on gfx942:
+  //     (1) Lane 15 gets broadcast to lanes [16, 32) and lane 31 gets broadcast to lanes [48, 64], after which
+  //         the reduction in lanes [16, 32) is over the full cluster of the first 32 lanes, and the reduction in lanes
+  //         [48, 64) is over the full cluster of the last 32 lanes.
+  //     (2) Update the reduction value in lanes [0, 16) and [32, 48) with the final reduction result from
+  //         lanes [16, 32) and [48, 64), respectively.
+  // CHECK-GFX9: %[[BCAST15:.+]] = amdgpu.dpp %[[A3]] %[[A3]]  row_bcast_15(unit) {row_mask = 10 : i32} : f32
+  // CHECK-GFX9: %[[SUM:.+]] = arith.addf %[[A3]], %[[BCAST15]] : f32
+  // CHECK-GFX9: %[[SWIZ:.+]] = amdgpu.swizzle_bitmode %[[SUM]] 0 31 0 : f32
+  // CHECK-GFX9: "test.consume"(%[[SWIZ]]) : (f32) -> ()
+  //
+  //   On gfx1030, the final step is to permute the lanes and perform final reduction:
+  // CHECK-GFX10: rocdl.permlanex16
+  // CHECK-GFX10: arith.addf
+  // CHECK-GFX10: "test.consume"
+   gpu.func @kernel7(%arg0: f32) kernel {
+     %sum0 = gpu.subgroup_reduce add %arg0 cluster(size = 32) : (f32) -> (f32)
+     "test.consume"(%sum0) : (f32) -> ()
+     gpu.return
+   }
+
   // CHECK-SHFL-LABEL: gpu.func @kernel_cluster_size_is_subgroup_size(
   // CHECK-SHFL-SAME:    %[[ARG0:.+]]: vector<3xi8>)
   //
diff --git a/mlir/test/Dialect/LLVMIR/nvvm.mlir b/mlir/test/Dialect/LLVMIR/nvvm.mlir
index 0243f5eb8c862..2505e56407c2b 100644
--- a/mlir/test/Dialect/LLVMIR/nvvm.mlir
+++ b/mlir/test/Dialect/LLVMIR/nvvm.mlir
@@ -419,8 +419,8 @@ llvm.func private @mbarrier_init_generic(%barrier: !llvm.ptr) {
 
 llvm.func private @mbarrier_init_shared(%barrier: !llvm.ptr<3>) {
   %count = nvvm.read.ptx.sreg.ntid.x : i32
-  // CHECK:   nvvm.mbarrier.init.shared %{{.*}}, %{{.*}} : !llvm.ptr<3>, i32
-  nvvm.mbarrier.init.shared %barrier, %count : !llvm.ptr<3>, i32
+  // CHECK:   nvvm.mbarrier.init %{{.*}}, %{{.*}} : !llvm.ptr<3>, i32
+  nvvm.mbarrier.init %barrier, %count : !llvm.ptr<3>, i32
   llvm.return
 }
 
@@ -433,8 +433,8 @@ llvm.func private @mbarrier_inval_generic(%barrier: !llvm.ptr) {
 
 
 llvm.func private @mbarrier_inval_shared(%barrier: !llvm.ptr<3>) {
-  // CHECK:   nvvm.mbarrier.inval.shared %{{.*}} : !llvm.ptr<3>
-  nvvm.mbarrier.inval.shared %barrier : !llvm.ptr<3>
+  // CHECK:   nvvm.mbarrier.inval %{{.*}} : !llvm.ptr<3>
+  nvvm.mbarrier.inval %barrier : !llvm.ptr<3>
   llvm.return
 }
 
diff --git a/mlir/test/Dialect/LLVMIR/rocdl.mlir b/mlir/test/Dialect/LLVMIR/rocdl.mlir
index d270ee8b089aa..e703600c71c8e 100644
--- a/mlir/test/Dialect/LLVMIR/rocdl.mlir
+++ b/mlir/test/Dialect/LLVMIR/rocdl.mlir
@@ -664,6 +664,36 @@ llvm.func @rocdl.global.load.lds(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) {
   llvm.return
 }
 
+// CHECK-LABEL @rocdl.tensor.load.to.lds
+llvm.func @rocdl.tensor.load.to.lds(%dgroup0 : vector<4xi32>, %dgroup1 : vector<8xi32>,
+                                    %dgroup2 : vector<4xi32>, %dgroup3 : vector<4xi32>) {
+  // CHECK: rocdl.tensor.load.to.lds %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} cachepolicy 0 : vector<4xi32>, vector<8xi32>
+  rocdl.tensor.load.to.lds %dgroup0, %dgroup1, %dgroup2, %dgroup3 cachepolicy 0 : vector<4xi32>, vector<8xi32>
+  llvm.return
+}
+
+// CHECK-LABEL @rocdl.tensor.store.from.lds
+llvm.func @rocdl.tensor.store.from.lds(%dgroup0 : vector<4xi32>, %dgroup1 : vector<8xi32>,
+                                       %dgroup2 : vector<4xi32>, %dgroup3 : vector<4xi32>) {
+  // CHECK: rocdl.tensor.store.from.lds %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} cachepolicy 0 : vector<4xi32>, vector<8xi32>
+  rocdl.tensor.store.from.lds %dgroup0, %dgroup1, %dgroup2, %dgroup3 cachepolicy 0 : vector<4xi32>, vector<8xi32>
+  llvm.return
+}
+
+// CHECK-LABEL @rocdl.tensor.load.to.lds.d2
+llvm.func @rocdl.tensor.load.to.lds.d2(%dgroup0 : vector<4xi32>, %dgroup1 : vector<8xi32>) {
+  // CHECK: rocdl.tensor.load.to.lds.d2 %{{.*}}, %{{.*}} cachepolicy 0 : vector<4xi32>, vector<8xi32>
+  rocdl.tensor.load.to.lds.d2 %dgroup0, %dgroup1 cachepolicy 0 : vector<4xi32>, vector<8xi32>
+  llvm.return
+}
+
+// CHECK-LABEL @rocdl.tensor.store.from.lds.d2
+llvm.func @rocdl.tensor.store.from.lds.d2(%dgroup0 : vector<4xi32>, %dgroup1 : vector<8xi32>) {
+  // CHECK: rocdl.tensor.store.from.lds.d2 %{{.*}}, %{{.*}} cachepolicy 0 : vector<4xi32>, vector<8xi32>
+  rocdl.tensor.store.from.lds.d2 %dgroup0, %dgroup1 cachepolicy 0 : vector<4xi32>, vector<8xi32>
+  llvm.return
+}
+
 llvm.func @rocdl.make.buffer.rsrc(%ptr : !llvm.ptr,
                                   %stride : i16,
                                   %numRecords : i64,
diff --git a/mlir/test/Dialect/Linalg/canonicalize.mlir b/mlir/test/Dialect/Linalg/canonicalize.mlir
index 26d2d98572f47..f4020ede4854e 100644
--- a/mlir/test/Dialect/Linalg/canonicalize.mlir
+++ b/mlir/test/Dialect/Linalg/canonicalize.mlir
@@ -1423,7 +1423,7 @@ func.func @transpose_buffer(%input: memref<?xf32>,
 func.func @recursive_effect(%arg : tensor<1xf32>) {
   %init = arith.constant dense<0.0> : tensor<1xf32>
   %mapped = linalg.map ins(%arg:tensor<1xf32>) outs(%init :tensor<1xf32>)
-            (%in : f32) {
+            (%in : f32, %out: f32) {
               vector.print %in : f32
               linalg.yield %in : f32
             }
diff --git a/mlir/test/Dialect/Linalg/generalize-named-ops.mlir b/mlir/test/Dialect/Linalg/generalize-named-ops.mlir
index ae07b1b82228c..dcdd6c8db4b21 100644
--- a/mlir/test/Dialect/Linalg/generalize-named-ops.mlir
+++ b/mlir/test/Dialect/Linalg/generalize-named-ops.mlir
@@ -386,18 +386,24 @@ func.func @generalize_batch_reduce_gemm_bf16(%lhs: memref<7x8x9xbf16>, %rhs: mem
 
 // -----
 
-// CHECK-LABEL: generalize_linalg_map
-func.func @generalize_linalg_map(%arg0: memref<1x8x8x8xf32>) {
+func.func @generalize_linalg_map(%arg0: memref<1x8x8x8xf32>, %arg1: memref<1x8x8x8xf32>, %arg2: memref<1x8x8x8xf32>) {
   %cst = arith.constant 0.000000e+00 : f32
-  // CHECK: linalg.map
-  // CHECK-NOT: linalg.generic
-  linalg.map outs(%arg0 : memref<1x8x8x8xf32>)
-    () {
-      linalg.yield %cst : f32
-    }
+  linalg.map {arith.addf} ins(%arg0, %arg1: memref<1x8x8x8xf32>, memref<1x8x8x8xf32>) outs(%arg2 : memref<1x8x8x8xf32>)
   return
 }
 
+// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
+
+// CHECK: @generalize_linalg_map
+
+// CHECK: linalg.generic
+// CHECK-SAME: indexing_maps = [#[[MAP0]], #[[MAP0]], #[[MAP0]]]
+// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "parallel"]}
+// CHECK-SAME: ins(%{{.+}}, %{{.+}} : memref<1x8x8x8xf32>, memref<1x8x8x8xf32>) outs(%{{.+}} : memref<1x8x8x8xf32>
+// CHECK:         ^{{.+}}(%[[BBARG0:.+]]: f32, %[[BBARG1:.+]]: f32, %[[BBARG2:.+]]: f32)
+// CHECK:         %[[ADD:.+]] = arith.addf %[[BBARG0]], %[[BBARG1]] : f32
+// CHECK:         linalg.yield %[[ADD]] : f32
+
 // -----
 
 func.func @generalize_add(%lhs: memref<7x14x21xf32>, %rhs: memref<7x14x21xf32>,
diff --git a/mlir/test/Dialect/Linalg/invalid.mlir b/mlir/test/Dialect/Linalg/invalid.mlir
index 40bf4d19d6b91..fabc8e610612d 100644
--- a/mlir/test/Dialect/Linalg/invalid.mlir
+++ b/mlir/test/Dialect/Linalg/invalid.mlir
@@ -681,7 +681,7 @@ func.func @map_binary_wrong_yield_operands(
    %add = linalg.map
           ins(%lhs, %rhs : tensor<64xf32>, tensor<64xf32>)
           outs(%init:tensor<64xf32>)
-          (%lhs_elem: f32, %rhs_elem: f32) {
+          (%lhs_elem: f32, %rhs_elem: f32, %out: f32) {
             %0 = arith.addf %lhs_elem, %rhs_elem: f32
             // expected-error @+1{{'linalg.yield' op expected number of yield values (2) to match the number of inits / outs operands of the enclosing LinalgOp (1)}}
             linalg.yield %0, %0: f32, f32
@@ -694,11 +694,11 @@ func.func @map_binary_wrong_yield_operands(
 func.func @map_input_mapper_arity_mismatch(
     %lhs: tensor<64xf32>, %rhs: tensor<64xf32>, %init: tensor<64xf32>)
     -> tensor<64xf32> {
-  // expected-error@+1{{'linalg.map' op expects number of operands to match the arity of mapper, but got: 2 and 3}}
+  // expected-error@+1{{'linalg.map' op expects number of operands to match the arity of mapper, but got: 3 and 4}}
   %add = linalg.map
       ins(%lhs, %rhs : tensor<64xf32>, tensor<64xf32>)
       outs(%init:tensor<64xf32>)
-      (%lhs_elem: f32, %rhs_elem: f32, %extra_elem: f32) {
+      (%lhs_elem: f32, %rhs_elem: f32, %out: f32, %extra_elem: f32) {
         %0 = arith.addf %lhs_elem, %rhs_elem: f32
         linalg.yield %0: f32
       }
@@ -714,7 +714,7 @@ func.func @map_input_mapper_type_mismatch(
   %add = linalg.map
       ins(%lhs, %rhs : tensor<64xf32>, tensor<64xf32>)
       outs(%init:tensor<64xf32>)
-      (%lhs_elem: f64, %rhs_elem: f64) {
+      (%lhs_elem: f64, %rhs_elem: f64, %out: f32) {
         %0 = arith.addf %lhs_elem, %rhs_elem: f64
         linalg.yield %0: f64
       }
@@ -730,7 +730,7 @@ func.func @map_input_output_shape_mismatch(
   %add = linalg.map
       ins(%lhs, %rhs : tensor<64x64xf32>, tensor<64x64xf32>)
       outs(%init:tensor<32xf32>)
-      (%lhs_elem: f32, %rhs_elem: f32) {
+      (%lhs_elem: f32, %rhs_elem: f32, %out: f32) {
         %0 = arith.addf %lhs_elem, %rhs_elem: f32
         linalg.yield %0: f32
       }
diff --git a/mlir/test/Dialect/Linalg/one-shot-bufferize.mlir b/mlir/test/Dialect/Linalg/one-shot-bufferize.mlir
index 1df15e85bac17..85cc1ffc2029e 100644
--- a/mlir/test/Dialect/Linalg/one-shot-bufferize.mlir
+++ b/mlir/test/Dialect/Linalg/one-shot-bufferize.mlir
@@ -339,7 +339,7 @@ func.func @map_binary(%lhs: tensor<64xf32>, %rhs: tensor<64xf32>,
    %add = linalg.map
           ins(%lhs, %rhs: tensor<64xf32>, tensor<64xf32>)
           outs(%init:tensor<64xf32>)
-          (%lhs_elem: f32, %rhs_elem: f32) {
+          (%lhs_elem: f32, %rhs_elem: f32, %out: f32) {
             %0 = arith.addf %lhs_elem, %rhs_elem: f32
             linalg.yield %0: f32
           }
diff --git a/mlir/test/Dialect/Linalg/roundtrip.mlir b/mlir/test/Dialect/Linalg/roundtrip.mlir
index 563013d4083af..74928920c695a 100644
--- a/mlir/test/Dialect/Linalg/roundtrip.mlir
+++ b/mlir/test/Dialect/Linalg/roundtrip.mlir
@@ -341,7 +341,7 @@ func.func @mixed_parallel_reduced_results(%arg0 : tensor<?x?x?xf32>,
 func.func @map_no_inputs(%init: tensor<64xf32>) -> tensor<64xf32> {
    %add = linalg.map
       outs(%init:tensor<64xf32>)
-      () {
+      (%out: f32) {
         %0 = arith.constant 0.0: f32
         linalg.yield %0: f32
       }
@@ -349,7 +349,7 @@ func.func @map_no_inputs(%init: tensor<64xf32>) -> tensor<64xf32> {
 }
 // CHECK-LABEL: func @map_no_inputs
 //       CHECK:   linalg.map outs
-//  CHECK-NEXT:   () {
+//  CHECK-NEXT:   (%[[OUT:.*]]: f32) {
 //  CHECK-NEXT:     arith.constant
 //  CHECK-NEXT:     linalg.yield
 //  CHECK-NEXT:   }
@@ -361,7 +361,7 @@ func.func @map_binary(%lhs: tensor<64xf32>, %rhs: tensor<64xf32>,
    %add = linalg.map
           ins(%lhs, %rhs: tensor<64xf32>, tensor<64xf32>)
           outs(%init:tensor<64xf32>)
-          (%lhs_elem: f32, %rhs_elem: f32) {
+          (%lhs_elem: f32, %rhs_elem: f32, %out: f32) {
             %0 = arith.addf %lhs_elem, %rhs_elem: f32
             linalg.yield %0: f32
           }
@@ -378,7 +378,7 @@ func.func @map_binary_memref(%lhs: memref<64xf32>, %rhs: memref<64xf32>,
    linalg.map
       ins(%lhs, %rhs: memref<64xf32>, memref<64xf32>)
       outs(%init:memref<64xf32>)
-      (%lhs_elem: f32, %rhs_elem: f32) {
+      (%lhs_elem: f32, %rhs_elem: f32, %out: f32) {
         %0 = arith.addf %lhs_elem, %rhs_elem: f32
         linalg.yield %0: f32
       }
@@ -393,7 +393,7 @@ func.func @map_unary(%input: tensor<64xf32>, %init: tensor<64xf32>) -> tensor<64
    %abs = linalg.map
           ins(%input:tensor<64xf32>)
           outs(%init:tensor<64xf32>)
-          (%input_elem: f32) {
+          (%input_elem: f32, %out: f32) {
             %0 = math.absf %input_elem: f32
             linalg.yield %0: f32
           }
@@ -408,7 +408,7 @@ func.func @map_unary_memref(%input: memref<64xf32>, %init: memref<64xf32>) {
    linalg.map
       ins(%input:memref<64xf32>)
       outs(%init:memref<64xf32>)
-      (%input_elem: f32) {
+      (%input_elem: f32, %out: f32) {
         %0 = math.absf %input_elem: f32
         linalg.yield %0: f32
       }
@@ -604,7 +604,7 @@ func.func @map_arith_with_attr(%lhs: tensor<64xf32>, %rhs: tensor<64xf32>,
   %add = linalg.map
           ins(%lhs, %rhs: tensor<64xf32>, tensor<64xf32>)
           outs(%init:tensor<64xf32>)
-          (%lhs_elem: f32, %rhs_elem: f32) {
+          (%lhs_elem: f32, %rhs_elem: f32, %out: f32) {
             %0 = arith.addf %lhs_elem, %rhs_elem fastmath<fast> : f32
             linalg.yield %0: f32
           }
@@ -622,7 +622,7 @@ func.func @map_arith_with_attr(%lhs: tensor<64xf32>, %rhs: tensor<64xf32>,
 
 func.func @map_not_short_form_compatible(%lhs: tensor<1x32xf32>, %rhs: tensor<1x32xf32>, %init: tensor<1x32xf32>) -> tensor<1x32xf32> {
   %mapped = linalg.map ins(%lhs, %rhs : tensor<1x32xf32>, tensor<1x32xf32>) outs(%init : tensor<1x32xf32>)
-    (%in_1: f32, %in_2: f32) {
+    (%in_1: f32, %in_2: f32, %out: f32) {
       %1 = arith.maximumf %in_1, %in_2 : f32
       linalg.yield %in_1 : f32
     }
@@ -634,7 +634,7 @@ func.func @map_not_short_form_compatible(%lhs: tensor<1x32xf32>, %rhs: tensor<1x
 // CHECK-NOT:     linalg.map { arith.maximumf } ins(%[[LHS]] : tensor<1x32xf32>
 // CHECK:         linalg.map ins(%[[LHS]], %[[RHS]] : tensor<1x32xf32>, tensor<1x32xf32>) 
 // CHECK-SAME:               outs(%[[INIT]] : tensor<1x32xf32>)
-// CHECK-NEXT:      (%[[IN1:.*]]: f32, %[[IN2:.*]]: f32) {
+// CHECK-NEXT:      (%[[IN1:.*]]: f32, %[[IN2:.*]]: f32, %[[OUT:.*]]: f32) {
 // CHECK-NEXT:        %[[MAX_RESULT:.*]] = arith.maximumf %[[IN1]], %[[IN2]] : f32
 // CHECK-NEXT:        linalg.yield %[[IN1]] : f32
 // CHECK-NEXT:    }
diff --git a/mlir/test/Dialect/Linalg/vectorization/linalg-ops-with-patterns.mlir b/mlir/test/Dialect/Linalg/vectorization/linalg-ops-with-patterns.mlir
index 93a03369be239..aa2c1da4b6274 100644
--- a/mlir/test/Dialect/Linalg/vectorization/linalg-ops-with-patterns.mlir
+++ b/mlir/test/Dialect/Linalg/vectorization/linalg-ops-with-patterns.mlir
@@ -356,7 +356,7 @@ func.func @vectorize_map(%arg0: memref<64xf32>,
     %arg1: memref<64xf32>, %arg2: memref<64xf32>) {
   linalg.map ins(%arg0, %arg1 : memref<64xf32>, memref<64xf32>)
              outs(%arg2 : memref<64xf32>)
-    (%in: f32, %in_0: f32) {
+    (%in: f32, %in_0: f32, %out: f32) {
       %0 = arith.addf %in, %in_0 : f32
       linalg.yield %0 : f32
     }
diff --git a/mlir/test/Dialect/MemRef/value-bounds-op-interface-impl.mlir b/mlir/test/Dialect/MemRef/value-bounds-op-interface-impl.mlir
index f9b81dfc7d468..d0aec68d54988 100644
--- a/mlir/test/Dialect/MemRef/value-bounds-op-interface-impl.mlir
+++ b/mlir/test/Dialect/MemRef/value-bounds-op-interface-impl.mlir
@@ -77,6 +77,24 @@ func.func @memref_expand(%m: memref<?xf32>, %sz: index) -> (index, index) {
 
 // -----
 
+//       CHECK: #[[$MAP:.+]] = affine_map<()[s0] -> (s0 * 2)>
+// CHECK-LABEL: func @memref_collapse(
+//  CHECK-SAME:     %[[sz0:.*]]: index
+//   CHECK-DAG:   %[[c2:.*]] = arith.constant 2 : index
+//   CHECK-DAG:   %[[c12:.*]] = arith.constant 12 : index
+//       CHECK:   %[[dim:.*]] = memref.dim %{{.*}}, %[[c2]] : memref<3x4x?x2xf32>
+//       CHECK:   %[[mul:.*]] = affine.apply #[[$MAP]]()[%[[dim]]]
+//       CHECK:   return %[[c12]], %[[mul]]
+func.func @memref_collapse(%sz0: index) -> (index, index) {
+  %0 = memref.alloc(%sz0) : memref<3x4x?x2xf32>
+  %1 = memref.collapse_shape %0 [[0, 1], [2, 3]] : memref<3x4x?x2xf32> into memref<12x?xf32>
+  %2 = "test.reify_bound"(%1) {dim = 0} : (memref<12x?xf32>) -> (index)
+  %3 = "test.reify_bound"(%1) {dim = 1} : (memref<12x?xf32>) -> (index)
+  return %2, %3 : index, index
+}
+
+// -----
+
 // CHECK-LABEL: func @memref_get_global(
 //       CHECK:   %[[c4:.*]] = arith.constant 4 : index
 //       CHECK:   return %[[c4]]
diff --git a/mlir/test/Dialect/OpenACC/ops.mlir b/mlir/test/Dialect/OpenACC/ops.mlir
index 77d18da49276a..042ee2503cb95 100644
--- a/mlir/test/Dialect/OpenACC/ops.mlir
+++ b/mlir/test/Dialect/OpenACC/ops.mlir
@@ -2243,3 +2243,76 @@ func.func @test_firstprivate_map(%arg0: memref<10xf32>) {
 // CHECK-NEXT:     acc.yield
 // CHECK-NEXT:   }
 // CHECK-NEXT:   return
+
+// -----
+
+func.func @test_kernel_environment(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) {
+  %c1 = arith.constant 1 : index
+  %c1024 = arith.constant 1024 : index
+
+  // Create data clause operands for the kernel environment
+  %copyin = acc.copyin varPtr(%arg0 : memref<1024xf32>) -> memref<1024xf32>
+  %create = acc.create varPtr(%arg1 : memref<1024xf32>) -> memref<1024xf32>
+
+  // Kernel environment wraps gpu.launch and captures data mapping
+  acc.kernel_environment dataOperands(%copyin, %create : memref<1024xf32>, memref<1024xf32>) {
+    gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c1, %grid_y = %c1, %grid_z = %c1)
+               threads(%tx, %ty, %tz) in (%block_x = %c1024, %block_y = %c1, %block_z = %c1) {
+      // Kernel body uses the mapped data
+      %val = memref.load %copyin[%tx] : memref<1024xf32>
+      %result = arith.mulf %val, %val : f32
+      memref.store %result, %create[%tx] : memref<1024xf32>
+      gpu.terminator
+    }
+  }
+
+  // Copy results back to host and deallocate device memory
+  acc.copyout accPtr(%create : memref<1024xf32>) to varPtr(%arg1 : memref<1024xf32>)
+  acc.delete accPtr(%copyin : memref<1024xf32>)
+
+  return
+}
+
+// CHECK-LABEL: func @test_kernel_environment
+// CHECK:         %[[COPYIN:.*]] = acc.copyin varPtr(%{{.*}} : memref<1024xf32>) -> memref<1024xf32>
+// CHECK:         %[[CREATE:.*]] = acc.create varPtr(%{{.*}} : memref<1024xf32>) -> memref<1024xf32>
+// CHECK:         acc.kernel_environment dataOperands(%[[COPYIN]], %[[CREATE]] : memref<1024xf32>, memref<1024xf32>) {
+// CHECK:           gpu.launch
+// CHECK:             memref.load %[[COPYIN]]
+// CHECK:             memref.store %{{.*}}, %[[CREATE]]
+// CHECK:           }
+// CHECK:         }
+// CHECK:         acc.copyout accPtr(%[[CREATE]] : memref<1024xf32>) to varPtr(%{{.*}} : memref<1024xf32>)
+// CHECK:         acc.delete accPtr(%[[COPYIN]] : memref<1024xf32>)
+
+// -----
+
+func.func @test_kernel_environment_with_async(%arg0: memref<1024xf32>) {
+  %c1 = arith.constant 1 : index
+  %c1024 = arith.constant 1024 : index
+  %async_val = arith.constant 1 : i32
+
+  %create = acc.create varPtr(%arg0 : memref<1024xf32>) async(%async_val : i32) -> memref<1024xf32>
+
+  // Kernel environment with async clause
+  acc.kernel_environment dataOperands(%create : memref<1024xf32>) async(%async_val : i32) {
+    gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c1, %grid_y = %c1, %grid_z = %c1)
+               threads(%tx, %ty, %tz) in (%block_x = %c1024, %block_y = %c1, %block_z = %c1) {
+      %f0 = arith.constant 0.0 : f32
+      memref.store %f0, %create[%tx] : memref<1024xf32>
+      gpu.terminator
+    }
+  }
+
+  acc.copyout accPtr(%create : memref<1024xf32>) async(%async_val : i32) to varPtr(%arg0 : memref<1024xf32>)
+
+  return
+}
+
+// CHECK-LABEL: func @test_kernel_environment_with_async
+// CHECK:         %[[ASYNC:.*]] = arith.constant 1 : i32
+// CHECK:         %[[CREATE:.*]] = acc.create varPtr(%{{.*}} : memref<1024xf32>) async(%[[ASYNC]] : i32) -> memref<1024xf32>
+// CHECK:         acc.kernel_environment dataOperands(%[[CREATE]] : memref<1024xf32>) async(%[[ASYNC]] : i32)
+// CHECK:           gpu.launch
+// CHECK:             memref.store %{{.*}}, %[[CREATE]]
+// CHECK:         acc.copyout accPtr(%[[CREATE]] : memref<1024xf32>) async(%[[ASYNC]] : i32) to varPtr(%{{.*}} : memref<1024xf32>)
diff --git a/mlir/test/Dialect/OpenACC/support-analysis-recipename.mlir b/mlir/test/Dialect/OpenACC/support-analysis-recipename.mlir
new file mode 100644
index 0000000000000..8ea53b5d0f4d4
--- /dev/null
+++ b/mlir/test/Dialect/OpenACC/support-analysis-recipename.mlir
@@ -0,0 +1,78 @@
+// RUN: mlir-opt %s -split-input-file -test-acc-support | FileCheck %s
+
+// Test private recipe with 2D memref
+func.func @test_private_2d_memref() {
+  // Create a 2D memref
+  %0 = memref.alloca() {test.recipe_name = #acc.recipe_kind<private_recipe>} : memref<5x10xf32>
+
+  // CHECK: op=%{{.*}} = memref.alloca() {test.recipe_name = #acc.recipe_kind<private_recipe>} : memref<5x10xf32>
+  // CHECK-NEXT: getRecipeName(kind=private_recipe, type=memref<5x10xf32>)="privatization_memref_5x10xf32_"
+
+  return
+}
+
+// -----
+
+// Test firstprivate recipe with 2D memref
+func.func @test_firstprivate_2d_memref() {
+  // Create a 2D memref
+  %0 = memref.alloca() {test.recipe_name = #acc.recipe_kind<firstprivate_recipe>} : memref<8x16xf64>
+
+  // CHECK: op=%{{.*}} = memref.alloca() {test.recipe_name = #acc.recipe_kind<firstprivate_recipe>} : memref<8x16xf64>
+  // CHECK-NEXT: getRecipeName(kind=firstprivate_recipe, type=memref<8x16xf64>)="firstprivatization_memref_8x16xf64_"
+
+  return
+}
+
+// -----
+
+// Test reduction recipe with 2D memref
+func.func @test_reduction_2d_memref() {
+  // Create a 2D memref
+  %0 = memref.alloca() {test.recipe_name = #acc.recipe_kind<reduction_recipe>} : memref<4x8xi32>
+
+  // CHECK: op=%{{.*}} = memref.alloca() {test.recipe_name = #acc.recipe_kind<reduction_recipe>} : memref<4x8xi32>
+  // CHECK-NEXT: getRecipeName(kind=reduction_recipe, type=memref<4x8xi32>)="reduction_memref_4x8xi32_"
+
+  return
+}
+
+// -----
+
+// Test private recipe with dynamic memref
+func.func @test_private_dynamic_memref(%arg0: memref<5x10xi32>) {
+  // Cast to dynamic dimensions
+  %0 = memref.cast %arg0 {test.recipe_name = #acc.recipe_kind<private_recipe>} : memref<5x10xi32> to memref<?x10xi32>
+
+  // CHECK: op=%{{.*}} = memref.cast %{{.*}} {test.recipe_name = #acc.recipe_kind<private_recipe>} : memref<5x10xi32> to memref<?x10xi32>
+  // CHECK-NEXT: getRecipeName(kind=private_recipe, type=memref<?x10xi32>)="privatization_memref_Ux10xi32_"
+
+  return
+}
+
+// -----
+
+// Test private recipe with scalar memref
+func.func @test_private_scalar_memref() {
+  // Create a scalar memref (no dimensions)
+  %0 = memref.alloca() {test.recipe_name = #acc.recipe_kind<private_recipe>} : memref<i32>
+
+  // CHECK: op=%{{.*}} = memref.alloca() {test.recipe_name = #acc.recipe_kind<private_recipe>} : memref<i32>
+  // CHECK-NEXT: getRecipeName(kind=private_recipe, type=memref<i32>)="privatization_memref_i32_"
+
+  return
+}
+
+// -----
+
+// Test private recipe with unranked memref
+func.func @test_private_unranked_memref(%arg0: memref<10xi32>) {
+  // Cast to unranked memref
+  %0 = memref.cast %arg0 {test.recipe_name = #acc.recipe_kind<private_recipe>} : memref<10xi32> to memref<*xi32>
+
+  // CHECK: op=%{{.*}} = memref.cast %{{.*}} {test.recipe_name = #acc.recipe_kind<private_recipe>} : memref<10xi32> to memref<*xi32>
+  // CHECK-NEXT: getRecipeName(kind=private_recipe, type=memref<*xi32>)="privatization_memref_Zxi32_"
+
+  return
+}
+
diff --git a/mlir/test/Dialect/OpenACC/support-analysis-unsupported.mlir b/mlir/test/Dialect/OpenACC/support-analysis-unsupported.mlir
new file mode 100644
index 0000000000000..c4d5b81a1380a
--- /dev/null
+++ b/mlir/test/Dialect/OpenACC/support-analysis-unsupported.mlir
@@ -0,0 +1,18 @@
+// RUN: mlir-opt %s --split-input-file -test-acc-support -verify-diagnostics
+
+// Test emitNYI with a simple message
+func.func @test_emit_nyi() {
+  // expected-error @below {{not yet implemented: Unsupported feature in OpenACC}}
+  %0 = memref.alloca() {test.emit_nyi = "Unsupported feature in OpenACC"} : memref<10xi32>
+  return
+}
+
+// -----
+
+// Test recipe name on load operation from scalar memref
+func.func @test_recipe_load_scalar() {
+  %0 = memref.alloca() : memref<i32>
+  // expected-error @below {{not yet implemented: variable privatization (incomplete recipe name handling)}}
+  %1 = memref.load %0[] {test.recipe_name = #acc.recipe_kind<private_recipe>} : memref<i32>
+  return
+}
diff --git a/mlir/test/Dialect/SCF/invalid.mlir b/mlir/test/Dialect/SCF/invalid.mlir
index 37fc86b18e7f0..3f481ad5dbba7 100644
--- a/mlir/test/Dialect/SCF/invalid.mlir
+++ b/mlir/test/Dialect/SCF/invalid.mlir
@@ -373,7 +373,7 @@ func.func @reduceReturn_not_inside_reduce(%arg0 : f32) {
 
 func.func @std_if_incorrect_yield(%arg0: i1, %arg1: f32)
 {
-  // expected-error@+1 {{region control flow edge from Region #0 to parent results: source has 1 operands, but target successor needs 2}}
+  // expected-error@+1 {{region control flow edge from Operation scf.yield to parent results: source has 1 operands, but target successor <to parent> needs 2}}
   %x, %y = scf.if %arg0 -> (f32, f32) {
     %0 = arith.addf %arg1, %arg1 : f32
     scf.yield %0 : f32
@@ -544,7 +544,7 @@ func.func @while_invalid_terminator() {
 
 func.func @while_cross_region_type_mismatch() {
   %true = arith.constant true
-  // expected-error@+1 {{'scf.while' op region control flow edge from Region #0 to Region #1: source has 0 operands, but target successor needs 1}}
+  // expected-error@+1 {{region control flow edge from Operation scf.condition to Region #1: source has 0 operands, but target successor <to region #1 with 1 inputs> needs 1}}
   scf.while : () -> () {
     scf.condition(%true)
   } do {
@@ -557,7 +557,7 @@ func.func @while_cross_region_type_mismatch() {
 
 func.func @while_cross_region_type_mismatch() {
   %true = arith.constant true
-  // expected-error@+1 {{'scf.while' op along control flow edge from Region #0 to Region #1: source type #0 'i1' should match input type #0 'i32'}}
+  // expected-error@+1 {{along control flow edge from Operation scf.condition to Region #1: source type #0 'i1' should match input type #0 'i32'}}
   %0 = scf.while : () -> (i1) {
     scf.condition(%true) %true : i1
   } do {
@@ -570,7 +570,7 @@ func.func @while_cross_region_type_mismatch() {
 
 func.func @while_result_type_mismatch() {
   %true = arith.constant true
-  // expected-error@+1 {{'scf.while' op region control flow edge from Region #0 to parent results: source has 1 operands, but target successor needs 0}}
+  // expected-error@+1 {{region control flow edge from Operation scf.condition to parent results: source has 1 operands, but target successor <to parent> needs 0}}
   scf.while : () -> () {
     scf.condition(%true) %true : i1
   } do {
diff --git a/mlir/test/Dialect/Tensor/bufferize.mlir b/mlir/test/Dialect/Tensor/bufferize.mlir
index 296ca02564e35..5eb2360a29b8f 100644
--- a/mlir/test/Dialect/Tensor/bufferize.mlir
+++ b/mlir/test/Dialect/Tensor/bufferize.mlir
@@ -728,7 +728,7 @@ func.func @tensor.concat_dynamic_nonconcat_dim(%f: tensor<?x?xf32>, %g: tensor<?
 // CHECK-DAG:     %[[ALLOC:.*]] = memref.alloc(%[[M]], %[[N]]) {{.*}} : memref<?x3x?xf32>
 // CHECK:         %[[ALLOC_T:.*]] = bufferization.to_tensor %[[ALLOC]]
 // CHECK:         %[[MAPPED:.*]] = linalg.map outs(%[[ALLOC_T]] : tensor<?x3x?xf32>)
-// CHECK:         () {
+// CHECK:         (%[[INIT:.*]]: f32) {
 // CHECK:           linalg.yield %[[F]] : f32
 // CHECK:         }
 // CHECK:         return %[[MAPPED]] : tensor<?x3x?xf32>
diff --git a/mlir/test/Dialect/Tosa/ops.mlir b/mlir/test/Dialect/Tosa/ops.mlir
index 865f712ce1a5a..22fde3b7d28a5 100644
--- a/mlir/test/Dialect/Tosa/ops.mlir
+++ b/mlir/test/Dialect/Tosa/ops.mlir
@@ -1269,6 +1269,13 @@ func.func @test_matmul_t_block_scaled_broadcast(%arg0: tensor<?x8x32xf8E4M3FN>,
   return %0 : tensor<4x8x16xf32>
 }
 
+// -----
+// CHECK-LABEL: test_matmul_t_block_scaled_mxint8
+func.func @test_matmul_t_block_scaled_mxint8(%arg0: tensor<4x8x32x!tosa.mxint8>, %arg1: tensor<4x8x1xf8E8M0FNU>, %arg2: tensor<4x16x32x!tosa.mxint8>, %arg3: tensor<4x16x1xf8E8M0FNU>) -> tensor<4x8x16xf32> {
+  %0 = tosa.matmul_t_block_scaled %arg0, %arg1, %arg2, %arg3 {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32} : (tensor<4x8x32x!tosa.mxint8>, tensor<4x8x1xf8E8M0FNU>, tensor<4x16x32x!tosa.mxint8>, tensor<4x16x1xf8E8M0FNU>) -> tensor<4x8x16xf32>
+  return %0 : tensor<4x8x16xf32>
+}
+
 // -----
 // CHECK-LABEL: test_cast_from_block_scaled_static
 func.func @test_cast_from_block_scaled_static(%arg0: tensor<4x32xf4E2M1FN>, %arg1: tensor<4x1xf8E8M0FNU>) -> tensor<4x32xf32> {
@@ -1296,3 +1303,17 @@ func.func @test_cast_to_block_scaled_unranked(%arg0: tensor<*xf32>) -> (tensor<*
   %0:2 = tosa.cast_to_block_scaled %arg0 {block_size = #tosa.block_size<BLOCK_SIZE_32>} : (tensor<*xf32>) -> (tensor<*xf4E2M1FN>, tensor<*xf8E8M0FNU>)
   return %0#0, %0#1 : tensor<*xf4E2M1FN>, tensor<*xf8E8M0FNU>
 }
+
+// -----
+// CHECK-LABEL: test_cast_to_block_scaled_mxint8
+func.func @test_cast_to_block_scaled_mxint8(%arg0: tensor<4x32xf32>) -> (tensor<4x32x!tosa.mxint8>, tensor<4x1xf8E8M0FNU>) {
+  %0:2 = tosa.cast_to_block_scaled %arg0 {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32, stochastic_round = false} : (tensor<4x32xf32>) -> (tensor<4x32x!tosa.mxint8>, tensor<4x1xf8E8M0FNU>)
+  return %0#0, %0#1 : tensor<4x32x!tosa.mxint8>, tensor<4x1xf8E8M0FNU>
+}
+
+// -----
+// CHECK-LABEL: test_const_mxint8
+func.func @test_const_mxint8(%arg0 : index) -> tensor<2x!tosa.mxint8> {
+    %0 = "tosa.const"() {values = dense<"0x007F"> : tensor<2x!tosa.mxint8>} : () -> tensor<2x!tosa.mxint8>
+    return %0 : tensor<2x!tosa.mxint8>
+}
diff --git a/mlir/test/Dialect/Tosa/tosa-validation-version-1p1-valid.mlir b/mlir/test/Dialect/Tosa/tosa-validation-version-1p1-valid.mlir
index f3d8dab2f6b0f..9bd7aa8f0783e 100644
--- a/mlir/test/Dialect/Tosa/tosa-validation-version-1p1-valid.mlir
+++ b/mlir/test/Dialect/Tosa/tosa-validation-version-1p1-valid.mlir
@@ -38,7 +38,7 @@ func.func @test_argmax_int64(%arg0: tensor<1x13x13x5xf32>) -> tensor<1x13x13xi64
 // -----
 
 // CHECK-LABEL: test_const_i64
-func.func @test_const_i64(%arg0 : index) -> tensor<4xi64> {
+func.func @test_const_i64() -> tensor<4xi64> {
     %0 = "tosa.const"() {values = dense<[3, 0, 1, 2]> : tensor<4xi64>} : () -> tensor<4xi64>
     return %0 : tensor<4xi64>
 }
@@ -46,7 +46,7 @@ func.func @test_const_i64(%arg0 : index) -> tensor<4xi64> {
 // -----
 
 // CHECK-LABEL: test_const_fp6e3m2
-func.func @test_const_fp6e3m2(%arg0 : index) -> tensor<4xf6E3M2FN> {
+func.func @test_const_fp6e3m2() -> tensor<4xf6E3M2FN> {
     %0 = "tosa.const"() {values = dense<[0.0, 0.0, 0.0, 0.0]> : tensor<4xf6E3M2FN>} : () -> tensor<4xf6E3M2FN>
     return %0 : tensor<4xf6E3M2FN>
 }
@@ -82,3 +82,51 @@ func.func @test_cast_to_block_scaled_static(%arg0: tensor<4x32xf32>) -> (tensor<
   %0:2 = tosa.cast_to_block_scaled %arg0 {block_size = #tosa.block_size<BLOCK_SIZE_32>} : (tensor<4x32xf32>) -> (tensor<4x32xf6E3M2FN>, tensor<4x1xf8E8M0FNU>)
   return %0#0, %0#1 : tensor<4x32xf6E3M2FN>, tensor<4x1xf8E8M0FNU>
 }
+
+// -----
+
+// CHECK-LABEL: test_cast_to_block_scaled_mxint8
+func.func @test_cast_to_block_scaled_mxint8(%arg0: tensor<4x32xf32>) -> (tensor<4x32x!tosa.mxint8>, tensor<4x1xf8E8M0FNU>) {
+  %0:2 = tosa.cast_to_block_scaled %arg0 {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32, stochastic_round = false} : (tensor<4x32xf32>) -> (tensor<4x32x!tosa.mxint8>, tensor<4x1xf8E8M0FNU>)
+  return %0#0, %0#1 : tensor<4x32x!tosa.mxint8>, tensor<4x1xf8E8M0FNU>
+}
+
+// -----
+
+// CHECK-LABEL: test_const_fp6e3m2
+func.func @test_const_fp6e3m2() -> tensor<4xf6E3M2FN> {
+    %0 = "tosa.const"() {values = dense<[0.0, 0.0, 0.0, 0.0]> : tensor<4xf6E3M2FN>} : () -> tensor<4xf6E3M2FN>
+    return %0 : tensor<4xf6E3M2FN>
+}
+
+// -----
+
+// CHECK-LABEL: test_const_mxint8
+func.func @test_const_mxint8() -> tensor<2x!tosa.mxint8> {
+    %0 = "tosa.const"() {values = dense<["0x00", "0x7F"]> : tensor<2x!tosa.mxint8>} : () -> tensor<2x!tosa.mxint8>
+    return %0 : tensor<2x!tosa.mxint8>
+}
+
+// -----
+
+// CHECK-LABEL: test_cast_f4e2m1
+func.func @test_cast_f4e2m1(%arg0: tensor<13x21x3xf4E2M1FN>) -> tensor<13x21x3xbf16> {
+  %0 = tosa.cast %arg0 : (tensor<13x21x3xf4E2M1FN>) -> tensor<13x21x3xbf16>
+  return %0 : tensor<13x21x3xbf16>
+}
+
+// -----
+
+// CHECK-LABEL: test_matmul_t_block_scaled_mxint8
+func.func @test_matmul_t_block_scaled_mxint8(%arg0: tensor<4x8x32x!tosa.mxint8>, %arg1: tensor<4x8x1xf8E8M0FNU>, %arg2: tensor<4x16x32x!tosa.mxint8>, %arg3: tensor<4x16x1xf8E8M0FNU>) -> tensor<4x8x16xf32> {
+  %0 = tosa.matmul_t_block_scaled %arg0, %arg1, %arg2, %arg3 {block_size = #tosa.block_size<BLOCK_SIZE_32>} : (tensor<4x8x32x!tosa.mxint8>, tensor<4x8x1xf8E8M0FNU>, tensor<4x16x32x!tosa.mxint8>, tensor<4x16x1xf8E8M0FNU>) -> tensor<4x8x16xf32>
+  return %0 : tensor<4x8x16xf32>
+}
+
+// -----
+
+// CHECK-LABEL: test_cast_to_block_scaled_mxint8
+func.func @test_cast_to_block_scaled_mxint8(%arg0: tensor<4x32xf32>) -> (tensor<4x32x!tosa.mxint8>, tensor<4x1xf8E8M0FNU>) {
+  %0:2 = tosa.cast_to_block_scaled %arg0 {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32, stochastic_round = false} : (tensor<4x32xf32>) -> (tensor<4x32x!tosa.mxint8>, tensor<4x1xf8E8M0FNU>)
+  return %0#0, %0#1 : tensor<4x32x!tosa.mxint8>, tensor<4x1xf8E8M0FNU>
+}
diff --git a/mlir/test/Dialect/XeGPU/move-gpu-func-to-warp-op.mlir b/mlir/test/Dialect/XeGPU/move-gpu-func-to-warp-op.mlir
index d289d73e863c7..2780212d2917f 100644
--- a/mlir/test/Dialect/XeGPU/move-gpu-func-to-warp-op.mlir
+++ b/mlir/test/Dialect/XeGPU/move-gpu-func-to-warp-op.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -test-xegpu-move-func-to-warp-op -split-input-file --allow-unregistered-dialect %s | FileCheck %s
+// RUN: mlir-opt -xevm-attach-target='chip=pvc' -test-xegpu-move-func-to-warp-op -split-input-file --allow-unregistered-dialect %s | FileCheck %s
 
 gpu.module @test {
 gpu.func @empty()  {
diff --git a/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir b/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir
new file mode 100644
index 0000000000000..58461b8be52c4
--- /dev/null
+++ b/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir
@@ -0,0 +1,128 @@
+// RUN: mlir-opt -xevm-attach-target='chip=pvc' -xegpu-propagate-layout="layout-kind=inst" -split-input-file %s | FileCheck %s
+
+// CHECK-LABEL: func.func @dpas_f16(
+// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
+// CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>} dense<0.000000e+00> : vector<8x16xf32>
+// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][{{.*}}] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
+// CHECK: %[[T1:.*]] = xegpu.create_nd_tdesc %[[ARG1]][{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<inst_data = [16, 16], lane_layout = [1, 16], lane_data = [2, 1]>>
+// CHECK: %[[T2:.*]] = xegpu.load_nd %[[T0]]  {layout_result_0 = #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>} :
+// CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf16>
+// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T1]]  {layout_result_0 = #xegpu.layout<inst_data = [16, 16], lane_layout = [1, 16], lane_data = [2, 1]>} :
+// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<inst_data = [16, 16], lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16>
+// CHECK: %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[CST]] {layout_result_0 = #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>} :
+// CHECK-SAME: vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+// CHECK: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]][{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
+// CHECK: xegpu.store_nd %[[T4]], %[[T5]] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
+gpu.module @test {
+
+func.func @dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) {
+  %c0 = arith.constant 0 : index
+  %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32>
+  %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
+  %1 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+  %2 = xegpu.load_nd %0  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
+  %3 = xegpu.load_nd %1  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
+  %4 = xegpu.dpas %2, %3, %cst : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+  %5 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
+  xegpu.store_nd %4, %5  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
+  return
+}
+}
+
+// -----
+gpu.module @test_kernel {
+  gpu.func @elementwise_with_inst_data_only(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf16>) {
+    %c0 = arith.constant 0 : index
+    %c32 = arith.constant 32 : index
+    %c1024 = arith.constant 1024 : index
+    %block_id_x = gpu.block_id x
+    %block_id_y = gpu.block_id y
+    %m = arith.muli %block_id_x, %c32 : index
+
+    %a_tdesc = xegpu.create_nd_tdesc %A[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16>
+    %b_tdesc = xegpu.create_nd_tdesc %B[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16>
+    %c_tdesc = xegpu.create_nd_tdesc %C[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16>
+
+    %out:3 = scf.for %k = %c0 to %c1024 step %c32
+      iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_tdesc)
+      -> (!xegpu.tensor_desc<16x32xf16>, !xegpu.tensor_desc<16x32xf16>, !xegpu.tensor_desc<16x32xf16>) {
+      //CHECK: xegpu.load_nd {{.*}} {layout_result_0 = #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>} :
+      //CHECK-SAME: !xegpu.tensor_desc<16x32xf16, #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x32xf16>
+      %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<16x32xf16> -> vector<16x32xf16>
+      %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x32xf16> -> vector<16x32xf16>
+
+      //CHECK-COUNT: arith.addf {{.*}} {layout_result_0 = #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x32xf16>
+      %c = arith.addf %a, %b : vector<16x32xf16>
+
+      //CHECK-COUNT: xegpu.store_nd {{.*}} : vector<16x32xf16>, !xegpu.tensor_desc<16x32xf16, #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>>>
+      xegpu.store_nd %c, %arg2: vector<16x32xf16>, !xegpu.tensor_desc<16x32xf16>
+
+      //CHECK-COUNT: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<16x32xf16, #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
+      %a_next_tdesc = xegpu.update_nd_offset %arg0, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16>
+      %b_next_tdesc = xegpu.update_nd_offset %arg1, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16>
+      %c_next_tdesc = xegpu.update_nd_offset %arg2, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16>
+      scf.yield %a_next_tdesc, %b_next_tdesc, %c_next_tdesc
+        : !xegpu.tensor_desc<16x32xf16>, !xegpu.tensor_desc<16x32xf16>, !xegpu.tensor_desc<16x32xf16>
+    }
+    gpu.return
+  }
+}
+
+// -----
+gpu.module @test_kernel {
+  gpu.func @elementwise_with_inst_data_12(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf16>) {
+    %c0 = arith.constant 0 : index
+    %c32 = arith.constant 32 : index
+    %c1024 = arith.constant 1024 : index
+    %block_id_x = gpu.block_id x
+    %block_id_y = gpu.block_id y
+    %m = arith.muli %block_id_x, %c32 : index
+
+    %a_tdesc = xegpu.create_nd_tdesc %A[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<12x32xf16>
+    %b_tdesc = xegpu.create_nd_tdesc %B[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<12x32xf16>
+    %c_tdesc = xegpu.create_nd_tdesc %C[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<12x32xf16>
+
+    %out:3 = scf.for %k = %c0 to %c1024 step %c32
+      iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_tdesc)
+      -> (!xegpu.tensor_desc<12x32xf16>, !xegpu.tensor_desc<12x32xf16>, !xegpu.tensor_desc<12x32xf16>) {
+      //CHECK: xegpu.load_nd {{.*}} {layout_result_0 = #xegpu.layout<inst_data = [4, 16], lane_layout = [1, 16], lane_data = [1, 1]>} :
+      //CHECK-SAME: !xegpu.tensor_desc<12x32xf16, #xegpu.layout<inst_data = [4, 16], lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<12x32xf16>
+      %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<12x32xf16> -> vector<12x32xf16>
+      %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<12x32xf16> -> vector<12x32xf16>
+
+      //CHECK-COUNT: arith.addf {{.*}} {layout_result_0 = #xegpu.layout<inst_data = [4, 16], lane_layout = [1, 16], lane_data = [1, 1]>} : vector<12x32xf16>
+      %c = arith.addf %a, %b : vector<12x32xf16>
+
+      //CHECK-COUNT: xegpu.store_nd {{.*}} : vector<12x32xf16>, !xegpu.tensor_desc<12x32xf16, #xegpu.layout<inst_data = [4, 16], lane_layout = [1, 16], lane_data = [1, 1]>>>
+      xegpu.store_nd %c, %arg2: vector<12x32xf16>, !xegpu.tensor_desc<12x32xf16>
+
+      //CHECK-COUNT: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<12x32xf16, #xegpu.layout<inst_data = [4, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
+      %a_next_tdesc = xegpu.update_nd_offset %arg0, [%c0, %c32] : !xegpu.tensor_desc<12x32xf16>
+      %b_next_tdesc = xegpu.update_nd_offset %arg1, [%c0, %c32] : !xegpu.tensor_desc<12x32xf16>
+      %c_next_tdesc = xegpu.update_nd_offset %arg2, [%c0, %c32] : !xegpu.tensor_desc<12x32xf16>
+      scf.yield %a_next_tdesc, %b_next_tdesc, %c_next_tdesc
+        : !xegpu.tensor_desc<12x32xf16>, !xegpu.tensor_desc<12x32xf16>, !xegpu.tensor_desc<12x32xf16>
+    }
+    gpu.return
+  }
+}
+
+// -----
+gpu.module @test {
+// CHECK-LABEL: func.func @scatter_ops_chunksize(
+// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) {
+// CHECK: %{{.*}} = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
+// CHECK: %{{.*}} = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex>
+// CHECK: %{{.*}} = xegpu.load %[[ARG0]][%{{.*}}], %{{.*}} <{chunk_size = 8 : i64}>
+// CHECK-SAME: {layout_result_0 = #xegpu.layout<inst_data = [16, 8], lane_layout = [16, 1], lane_data = [1, 2]>} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
+// CHECK: xegpu.store %0, %[[ARG0]][%{{.*}}], %{{.*}} <{chunk_size = 8 : i64}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
+func.func @scatter_ops_chunksize(%src: memref<256xf16>) {
+  %1 = arith.constant dense<1>: vector<16xi1>
+  %offset = arith.constant dense<12> : vector<16xindex>
+  %3 = xegpu.load %src[%offset], %1 <{chunk_size=8}>
+      : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
+  xegpu.store %3, %src[%offset], %1 <{chunk_size=8}>
+      : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
+  return
+}
+}
diff --git a/mlir/test/Dialect/XeGPU/propagate-layout.mlir b/mlir/test/Dialect/XeGPU/propagate-layout.mlir
index 30f785ded975a..543e119d81d88 100644
--- a/mlir/test/Dialect/XeGPU/propagate-layout.mlir
+++ b/mlir/test/Dialect/XeGPU/propagate-layout.mlir
@@ -1,5 +1,6 @@
-// RUN: mlir-opt -xegpu-propagate-layout -split-input-file %s | FileCheck %s
+// RUN: mlir-opt -xevm-attach-target='chip=pvc' -xegpu-propagate-layout -split-input-file %s | FileCheck %s
 
+gpu.module @test {
 // CHECK-LABEL: func.func @dpas_f16(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
 // CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} dense<0.000000e+00> : vector<8x16xf32>
@@ -25,8 +26,10 @@ func.func @dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: me
   xegpu.store_nd %4, %5  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
   return
 }
+}
 
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @dpas_i8(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: vector<8x32xi8>, %[[ARG1:[0-9a-zA-Z]+]]: vector<32x16xi8>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xi32>) {
 // CHECK: %[[T0:.*]] = xegpu.dpas %[[ARG0]], %[[ARG1]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16],
@@ -37,8 +40,10 @@ func.func @dpas_i8(%arg0: vector<8x32xi8>, %arg1: vector<32x16xi8>, %arg2: memre
   xegpu.store_nd %0, %1  : vector<8x16xi32>, !xegpu.tensor_desc<8x16xi32>
   return
 }
+}
 
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @load_with_transpose_effect(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG0:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
 // CHECK: %{{.*}} = xegpu.load_nd %{{.*}} <{transpose = array<i64: 1, 0>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} :
@@ -55,8 +60,10 @@ func.func @load_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: memref<16x
   xegpu.store_nd %4, %5  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
   return
 }
+}
 
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @vector_transpose(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
 // CHECK: %{{.*}} = vector.transpose %{{.*}}, [1, 0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} : vector<16x16xf16> to vector<16x16xf16>
@@ -73,8 +80,10 @@ func.func @vector_transpose(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %
   xegpu.store_nd %5, %6  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
   return
 }
+}
 
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @extf_truncf(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, %[[ARG1:[0-9a-zA-Z]+]]:
 // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>) -> vector<8x16xf32> {
@@ -88,8 +97,10 @@ func.func @extf_truncf(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor
   %4 = xegpu.dpas %0, %3 : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
   return %4 : vector<8x16xf32>
 }
+}
 
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @load_gather_with_chunksize(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<256xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
 // CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
@@ -113,8 +124,10 @@ func.func @load_gather_with_chunksize(%arg0: memref<8x16xf16>, %arg1: memref<256
   xegpu.store_nd %5, %6  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
   return
 }
+}
 
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @load_gather_1d(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf32>, %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>) {
 // CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
@@ -132,8 +145,9 @@ func.func @load_gather_1d(%arg0: memref<256xf32>, %arg1: !xegpu.tensor_desc<16xf
   xegpu.store_nd %1, %arg1  : vector<16xf32>, !xegpu.tensor_desc<16xf32>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @store_scatter_with_chunksize(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<128xf32>) {
 // CHECK: %[[T0:.*]] = xegpu.create_tdesc %[[ARG0]], %{{.*}} : memref<128xf32>, vector<16xindex> ->
@@ -148,8 +162,9 @@ func.func @store_scatter_with_chunksize(%arg0: memref<128xf32>) {
   xegpu.store %cst, %0, %cst_0 : vector<16x8xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>>, vector<16xi1>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @store_scatter_1d(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: vector<16xf32>, %[[ARG1:[0-9a-zA-Z]+]]: memref<256xf32>) {
 // CHECK: xegpu.store %[[ARG0]], %{{.*}}, %{{.*}}  : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>,
@@ -161,8 +176,9 @@ func.func @store_scatter_1d(%arg0: vector<16xf32>, %arg1: memref<256xf32>) {
   xegpu.store %arg0, %0, %cst_0  : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @scatter_ops_chunksize(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) {
 // CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
@@ -179,8 +195,9 @@ func.func @scatter_ops_chunksize(%src: memref<256xf16>) {
       : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @scatter_ops(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) {
 // CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
@@ -195,8 +212,9 @@ func.func @scatter_ops(%src: memref<256xf16>) {
   xegpu.store %3, %src[%offset], %1 : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @vector_bitcast_i16_to_f16(
 // CHECK:       %[[LOAD0:.*]] = xegpu.load_nd %{{.*}}  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
 // CHECK-SAME:     !xegpu.tensor_desc<8x16xi16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xi16>
@@ -219,8 +237,9 @@ func.func @vector_bitcast_i16_to_f16(%arg0: memref<8x16xi16>, %arg1: memref<16x1
   xegpu.store_nd %6, %7  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @vector_bitcast_i32_to_f16(
 // CHECK:      %[[LOAD:.*]] = xegpu.load_nd %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
 // CHECK-SAME:     !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<16x8xi32>
@@ -239,8 +258,9 @@ func.func @vector_bitcast_i32_to_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x8
   xegpu.store_nd %6, %7  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @vector_bitcast_i16_to_i32(
 // CHECK:      %[[LOAD:.*]] = xegpu.load_nd %{{.*}}  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>}
 // CHECK-SAME:     !xegpu.tensor_desc<8x32xi16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>> -> vector<8x32xi16>
@@ -255,8 +275,9 @@ func.func @vector_bitcast_i16_to_i32(%arg0: memref<8x32xi16>, %arg1: memref<8x16
   xegpu.store_nd %3, %1  : vector<8x16xi32>, !xegpu.tensor_desc<8x16xi32>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @vector_bitcast_require_cross_lane_shuffle(
 // CHECK:     %[[LOAD:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<8x16xi32> -> vector<8x16xi32>
 // CHECK:     %{{.*}} = vector.bitcast %[[LOAD]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
@@ -270,9 +291,10 @@ func.func @vector_bitcast_require_cross_lane_shuffle(%arg0: memref<8x16xi32>, %a
   xegpu.store_nd %3, %1  : vector<8x32xi16>, !xegpu.tensor_desc<8x32xi16>
   return
 }
-
+}
 
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @binary_op_one_use(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
 // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>,
@@ -291,8 +313,9 @@ func.func @binary_op_one_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.
   xegpu.store_nd %4, %arg2  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @binary_op_multiple_uses(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
 // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
@@ -312,8 +335,9 @@ func.func @binary_op_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !
   xegpu.store_nd %2, %arg3  : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @for_op(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x128xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<128x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
 // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<8x128xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
@@ -353,8 +377,9 @@ func.func @for_op(%arg0: memref<8x128xf16>, %arg1: memref<128x16xf16>, %arg2: me
   xegpu.store_nd %2#2, %3  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @if_single_use(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
 // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>,
@@ -381,8 +406,9 @@ func.func @if_single_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tens
   xegpu.store_nd %2, %arg3  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @if_multiple_uses(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
 // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
@@ -411,8 +437,9 @@ func.func @if_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.t
   xegpu.store_nd %1, %arg4  : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @vector_outer_reduction(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: vector<16x16xf32>, %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>) {
 // CHECK: %{{.*}} = vector.multi_reduction <add>, %[[ARG0]], %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} [0] : vector<16x16xf32> to vector<16xf32>
@@ -422,8 +449,9 @@ func.func @vector_outer_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.tensor
   xegpu.store_nd %0, %arg1  : vector<16xf32>, !xegpu.tensor_desc<16xf32>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @vector_inner_reduction(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: vector<16x16xf32>, %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>) {
 // CHECK: %{{.*}} = vector.multi_reduction <add>, %[[ARG0]], %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} [1] : vector<16x16xf32> to vector<16xf32>
@@ -433,8 +461,9 @@ func.func @vector_inner_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.tensor
   xegpu.store_nd %0, %arg1  : vector<16xf32>, !xegpu.tensor_desc<16xf32>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @update_nd_offset_1d(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf32>) {
 // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<256xf32> -> !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
@@ -448,8 +477,9 @@ func.func @update_nd_offset_1d(%arg0: memref<256xf32>){
   xegpu.store_nd %1, %2 : vector<16xf32>, !xegpu.tensor_desc<16xf32>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @update_nd_offset_2d(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256x256xf32>) {
 // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}, %{{.*}}] : memref<256x256xf32> -> !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
@@ -463,8 +493,9 @@ func.func @update_nd_offset_2d(%arg0: memref<256x256xf32>){
   xegpu.store_nd %1, %2 : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @prefetch_2d(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256x256xf16>) {
 // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}, %{{.*}}] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
@@ -475,8 +506,9 @@ func.func @prefetch_2d(%arg0: memref<256x256xf16>){
   xegpu.prefetch_nd %0 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<16x16xf16>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @prefetch_1d(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) {
 // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<256xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
@@ -487,8 +519,9 @@ func.func @prefetch_1d(%arg0: memref<256xf16>){
   xegpu.prefetch_nd %0 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<16xf16>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @scf_while_and_condition(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf32>, %[[ARG1:[0-9a-zA-Z]+]]: memref<256xf32>) {
 // CHECK: %{{.*}}:3 = scf.while ({{.*}}) : (vector<16xf32>, i32, !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>)
@@ -520,8 +553,9 @@ func.func @scf_while_and_condition(%arg0: memref<256xf32>, %arg1: memref<256xf32
   }
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @vector_shape_cast_1d_to_2d_dim1_distributed(
 // CHECK-SAME:    %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
 // CHECK-SAME:    %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) {
@@ -541,8 +575,9 @@ func.func @vector_shape_cast_1d_to_2d_dim1_distributed(%arg0: !xegpu.tensor_desc
   xegpu.store_nd %5, %arg1  : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @vector_shape_cast_1d_to_2d_dim0_broadcasted(
 // CHECK-SAME:     %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
 // CHECK-SAME:     %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) {
@@ -563,3 +598,4 @@ func.func @vector_shape_cast_1d_to_2d_dim0_broadcasted(%arg0: !xegpu.tensor_desc
   xegpu.store_nd %5, %arg1  : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
   return
 }
+}
diff --git a/mlir/test/Integration/Dialect/Complex/CPU/correctness.mlir b/mlir/test/Integration/Dialect/Complex/CPU/correctness.mlir
index 1bcef0a0df316..ea587e92674d7 100644
--- a/mlir/test/Integration/Dialect/Complex/CPU/correctness.mlir
+++ b/mlir/test/Integration/Dialect/Complex/CPU/correctness.mlir
@@ -49,6 +49,11 @@ func.func @conj(%arg: complex<f32>) -> complex<f32> {
   func.return %conj : complex<f32>
 }
 
+func.func @exp(%arg: complex<f32>) -> complex<f32> {
+  %exp = complex.exp %arg : complex<f32>
+  func.return %exp : complex<f32>
+}
+
 // %input contains pairs of lhs, rhs, i.e. [lhs_0, rhs_0, lhs_1, rhs_1,...]
 func.func @test_binary(%input: tensor<?xcomplex<f32>>,
                        %func: (complex<f32>, complex<f32>) -> complex<f32>) {
@@ -353,5 +358,32 @@ func.func @entry() {
   call @test_element_f64(%abs_test_cast, %abs_func)
     : (tensor<?xcomplex<f64>>, (complex<f64>) -> f64) -> ()
 
+  // complex.exp test
+  %exp_test = arith.constant dense<[
+    (1.0, 2.0),
+    // CHECK:      -1.1312
+    // CHECK-NEXT:  2.4717
+
+    // The first case to consider is overflow of exp(real_part). If computed
+    // directly, this yields inf * 0 = NaN, which is incorrect.
+    (500.0, 0.0),
+    // CHECK-NEXT:  inf
+    // CHECK-NOT:   nan
+    // CHECK-NEXT:  0
+
+    // In this case, the overflow of exp(real_part) is compensated when
+    // sin(imag_part) is close to zero, yielding a finite imaginary part.
+    (90.0238094, 5.900613e-39)
+    // CHECK-NEXT:  inf
+    // CHECK-NOT:   inf
+    // CHECK-NEXT:  7.3746
+  ]> : tensor<3xcomplex<f32>>
+  %exp_test_cast = tensor.cast %exp_test
+    :  tensor<3xcomplex<f32>> to tensor<?xcomplex<f32>>
+
+  %exp_func = func.constant @exp : (complex<f32>) -> complex<f32>
+  call @test_unary(%exp_test_cast, %exp_func)
+    : (tensor<?xcomplex<f32>>, (complex<f32>) -> complex<f32>) -> ()
+
   func.return
 }
diff --git a/mlir/test/Interfaces/TilingInterface/lower-to-loops-using-interface.mlir b/mlir/test/Interfaces/TilingInterface/lower-to-loops-using-interface.mlir
index 8cbee3cbb758b..aa8882d21698c 100644
--- a/mlir/test/Interfaces/TilingInterface/lower-to-loops-using-interface.mlir
+++ b/mlir/test/Interfaces/TilingInterface/lower-to-loops-using-interface.mlir
@@ -257,10 +257,10 @@ module attributes {transform.with_named_sequence} {
 // -----
 
 func.func @map(%lhs: memref<64xf32>,
-    %rhs: memref<64xf32>, %out: memref<64xf32>) {
+    %rhs: memref<64xf32>, %init: memref<64xf32>) {
   linalg.map ins(%lhs, %rhs : memref<64xf32>, memref<64xf32>)
-             outs(%out : memref<64xf32>)
-    (%in: f32, %in_0: f32) {
+             outs(%init : memref<64xf32>)
+    (%in: f32, %in_0: f32, %out: f32) {
       %0 = arith.addf %in, %in_0 : f32
       linalg.yield %0 : f32
     }
diff --git a/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir b/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir
index 09b8f593154b5..42aa2210eae1a 100644
--- a/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir
+++ b/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir
@@ -621,3 +621,14 @@ func.func @invalid_range_equal_bounds() {
   %0 = nvvm.read.ptx.sreg.warpsize range <i32, 32, 32> : i32
   return
 }
+
+// -----
+
+// Test for correct return type check for wmma.load fragment a for f64 
+llvm.func @nvvm_wmma_load_a_f64(%arg0: !llvm.ptr, %arg1 : i32) {
+  // expected-error @below {{'nvvm.wmma.load' op expected destination type to be f64}}
+  %0 = nvvm.wmma.load %arg0, %arg1
+    {eltype = #nvvm.mma_type<f64>, frag = #nvvm.mma_frag<a>, k = 4 : i32, layout = #nvvm.mma_layout<row>, m = 8 : i32, n = 8 : i32}
+    : (!llvm.ptr) -> !llvm.struct<(f64)>
+  llvm.return
+}
diff --git a/mlir/test/Target/LLVMIR/nvvmir.mlir b/mlir/test/Target/LLVMIR/nvvmir.mlir
index 594ae4849e3eb..9115de65ff0e8 100644
--- a/mlir/test/Target/LLVMIR/nvvmir.mlir
+++ b/mlir/test/Target/LLVMIR/nvvmir.mlir
@@ -463,6 +463,43 @@ llvm.func @nvvm_wmma_mma(%0 : i32, %1 : i32, %2 : i32, %3 : i32, %4 : i32, %5 :
   llvm.return
 }
 
+// CHECK-LABEL: @nvvm_wmma_load_a_f64
+llvm.func @nvvm_wmma_load_a_f64(%arg0: !llvm.ptr, %arg1 : i32) {
+  // CHECK: call double @llvm.nvvm.wmma.m8n8k4.load.a.row.stride.f64.p0(ptr %{{.*}}, i32 %{{.*}})
+  %0 = nvvm.wmma.load %arg0, %arg1
+    {eltype = #nvvm.mma_type<f64>, frag = #nvvm.mma_frag<a>, k = 4 : i32, layout = #nvvm.mma_layout<row>, m = 8 : i32, n = 8 : i32}
+    : (!llvm.ptr) -> f64
+  llvm.return
+}
+
+// CHECK-LABEL: @nvvm_wmma_load_c_f64
+llvm.func @nvvm_wmma_load_c_f64(%arg0: !llvm.ptr, %arg1 : i32) {
+  // CHECK: call { double, double } @llvm.nvvm.wmma.m8n8k4.load.c.row.stride.f64.p0(ptr %{{.*}}, i32 %{{.*}})
+  %0 = nvvm.wmma.load %arg0, %arg1
+    {eltype = #nvvm.mma_type<f64>, frag = #nvvm.mma_frag<c>, k = 4 : i32, layout = #nvvm.mma_layout<row>, m = 8 : i32, n = 8 : i32}
+    : (!llvm.ptr) -> !llvm.struct<(f64, f64)>
+  llvm.return
+}
+
+// CHECK-LABEL: @nvvm_wmma_mma_f64
+llvm.func @nvvm_wmma_mma_f64(%0 : f64, %1 : f64, %2 : f64, %3 : f64) {
+  // CHECK: { double, double } @llvm.nvvm.wmma.m8n8k4.mma.row.col.f64(double %{{.*}}, double %{{.*}}, double %{{.*}}, double %{{.*}})
+  %r = nvvm.wmma.mma %0, %1, %2, %3
+    {eltypeA = #nvvm.mma_type<f64>, eltypeB = #nvvm.mma_type<f64>, k = 4 : i32, layoutA = #nvvm.mma_layout<row>, layoutB = #nvvm.mma_layout<col>, m = 8 : i32, n = 8 : i32}
+    : (f64, f64, f64, f64)
+    -> !llvm.struct<(f64, f64)>
+  llvm.return
+}
+
+// CHECK-LABEL: @nvvm_wmma_store_d_f64
+llvm.func @nvvm_wmma_store_d_f64(%arg0: !llvm.ptr, %arg1 : i32, %arg2 : f64, %arg3 : f64) {
+  // CHECK: call void @llvm.nvvm.wmma.m8n8k4.store.d.row.stride.f64.p0(ptr %{{.*}}, double %{{.*}}, double %{{.*}}, i32 %{{.*}})
+  nvvm.wmma.store %arg0, %arg1, %arg2, %arg3
+    {eltype = #nvvm.mma_type<f64>, k = 4 : i32, layout = #nvvm.mma_layout<row>, m = 8 : i32, n = 8 : i32}
+    : !llvm.ptr, f64, f64
+  llvm.return
+}
+
 // CHECK-LABEL: @cp_async
 llvm.func @cp_async(%arg0: !llvm.ptr<3>, %arg1: !llvm.ptr<1>) {
   // CHECK: call void @llvm.nvvm.cp.async.ca.shared.global.4(ptr addrspace(3) %{{.*}}, ptr addrspace(1) %{{.*}})
diff --git a/mlir/test/Target/LLVMIR/omptarget-record-type-with-ptr-member-host.mlir b/mlir/test/Target/LLVMIR/omptarget-record-type-with-ptr-member-host.mlir
index a1e415c35e4b6..9640f03311af7 100644
--- a/mlir/test/Target/LLVMIR/omptarget-record-type-with-ptr-member-host.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-record-type-with-ptr-member-host.mlir
@@ -81,9 +81,8 @@ module attributes {omp.is_target_device = false, omp.target_triples = ["amdgcn-a
 // CHECK: %[[ARR_SECT_SIZE:.*]] = mul i64 %[[ARR_SECT_SIZE1]], 4
 // CHECK: %[[LFULL_ARR:.*]] = load ptr, ptr @full_arr, align 8
 // CHECK: %[[FULL_ARR_PTR:.*]] = getelementptr inbounds float, ptr %[[LFULL_ARR]], i64 0
-// CHECK: %[[ARR_SECT_OFFSET1:.*]] = mul i64 %[[ARR_SECT_OFFSET2]], 1
 // CHECK: %[[LARR_SECT:.*]] = load ptr, ptr @sect_arr, align 8
-// CHECK: %[[ARR_SECT_PTR:.*]] = getelementptr inbounds i32, ptr %[[LARR_SECT]], i64 %[[ARR_SECT_OFFSET1]]
+// CHECK: %[[ARR_SECT_PTR:.*]] = getelementptr inbounds i32, ptr %[[LARR_SECT]], i64 %[[ARR_SECT_OFFSET2]]
 // CHECK: %[[SCALAR_PTR_LOAD:.*]] = load ptr, ptr %[[SCALAR_BASE]], align 8
 // CHECK: %[[FULL_ARR_DESC_SIZE:.*]] = sdiv exact i64 48, ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64)
 // CHECK: %[[FULL_ARR_SIZE_CMP:.*]] = icmp eq ptr %[[FULL_ARR_PTR]], null
diff --git a/mlir/test/Target/LLVMIR/ptr.mlir b/mlir/test/Target/LLVMIR/ptr.mlir
index 473ac0598e9ce..94b6628772634 100644
--- a/mlir/test/Target/LLVMIR/ptr.mlir
+++ b/mlir/test/Target/LLVMIR/ptr.mlir
@@ -284,8 +284,8 @@ llvm.func @ptr_add_cst() -> !ptr.ptr<#llvm.address_space<0>> {
 
 // CHECK-LABEL: define i64 @ptr_diff_scalar
 // CHECK-SAME: (ptr %[[PTR1:.*]], ptr %[[PTR2:.*]]) {
-// CHECK-NEXT:   %[[P1INT:.*]] = ptrtoint ptr %[[PTR1]] to i64
-// CHECK-NEXT:   %[[P2INT:.*]] = ptrtoint ptr %[[PTR2]] to i64
+// CHECK-NEXT:   %[[P1INT:.*]] = ptrtoaddr ptr %[[PTR1]] to i64
+// CHECK-NEXT:   %[[P2INT:.*]] = ptrtoaddr ptr %[[PTR2]] to i64
 // CHECK-NEXT:   %[[DIFF:.*]] = sub i64 %[[P1INT]], %[[P2INT]]
 // CHECK-NEXT:   ret i64 %[[DIFF]]
 // CHECK-NEXT: }
@@ -296,8 +296,8 @@ llvm.func @ptr_diff_scalar(%ptr1: !ptr.ptr<#llvm.address_space<0>>, %ptr2: !ptr.
 
 // CHECK-LABEL: define i32 @ptr_diff_scalar_i32
 // CHECK-SAME: (ptr %[[PTR1:.*]], ptr %[[PTR2:.*]]) {
-// CHECK-NEXT:   %[[P1INT:.*]] = ptrtoint ptr %[[PTR1]] to i64
-// CHECK-NEXT:   %[[P2INT:.*]] = ptrtoint ptr %[[PTR2]] to i64
+// CHECK-NEXT:   %[[P1INT:.*]] = ptrtoaddr ptr %[[PTR1]] to i64
+// CHECK-NEXT:   %[[P2INT:.*]] = ptrtoaddr ptr %[[PTR2]] to i64
 // CHECK-NEXT:   %[[DIFF:.*]] = sub i64 %[[P1INT]], %[[P2INT]]
 // CHECK-NEXT:   %[[TRUNC:.*]] = trunc i64 %[[DIFF]] to i32
 // CHECK-NEXT:   ret i32 %[[TRUNC]]
@@ -309,8 +309,8 @@ llvm.func @ptr_diff_scalar_i32(%ptr1: !ptr.ptr<#llvm.address_space<0>>, %ptr2: !
 
 // CHECK-LABEL: define <4 x i64> @ptr_diff_vector
 // CHECK-SAME: (<4 x ptr> %[[PTRS1:.*]], <4 x ptr> %[[PTRS2:.*]]) {
-// CHECK-NEXT:   %[[P1INT:.*]] = ptrtoint <4 x ptr> %[[PTRS1]] to <4 x i64>
-// CHECK-NEXT:   %[[P2INT:.*]] = ptrtoint <4 x ptr> %[[PTRS2]] to <4 x i64>
+// CHECK-NEXT:   %[[P1INT:.*]] = ptrtoaddr <4 x ptr> %[[PTRS1]] to <4 x i64>
+// CHECK-NEXT:   %[[P2INT:.*]] = ptrtoaddr <4 x ptr> %[[PTRS2]] to <4 x i64>
 // CHECK-NEXT:   %[[DIFF:.*]] = sub <4 x i64> %[[P1INT]], %[[P2INT]]
 // CHECK-NEXT:   ret <4 x i64> %[[DIFF]]
 // CHECK-NEXT: }
@@ -321,8 +321,8 @@ llvm.func @ptr_diff_vector(%ptrs1: vector<4x!ptr.ptr<#llvm.address_space<0>>>, %
 
 // CHECK-LABEL: define <8 x i32> @ptr_diff_vector_i32
 // CHECK-SAME: (<8 x ptr> %[[PTRS1:.*]], <8 x ptr> %[[PTRS2:.*]]) {
-// CHECK-NEXT:   %[[P1INT:.*]] = ptrtoint <8 x ptr> %[[PTRS1]] to <8 x i64>
-// CHECK-NEXT:   %[[P2INT:.*]] = ptrtoint <8 x ptr> %[[PTRS2]] to <8 x i64>
+// CHECK-NEXT:   %[[P1INT:.*]] = ptrtoaddr <8 x ptr> %[[PTRS1]] to <8 x i64>
+// CHECK-NEXT:   %[[P2INT:.*]] = ptrtoaddr <8 x ptr> %[[PTRS2]] to <8 x i64>
 // CHECK-NEXT:   %[[DIFF:.*]] = sub <8 x i64> %[[P1INT]], %[[P2INT]]
 // CHECK-NEXT:   %[[TRUNC:.*]] = trunc <8 x i64> %[[DIFF]] to <8 x i32>
 // CHECK-NEXT:   ret <8 x i32> %[[TRUNC]]
@@ -344,8 +344,8 @@ llvm.func @ptr_diff_with_constants() -> i64 {
 
 // CHECK-LABEL: define i64 @ptr_diff_with_flags_nsw
 // CHECK-SAME: (ptr %[[PTR1:.*]], ptr %[[PTR2:.*]]) {
-// CHECK-NEXT:   %[[P1INT:.*]] = ptrtoint ptr %[[PTR1]] to i64
-// CHECK-NEXT:   %[[P2INT:.*]] = ptrtoint ptr %[[PTR2]] to i64
+// CHECK-NEXT:   %[[P1INT:.*]] = ptrtoaddr ptr %[[PTR1]] to i64
+// CHECK-NEXT:   %[[P2INT:.*]] = ptrtoaddr ptr %[[PTR2]] to i64
 // CHECK-NEXT:   %[[DIFF:.*]] = sub nsw i64 %[[P1INT]], %[[P2INT]]
 // CHECK-NEXT:   ret i64 %[[DIFF]]
 // CHECK-NEXT: }
@@ -356,8 +356,8 @@ llvm.func @ptr_diff_with_flags_nsw(%ptr1: !ptr.ptr<#llvm.address_space<0>>, %ptr
 
 // CHECK-LABEL: define i64 @ptr_diff_with_flags_nuw
 // CHECK-SAME: (ptr %[[PTR1:.*]], ptr %[[PTR2:.*]]) {
-// CHECK-NEXT:   %[[P1INT:.*]] = ptrtoint ptr %[[PTR1]] to i64
-// CHECK-NEXT:   %[[P2INT:.*]] = ptrtoint ptr %[[PTR2]] to i64
+// CHECK-NEXT:   %[[P1INT:.*]] = ptrtoaddr ptr %[[PTR1]] to i64
+// CHECK-NEXT:   %[[P2INT:.*]] = ptrtoaddr ptr %[[PTR2]] to i64
 // CHECK-NEXT:   %[[DIFF:.*]] = sub nuw i64 %[[P1INT]], %[[P2INT]]
 // CHECK-NEXT:   ret i64 %[[DIFF]]
 // CHECK-NEXT: }
@@ -368,8 +368,8 @@ llvm.func @ptr_diff_with_flags_nuw(%ptr1: !ptr.ptr<#llvm.address_space<0>>, %ptr
 
 // CHECK-LABEL: define i64 @ptr_diff_with_flags_nsw_nuw
 // CHECK-SAME: (ptr %[[PTR1:.*]], ptr %[[PTR2:.*]]) {
-// CHECK-NEXT:   %[[P1INT:.*]] = ptrtoint ptr %[[PTR1]] to i64
-// CHECK-NEXT:   %[[P2INT:.*]] = ptrtoint ptr %[[PTR2]] to i64
+// CHECK-NEXT:   %[[P1INT:.*]] = ptrtoaddr ptr %[[PTR1]] to i64
+// CHECK-NEXT:   %[[P2INT:.*]] = ptrtoaddr ptr %[[PTR2]] to i64
 // CHECK-NEXT:   %[[DIFF:.*]] = sub nuw nsw i64 %[[P1INT]], %[[P2INT]]
 // CHECK-NEXT:   ret i64 %[[DIFF]]
 // CHECK-NEXT: }
diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir
index 30126f6bff05a..8a848221a50dd 100644
--- a/mlir/test/Target/LLVMIR/rocdl.mlir
+++ b/mlir/test/Target/LLVMIR/rocdl.mlir
@@ -1040,6 +1040,36 @@ llvm.func @rocdl.global.load.lds(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) {
   llvm.return
 }
 
+// CHECK-LABEL: rocdl.tensor.load.to.lds
+llvm.func @rocdl.tensor.load.to.lds(%dgroup0 : vector<4xi32>, %dgroup1 : vector<8xi32>,
+                                    %dgroup2 : vector<4xi32>, %dgroup3 : vector<4xi32>) {
+  // CHECK: call void @llvm.amdgcn.tensor.load.to.lds(<4 x i32> %{{.*}}, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, i32 0)
+  rocdl.tensor.load.to.lds %dgroup0, %dgroup1, %dgroup2, %dgroup3 cachepolicy 0 : vector<4xi32>, vector<8xi32>
+  llvm.return
+}
+
+// CHECK-LABEL: rocdl.tensor.store.from.lds
+llvm.func @rocdl.tensor.store.from.lds(%dgroup0 : vector<4xi32>, %dgroup1 : vector<8xi32>,
+                                       %dgroup2 : vector<4xi32>, %dgroup3 : vector<4xi32>) {
+  // CHECK: call void @llvm.amdgcn.tensor.store.from.lds(<4 x i32> %{{.*}}, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, i32 0)
+  rocdl.tensor.store.from.lds %dgroup0, %dgroup1, %dgroup2, %dgroup3 cachepolicy 0 : vector<4xi32>, vector<8xi32>
+  llvm.return
+}
+
+// CHECK-LABEL: rocdl.tensor.load.to.lds.d2
+llvm.func @rocdl.tensor.load.to.lds.d2(%dgroup0 : vector<4xi32>, %dgroup1 : vector<8xi32>) {
+  // CHECK: call void @llvm.amdgcn.tensor.load.to.lds.d2(<4 x i32> %{{.*}}, <8 x i32> %{{.*}}, i32 0)
+  rocdl.tensor.load.to.lds.d2 %dgroup0, %dgroup1 cachepolicy 0 : vector<4xi32>, vector<8xi32>
+  llvm.return
+}
+
+// CHECK-LABEL: rocdl.tensor.store.from.lds.d2
+llvm.func @rocdl.tensor.store.from.lds.d2(%dgroup0 : vector<4xi32>, %dgroup1 : vector<8xi32>) {
+  // CHECK: call void @llvm.amdgcn.tensor.store.from.lds.d2(<4 x i32> %{{.*}}, <8 x i32> %{{.*}}, i32 0)
+  rocdl.tensor.store.from.lds.d2 %dgroup0, %dgroup1 cachepolicy 0 : vector<4xi32>, vector<8xi32>
+  llvm.return
+}
+
 llvm.func @rocdl.make.buffer.rsrc(%ptr : !llvm.ptr,
                                   %stride : i16,
                                   %numRecords : i64,
diff --git a/mlir/test/Target/SPIRV/decorations-intel-cache-controls.mlir b/mlir/test/Target/SPIRV/decorations-intel-cache-controls.mlir
new file mode 100644
index 0000000000000..62d15de5ab03c
--- /dev/null
+++ b/mlir/test/Target/SPIRV/decorations-intel-cache-controls.mlir
@@ -0,0 +1,42 @@
+// RUN: mlir-translate --no-implicit-module --split-input-file --test-spirv-roundtrip --verify-diagnostics %s | FileCheck %s
+
+// CHECK-LABEL: spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [CacheControlsINTEL], [SPV_INTEL_cache_controls]> {
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [CacheControlsINTEL], [SPV_INTEL_cache_controls]> {
+  spirv.func @cache_controls() "None" {
+    // CHECK: spirv.Variable {cache_control_load_intel = [#spirv.cache_control_load_intel<cache_level = 0, load_cache_control = Uncached>, #spirv.cache_control_load_intel<cache_level = 1, load_cache_control = Cached>, #spirv.cache_control_load_intel<cache_level = 2, load_cache_control = InvalidateAfterR>]} : !spirv.ptr<f32, Function>
+    %0 = spirv.Variable {cache_control_load_intel = [#spirv.cache_control_load_intel<cache_level = 0, load_cache_control = Uncached>, #spirv.cache_control_load_intel<cache_level = 1, load_cache_control = Cached>, #spirv.cache_control_load_intel<cache_level = 2, load_cache_control = InvalidateAfterR>]} : !spirv.ptr<f32, Function>
+    // CHECK: spirv.Variable {cache_control_store_intel = [#spirv.cache_control_store_intel<cache_level = 0, store_cache_control = Uncached>, #spirv.cache_control_store_intel<cache_level = 1, store_cache_control = WriteThrough>, #spirv.cache_control_store_intel<cache_level = 2, store_cache_control = WriteBack>]} : !spirv.ptr<f32, Function>
+    %1 = spirv.Variable {cache_control_store_intel = [#spirv.cache_control_store_intel<cache_level = 0, store_cache_control = Uncached>, #spirv.cache_control_store_intel<cache_level = 1, store_cache_control = WriteThrough>, #spirv.cache_control_store_intel<cache_level = 2, store_cache_control = WriteBack>]} : !spirv.ptr<f32, Function>
+    spirv.Return
+  }
+}
+
+// -----
+
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [CacheControlsINTEL], [SPV_INTEL_cache_controls]> {
+  spirv.func @cache_controls_invalid_type() "None" {
+    // expected-error@below {{expecting array attribute of CacheControlLoadINTEL for CacheControlLoadINTEL}}
+    %0 = spirv.Variable {cache_control_load_intel = #spirv.cache_control_load_intel<cache_level = 0, load_cache_control = Uncached>} : !spirv.ptr<f32, Function>
+    spirv.Return
+  }
+}
+
+// -----
+
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [CacheControlsINTEL], [SPV_INTEL_cache_controls]> {
+  spirv.func @cache_controls_invalid_type() "None" {
+    // expected-error@below {{expecting array attribute of CacheControlStoreINTEL for CacheControlStoreINTEL}}
+    %0 = spirv.Variable {cache_control_store_intel = [#spirv.cache_control_store_intel<cache_level = 0, store_cache_control = Uncached>, 0 : i32]} : !spirv.ptr<f32, Function>
+    spirv.Return
+  }
+}
+
+// -----
+
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [CacheControlsINTEL], [SPV_INTEL_cache_controls]> {
+  spirv.func @cache_controls_invalid_type() "None" {
+    // expected-error@below {{expecting non-empty array attribute of CacheControlStoreINTEL for CacheControlStoreINTEL}}
+    %0 = spirv.Variable {cache_control_store_intel = []} : !spirv.ptr<f32, Function>
+    spirv.Return
+  }
+}
diff --git a/mlir/test/Target/SPIRV/decorations.mlir b/mlir/test/Target/SPIRV/decorations.mlir
index 90ba690e50b73..712fd17623402 100644
--- a/mlir/test/Target/SPIRV/decorations.mlir
+++ b/mlir/test/Target/SPIRV/decorations.mlir
@@ -1,27 +1,32 @@
-// RUN: mlir-translate -no-implicit-module -split-input-file -test-spirv-roundtrip -verify-diagnostics %s | FileCheck %s
+// RUN: mlir-translate --no-implicit-module --split-input-file --test-spirv-roundtrip %s | FileCheck %s
 
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
+// RUN: %if spirv-tools %{ rm -rf %t %}
+// RUN: %if spirv-tools %{ mkdir %t %}
+// RUN: %if spirv-tools %{ mlir-translate --no-implicit-module --serialize-spirv --split-input-file --spirv-save-validation-files-with-prefix=%t/module %s %}
+// RUN: %if spirv-tools %{ spirv-val %t %}
+
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Linkage], []> {
   // CHECK: location = 0 : i32
   spirv.GlobalVariable @var {location = 0 : i32} : !spirv.ptr<vector<4xf32>, Input>
 }
 
 // -----
 
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Linkage], []> {
   // CHECK: no_perspective
   spirv.GlobalVariable @var {no_perspective} : !spirv.ptr<vector<4xf32>, Input>
 }
 
 // -----
 
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Linkage], []> {
   // CHECK: flat
   spirv.GlobalVariable @var {flat} : !spirv.ptr<si32, Input>
 }
 
 // -----
 
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Linkage], [SPV_KHR_variable_pointers]> {
   // CHECK: aliased
   // CHECK: aliased
   spirv.GlobalVariable @var1 bind(0, 0) {aliased} : !spirv.ptr<!spirv.struct<(!spirv.array<4xf32, stride=4>[0])>, StorageBuffer>
@@ -30,28 +35,28 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
 
 // -----
 
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Linkage], [SPV_KHR_variable_pointers]> {
   // CHECK: non_readable
   spirv.GlobalVariable @var bind(0, 0) {non_readable} : !spirv.ptr<!spirv.struct<(!spirv.array<4xf32, stride=4>[0])>, StorageBuffer>
 }
 
 // -----
 
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Linkage], [SPV_KHR_variable_pointers]> {
   // CHECK: non_writable
   spirv.GlobalVariable @var bind(0, 0) {non_writable} : !spirv.ptr<!spirv.struct<(!spirv.array<4xf32, stride=4>[0])>, StorageBuffer>
 }
 
 // -----
 
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Linkage], [SPV_KHR_variable_pointers]> {
   // CHECK: restrict
   spirv.GlobalVariable @var bind(0, 0) {restrict} : !spirv.ptr<!spirv.struct<(!spirv.array<4xf32, stride=4>[0])>, StorageBuffer>
 }
 
 // -----
 
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Linkage], []> {
   // CHECK: relaxed_precision
   spirv.GlobalVariable @var {location = 0 : i32, relaxed_precision} : !spirv.ptr<vector<4xf32>, Output>
 }
@@ -84,7 +89,7 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Linkage], []> {
 
 // -----
 
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Kernel], []> {
+spirv.module Logical OpenCL requires #spirv.vce<v1.0, [Kernel, Linkage], [SPV_KHR_no_integer_wrap_decoration]> {
 spirv.func @iadd_decorations(%arg: i32) -> i32 "None" {
   // CHECK: spirv.IAdd %{{.*}}, %{{.*}} {no_signed_wrap, no_unsigned_wrap}
   %0 = spirv.IAdd %arg, %arg {no_signed_wrap, no_unsigned_wrap} : i32
@@ -94,7 +99,7 @@ spirv.func @iadd_decorations(%arg: i32) -> i32 "None" {
 
 // -----
 
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Kernel], []> {
+spirv.module Logical OpenCL requires #spirv.vce<v1.0, [Kernel, Linkage], []> {
 spirv.func @fadd_decorations(%arg: f32) -> f32 "None" {
   // CHECK: spirv.FAdd %{{.*}}, %{{.*}} {fp_fast_math_mode = #spirv.fastmath_mode<NotNaN|NotInf|NSZ>}
   %0 = spirv.FAdd %arg, %arg {fp_fast_math_mode = #spirv.fastmath_mode<NotNaN|NotInf|NSZ>} : f32
@@ -104,7 +109,7 @@ spirv.func @fadd_decorations(%arg: f32) -> f32 "None" {
 
 // -----
 
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Kernel], []> {
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Linkage], []> {
 spirv.func @fmul_decorations(%arg: f32) -> f32 "None" {
   // CHECK: spirv.FMul %{{.*}}, %{{.*}} {no_contraction}
   %0 = spirv.FMul %arg, %arg {no_contraction} : f32
@@ -114,7 +119,7 @@ spirv.func @fmul_decorations(%arg: f32) -> f32 "None" {
 
 // -----
 
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Kernel, Float16], []> {
+spirv.module Logical OpenCL requires #spirv.vce<v1.0, [Kernel, Linkage, Float16], []> {
 spirv.func @fp_rounding_mode(%arg: f32) -> f16 "None" {
   // CHECK: spirv.FConvert %arg0 {fp_rounding_mode = #spirv.fp_rounding_mode<RTN>} : f32 to f16
   %0 = spirv.FConvert %arg {fp_rounding_mode = #spirv.fp_rounding_mode<RTN>} : f32 to f16
@@ -124,51 +129,7 @@ spirv.func @fp_rounding_mode(%arg: f32) -> f16 "None" {
 
 // -----
 
-// CHECK-LABEL: spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [CacheControlsINTEL], [SPV_INTEL_cache_controls]> {
-
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [CacheControlsINTEL], [SPV_INTEL_cache_controls]> {
-  spirv.func @cache_controls() "None" {
-    // CHECK: spirv.Variable {cache_control_load_intel = [#spirv.cache_control_load_intel<cache_level = 0, load_cache_control = Uncached>, #spirv.cache_control_load_intel<cache_level = 1, load_cache_control = Cached>, #spirv.cache_control_load_intel<cache_level = 2, load_cache_control = InvalidateAfterR>]} : !spirv.ptr<f32, Function>
-    %0 = spirv.Variable {cache_control_load_intel = [#spirv.cache_control_load_intel<cache_level = 0, load_cache_control = Uncached>, #spirv.cache_control_load_intel<cache_level = 1, load_cache_control = Cached>, #spirv.cache_control_load_intel<cache_level = 2, load_cache_control = InvalidateAfterR>]} : !spirv.ptr<f32, Function>
-    // CHECK: spirv.Variable {cache_control_store_intel = [#spirv.cache_control_store_intel<cache_level = 0, store_cache_control = Uncached>, #spirv.cache_control_store_intel<cache_level = 1, store_cache_control = WriteThrough>, #spirv.cache_control_store_intel<cache_level = 2, store_cache_control = WriteBack>]} : !spirv.ptr<f32, Function>
-    %1 = spirv.Variable {cache_control_store_intel = [#spirv.cache_control_store_intel<cache_level = 0, store_cache_control = Uncached>, #spirv.cache_control_store_intel<cache_level = 1, store_cache_control = WriteThrough>, #spirv.cache_control_store_intel<cache_level = 2, store_cache_control = WriteBack>]} : !spirv.ptr<f32, Function>
-    spirv.Return
-  }
-}
-
-// -----
-
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [CacheControlsINTEL], [SPV_INTEL_cache_controls]> {
-  spirv.func @cache_controls_invalid_type() "None" {
-    // expected-error@below {{expecting array attribute of CacheControlLoadINTEL for CacheControlLoadINTEL}}
-    %0 = spirv.Variable {cache_control_load_intel = #spirv.cache_control_load_intel<cache_level = 0, load_cache_control = Uncached>} : !spirv.ptr<f32, Function>
-    spirv.Return
-  }
-}
-
-// -----
-
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [CacheControlsINTEL], [SPV_INTEL_cache_controls]> {
-  spirv.func @cache_controls_invalid_type() "None" {
-    // expected-error@below {{expecting array attribute of CacheControlStoreINTEL for CacheControlStoreINTEL}}
-    %0 = spirv.Variable {cache_control_store_intel = [#spirv.cache_control_store_intel<cache_level = 0, store_cache_control = Uncached>, 0 : i32]} : !spirv.ptr<f32, Function>
-    spirv.Return
-  }
-}
-
-// -----
-
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [CacheControlsINTEL], [SPV_INTEL_cache_controls]> {
-  spirv.func @cache_controls_invalid_type() "None" {
-    // expected-error@below {{expecting non-empty array attribute of CacheControlStoreINTEL for CacheControlStoreINTEL}}
-    %0 = spirv.Variable {cache_control_store_intel = []} : !spirv.ptr<f32, Function>
-    spirv.Return
-  }
-}
-
-// -----
-
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Linkage], []> {
   // CHECK: spirv.func @relaxed_precision_arg({{%.*}}: !spirv.ptr<f32, Function> {spirv.decoration = #spirv.decoration<RelaxedPrecision>}) "None" attributes {relaxed_precision} {
   spirv.func @relaxed_precision_arg(%arg0: !spirv.ptr<f32, Function> {spirv.decoration = #spirv.decoration<RelaxedPrecision>}) -> () "None" attributes {relaxed_precision} {
     spirv.Return
diff --git a/mlir/test/Transforms/test-legalize-type-conversion.mlir b/mlir/test/Transforms/test-legalize-type-conversion.mlir
index c003f8b2cb1cd..91f83a0afaeef 100644
--- a/mlir/test/Transforms/test-legalize-type-conversion.mlir
+++ b/mlir/test/Transforms/test-legalize-type-conversion.mlir
@@ -143,3 +143,25 @@ func.func @test_signature_conversion_no_converter() {
   return
 }
 
+// -----
+
+// CHECK-LABEL: func @test_unstructured_cf_conversion(
+//  CHECK-SAME:     %[[arg0:.*]]: f64, %[[c:.*]]: i1)
+//       CHECK:   %[[cast1:.*]] = "test.cast"(%[[arg0]]) : (f64) -> f32
+//       CHECK:   "test.foo"(%[[cast1]])
+//       CHECK:   cf.br ^[[bb1:.*]](%[[arg0]] : f64)
+//       CHECK: ^[[bb1]](%[[arg1:.*]]: f64):
+//       CHECK:   cf.cond_br %[[c]], ^[[bb1]](%[[arg1]] : f64), ^[[bb2:.*]](%[[arg1]] : f64)
+//       CHECK: ^[[bb2]](%[[arg2:.*]]: f64):
+//       CHECK:   %[[cast2:.*]] = "test.cast"(%[[arg2]]) : (f64) -> f32
+//       CHECK:   "test.bar"(%[[cast2]])
+//       CHECK: return
+func.func @test_unstructured_cf_conversion(%arg0: f32, %c: i1) {
+  "test.foo"(%arg0) : (f32) -> ()
+  cf.br ^bb1(%arg0: f32)
+^bb1(%arg1: f32):
+  cf.cond_br %c, ^bb1(%arg1 : f32), ^bb2(%arg1 : f32)
+^bb2(%arg2: f32):
+  "test.bar"(%arg2) : (f32) -> ()
+  return
+}
diff --git a/mlir/test/lib/Analysis/DataFlow/TestDenseBackwardDataFlowAnalysis.cpp b/mlir/test/lib/Analysis/DataFlow/TestDenseBackwardDataFlowAnalysis.cpp
index eb0d9801e7d3f..7a7a58384fbb8 100644
--- a/mlir/test/lib/Analysis/DataFlow/TestDenseBackwardDataFlowAnalysis.cpp
+++ b/mlir/test/lib/Analysis/DataFlow/TestDenseBackwardDataFlowAnalysis.cpp
@@ -66,7 +66,7 @@ class NextAccessAnalysis : public DenseBackwardDataFlowAnalysis<NextAccess> {
 
   void visitRegionBranchControlFlowTransfer(RegionBranchOpInterface branch,
                                             RegionBranchPoint regionFrom,
-                                            RegionBranchPoint regionTo,
+                                            RegionSuccessor regionTo,
                                             const NextAccess &after,
                                             NextAccess *before) override;
 
@@ -240,7 +240,7 @@ void NextAccessAnalysis::visitCallControlFlowTransfer(
 
 void NextAccessAnalysis::visitRegionBranchControlFlowTransfer(
     RegionBranchOpInterface branch, RegionBranchPoint regionFrom,
-    RegionBranchPoint regionTo, const NextAccess &after, NextAccess *before) {
+    RegionSuccessor regionTo, const NextAccess &after, NextAccess *before) {
   LDBG() << "visitRegionBranchControlFlowTransfer: "
          << OpWithFlags(branch.getOperation(), OpPrintingFlags().skipRegions());
   LDBG() << "  regionFrom: " << (regionFrom.isParent() ? "parent" : "region");
diff --git a/mlir/test/lib/Dialect/OpenACC/TestOpenACCSupport.cpp b/mlir/test/lib/Dialect/OpenACC/TestOpenACCSupport.cpp
index 8bf984bdc2632..7c8b08489c62e 100644
--- a/mlir/test/lib/Dialect/OpenACC/TestOpenACCSupport.cpp
+++ b/mlir/test/lib/Dialect/OpenACC/TestOpenACCSupport.cpp
@@ -57,6 +57,28 @@ void TestOpenACCSupportPass::runOnOperation() {
                      << "\"\n";
       }
     }
+
+    // Check for test.recipe_name attribute. This is the marker used to identify
+    // the operations that need to be tested for getRecipeName.
+    if (auto recipeAttr =
+            op->getAttrOfType<RecipeKindAttr>("test.recipe_name")) {
+      RecipeKind kind = recipeAttr.getValue();
+      // Get the type from the first result if available
+      if (op->getNumResults() > 0) {
+        Type type = op->getResult(0).getType();
+        std::string recipeName =
+            support.getRecipeName(kind, type, op->getResult(0));
+        llvm::outs() << "op=" << *op
+                     << "\n\tgetRecipeName(kind=" << stringifyRecipeKind(kind)
+                     << ", type=" << type << ")=\"" << recipeName << "\"\n";
+      }
+    }
+
+    // Check for test.emit_nyi attribute. This is the marker used to
+    // test whether the not yet implemented case is reported correctly.
+    if (auto messageAttr = op->getAttrOfType<StringAttr>("test.emit_nyi")) {
+      support.emitNYI(op->getLoc(), messageAttr.getValue());
+    }
   });
 }
 
diff --git a/mlir/test/lib/Dialect/Test/CMakeLists.txt b/mlir/test/lib/Dialect/Test/CMakeLists.txt
index f099d01abd31a..9354a85d984c9 100644
--- a/mlir/test/lib/Dialect/Test/CMakeLists.txt
+++ b/mlir/test/lib/Dialect/Test/CMakeLists.txt
@@ -71,6 +71,7 @@ add_mlir_library(MLIRTestDialect
   )
 mlir_target_link_libraries(MLIRTestDialect PUBLIC
   MLIRControlFlowInterfaces
+  MLIRControlFlowTransforms
   MLIRDataLayoutInterfaces
   MLIRDerivedAttributeOpInterface
   MLIRDestinationStyleOpInterface
diff --git a/mlir/test/lib/Dialect/Test/TestOpDefs.cpp b/mlir/test/lib/Dialect/Test/TestOpDefs.cpp
index b211e243f234c..4d4ec02546bc7 100644
--- a/mlir/test/lib/Dialect/Test/TestOpDefs.cpp
+++ b/mlir/test/lib/Dialect/Test/TestOpDefs.cpp
@@ -633,8 +633,9 @@ ParseResult RegionIfOp::parse(OpAsmParser &parser, OperationState &result) {
                                 parser.getCurrentLocation(), result.operands);
 }
 
-OperandRange RegionIfOp::getEntrySuccessorOperands(RegionBranchPoint point) {
-  assert(llvm::is_contained({&getThenRegion(), &getElseRegion()}, point) &&
+OperandRange RegionIfOp::getEntrySuccessorOperands(RegionSuccessor successor) {
+  assert(llvm::is_contained({&getThenRegion(), &getElseRegion()},
+                            successor.getSuccessor()) &&
          "invalid region index");
   return getOperands();
 }
@@ -643,10 +644,11 @@ void RegionIfOp::getSuccessorRegions(
     RegionBranchPoint point, SmallVectorImpl<RegionSuccessor> &regions) {
   // We always branch to the join region.
   if (!point.isParent()) {
-    if (point != getJoinRegion())
+    if (point.getTerminatorPredecessorOrNull()->getParentRegion() !=
+        &getJoinRegion())
       regions.push_back(RegionSuccessor(&getJoinRegion(), getJoinArgs()));
     else
-      regions.push_back(RegionSuccessor(getResults()));
+      regions.push_back(RegionSuccessor(getOperation(), getResults()));
     return;
   }
 
@@ -673,7 +675,7 @@ void AnyCondOp::getSuccessorRegions(RegionBranchPoint point,
   if (point.isParent())
     regions.emplace_back(&getRegion());
   else
-    regions.emplace_back(getResults());
+    regions.emplace_back(getOperation(), getResults());
 }
 
 void AnyCondOp::getRegionInvocationBounds(
@@ -1107,11 +1109,11 @@ void LoopBlockOp::getSuccessorRegions(
   if (point.isParent())
     return;
 
-  regions.emplace_back((*this)->getResults());
+  regions.emplace_back(getOperation(), getOperation()->getResults());
 }
 
-OperandRange LoopBlockOp::getEntrySuccessorOperands(RegionBranchPoint point) {
-  assert(point == getBody());
+OperandRange LoopBlockOp::getEntrySuccessorOperands(RegionSuccessor successor) {
+  assert(successor.getSuccessor() == &getBody());
   return MutableOperandRange(getInitMutable());
 }
 
@@ -1120,8 +1122,8 @@ OperandRange LoopBlockOp::getEntrySuccessorOperands(RegionBranchPoint point) {
 //===----------------------------------------------------------------------===//
 
 MutableOperandRange
-LoopBlockTerminatorOp::getMutableSuccessorOperands(RegionBranchPoint point) {
-  if (point.isParent())
+LoopBlockTerminatorOp::getMutableSuccessorOperands(RegionSuccessor successor) {
+  if (successor.isParent())
     return getExitArgMutable();
   return getNextIterArgMutable();
 }
@@ -1213,7 +1215,7 @@ void TestStoreWithARegion::getSuccessorRegions(
   if (point.isParent())
     regions.emplace_back(&getBody(), getBody().front().getArguments());
   else
-    regions.emplace_back();
+    regions.emplace_back(getOperation(), getOperation()->getResults());
 }
 
 //===----------------------------------------------------------------------===//
@@ -1227,7 +1229,7 @@ void TestStoreWithALoopRegion::getSuccessorRegions(
   // enter the body.
   regions.emplace_back(
       RegionSuccessor(&getBody(), getBody().front().getArguments()));
-  regions.emplace_back();
+  regions.emplace_back(getOperation(), getOperation()->getResults());
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/lib/Dialect/Test/TestOps.td b/mlir/test/lib/Dialect/Test/TestOps.td
index 05a33cf1afd94..a3430ba49a291 100644
--- a/mlir/test/lib/Dialect/Test/TestOps.td
+++ b/mlir/test/lib/Dialect/Test/TestOps.td
@@ -2581,7 +2581,7 @@ def LoopBlockTerminatorOp : TEST_Op<"loop_block_term",
 
 def TestNoTerminatorOp : TEST_Op<"switch_with_no_break", [
     NoTerminator,
-    DeclareOpInterfaceMethods<RegionBranchOpInterface, ["getSuccessorRegions"]>
+    DeclareOpInterfaceMethods<RegionBranchOpInterface>
   ]> {
   let arguments = (ins Index:$arg, DenseI64ArrayAttr:$cases);
   let regions = (region VariadicRegion<SizedRegion<1>>:$caseRegions);
diff --git a/mlir/test/lib/Dialect/Test/TestPatterns.cpp b/mlir/test/lib/Dialect/Test/TestPatterns.cpp
index efbdbfb65d65b..fd2b943ff1296 100644
--- a/mlir/test/lib/Dialect/Test/TestPatterns.cpp
+++ b/mlir/test/lib/Dialect/Test/TestPatterns.cpp
@@ -11,6 +11,7 @@
 #include "TestTypes.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/CommonFolders.h"
+#include "mlir/Dialect/ControlFlow/Transforms/StructuralTypeConversions.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Func/Transforms/FuncConversions.h"
 #include "mlir/Dialect/SCF/Transforms/Patterns.h"
@@ -2042,6 +2043,10 @@ struct TestTypeConversionDriver
     });
     converter.addConversion([](IndexType type) { return type; });
     converter.addConversion([](IntegerType type, SmallVectorImpl<Type> &types) {
+      if (type.isInteger(1)) {
+        // i1 is legal.
+        types.push_back(type);
+      }
       if (type.isInteger(38)) {
         // i38 is legal.
         types.push_back(type);
@@ -2175,6 +2180,8 @@ struct TestTypeConversionDriver
                                                               converter);
     mlir::scf::populateSCFStructuralTypeConversionsAndLegality(
         converter, patterns, target);
+    mlir::cf::populateCFStructuralTypeConversionsAndLegality(converter,
+                                                             patterns, target);
 
     ConversionConfig config;
     config.allowPatternRollback = allowPatternRollback;
diff --git a/mlir/test/lib/Dialect/Transform/TestTransformDialectExtension.cpp b/mlir/test/lib/Dialect/Transform/TestTransformDialectExtension.cpp
index 496f18bc49fad..61db9d2b44461 100644
--- a/mlir/test/lib/Dialect/Transform/TestTransformDialectExtension.cpp
+++ b/mlir/test/lib/Dialect/Transform/TestTransformDialectExtension.cpp
@@ -797,7 +797,7 @@ DiagnosedSilenceableFailure mlir::test::TestProduceInvalidIR::applyToOne(
   // Provide some IR that does not verify.
   rewriter.setInsertionPointToStart(&target->getRegion(0).front());
   TestDummyPayloadOp::create(rewriter, target->getLoc(), TypeRange(),
-                             ValueRange(), /*failToVerify=*/true);
+                             ValueRange(), /*fail_to_verify=*/true);
   return DiagnosedSilenceableFailure::success();
 }
 
diff --git a/mlir/test/mlir-runner/memref-reshape.mlir b/mlir/test/mlir-runner/memref-reshape.mlir
index 8c17f1fd02358..b264e0285953f 100644
--- a/mlir/test/mlir-runner/memref-reshape.mlir
+++ b/mlir/test/mlir-runner/memref-reshape.mlir
@@ -65,8 +65,8 @@ func.func @reshape_ranked_memref_to_ranked(%input : memref<2x3xf32>,
 func.func @reshape_unranked_memref_to_ranked(%input : memref<2x3xf32>,
                                         %shape : memref<2xindex>) {
   %unranked_input = memref.cast %input : memref<2x3xf32> to memref<*xf32>
-  %output = memref.reshape %input(%shape)
-                : (memref<2x3xf32>, memref<2xindex>) -> memref<?x?xf32>
+  %output = memref.reshape %unranked_input(%shape)
+                : (memref<*xf32>, memref<2xindex>) -> memref<?x?xf32>
 
   %unranked_output = memref.cast %output : memref<?x?xf32> to memref<*xf32>
   call @printMemrefF32(%unranked_output) : (memref<*xf32>) -> ()
@@ -95,8 +95,8 @@ func.func @reshape_unranked_memref_to_unranked(%input : memref<2x3xf32>,
                                           %shape : memref<2xindex>) {
   %unranked_input = memref.cast %input : memref<2x3xf32> to memref<*xf32>
   %dyn_size_shape = memref.cast %shape : memref<2xindex> to memref<?xindex>
-  %output = memref.reshape %input(%dyn_size_shape)
-                : (memref<2x3xf32>, memref<?xindex>) -> memref<*xf32>
+  %output = memref.reshape %unranked_input(%dyn_size_shape)
+                : (memref<*xf32>, memref<?xindex>) -> memref<*xf32>
 
   call @printMemrefF32(%output) : (memref<*xf32>) -> ()
   // CHECK: rank = 2 offset = 0 sizes = [3, 2] strides = [2, 1] data =
diff --git a/mlir/test/mlir-tblgen/op-properties.td b/mlir/test/mlir-tblgen/op-properties.td
index a9c784cba0b6d..cb9bd3dc868fe 100644
--- a/mlir/test/mlir-tblgen/op-properties.td
+++ b/mlir/test/mlir-tblgen/op-properties.td
@@ -32,7 +32,7 @@ def OpWithProps : NS_Op<"op_with_props"> {
     ArrayProp<StringProp>:$strings,
     DefaultValuedProp<I32Prop, "0">:$default_int,
     OptionalProp<I64Prop>:$optional,
-    DefaultI64Array:$intArray
+    DefaultI64Array:$value
   );
 }
 
@@ -94,10 +94,10 @@ def OpWithOptionalPropsAndAttrs :
 // DECL: ::llvm::ArrayRef<std::string> getStrings()
 // DECL: using default_intTy = int32_t;
 // DECL: default_intTy default_int = 0;
-// DECL: intArrayTy intArray = ::llvm::SmallVector<int64_t>{};
-// DECL: ::llvm::ArrayRef<int64_t> getIntArray()
+// DECL: valueTy value = ::llvm::SmallVector<int64_t>{};
+// DECL: ::llvm::ArrayRef<int64_t> getValue()
 // DECL: return ::llvm::ArrayRef<int64_t>{propStorage}
-// DECL: void setIntArray(::llvm::ArrayRef<int64_t> propValue)
+// DECL: void setValue(::llvm::ArrayRef<int64_t> propValue)
 // DECL: propStorage.assign
 // DECL-LABEL: class OpWithProps :
 // DECL: setString(::llvm::StringRef newString)
@@ -111,14 +111,14 @@ def OpWithOptionalPropsAndAttrs :
 // DECL-SAME: ::llvm::ArrayRef<std::string> strings,
 // DECL-SAME: /*optional*/int32_t default_int = 0,
 // DECL-SAME: /*optional*/std::optional<int64_t> optional = std::nullopt,
-// DECL-SAME: /*optional*/::llvm::ArrayRef<int64_t> intArray = ::llvm::ArrayRef<int64_t>{});
+// DECL-SAME: /*optional*/::llvm::ArrayRef<int64_t> value = ::llvm::ArrayRef<int64_t>{});
 
 // DEFS-LABEL: OpWithProps::computePropertiesHash
-// DEFS: hash_intArray
+// DEFS: hash_value_
 // DEFS: using ::llvm::hash_value;
 // DEFS-NEXT: return hash_value(::llvm::ArrayRef<int64_t>{propStorage})
 // DEFS: hash_value(prop.optional)
-// DEFS: hash_intArray(prop.intArray)
+// DEFS: hash_value_(prop.value)
 
 // -----
 
diff --git a/mlir/test/python/CMakeLists.txt b/mlir/test/python/CMakeLists.txt
index 2c123811c2998..c81f75fc6a1af 100644
--- a/mlir/test/python/CMakeLists.txt
+++ b/mlir/test/python/CMakeLists.txt
@@ -11,7 +11,7 @@ add_public_tablegen_target(MLIRPythonTestIncGen)
 
 add_subdirectory(lib)
 
-set(MLIR_PYTHON_TEST_DEPENDS MLIRPythonModules mlir-runner)
+set(MLIR_PYTHON_TEST_DEPENDS MLIRPythonModules mlir-runner mlir_c_runner_utils mlir_runner_utils)
 if(NOT MLIR_STANDALONE_BUILD)
   list(APPEND MLIR_PYTHON_TEST_DEPENDS FileCheck count not)
 endif()
diff --git a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
index 371864830a3c1..4d9b1b2328018 100644
--- a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
@@ -1629,7 +1629,7 @@ void OpEmitter::genPropertiesSupport() {
   // Hashing for the property
 
   const char *propHashFmt = R"decl(
-  auto hash_{0} = [] (const auto &propStorage) -> llvm::hash_code {
+  auto hash_{0}_ = [] (const auto &propStorage) -> llvm::hash_code {
     using ::llvm::hash_value;
     return {1};
   };
@@ -1655,7 +1655,7 @@ void OpEmitter::genPropertiesSupport() {
         if (const auto *namedProperty =
                 llvm::dyn_cast_if_present<const NamedProperty *>(attrOrProp)) {
           if (!namedProperty->prop.getHashPropertyCall().empty()) {
-            hashMethod << "\n    hash_" << namedProperty->name << "(prop."
+            hashMethod << "\n    hash_" << namedProperty->name << "_(prop."
                        << namedProperty->name << ")";
           } else {
             hashMethod << "\n    hash_value(prop." << namedProperty->name
@@ -2632,11 +2632,13 @@ void OpEmitter::genInlineCreateBody(
     interleaveComma(nonBuilderStateArgsList, nonBuilderStateArgsOS);
     nonBuilderStateArgs = ", " + nonBuilderStateArgs;
   }
-  cWithLoc->body() << llvm::formatv(inlineCreateBody, locParamName,
-                                    nonBuilderStateArgs,
-                                    opClass.getClassName());
-  cImplicitLoc->body() << llvm::formatv(inlineCreateBodyImplicitLoc,
-                                        nonBuilderStateArgs);
+  if (cWithLoc)
+    cWithLoc->body() << llvm::formatv(inlineCreateBody, locParamName,
+                                      nonBuilderStateArgs,
+                                      opClass.getClassName());
+  if (cImplicitLoc)
+    cImplicitLoc->body() << llvm::formatv(inlineCreateBodyImplicitLoc,
+                                          nonBuilderStateArgs);
 }
 
 void OpEmitter::genSeparateArgParamBuilder() {
diff --git a/mlir/unittests/Dialect/OpenACC/OpenACCOpsTest.cpp b/mlir/unittests/Dialect/OpenACC/OpenACCOpsTest.cpp
index 6ac9a873e6154..d6203b97e00d7 100644
--- a/mlir/unittests/Dialect/OpenACC/OpenACCOpsTest.cpp
+++ b/mlir/unittests/Dialect/OpenACC/OpenACCOpsTest.cpp
@@ -766,7 +766,9 @@ void testShortDataEntryOpBuildersMappableVar(OpBuilder &b, MLIRContext &context,
 
 struct IntegerOpenACCMappableModel
     : public mlir::acc::MappableType::ExternalModel<IntegerOpenACCMappableModel,
-                                                    IntegerType> {};
+                                                    IntegerType> {
+  bool hasUnknownDimensions(mlir::Type type) const { return false; }
+};
 
 TEST_F(OpenACCOpsTest, mappableTypeBuilderDataEntry) {
   // First, set up the test by attaching MappableInterface to IntegerType.
diff --git a/mlir/unittests/Dialect/OpenACC/OpenACCUtilsTest.cpp b/mlir/unittests/Dialect/OpenACC/OpenACCUtilsTest.cpp
index 3fbbcc90a67c9..6f4e30585b2c9 100644
--- a/mlir/unittests/Dialect/OpenACC/OpenACCUtilsTest.cpp
+++ b/mlir/unittests/Dialect/OpenACC/OpenACCUtilsTest.cpp
@@ -485,3 +485,192 @@ TEST_F(OpenACCUtilsTest, getVariableNameFromCopyin) {
   std::string varName = getVariableName(copyinOp->getAccVar());
   EXPECT_EQ(varName, name);
 }
+
+//===----------------------------------------------------------------------===//
+// getRecipeName Tests
+//===----------------------------------------------------------------------===//
+
+TEST_F(OpenACCUtilsTest, getRecipeNamePrivateScalarMemref) {
+  // Create a scalar memref type
+  auto scalarMemrefTy = MemRefType::get({}, b.getI32Type());
+
+  // Test private recipe with scalar memref
+  std::string recipeName =
+      getRecipeName(RecipeKind::private_recipe, scalarMemrefTy);
+  EXPECT_EQ(recipeName, "privatization_memref_i32_");
+}
+
+TEST_F(OpenACCUtilsTest, getRecipeNameFirstprivateScalarMemref) {
+  // Create a scalar memref type
+  auto scalarMemrefTy = MemRefType::get({}, b.getF32Type());
+
+  // Test firstprivate recipe with scalar memref
+  std::string recipeName =
+      getRecipeName(RecipeKind::firstprivate_recipe, scalarMemrefTy);
+  EXPECT_EQ(recipeName, "firstprivatization_memref_f32_");
+}
+
+TEST_F(OpenACCUtilsTest, getRecipeNameReductionScalarMemref) {
+  // Create a scalar memref type
+  auto scalarMemrefTy = MemRefType::get({}, b.getI64Type());
+
+  // Test reduction recipe with scalar memref
+  std::string recipeName =
+      getRecipeName(RecipeKind::reduction_recipe, scalarMemrefTy);
+  EXPECT_EQ(recipeName, "reduction_memref_i64_");
+}
+
+TEST_F(OpenACCUtilsTest, getRecipeNamePrivate2DMemref) {
+  // Create a 2D memref type
+  auto memref2DTy = MemRefType::get({5, 10}, b.getF32Type());
+
+  // Test private recipe with 2D memref
+  std::string recipeName =
+      getRecipeName(RecipeKind::private_recipe, memref2DTy);
+  EXPECT_EQ(recipeName, "privatization_memref_5x10xf32_");
+}
+
+TEST_F(OpenACCUtilsTest, getRecipeNameFirstprivate2DMemref) {
+  // Create a 2D memref type
+  auto memref2DTy = MemRefType::get({8, 16}, b.getF64Type());
+
+  // Test firstprivate recipe with 2D memref
+  std::string recipeName =
+      getRecipeName(RecipeKind::firstprivate_recipe, memref2DTy);
+  EXPECT_EQ(recipeName, "firstprivatization_memref_8x16xf64_");
+}
+
+TEST_F(OpenACCUtilsTest, getRecipeNameReduction2DMemref) {
+  // Create a 2D memref type
+  auto memref2DTy = MemRefType::get({4, 8}, b.getI32Type());
+
+  // Test reduction recipe with 2D memref
+  std::string recipeName =
+      getRecipeName(RecipeKind::reduction_recipe, memref2DTy);
+  EXPECT_EQ(recipeName, "reduction_memref_4x8xi32_");
+}
+
+TEST_F(OpenACCUtilsTest, getRecipeNamePrivateDynamicMemref) {
+  // Create a memref with dynamic dimensions
+  auto dynamicMemrefTy =
+      MemRefType::get({ShapedType::kDynamic, 10}, b.getI32Type());
+
+  // Test private recipe with dynamic memref
+  std::string recipeName =
+      getRecipeName(RecipeKind::private_recipe, dynamicMemrefTy);
+  EXPECT_EQ(recipeName, "privatization_memref_Ux10xi32_");
+}
+
+TEST_F(OpenACCUtilsTest, getRecipeNamePrivateUnrankedMemref) {
+  // Create an unranked memref type
+  auto unrankedMemrefTy = UnrankedMemRefType::get(b.getI32Type(), 0);
+
+  // Test private recipe with unranked memref
+  std::string recipeName =
+      getRecipeName(RecipeKind::private_recipe, unrankedMemrefTy);
+  EXPECT_EQ(recipeName, "privatization_memref_Zxi32_");
+}
+
+//===----------------------------------------------------------------------===//
+// getBaseEntity Tests
+//===----------------------------------------------------------------------===//
+
+// Local implementation of PartialEntityAccessOpInterface for memref.subview.
+// This is implemented locally in the test rather than officially because memref
+// operations already have ViewLikeOpInterface, which serves a similar purpose
+// for walking through views to the base entity. This test demonstrates how
+// getBaseEntity() would work if the interface were attached to memref.subview.
+namespace {
+struct SubViewOpPartialEntityAccessOpInterface
+    : public acc::PartialEntityAccessOpInterface::ExternalModel<
+          SubViewOpPartialEntityAccessOpInterface, memref::SubViewOp> {
+  Value getBaseEntity(Operation *op) const {
+    auto subviewOp = cast<memref::SubViewOp>(op);
+    return subviewOp.getSource();
+  }
+
+  bool isCompleteView(Operation *op) const {
+    // For testing purposes, we'll consider it a partial view (return false).
+    // The real implementation would need to look at the offsets.
+    return false;
+  }
+};
+} // namespace
+
+TEST_F(OpenACCUtilsTest, getBaseEntityFromSubview) {
+  // Register the local interface implementation for memref.subview
+  memref::SubViewOp::attachInterface<SubViewOpPartialEntityAccessOpInterface>(
+      context);
+
+  // Create a base memref
+  auto memrefTy = MemRefType::get({10, 20}, b.getF32Type());
+  OwningOpRef<memref::AllocaOp> allocOp =
+      memref::AllocaOp::create(b, loc, memrefTy);
+  Value baseMemref = allocOp->getResult();
+
+  // Create a subview of the base memref with non-zero offsets
+  // This creates a 5x10 view starting at [2, 3] in the original 10x20 memref
+  SmallVector<OpFoldResult> offsets = {b.getIndexAttr(2), b.getIndexAttr(3)};
+  SmallVector<OpFoldResult> sizes = {b.getIndexAttr(5), b.getIndexAttr(10)};
+  SmallVector<OpFoldResult> strides = {b.getIndexAttr(1), b.getIndexAttr(1)};
+
+  OwningOpRef<memref::SubViewOp> subviewOp =
+      memref::SubViewOp::create(b, loc, baseMemref, offsets, sizes, strides);
+  Value subview = subviewOp->getResult();
+
+  // Test that getBaseEntity returns the base memref, not the subview
+  Value baseEntity = getBaseEntity(subview);
+  EXPECT_EQ(baseEntity, baseMemref);
+}
+
+TEST_F(OpenACCUtilsTest, getBaseEntityNoInterface) {
+  // Create a memref without the interface
+  auto memrefTy = MemRefType::get({10}, b.getI32Type());
+  OwningOpRef<memref::AllocaOp> allocOp =
+      memref::AllocaOp::create(b, loc, memrefTy);
+  Value varPtr = allocOp->getResult();
+
+  // Test that getBaseEntity returns the value itself when there's no interface
+  Value baseEntity = getBaseEntity(varPtr);
+  EXPECT_EQ(baseEntity, varPtr);
+}
+
+TEST_F(OpenACCUtilsTest, getBaseEntityChainedSubviews) {
+  // Register the local interface implementation for memref.subview
+  memref::SubViewOp::attachInterface<SubViewOpPartialEntityAccessOpInterface>(
+      context);
+
+  // Create a base memref
+  auto memrefTy = MemRefType::get({100, 200}, b.getI64Type());
+  OwningOpRef<memref::AllocaOp> allocOp =
+      memref::AllocaOp::create(b, loc, memrefTy);
+  Value baseMemref = allocOp->getResult();
+
+  // Create first subview
+  SmallVector<OpFoldResult> offsets1 = {b.getIndexAttr(10), b.getIndexAttr(20)};
+  SmallVector<OpFoldResult> sizes1 = {b.getIndexAttr(50), b.getIndexAttr(80)};
+  SmallVector<OpFoldResult> strides1 = {b.getIndexAttr(1), b.getIndexAttr(1)};
+
+  OwningOpRef<memref::SubViewOp> subview1Op =
+      memref::SubViewOp::create(b, loc, baseMemref, offsets1, sizes1, strides1);
+  Value subview1 = subview1Op->getResult();
+
+  // Create second subview (subview of subview)
+  SmallVector<OpFoldResult> offsets2 = {b.getIndexAttr(5), b.getIndexAttr(10)};
+  SmallVector<OpFoldResult> sizes2 = {b.getIndexAttr(20), b.getIndexAttr(30)};
+  SmallVector<OpFoldResult> strides2 = {b.getIndexAttr(1), b.getIndexAttr(1)};
+
+  OwningOpRef<memref::SubViewOp> subview2Op =
+      memref::SubViewOp::create(b, loc, subview1, offsets2, sizes2, strides2);
+  Value subview2 = subview2Op->getResult();
+
+  // Test that getBaseEntity on the nested subview returns the first subview
+  // (since our implementation returns the immediate source, not the ultimate
+  // base)
+  Value baseEntity = getBaseEntity(subview2);
+  EXPECT_EQ(baseEntity, subview1);
+
+  // Test that calling getBaseEntity again returns the original base
+  Value ultimateBase = getBaseEntity(baseEntity);
+  EXPECT_EQ(ultimateBase, baseMemref);
+}
diff --git a/mlir/unittests/Interfaces/ControlFlowInterfacesTest.cpp b/mlir/unittests/Interfaces/ControlFlowInterfacesTest.cpp
index f1aae15393fd3..2e6950fca6be2 100644
--- a/mlir/unittests/Interfaces/ControlFlowInterfacesTest.cpp
+++ b/mlir/unittests/Interfaces/ControlFlowInterfacesTest.cpp
@@ -13,17 +13,24 @@
 #include "mlir/IR/OpDefinition.h"
 #include "mlir/IR/OpImplementation.h"
 #include "mlir/Parser/Parser.h"
+#include "llvm/Support/DebugLog.h"
 
 #include <gtest/gtest.h>
 
 using namespace mlir;
 
 /// A dummy op that is also a terminator.
-struct DummyOp : public Op<DummyOp, OpTrait::IsTerminator> {
+struct DummyOp : public Op<DummyOp, OpTrait::IsTerminator, OpTrait::ZeroResults,
+                           OpTrait::ZeroSuccessors,
+                           RegionBranchTerminatorOpInterface::Trait> {
   using Op::Op;
   static ArrayRef<StringRef> getAttributeNames() { return {}; }
 
   static StringRef getOperationName() { return "cftest.dummy_op"; }
+
+  MutableOperandRange getMutableSuccessorOperands(RegionSuccessor point) {
+    return MutableOperandRange(getOperation(), 0, 0);
+  }
 };
 
 /// All regions of this op are mutually exclusive.
@@ -39,6 +46,8 @@ struct MutuallyExclusiveRegionsOp
   // Regions have no successors.
   void getSuccessorRegions(RegionBranchPoint point,
                            SmallVectorImpl<RegionSuccessor> &regions) {}
+  using RegionBranchOpInterface::Trait<
+      MutuallyExclusiveRegionsOp>::getSuccessorRegions;
 };
 
 /// All regions of this op call each other in a large circle.
@@ -53,13 +62,18 @@ struct LoopRegionsOp
 
   void getSuccessorRegions(RegionBranchPoint point,
                            SmallVectorImpl<RegionSuccessor> &regions) {
-    if (Region *region = point.getRegionOrNull()) {
-      if (point == (*this)->getRegion(1))
+    if (point.getTerminatorPredecessorOrNull()) {
+      Region *region =
+          point.getTerminatorPredecessorOrNull()->getParentRegion();
+      if (region == &(*this)->getRegion(1))
         // This region also branches back to the parent.
-        regions.push_back(RegionSuccessor());
+        regions.push_back(
+            RegionSuccessor(getOperation()->getParentOp(),
+                            getOperation()->getParentOp()->getResults()));
       regions.push_back(RegionSuccessor(region));
     }
   }
+  using RegionBranchOpInterface::Trait<LoopRegionsOp>::getSuccessorRegions;
 };
 
 /// Each region branches back it itself or the parent.
@@ -75,11 +89,17 @@ struct DoubleLoopRegionsOp
 
   void getSuccessorRegions(RegionBranchPoint point,
                            SmallVectorImpl<RegionSuccessor> &regions) {
-    if (Region *region = point.getRegionOrNull()) {
-      regions.push_back(RegionSuccessor());
+    if (point.getTerminatorPredecessorOrNull()) {
+      Region *region =
+          point.getTerminatorPredecessorOrNull()->getParentRegion();
+      regions.push_back(
+          RegionSuccessor(getOperation()->getParentOp(),
+                          getOperation()->getParentOp()->getResults()));
       regions.push_back(RegionSuccessor(region));
     }
   }
+  using RegionBranchOpInterface::Trait<
+      DoubleLoopRegionsOp>::getSuccessorRegions;
 };
 
 /// Regions are executed sequentially.
@@ -93,11 +113,15 @@ struct SequentialRegionsOp
   // Region 0 has Region 1 as a successor.
   void getSuccessorRegions(RegionBranchPoint point,
                            SmallVectorImpl<RegionSuccessor> &regions) {
-    if (point == (*this)->getRegion(0)) {
+    if (point.getTerminatorPredecessorOrNull() &&
+        point.getTerminatorPredecessorOrNull()->getParentRegion() ==
+            &(*this)->getRegion(0)) {
       Operation *thisOp = this->getOperation();
       regions.push_back(RegionSuccessor(&thisOp->getRegion(1)));
     }
   }
+  using RegionBranchOpInterface::Trait<
+      SequentialRegionsOp>::getSuccessorRegions;
 };
 
 /// A dialect putting all the above together.
diff --git a/offload/test/offloading/fortran/descriptor-array-slice-map.f90 b/offload/test/offloading/fortran/descriptor-array-slice-map.f90
new file mode 100644
index 0000000000000..69abb320adc35
--- /dev/null
+++ b/offload/test/offloading/fortran/descriptor-array-slice-map.f90
@@ -0,0 +1,61 @@
+! Offloading test which aims to test that an allocatable/descriptor type map
+! will allow the appropriate slicing behaviour.
+! REQUIRES: flang, amdgpu
+
+subroutine slice_writer(n, a, b, c)
+    implicit none
+    integer, intent(in) :: n
+    real(8), intent(in) :: a(n)
+    real(8), intent(in) :: b(n)
+    real(8), intent(out) :: c(n)
+    integer :: i
+
+    !$omp target teams distribute parallel do
+    do i=1,n
+       c(i) = b(i) + a(i)
+    end do
+end subroutine slice_writer
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+program main
+    implicit none
+    real(kind=8), allocatable :: a(:,:,:)
+    integer :: i, j, k, idx, idx1, idx2, idx3
+
+    i=50
+    j=100
+    k=2
+
+    allocate(a(1:i,1:j,1:k))
+
+    do idx1=1, i
+        do idx2=1, j
+            do idx3=1, k
+                a(idx1,idx2,idx3) = idx2
+            end do
+        end do
+    end do
+
+    do idx=1,k
+        !$omp target enter data map(alloc: a(1:i,:, idx))
+
+        !$omp target update to(a(1:i, 1:30, idx), &
+        !$omp&                 a(1:i, 61:100, idx))
+
+        call slice_writer(i, a(:, 1, idx), a(:, 61, idx), a(:, 31, idx))
+        call slice_writer(i, a(:, 30, idx), a(:, 100, idx), a(:, 60, idx))
+
+        !$omp target update from(a(1:i, 31:60, idx))
+        !$omp target exit data map(delete: a(1:i, :, idx))
+
+        print *, a(1, 31, idx), a(2, 31, idx), a(i, 31, idx)
+        print *, a(1, 60, idx), a(2, 60, idx), a(i, 60, idx)
+    enddo
+
+    deallocate(a)
+end program
+
+! CHECK: 62. 62. 62.
+! CHECK: 130. 130. 130.
+! CHECK: 62. 62. 62.
+! CHECK: 130. 130. 130.
diff --git a/openmp/runtime/src/z_Linux_asm.S b/openmp/runtime/src/z_Linux_asm.S
index 89359759fcb42..12fea67e000e8 100644
--- a/openmp/runtime/src/z_Linux_asm.S
+++ b/openmp/runtime/src/z_Linux_asm.S
@@ -121,8 +121,7 @@ KMP_PREFIX_UNDERSCORE(\proc):
 # endif // KMP_OS_DARWIN
 #endif // KMP_ARCH_X86 || KMP_ARCH_x86_64
 
-#if (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS || KMP_OS_OPENBSD) &&     \
-    (KMP_ARCH_AARCH64 || KMP_ARCH_AARCH64_32 || KMP_ARCH_ARM)
+#if KMP_ARCH_AARCH64 || KMP_ARCH_AARCH64_32 || KMP_ARCH_ARM
 
 # if KMP_OS_DARWIN
 #  define KMP_PREFIX_UNDERSCORE(x) _##x  // extra underscore for OS X* symbols
@@ -237,8 +236,7 @@ KMP_PREFIX_UNDERSCORE(\proc):
 #  define PACBTI_RET
 #  define GNU_PROPERTY_BTI_PAC
 # endif
-#endif // (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS || KMP_OS_OPENBSD) && \
-          (KMP_ARCH_AARCH64 || KMP_ARCH_AARCH64_32 || KMP_ARCH_ARM)
+#endif // KMP_ARCH_AARCH64 || KMP_ARCH_AARCH64_32 || KMP_ARCH_ARM
 
 .macro COMMON name, size, align_power
 #if KMP_OS_DARWIN
@@ -1302,7 +1300,7 @@ KMP_LABEL(kmp_no_args):
 #endif /* KMP_ARCH_X86_64 */
 
 // '
-#if (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && (KMP_ARCH_AARCH64 || KMP_ARCH_AARCH64_32)
+#if KMP_ARCH_AARCH64 || KMP_ARCH_AARCH64_32
 
 //------------------------------------------------------------------------
 // int
@@ -1360,10 +1358,10 @@ __tid = 8
 	PROC __kmp_invoke_microtask
 	PACBTI_C
 
-	stp	x29, x30, [sp, #-16]!
 # if OMPT_SUPPORT
 	stp	x19, x20, [sp, #-16]!
 # endif
+	stp	x29, x30, [sp, #-16]!
 	mov	x29, sp
 
 	orr	w9, wzr, #1
@@ -1417,20 +1415,20 @@ KMP_LABEL(kmp_1):
 	blr	x8
 	orr	w0, wzr, #1
 	mov	sp, x29
+	ldp	x29, x30, [sp], #16
 # if OMPT_SUPPORT
 	str	xzr, [x19]
 	ldp	x19, x20, [sp], #16
 # endif
-	ldp	x29, x30, [sp], #16
 	PACBTI_RET
 	ret
 
 	DEBUG_INFO __kmp_invoke_microtask
 // -- End  __kmp_invoke_microtask
 
-#endif /* (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && (KMP_ARCH_AARCH64 || KMP_ARCH_AARCH64_32) */
+#endif /* KMP_ARCH_AARCH64 || KMP_ARCH_AARCH64_32 */
 
-#if (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && KMP_ARCH_ARM
+#if KMP_ARCH_ARM
 
 //------------------------------------------------------------------------
 // int
@@ -1573,7 +1571,7 @@ KMP_LABEL(kmp_1):
 	DEBUG_INFO __kmp_invoke_microtask
 // -- End  __kmp_invoke_microtask
 
-#endif /* (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && KMP_ARCH_ARM */
+#endif /* KMP_ARCH_ARM */
 
 #if KMP_ARCH_PPC64
 
diff --git a/openmp/runtime/src/z_Linux_util.cpp b/openmp/runtime/src/z_Linux_util.cpp
index 368c0b6e872cc..c7fe0642cea63 100644
--- a/openmp/runtime/src/z_Linux_util.cpp
+++ b/openmp/runtime/src/z_Linux_util.cpp
@@ -2736,8 +2736,7 @@ int __kmp_get_load_balance(int max) {
 
 #endif // USE_LOAD_BALANCE
 
-#if !(KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_MIC ||                            \
-      ((KMP_OS_LINUX || KMP_OS_DARWIN) && KMP_ARCH_AARCH64) ||                 \
+#if !(KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_MIC || KMP_ARCH_AARCH64 ||        \
       KMP_ARCH_PPC64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 ||            \
       KMP_ARCH_ARM || KMP_ARCH_VE || KMP_ARCH_S390X || KMP_ARCH_PPC_XCOFF ||   \
       KMP_ARCH_AARCH64_32)
diff --git a/polly/lib/Transform/CodePreparation.cpp b/polly/lib/Transform/CodePreparation.cpp
index 7c8579eb93218..d045fb6b62c90 100644
--- a/polly/lib/Transform/CodePreparation.cpp
+++ b/polly/lib/Transform/CodePreparation.cpp
@@ -27,6 +27,26 @@
 using namespace llvm;
 using namespace polly;
 
+static bool runCodePreprationImpl(Function &F, DominatorTree *DT, LoopInfo *LI,
+                                  RegionInfo *RI) {
+  // Find first non-alloca instruction. Every basic block has a non-alloca
+  // instruction, as every well formed basic block has a terminator.
+  auto &EntryBlock = F.getEntryBlock();
+  BasicBlock::iterator I = EntryBlock.begin();
+  while (isa<AllocaInst>(I))
+    ++I;
+
+  // Abort if not necessary to split
+  if (I->isTerminator() && isa<BranchInst>(I) &&
+      cast<BranchInst>(I)->isUnconditional())
+    return false;
+
+  // splitBlock updates DT, LI and RI.
+  splitEntryBlockForAlloca(&EntryBlock, DT, LI, RI);
+
+  return true;
+}
+
 namespace {
 
 /// Prepare the IR for the scop detection.
@@ -35,9 +55,6 @@ class CodePreparation final : public FunctionPass {
   CodePreparation(const CodePreparation &) = delete;
   const CodePreparation &operator=(const CodePreparation &) = delete;
 
-  LoopInfo *LI;
-  ScalarEvolution *SE;
-
   void clear();
 
 public:
@@ -58,19 +75,11 @@ class CodePreparation final : public FunctionPass {
 
 PreservedAnalyses CodePreparationPass::run(Function &F,
                                            FunctionAnalysisManager &FAM) {
-
-  // Find first non-alloca instruction. Every basic block has a non-alloca
-  // instruction, as every well formed basic block has a terminator.
-  auto &EntryBlock = F.getEntryBlock();
-  BasicBlock::iterator I = EntryBlock.begin();
-  while (isa<AllocaInst>(I))
-    ++I;
-
   auto &DT = FAM.getResult<DominatorTreeAnalysis>(F);
   auto &LI = FAM.getResult<LoopAnalysis>(F);
-
-  // splitBlock updates DT, LI and RI.
-  splitEntryBlockForAlloca(&EntryBlock, &DT, &LI, nullptr);
+  bool Changed = runCodePreprationImpl(F, &DT, &LI, nullptr);
+  if (!Changed)
+    return PreservedAnalyses::all();
 
   PreservedAnalyses PA;
   PA.preserve<DominatorTreeAnalysis>();
@@ -84,7 +93,6 @@ CodePreparation::~CodePreparation() { clear(); }
 
 void CodePreparation::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<LoopInfoWrapperPass>();
-  AU.addRequired<ScalarEvolutionWrapperPass>();
 
   AU.addPreserved<LoopInfoWrapperPass>();
   AU.addPreserved<RegionInfoPass>();
@@ -96,10 +104,11 @@ bool CodePreparation::runOnFunction(Function &F) {
   if (skipFunction(F))
     return false;
 
-  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+  DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  RegionInfo *RI = &getAnalysis<RegionInfoPass>().getRegionInfo();
 
-  splitEntryBlockForAlloca(&F.getEntryBlock(), this);
+  runCodePreprationImpl(F, DT, LI, RI);
 
   return true;
 }
diff --git a/runtimes/cmake/Modules/HandleLibC.cmake b/runtimes/cmake/Modules/HandleLibC.cmake
index 51fbf04df7e3b..01da5b260d3d4 100644
--- a/runtimes/cmake/Modules/HandleLibC.cmake
+++ b/runtimes/cmake/Modules/HandleLibC.cmake
@@ -30,6 +30,7 @@ elseif (RUNTIMES_USE_LIBC STREQUAL "llvm-libc")
   check_cxx_compiler_flag(-nostdlibinc CXX_SUPPORTS_NOSTDLIBINC_FLAG)
   if(CXX_SUPPORTS_NOSTDLIBINC_FLAG)
     target_compile_options(runtimes-libc-headers INTERFACE "-nostdlibinc")
+    target_compile_options(runtimes-libc-headers INTERFACE "-idirafter${LIBC_KERNEL_HEADERS}")
   endif()
 
   add_library(runtimes-libc-static INTERFACE)
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index 599bc4b3d8bbf..5a1e0b53b021c 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -1000,6 +1000,7 @@ libc_support_library(
         ":__support_ctype_utils",
         ":__support_str_to_num_result",
         ":__support_uint128",
+        ":__support_wctype_utils",
         ":hdr_errno_macros",
     ],
 )
diff --git a/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel b/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel
index 7d62afc982be8..7e5cc2e10f612 100644
--- a/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel
@@ -1051,7 +1051,7 @@ gentbl_cc_library(
     strip_include_prefix = "tools/lldb-dap",
     tbl_outs = {"tools/lldb-dap/Options.inc": ["-gen-opt-parser-defs"]},
     tblgen = "//llvm:llvm-tblgen",
-    td_file = "tools/lldb-dap/Options.td",
+    td_file = "tools/lldb-dap/tool/Options.td",
     deps = ["//llvm:OptParserTdFiles"],
 )
 
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index d528daeb160cf..83414ceed5ca5 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -3781,6 +3781,7 @@ cc_library(
         ":XeGPUDialect",
         ":XeGPUPassIncGen",
         ":XeGPUUtils",
+        ":XeGPUuArch",
         "//llvm:Support",
     ],
 )
@@ -4681,6 +4682,8 @@ cc_library(
         ":ControlFlowDialect",
         ":IR",
         ":MemRefDialect",
+        ":Pass",
+        ":TransformUtils",
     ],
 )
 
@@ -7898,6 +7901,7 @@ cc_library(
     deps = [
         ":AffineDialect",
         ":AffineToStandard",
+        ":Analysis",
         ":ArithDialect",
         ":ComplexDialect",
         ":ConversionPassIncGen",
@@ -12086,6 +12090,7 @@ cc_library(
     srcs = glob(["lib/Dialect/Transform/TuneExtension/*.cpp"]),
     hdrs = glob(["include/mlir/Dialect/Transform/TuneExtension/*.h"]),
     deps = [
+        ":ControlFlowInterfaces",
         ":IR",
         ":TransformDialect",
         ":TransformDialectInterfaces",
diff --git a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
index 778f0be86025a..aa61da4667720 100644
--- a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
@@ -386,6 +386,7 @@ cc_library(
         "//mlir:CallOpInterfaces",
         "//mlir:CommonFolders",
         "//mlir:ControlFlowInterfaces",
+        "//mlir:ControlFlowTransforms",
         "//mlir:DLTIDialect",
         "//mlir:DataLayoutInterfaces",
         "//mlir:DerivedAttributeOpInterface",